tankwar_proj/content-security-service/services/sensitiveWords.js

/**
 * Sensitive Words Dictionary
 * Provides sensitive word lists for content security filtering.
 * This is the server-side master word list that gets served to clients.
 *
 * Categories:
 * - politics: Politically harmful content
 * - pornography: Pornographic and obscene content
 * - gambling: Gambling and illegal betting content
 * - violence: Violent and threatening content
 * - abuse: Abusive and insulting language
 * - fraud: Fraud and scam content
 * - other: Other regulated content
 */

const SENSITIVE_WORDS = {
  politics: [
    // Politically sensitive terms (representative samples)
    '颠覆国家', '推翻政权', '分裂国家', '恐怖组织', '极端主义',
    '反动', '暴乱', '煽动颠覆', '分裂势力', '恐怖袭击',
    '邪教组织', '法轮', '法轮功', '台独', '藏独', '疆独',
    // Political figure names and common variants (homophone / split-char evasion)
    '习近平', '刁近平', '习大大', '习主席', '习总',
    'XiJinping', 'xijinping', '习近', '近平',
    '李强', '王岐山', '栗战书', '汪洋', '韩正',
    '李克强', '胡锦涛', '江泽民', '温家宝', '朱镕基',
    '邓小平', '毛泽东', '周恩来', '刘少奇', '彭德怀',
    '薄熙来', '周永康', '徐才厚', '郭伯雄', '令计划',
    '孙政才', '赵乐际', '王沪宁', '丁薛祥', '蔡奇',
  ],

  pornography: [
    // Pornographic and obscene terms (representative samples)
    '色情', '淫秽', '裸体', '性交', '卖淫',
    '嫖娼', '成人电影', '情色', '黄色视频', '一夜情',
    '援交', '约炮', '色诱', '露点', '性服务',
  ],

  gambling: [
    // Gambling related terms (representative samples)
    '赌博', '赌场', '下注', '赌资', '博彩',
    '六合彩', '时时彩', '赌球', '网络赌', '百家乐',
    '老虎机', '扑克赌', '赌狗', '开盘下注', '庄家赔率',
  ],

  violence: [
    // Violence related terms (representative samples)
    '杀人', '砍人', '捅死', '爆炸装置', '自制炸弹',
    '灭门', '血腥屠杀', '残忍杀害', '暴力袭击', '砍杀',
  ],

  abuse: [
    // Abusive language (representative samples)
    '傻逼', '操你', '妈的', '去死', '废物',
    '滚蛋', '贱人', '狗日的', '草泥马', '脑残',
    '白痴', '弱智', '猪头', '王八蛋', '混蛋',
  ],

  fraud: [
    // Fraud and scam terms (representative samples)
    '代开发票', '虚假投资', '传销', '诈骗', '骗钱',
    '刷单', '套现', '洗钱', '假币', '传销组织',
  ],

  other: [
    // Other regulated terms
    '代孕', '买卖器官', '毒品', '吸毒', '走私',
    '枪支', '管制刀具', '假药', '违禁品',
  ],
};

/**
 * Get all sensitive words as a flat array.
 * @returns {string[]}
 */
function getAllWords() {
  return Object.values(SENSITIVE_WORDS).flat();
}

/**
 * Get words grouped by category.
 * @returns {object}
 */
function getWordsByCategory() {
  return { ...SENSITIVE_WORDS };
}

/**
 * Get the total count of words.
 * @returns {number}
 */
function getWordCount() {
  return getAllWords().length;
}

/**
 * Check if a text contains any sensitive words.
 * @param {string} text - Text to check
 * @returns {{ hasViolation: boolean, matchedWords: string[], categories: string[] }}
 */
function checkText(text) {
  if (!text || typeof text !== 'string') {
    return { hasViolation: false, matchedWords: [], categories: [] };
  }

  const matchedWords = [];
  const categories = new Set();
  const lowerText = text.toLowerCase();
  // Strip common evasion characters for split-char detection
  const strippedText = text.replace(/[\s\u3000.,;:!?·…—\-_\|\\/～~`@#$%^&*+=<>()\[\]{}""''「」『』【】（）〈〕\u200b\u200c\u200d\ufeff]/g, '').toLowerCase();

  for (const [category, words] of Object.entries(SENSITIVE_WORDS)) {
    for (const word of words) {
      const lowerWord = word.toLowerCase();
      if (lowerText.includes(lowerWord)) {
        matchedWords.push(word);
        categories.add(category);
      } else if (word.length >= 2 && strippedText.includes(lowerWord)) {
        // Split-char evasion detected
        matchedWords.push(word);
        categories.add(category);
      }
    }
  }

  return {
    hasViolation: matchedWords.length > 0,
    matchedWords: [...new Set(matchedWords)],
    categories: [...categories],
  };
}

/**
 * Get the version/timestamp of the word list for cache validation.
 * @returns {string}
 */
function getVersion() {
  return '2026-05-12-v2';
}

module.exports = {
  SENSITIVE_WORDS,
  getAllWords,
  getWordsByCategory,
  getWordCount,
  checkText,
  getVersion,
};