148 lines
4.5 KiB
JavaScript
148 lines
4.5 KiB
JavaScript
/**
|
||
* Sensitive Words Dictionary
|
||
* Provides sensitive word lists for content security filtering.
|
||
* This is the server-side master word list that gets served to clients.
|
||
*
|
||
* Categories:
|
||
* - politics: Politically harmful content
|
||
* - pornography: Pornographic and obscene content
|
||
* - gambling: Gambling and illegal betting content
|
||
* - violence: Violent and threatening content
|
||
* - abuse: Abusive and insulting language
|
||
* - fraud: Fraud and scam content
|
||
* - other: Other regulated content
|
||
*/
|
||
|
||
const SENSITIVE_WORDS = {
|
||
politics: [
|
||
// Politically sensitive terms (representative samples)
|
||
'颠覆国家', '推翻政权', '分裂国家', '恐怖组织', '极端主义',
|
||
'反动', '暴乱', '煽动颠覆', '分裂势力', '恐怖袭击',
|
||
'邪教组织', '法轮', '法轮功', '台独', '藏独', '疆独',
|
||
// Political figure names and common variants (homophone / split-char evasion)
|
||
'习近平', '刁近平', '习大大', '习主席', '习总',
|
||
'XiJinping', 'xijinping', '习近', '近平',
|
||
'李强', '王岐山', '栗战书', '汪洋', '韩正',
|
||
'李克强', '胡锦涛', '江泽民', '温家宝', '朱镕基',
|
||
'邓小平', '毛泽东', '周恩来', '刘少奇', '彭德怀',
|
||
'薄熙来', '周永康', '徐才厚', '郭伯雄', '令计划',
|
||
'孙政才', '赵乐际', '王沪宁', '丁薛祥', '蔡奇',
|
||
],
|
||
|
||
pornography: [
|
||
// Pornographic and obscene terms (representative samples)
|
||
'色情', '淫秽', '裸体', '性交', '卖淫',
|
||
'嫖娼', '成人电影', '情色', '黄色视频', '一夜情',
|
||
'援交', '约炮', '色诱', '露点', '性服务',
|
||
],
|
||
|
||
gambling: [
|
||
// Gambling related terms (representative samples)
|
||
'赌博', '赌场', '下注', '赌资', '博彩',
|
||
'六合彩', '时时彩', '赌球', '网络赌', '百家乐',
|
||
'老虎机', '扑克赌', '赌狗', '开盘下注', '庄家赔率',
|
||
],
|
||
|
||
violence: [
|
||
// Violence related terms (representative samples)
|
||
'杀人', '砍人', '捅死', '爆炸装置', '自制炸弹',
|
||
'灭门', '血腥屠杀', '残忍杀害', '暴力袭击', '砍杀',
|
||
],
|
||
|
||
abuse: [
|
||
// Abusive language (representative samples)
|
||
'傻逼', '操你', '妈的', '去死', '废物',
|
||
'滚蛋', '贱人', '狗日的', '草泥马', '脑残',
|
||
'白痴', '弱智', '猪头', '王八蛋', '混蛋',
|
||
],
|
||
|
||
fraud: [
|
||
// Fraud and scam terms (representative samples)
|
||
'代开发票', '虚假投资', '传销', '诈骗', '骗钱',
|
||
'刷单', '套现', '洗钱', '假币', '传销组织',
|
||
],
|
||
|
||
other: [
|
||
// Other regulated terms
|
||
'代孕', '买卖器官', '毒品', '吸毒', '走私',
|
||
'枪支', '管制刀具', '假药', '违禁品',
|
||
],
|
||
};
|
||
|
||
/**
|
||
* Get all sensitive words as a flat array.
|
||
* @returns {string[]}
|
||
*/
|
||
function getAllWords() {
|
||
return Object.values(SENSITIVE_WORDS).flat();
|
||
}
|
||
|
||
/**
|
||
* Get words grouped by category.
|
||
* @returns {object}
|
||
*/
|
||
function getWordsByCategory() {
|
||
return { ...SENSITIVE_WORDS };
|
||
}
|
||
|
||
/**
|
||
* Get the total count of words.
|
||
* @returns {number}
|
||
*/
|
||
function getWordCount() {
|
||
return getAllWords().length;
|
||
}
|
||
|
||
/**
|
||
* Check if a text contains any sensitive words.
|
||
* @param {string} text - Text to check
|
||
* @returns {{ hasViolation: boolean, matchedWords: string[], categories: string[] }}
|
||
*/
|
||
function checkText(text) {
|
||
if (!text || typeof text !== 'string') {
|
||
return { hasViolation: false, matchedWords: [], categories: [] };
|
||
}
|
||
|
||
const matchedWords = [];
|
||
const categories = new Set();
|
||
const lowerText = text.toLowerCase();
|
||
// Strip common evasion characters for split-char detection
|
||
const strippedText = text.replace(/[\s\u3000.,;:!?·…—\-_\|\\/~~`@#$%^&*+=<>()\[\]{}""''「」『』【】()〈〕\u200b\u200c\u200d\ufeff]/g, '').toLowerCase();
|
||
|
||
for (const [category, words] of Object.entries(SENSITIVE_WORDS)) {
|
||
for (const word of words) {
|
||
const lowerWord = word.toLowerCase();
|
||
if (lowerText.includes(lowerWord)) {
|
||
matchedWords.push(word);
|
||
categories.add(category);
|
||
} else if (word.length >= 2 && strippedText.includes(lowerWord)) {
|
||
// Split-char evasion detected
|
||
matchedWords.push(word);
|
||
categories.add(category);
|
||
}
|
||
}
|
||
}
|
||
|
||
return {
|
||
hasViolation: matchedWords.length > 0,
|
||
matchedWords: [...new Set(matchedWords)],
|
||
categories: [...categories],
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Get the version/timestamp of the word list for cache validation.
|
||
* @returns {string}
|
||
*/
|
||
function getVersion() {
|
||
return '2026-05-12-v2';
|
||
}
|
||
|
||
module.exports = {
|
||
SENSITIVE_WORDS,
|
||
getAllWords,
|
||
getWordsByCategory,
|
||
getWordCount,
|
||
checkText,
|
||
getVersion,
|
||
}; |