Skip to content

Commit 2a5914b

Browse files
authored
Merge pull request #945 from lmanchu/feat/zh-entity-detection
feat(i18n): add Traditional + Simplified Chinese entity detection
2 parents 32ec74d + c88b8a2 commit 2a5914b

3 files changed

Lines changed: 275 additions & 0 deletions

File tree

mempalace/i18n/zh-CN.json

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,5 +40,93 @@
4040
"stop_words": "的 了 在 是 我 有 和 就 不 人 都 一 一个 上 也 很 到 说 要 去 你 会 着 没有 看 好 自己 这 那 她 他 它 们 但是 因为 所以 如果 虽然 然后 或者 而且",
4141
"quote_pattern": "\\u201C([^\\u201D]{10,100})\\u201D|\"([^\"]{10,200})\"",
4242
"action_pattern": "(构建|修复|添加|删除|确认|创建|实现|修理|编写|测试|验证|更新|配置|启动|停止)(?:了|完成|成功)"
43+
},
44+
"entity": {
45+
"boundary_chars": "\\u4E00-\\u9FFF",
46+
"candidate_pattern": "[王李张刘陈杨赵黄周吴徐孙朱胡郭何高林罗郑梁谢宋唐许韩冯邓曹彭曾萧田董袁潘于蒋蔡余杜叶程苏魏吕丁任沈姚卢姜崔钟谭陆汪范金石廖贾夏韦方白邹孟熊秦邱江尹薛阎段雷侯龙史陶黎贺顾毛郝龚邵万钱严武戴莫孔向汤温庞殷章葛管甘卞冉蓝殷习][\\u4E00-\\u9FFF]{1,2}",
47+
"person_verb_patterns": [
48+
"{name}说",
49+
"{name}问",
50+
"{name}答",
51+
"{name}表示",
52+
"{name}回答",
53+
"{name}提出",
54+
"{name}决定",
55+
"{name}认为",
56+
"{name}指出",
57+
"{name}解释",
58+
"{name}告诉",
59+
"{name}写道",
60+
"{name}想",
61+
"{name}觉得",
62+
"{name}知道",
63+
"{name}喜欢",
64+
"{name}讨厌",
65+
"{name}确认",
66+
"{name}提醒",
67+
"{name}分享",
68+
"{name}建议",
69+
"{name}同意",
70+
"{name}反对"
71+
],
72+
"pronoun_patterns": [
73+
"他们",
74+
"她们",
75+
"",
76+
"",
77+
"",
78+
"",
79+
""
80+
],
81+
"dialogue_patterns": [
82+
"^>\\s*{name}[::\\s]",
83+
"^{name}[::]\\s?",
84+
"^\\[{name}\\]",
85+
"\u201C{name}[\u201D::]",
86+
"「{name}[」::]"
87+
],
88+
"direct_address_pattern": "\\s*{name}|喂\\s*{name}|谢谢\\s*{name}|感谢\\s*{name}|哈喽\\s*{name}|亲爱的\\s*{name}",
89+
"project_verb_patterns": [
90+
"建立{name}",
91+
"打造{name}",
92+
"部署{name}",
93+
"启动{name}",
94+
"发布{name}",
95+
"上线{name}",
96+
"开发{name}",
97+
"维护{name}",
98+
"{name}系统",
99+
"{name}平台",
100+
"{name}项目",
101+
"{name}架构",
102+
"{name}管线",
103+
"{name}v\\d+",
104+
"\\bimport\\s+{name}\\b",
105+
"\\bpip\\s+install\\s+{name}\\b"
106+
],
107+
"stopwords": [
108+
"", "", "", "", "", "", "", "", "", "", "", "",
109+
"", "", "", "", "", "", "", "",
110+
"我们", "你们", "妳们", "他们", "她们", "它们", "咱们",
111+
"自己", "大家", "有人", "没人",
112+
"今天", "明天", "昨天", "前天", "后天", "今年", "明年", "去年",
113+
"早上", "下午", "晚上", "中午", "凌晨",
114+
"现在", "刚才", "刚刚", "等等", "等下", "待会",
115+
"最近", "以前", "之前", "之后", "以后", "后来",
116+
"什么", "为什么", "怎么", "怎样", "哪里", "哪个",
117+
"这个", "那个", "这里", "那里", "这些", "那些", "这样", "那样",
118+
"但是", "可是", "然后", "所以", "因为", "如果", "虽然",
119+
"而且", "或者", "或是", "还是", "不过", "只是", "不只",
120+
"既然", "不然", "否则", "此外", "另外",
121+
"", "非常", "相当", "真的", "确实", "当然", "其实",
122+
"已经", "正在", "即将", "将要", "刚好", "恰好",
123+
"可能", "也许", "或许", "大概", "应该", "必须", "一定",
124+
"完成", "执行", "进行", "开始", "结束", "继续", "停止", "完毕",
125+
"没有", "有点", "有些", "一些", "许多", "很多",
126+
"问题", "答案", "原因", "结果", "情况", "状况",
127+
"主要", "重要", "基本", "简单", "复杂", "特别",
128+
"谢谢", "感谢", "对不起", "不好意思", "请问",
129+
"欢迎", "再见", "你好", "您好", "哈喽", "拜拜"
130+
]
43131
}
44132
}

mempalace/i18n/zh-TW.json

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,5 +40,93 @@
4040
"stop_words": "的 了 在 是 我 有 和 就 不 人 都 一 一個 上 也 很 到 說 要 去 你 會 著 沒有 看 好 自己 這 那 她 他 它 們 但是 因為 所以 如果 雖然 然後 或者 而且",
4141
"quote_pattern": "「([^」]{10,100})」|\u201c([^\u201d]{10,100})\u201d",
4242
"action_pattern": "(構建|修復|添加|刪除|確認|創建|實現|修理|編寫|測試|驗證|更新|配置|啟動|停止)(?:了|完成|成功)"
43+
},
44+
"entity": {
45+
"boundary_chars": "\\u4E00-\\u9FFF",
46+
"candidate_pattern": "[王李張劉陳楊趙黃周吳徐孫朱胡郭何高林羅鄭梁謝宋唐許韓馮鄧曹彭曾蕭田董袁潘于蔣蔡余杜葉程蘇魏呂丁任沈姚盧姜崔鍾譚陸汪范金石廖賈夏韋方白鄒孟熊秦邱江尹薛閻段雷侯龍史陶黎賀顧毛郝龔邵萬錢嚴武戴莫孔向湯溫龐殷章葛管甘卞冉藍殷習][\\u4E00-\\u9FFF]{1,2}",
47+
"person_verb_patterns": [
48+
"{name}說",
49+
"{name}問",
50+
"{name}答",
51+
"{name}表示",
52+
"{name}回答",
53+
"{name}提出",
54+
"{name}決定",
55+
"{name}認為",
56+
"{name}指出",
57+
"{name}解釋",
58+
"{name}告訴",
59+
"{name}寫道",
60+
"{name}想",
61+
"{name}覺得",
62+
"{name}知道",
63+
"{name}喜歡",
64+
"{name}討厭",
65+
"{name}確認",
66+
"{name}提醒",
67+
"{name}分享",
68+
"{name}建議",
69+
"{name}同意",
70+
"{name}反對"
71+
],
72+
"pronoun_patterns": [
73+
"他們",
74+
"她們",
75+
"",
76+
"",
77+
"",
78+
"",
79+
""
80+
],
81+
"dialogue_patterns": [
82+
"^>\\s*{name}[::\\s]",
83+
"^{name}[::]\\s?",
84+
"^\\[{name}\\]",
85+
"「{name}[」::]",
86+
"『{name}[』::]"
87+
],
88+
"direct_address_pattern": "\\s*{name}|喂\\s*{name}|謝謝\\s*{name}|感謝\\s*{name}|哈囉\\s*{name}|親愛的\\s*{name}",
89+
"project_verb_patterns": [
90+
"建立{name}",
91+
"打造{name}",
92+
"部署{name}",
93+
"啟動{name}",
94+
"發布{name}",
95+
"上線{name}",
96+
"開發{name}",
97+
"維護{name}",
98+
"{name}系統",
99+
"{name}平台",
100+
"{name}專案",
101+
"{name}架構",
102+
"{name}管線",
103+
"{name}v\\d+",
104+
"\\bimport\\s+{name}\\b",
105+
"\\bpip\\s+install\\s+{name}\\b"
106+
],
107+
"stopwords": [
108+
"", "", "", "", "", "", "", "", "", "", "", "",
109+
"", "", "", "", "", "", "", "",
110+
"我們", "你們", "妳們", "他們", "她們", "它們", "咱們",
111+
"自己", "大家", "有人", "沒人",
112+
"今天", "明天", "昨天", "前天", "後天", "今年", "明年", "去年",
113+
"早上", "下午", "晚上", "中午", "凌晨",
114+
"現在", "剛才", "剛剛", "等等", "等下", "待會",
115+
"最近", "以前", "之前", "之後", "以後", "後來",
116+
"什麼", "甚麼", "為什麼", "怎麼", "怎樣", "哪裡", "哪個",
117+
"這個", "那個", "這裡", "那裡", "這些", "那些", "這樣", "那樣",
118+
"但是", "可是", "然後", "所以", "因為", "如果", "雖然",
119+
"而且", "或者", "或是", "還是", "不過", "只是", "不只",
120+
"既然", "不然", "否則", "此外", "另外",
121+
"", "非常", "相當", "真的", "確實", "當然", "其實",
122+
"已經", "正在", "即將", "將要", "剛好", "恰好",
123+
"可能", "也許", "或許", "大概", "應該", "必須", "一定",
124+
"完成", "執行", "進行", "開始", "結束", "繼續", "停止", "完畢",
125+
"沒有", "有點", "有些", "一些", "許多", "很多",
126+
"問題", "答案", "原因", "結果", "情況", "狀況",
127+
"主要", "重要", "基本", "簡單", "複雜", "特別",
128+
"謝謝", "感謝", "對不起", "不好意思", "請問",
129+
"歡迎", "再見", "你好", "您好", "哈囉", "掰掰"
130+
]
43131
}
44132
}

tests/test_entity_detector.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -661,3 +661,102 @@ def test_boundary_chars_english_regression():
661661
result = extract_candidates(text, languages=("en",))
662662
assert "Riley" in result
663663
assert result["Riley"] >= 3
664+
665+
666+
# ── Chinese (zh-TW / zh-CN) entity detection ──────────────────────────
667+
668+
# CJK scripts have no word delimiters — a regex-based extractor can only
669+
# catch names when they have a non-CJK neighbour (whitespace, punctuation,
670+
# newline, or mixed English). Real-world technical notes in zh-TW / zh-CN
671+
# routinely satisfy this: names appear at the start of bullet lines, next
672+
# to English terms, or before full-width punctuation 「」:,。. The patterns
673+
# below target that realistic regime.
674+
675+
676+
def test_zh_tw_candidate_extraction_at_boundaries():
677+
"""A 3-char Traditional Chinese name is extracted when neighboured by
678+
whitespace, English, full-width punctuation, or line-start."""
679+
text = (
680+
"# 會議紀錄\n"
681+
"- 朱宜振 主持\n"
682+
"朱宜振 跟 Jeffrey 討論。\n"
683+
"朱宜振: 方向正確。\n"
684+
"朱宜振, 明天 pitch。\n"
685+
)
686+
result = extract_candidates(text, languages=("zh-TW",))
687+
assert "朱宜振" in result, f"expected 朱宜振 in {result}"
688+
assert result["朱宜振"] >= 3
689+
690+
691+
def test_zh_tw_person_classification():
692+
"""A Traditional Chinese name with dialogue + verb context classifies
693+
as a person."""
694+
text = (
695+
"朱宜振: 「我們要 6 月 launch。」\n"
696+
"朱宜振 同意 Arnold 的方案。\n"
697+
"朱宜振 覺得 Hermes 方向對。\n"
698+
"朱宜振 決定 ship pitch。\n"
699+
)
700+
lines = text.splitlines()
701+
scores = score_entity("朱宜振", text, lines, languages=("zh-TW",))
702+
# Dialogue + action signals fire — person score dominates
703+
assert scores["person_score"] > 0, f"expected person signals, got {scores}"
704+
705+
706+
def test_zh_tw_stopwords_filter_common_particles():
707+
"""Common Chinese particles / pronouns should be stopword-filtered
708+
even if they happen to share a surname prefix like 甘 or 習."""
709+
from mempalace.i18n import get_entity_patterns
710+
711+
patterns = get_entity_patterns(("zh-TW",))
712+
stopwords = set(patterns["stopwords"])
713+
# Sanity: stopwords are lower-cased from the source list
714+
assert "這個" in stopwords
715+
assert "我們" in stopwords
716+
assert "他們" in stopwords
717+
assert "完成" in stopwords
718+
719+
720+
def test_zh_tw_falls_back_to_english_for_non_cjk_names():
721+
"""English names embedded in Chinese text are still captured via the
722+
English pattern — Lman's Chinese notes mix in names like 'Jeffrey Lai'."""
723+
text = (
724+
"朱宜振 跟 Jeffrey Lai 討論 pitch。\n"
725+
"Jeffrey Lai 報告進度。\n"
726+
"朱宜振 同意 Jeffrey Lai 的方案。\n"
727+
"朱宜振: 確認。\n"
728+
)
729+
result = extract_candidates(text, languages=("zh-TW", "en"))
730+
assert "Jeffrey Lai" in result or "Jeffrey" in result
731+
assert "朱宜振" in result
732+
733+
734+
def test_zh_cn_candidate_extraction():
735+
"""Simplified-Chinese name extraction mirrors zh-TW behaviour."""
736+
text = "张三 今天主持。\n- 张三 跟 Bob 谈。\n张三: 已经搞定了。\n张三, 明天继续。\n"
737+
result = extract_candidates(text, languages=("zh-CN",))
738+
assert "张三" in result, f"expected 张三 in {result}"
739+
assert result["张三"] >= 3
740+
741+
742+
def test_zh_cn_and_zh_tw_union_covers_both_variants():
743+
"""Passing both zh-CN and zh-TW unions the surname sets — a document
744+
mixing simplified 张三 and traditional 張三 extracts both."""
745+
text = "张三 说 hello。张三 笑了。张三 同意。\n張三 也參加。張三 寫 code。張三 決定。\n"
746+
result = extract_candidates(text, languages=("zh-TW", "zh-CN"))
747+
# At least one variant meets freq>=3
748+
assert "张三" in result or "張三" in result
749+
750+
751+
def test_zh_tw_known_limitation_inline_name_no_boundary():
752+
"""Documented limitation: a name sandwiched between CJK chars with no
753+
whitespace or punctuation break is not extracted. This is a fundamental
754+
limit of regex-based CJK entity detection — words have no delimiters.
755+
Realistic Chinese writing has enough non-CJK boundaries (punctuation,
756+
newlines, mixed English) that 3+ occurrences normally produce matches
757+
elsewhere in the document, so this rarely degrades real-world recall."""
758+
# 朱宜振 appears 4x but every instance is flanked by CJK on both sides.
759+
text = "他是朱宜振今天來。說朱宜振決定。又朱宜振負責。問朱宜振意見。"
760+
result = extract_candidates(text, languages=("zh-TW",))
761+
# Extraction is expected to miss this adversarial case.
762+
assert "朱宜振" not in result

0 commit comments

Comments
 (0)