Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions mempalace/i18n/zh-CN.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,93 @@
"stop_words": "的 了 在 是 我 有 和 就 不 人 都 一 一个 上 也 很 到 说 要 去 你 会 着 没有 看 好 自己 这 那 她 他 它 们 但是 因为 所以 如果 虽然 然后 或者 而且",
"quote_pattern": "\\u201C([^\\u201D]{10,100})\\u201D|\"([^\"]{10,200})\"",
"action_pattern": "(构建|修复|添加|删除|确认|创建|实现|修理|编写|测试|验证|更新|配置|启动|停止)(?:了|完成|成功)"
},
"entity": {
"boundary_chars": "\\u4E00-\\u9FFF",
"candidate_pattern": "[王李张刘陈杨赵黄周吴徐孙朱胡郭何高林罗郑梁谢宋唐许韩冯邓曹彭曾萧田董袁潘于蒋蔡余杜叶程苏魏吕丁任沈姚卢姜崔钟谭陆汪范金石廖贾夏韦方白邹孟熊秦邱江尹薛阎段雷侯龙史陶黎贺顾毛郝龚邵万钱严武戴莫孔向汤温庞殷章葛管甘卞冉蓝殷习][\\u4E00-\\u9FFF]{1,2}",
"person_verb_patterns": [
"{name}说",
"{name}问",
"{name}答",
"{name}表示",
"{name}回答",
"{name}提出",
"{name}决定",
"{name}认为",
"{name}指出",
"{name}解释",
"{name}告诉",
"{name}写道",
"{name}想",
"{name}觉得",
"{name}知道",
"{name}喜欢",
"{name}讨厌",
"{name}确认",
"{name}提醒",
"{name}分享",
"{name}建议",
"{name}同意",
"{name}反对"
],
"pronoun_patterns": [
"他们",
"她们",
"他",
"她",
"它",
"您",
"咱"
],
"dialogue_patterns": [
"^>\\s*{name}[::\\s]",
"^{name}[::]\\s?",
"^\\[{name}\\]",
"\u201C{name}[\u201D::]",
"「{name}[」::]"
],
"direct_address_pattern": "嘿\\s*{name}|喂\\s*{name}|谢谢\\s*{name}|感谢\\s*{name}|哈喽\\s*{name}|亲爱的\\s*{name}",
"project_verb_patterns": [
"建立{name}",
"打造{name}",
"部署{name}",
"启动{name}",
"发布{name}",
"上线{name}",
"开发{name}",
"维护{name}",
"{name}系统",
"{name}平台",
"{name}项目",
"{name}架构",
"{name}管线",
"{name}v\\d+",
"\\bimport\\s+{name}\\b",
"\\bpip\\s+install\\s+{name}\\b"
],
"stopwords": [
"的", "了", "着", "过", "得", "地", "吗", "吧", "呢", "啊", "喔", "耶",
"我", "你", "妳", "他", "她", "它", "您", "咱",
"我们", "你们", "妳们", "他们", "她们", "它们", "咱们",
"自己", "大家", "有人", "没人",
"今天", "明天", "昨天", "前天", "后天", "今年", "明年", "去年",
"早上", "下午", "晚上", "中午", "凌晨",
"现在", "刚才", "刚刚", "等等", "等下", "待会",
"最近", "以前", "之前", "之后", "以后", "后来",
"什么", "为什么", "怎么", "怎样", "哪里", "哪个",
"这个", "那个", "这里", "那里", "这些", "那些", "这样", "那样",
"但是", "可是", "然后", "所以", "因为", "如果", "虽然",
"而且", "或者", "或是", "还是", "不过", "只是", "不只",
"既然", "不然", "否则", "此外", "另外",
"很", "非常", "相当", "真的", "确实", "当然", "其实",
"已经", "正在", "即将", "将要", "刚好", "恰好",
"可能", "也许", "或许", "大概", "应该", "必须", "一定",
"完成", "执行", "进行", "开始", "结束", "继续", "停止", "完毕",
"没有", "有点", "有些", "一些", "许多", "很多",
"问题", "答案", "原因", "结果", "情况", "状况",
"主要", "重要", "基本", "简单", "复杂", "特别",
"谢谢", "感谢", "对不起", "不好意思", "请问",
"欢迎", "再见", "你好", "您好", "哈喽", "拜拜"
]
}
}
88 changes: 88 additions & 0 deletions mempalace/i18n/zh-TW.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,93 @@
"stop_words": "的 了 在 是 我 有 和 就 不 人 都 一 一個 上 也 很 到 說 要 去 你 會 著 沒有 看 好 自己 這 那 她 他 它 們 但是 因為 所以 如果 雖然 然後 或者 而且",
"quote_pattern": "「([^」]{10,100})」|\u201c([^\u201d]{10,100})\u201d",
"action_pattern": "(構建|修復|添加|刪除|確認|創建|實現|修理|編寫|測試|驗證|更新|配置|啟動|停止)(?:了|完成|成功)"
},
"entity": {
"boundary_chars": "\\u4E00-\\u9FFF",
"candidate_pattern": "[王李張劉陳楊趙黃周吳徐孫朱胡郭何高林羅鄭梁謝宋唐許韓馮鄧曹彭曾蕭田董袁潘于蔣蔡余杜葉程蘇魏呂丁任沈姚盧姜崔鍾譚陸汪范金石廖賈夏韋方白鄒孟熊秦邱江尹薛閻段雷侯龍史陶黎賀顧毛郝龔邵萬錢嚴武戴莫孔向湯溫龐殷章葛管甘卞冉藍殷習][\\u4E00-\\u9FFF]{1,2}",
"person_verb_patterns": [
"{name}說",
"{name}問",
"{name}答",
"{name}表示",
"{name}回答",
"{name}提出",
"{name}決定",
"{name}認為",
"{name}指出",
"{name}解釋",
"{name}告訴",
"{name}寫道",
"{name}想",
"{name}覺得",
"{name}知道",
"{name}喜歡",
"{name}討厭",
"{name}確認",
"{name}提醒",
"{name}分享",
"{name}建議",
"{name}同意",
"{name}反對"
],
"pronoun_patterns": [
"他們",
"她們",
"他",
"她",
"它",
"您",
"咱"
],
"dialogue_patterns": [
"^>\\s*{name}[::\\s]",
"^{name}[::]\\s?",
"^\\[{name}\\]",
"「{name}[」::]",
"『{name}[』::]"
],
"direct_address_pattern": "嘿\\s*{name}|喂\\s*{name}|謝謝\\s*{name}|感謝\\s*{name}|哈囉\\s*{name}|親愛的\\s*{name}",
"project_verb_patterns": [
"建立{name}",
"打造{name}",
"部署{name}",
"啟動{name}",
"發布{name}",
"上線{name}",
"開發{name}",
"維護{name}",
"{name}系統",
"{name}平台",
"{name}專案",
"{name}架構",
"{name}管線",
"{name}v\\d+",
"\\bimport\\s+{name}\\b",
"\\bpip\\s+install\\s+{name}\\b"
],
"stopwords": [
"的", "了", "著", "過", "得", "地", "嗎", "吧", "呢", "啊", "喔", "耶",
"我", "你", "妳", "他", "她", "它", "您", "咱",
"我們", "你們", "妳們", "他們", "她們", "它們", "咱們",
"自己", "大家", "有人", "沒人",
"今天", "明天", "昨天", "前天", "後天", "今年", "明年", "去年",
"早上", "下午", "晚上", "中午", "凌晨",
"現在", "剛才", "剛剛", "等等", "等下", "待會",
"最近", "以前", "之前", "之後", "以後", "後來",
"什麼", "甚麼", "為什麼", "怎麼", "怎樣", "哪裡", "哪個",
"這個", "那個", "這裡", "那裡", "這些", "那些", "這樣", "那樣",
"但是", "可是", "然後", "所以", "因為", "如果", "雖然",
"而且", "或者", "或是", "還是", "不過", "只是", "不只",
"既然", "不然", "否則", "此外", "另外",
"很", "非常", "相當", "真的", "確實", "當然", "其實",
"已經", "正在", "即將", "將要", "剛好", "恰好",
"可能", "也許", "或許", "大概", "應該", "必須", "一定",
"完成", "執行", "進行", "開始", "結束", "繼續", "停止", "完畢",
"沒有", "有點", "有些", "一些", "許多", "很多",
"問題", "答案", "原因", "結果", "情況", "狀況",
"主要", "重要", "基本", "簡單", "複雜", "特別",
"謝謝", "感謝", "對不起", "不好意思", "請問",
"歡迎", "再見", "你好", "您好", "哈囉", "掰掰"
]
}
}
99 changes: 99 additions & 0 deletions tests/test_entity_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,3 +661,102 @@ def test_boundary_chars_english_regression():
result = extract_candidates(text, languages=("en",))
assert "Riley" in result
assert result["Riley"] >= 3


# ── Chinese (zh-TW / zh-CN) entity detection ──────────────────────────

# CJK scripts have no word delimiters — a regex-based extractor can only
# catch names when they have a non-CJK neighbour (whitespace, punctuation,
# newline, or mixed English). Real-world technical notes in zh-TW / zh-CN
# routinely satisfy this: names appear at the start of bullet lines, next
# to English terms, or before full-width punctuation 「」:,。. The patterns
# below target that realistic regime.


def test_zh_tw_candidate_extraction_at_boundaries():
"""A 3-char Traditional Chinese name is extracted when neighboured by
whitespace, English, full-width punctuation, or line-start."""
text = (
"# 會議紀錄\n"
"- 朱宜振 主持\n"
"朱宜振 跟 Jeffrey 討論。\n"
"朱宜振: 方向正確。\n"
"朱宜振, 明天 pitch。\n"
)
result = extract_candidates(text, languages=("zh-TW",))
assert "朱宜振" in result, f"expected 朱宜振 in {result}"
assert result["朱宜振"] >= 3


def test_zh_tw_person_classification():
"""A Traditional Chinese name with dialogue + verb context classifies
as a person."""
text = (
"朱宜振: 「我們要 6 月 launch。」\n"
"朱宜振 同意 Arnold 的方案。\n"
"朱宜振 覺得 Hermes 方向對。\n"
"朱宜振 決定 ship pitch。\n"
)
lines = text.splitlines()
scores = score_entity("朱宜振", text, lines, languages=("zh-TW",))
# Dialogue + action signals fire — person score dominates
assert scores["person_score"] > 0, f"expected person signals, got {scores}"


def test_zh_tw_stopwords_filter_common_particles():
"""Common Chinese particles / pronouns should be stopword-filtered
even if they happen to share a surname prefix like 甘 or 習."""
from mempalace.i18n import get_entity_patterns

patterns = get_entity_patterns(("zh-TW",))
stopwords = set(patterns["stopwords"])
# Sanity: stopwords are lower-cased from the source list
assert "這個" in stopwords
assert "我們" in stopwords
assert "他們" in stopwords
assert "完成" in stopwords


def test_zh_tw_falls_back_to_english_for_non_cjk_names():
"""English names embedded in Chinese text are still captured via the
English pattern — Lman's Chinese notes mix in names like 'Jeffrey Lai'."""
text = (
"朱宜振 跟 Jeffrey Lai 討論 pitch。\n"
"Jeffrey Lai 報告進度。\n"
"朱宜振 同意 Jeffrey Lai 的方案。\n"
"朱宜振: 確認。\n"
)
result = extract_candidates(text, languages=("zh-TW", "en"))
assert "Jeffrey Lai" in result or "Jeffrey" in result
assert "朱宜振" in result


def test_zh_cn_candidate_extraction():
"""Simplified-Chinese name extraction mirrors zh-TW behaviour."""
text = "张三 今天主持。\n- 张三 跟 Bob 谈。\n张三: 已经搞定了。\n张三, 明天继续。\n"
result = extract_candidates(text, languages=("zh-CN",))
assert "张三" in result, f"expected 张三 in {result}"
assert result["张三"] >= 3


def test_zh_cn_and_zh_tw_union_covers_both_variants():
"""Passing both zh-CN and zh-TW unions the surname sets — a document
mixing simplified 张三 and traditional 張三 extracts both."""
text = "张三 说 hello。张三 笑了。张三 同意。\n張三 也參加。張三 寫 code。張三 決定。\n"
result = extract_candidates(text, languages=("zh-TW", "zh-CN"))
# At least one variant meets freq>=3
assert "张三" in result or "張三" in result


def test_zh_tw_known_limitation_inline_name_no_boundary():
"""Documented limitation: a name sandwiched between CJK chars with no
whitespace or punctuation break is not extracted. This is a fundamental
limit of regex-based CJK entity detection — words have no delimiters.
Realistic Chinese writing has enough non-CJK boundaries (punctuation,
newlines, mixed English) that 3+ occurrences normally produce matches
elsewhere in the document, so this rarely degrades real-world recall."""
# 朱宜振 appears 4x but every instance is flanked by CJK on both sides.
text = "他是朱宜振今天來。說朱宜振決定。又朱宜振負責。問朱宜振意見。"
result = extract_candidates(text, languages=("zh-TW",))
# Extraction is expected to miss this adversarial case.
assert "朱宜振" not in result