forked from MemPalace/mempalace
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_entity_detector.py
More file actions
762 lines (602 loc) · 28.3 KB
/
test_entity_detector.py
File metadata and controls
762 lines (602 loc) · 28.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
"""Tests for mempalace.entity_detector."""
import contextlib
import json
import os
from pathlib import Path
from unittest.mock import patch
from mempalace.entity_detector import (
PROSE_EXTENSIONS,
STOPWORDS,
_print_entity_list,
classify_entity,
confirm_entities,
detect_entities,
extract_candidates,
scan_for_detection,
score_entity,
)
# ── extract_candidates ──────────────────────────────────────────────────
def test_extract_candidates_finds_frequent_names():
text = "Riley said hello. Riley laughed. Riley smiled. Riley waved."
result = extract_candidates(text)
assert "Riley" in result
assert result["Riley"] >= 3
def test_extract_candidates_ignores_stopwords():
# "The" appears many times but is a stopword
text = "The The The The The The"
result = extract_candidates(text)
assert "The" not in result
def test_extract_candidates_requires_min_frequency():
text = "Riley said hi. Devon waved."
result = extract_candidates(text)
# Each name appears only once, below the threshold of 3
assert "Riley" not in result
assert "Devon" not in result
def test_extract_candidates_finds_multi_word_names():
# Multi-word names need 3+ occurrences and no stopwords
text = "Claude Code is great. Claude Code rocks. Claude Code works. Claude Code rules."
result = extract_candidates(text)
assert "Claude Code" in result
def test_extract_candidates_empty_text():
result = extract_candidates("")
assert result == {}
# ── score_entity ────────────────────────────────────────────────────────
def test_score_entity_person_verbs():
text = "Riley said hello. Riley asked why. Riley told me."
lines = text.splitlines()
result = score_entity("Riley", text, lines)
assert result["person_score"] > 0
assert len(result["person_signals"]) > 0
def test_score_entity_project_verbs():
text = "We are building ChromaDB. We deployed ChromaDB. Install ChromaDB."
lines = text.splitlines()
result = score_entity("ChromaDB", text, lines)
assert result["project_score"] > 0
assert len(result["project_signals"]) > 0
def test_score_entity_dialogue_markers():
text = "Riley: Hey, how are you?\nRiley: I'm fine."
lines = text.splitlines()
result = score_entity("Riley", text, lines)
assert result["person_score"] > 0
def test_score_entity_code_ref():
text = "Check out ChromaDB.py for details. Also ChromaDB.js is good."
lines = text.splitlines()
result = score_entity("ChromaDB", text, lines)
assert result["project_score"] > 0
def test_score_entity_no_signals():
text = "Nothing interesting here at all."
lines = text.splitlines()
result = score_entity("Riley", text, lines)
assert result["person_score"] == 0
assert result["project_score"] == 0
# ── classify_entity ─────────────────────────────────────────────────────
def test_classify_entity_no_signals_gives_uncertain():
scores = {
"person_score": 0,
"project_score": 0,
"person_signals": [],
"project_signals": [],
}
result = classify_entity("Foo", 10, scores)
assert result["type"] == "uncertain"
assert result["name"] == "Foo"
def test_classify_entity_strong_project():
scores = {
"person_score": 0,
"project_score": 10,
"person_signals": [],
"project_signals": ["project verb (5x)", "code file reference (2x)"],
}
result = classify_entity("ChromaDB", 5, scores)
assert result["type"] == "project"
def test_classify_entity_strong_person_needs_two_signal_types():
scores = {
"person_score": 10,
"project_score": 0,
"person_signals": [
"dialogue marker (3x)",
"'Riley ...' action (4x)",
],
"project_signals": [],
}
result = classify_entity("Riley", 8, scores)
assert result["type"] == "person"
def test_classify_entity_pronoun_only_is_uncertain():
scores = {
"person_score": 8,
"project_score": 0,
"person_signals": ["pronoun nearby (4x)"],
"project_signals": [],
}
result = classify_entity("Riley", 5, scores)
assert result["type"] == "uncertain"
def test_classify_entity_mixed_signals():
scores = {
"person_score": 5,
"project_score": 5,
"person_signals": ["pronoun nearby (2x)"],
"project_signals": ["project verb (2x)"],
}
result = classify_entity("Lantern", 5, scores)
assert result["type"] == "uncertain"
assert "mixed signals" in result["signals"][-1]
# ── detect_entities (integration) ───────────────────────────────────────
def test_detect_entities_with_person_file(tmp_path):
f = tmp_path / "notes.txt"
content = "\n".join(
[
"Riley said hello today.",
"Riley asked about the project.",
"Riley told me she was happy.",
"Riley: I think we should go.",
"Hey Riley, thanks for the help.",
"Riley laughed and smiled.",
"Riley decided to join.",
"Riley pushed the change.",
]
)
f.write_text(content)
result = detect_entities([f])
all_names = [e["name"] for cat in result.values() for e in cat]
assert "Riley" in all_names
def test_detect_entities_with_project_file(tmp_path):
f = tmp_path / "readme.txt"
# "ChromaDB" has uppercase+lowercase mix but extract_candidates looks
# for /[A-Z][a-z]{1,19}/ — so we need a name that matches that regex.
# Use "Lantern" which matches the capitalized-word pattern.
content = "\n".join(
[
"The Lantern project is great.",
"Building Lantern was fun.",
"We deployed Lantern today.",
"Install Lantern with pip install Lantern.",
"Check Lantern.py for the source.",
"Lantern v2 is faster.",
]
)
f.write_text(content)
result = detect_entities([f])
all_names = [e["name"] for cat in result.values() for e in cat]
assert "Lantern" in all_names
def test_detect_entities_empty_files(tmp_path):
f = tmp_path / "empty.txt"
f.write_text("")
result = detect_entities([f])
assert result == {"people": [], "projects": [], "uncertain": []}
def test_detect_entities_handles_missing_file(tmp_path):
missing = tmp_path / "nonexistent.txt"
result = detect_entities([missing])
assert result == {"people": [], "projects": [], "uncertain": []}
def test_detect_entities_respects_max_files(tmp_path):
files = []
for i in range(5):
f = tmp_path / f"file{i}.txt"
f.write_text("Riley said hello. " * 10)
files.append(f)
# max_files=2 should only read 2 files
result = detect_entities(files, max_files=2)
# Should still work without error
assert isinstance(result, dict)
# ── scan_for_detection ──────────────────────────────────────────────────
def test_scan_for_detection_finds_prose(tmp_path):
(tmp_path / "notes.md").write_text("hello")
(tmp_path / "data.txt").write_text("world")
(tmp_path / "code.py").write_text("import os")
files = scan_for_detection(str(tmp_path))
extensions = {os.path.splitext(str(f))[1] for f in files}
# Prose files should be found
assert ".md" in extensions or ".txt" in extensions
def test_scan_for_detection_skips_git_dir(tmp_path):
git_dir = tmp_path / ".git"
git_dir.mkdir()
(git_dir / "config.txt").write_text("git config")
(tmp_path / "readme.md").write_text("hello")
files = scan_for_detection(str(tmp_path))
file_strs = [str(f) for f in files]
assert not any(".git" in f for f in file_strs)
# ── module-level constants ──────────────────────────────────────────────
def test_stopwords_contains_common_words():
assert "the" in STOPWORDS
assert "import" in STOPWORDS
assert "class" in STOPWORDS
def test_prose_extensions():
assert ".txt" in PROSE_EXTENSIONS
assert ".md" in PROSE_EXTENSIONS
# ── _print_entity_list ─────────────────────────────────────────────────
def test_print_entity_list_with_entities(capsys):
entities = [
{"name": "Alice", "confidence": 0.9, "signals": ["dialogue marker (3x)"]},
{"name": "Bob", "confidence": 0.5, "signals": []},
]
_print_entity_list(entities, "PEOPLE")
out = capsys.readouterr().out
assert "PEOPLE" in out
assert "Alice" in out
assert "Bob" in out
def test_print_entity_list_empty(capsys):
_print_entity_list([], "PEOPLE")
out = capsys.readouterr().out
assert "none detected" in out
# ── confirm_entities ───────────────────────────────────────────────────
def test_confirm_entities_yes_mode():
detected = {
"people": [{"name": "Alice", "confidence": 0.9, "signals": ["test"]}],
"projects": [{"name": "Acme", "confidence": 0.8, "signals": ["test"]}],
"uncertain": [{"name": "Foo", "confidence": 0.4, "signals": ["test"]}],
}
result = confirm_entities(detected, yes=True)
assert result["people"] == ["Alice"]
assert result["projects"] == ["Acme"]
def test_confirm_entities_accept_all():
detected = {
"people": [{"name": "Alice", "confidence": 0.9, "signals": ["test"]}],
"projects": [],
"uncertain": [],
}
with patch("builtins.input", side_effect=["", "n"]):
result = confirm_entities(detected, yes=False)
assert "Alice" in result["people"]
def test_confirm_entities_edit_reclassify_uncertain():
detected = {
"people": [],
"projects": [],
"uncertain": [
{"name": "Foo", "confidence": 0.4, "signals": ["test"]},
{"name": "Bar", "confidence": 0.4, "signals": ["test"]},
],
}
with patch(
"builtins.input",
side_effect=[
"edit", # choice
"p", # Foo -> person
"s", # Bar -> skip
"", # no removals from people
"", # no removals from projects
"n", # don't add missing
],
):
result = confirm_entities(detected, yes=False)
assert "Foo" in result["people"]
assert "Bar" not in result["people"]
assert "Bar" not in result["projects"]
def test_confirm_entities_add_mode():
detected = {
"people": [],
"projects": [],
"uncertain": [],
}
with patch(
"builtins.input",
side_effect=[
"add", # choice = add
"NewPerson", # name
"p", # person
"NewProj", # name
"r", # project
"", # stop adding
],
):
result = confirm_entities(detected, yes=False)
assert "NewPerson" in result["people"]
assert "NewProj" in result["projects"]
# ── scan_for_detection fallback ────────────────────────────────────────
def test_scan_for_detection_fallback_to_all_readable(tmp_path):
"""When fewer than 3 prose files, falls back to include all readable files."""
(tmp_path / "one.md").write_text("hello")
(tmp_path / "two.txt").write_text("world")
# Only 2 prose files, so it should also include code files
(tmp_path / "code.py").write_text("import os")
(tmp_path / "app.js").write_text("console.log()")
files = scan_for_detection(str(tmp_path))
extensions = {os.path.splitext(str(f))[1] for f in files}
assert ".py" in extensions or ".js" in extensions
def test_scan_for_detection_max_files(tmp_path):
"""Caps to max_files."""
for i in range(20):
(tmp_path / f"note{i}.md").write_text(f"content {i}")
files = scan_for_detection(str(tmp_path), max_files=5)
assert len(files) <= 5
# ── multi-language infra ───────────────────────────────────────────────
@contextlib.contextmanager
def _temp_locale(locale_code: str, entity_section: dict):
"""Context manager that drops a locale JSON into mempalace/i18n/ for the test body.
Cleans up the file and clears every cache that depends on locale data on exit,
even if the test fails or the entity section is invalid.
Note: writes into the real mempalace/i18n/ directory. If a test process is
SIGKILLed mid-test the orphan zz-test-*.json file will break test_all_languages_load
on the next run (the fixture lacks the required terms/cli/aaak sections).
Recover with `rm mempalace/i18n/zz-test-*.json`.
"""
from mempalace import i18n
from mempalace import entity_detector
locale_path = Path(i18n.__file__).parent / f"{locale_code}.json"
if locale_path.exists():
raise RuntimeError(f"Test locale {locale_code} collides with an existing file")
payload = {
"lang": locale_code,
"label": locale_code,
"terms": {},
"cli": {},
"aaak": {"instruction": "test"},
"entity": entity_section,
}
locale_path.write_text(json.dumps(payload), encoding="utf-8")
def _clear_caches():
i18n._entity_cache.clear()
entity_detector._build_patterns.cache_clear()
entity_detector._pronoun_re.cache_clear()
entity_detector._get_stopwords.cache_clear()
_clear_caches()
try:
yield locale_path
finally:
try:
locale_path.unlink()
except OSError:
pass
_clear_caches()
def test_extract_candidates_default_languages_is_english_only():
"""Default languages tuple = ('en',) — accented names dropped (as today)."""
text = "João said hi. João laughed. João waved. João decided."
result = extract_candidates(text) # default ("en",)
assert "João" not in result
def test_extract_candidates_with_extra_locale_picks_up_new_charset():
"""A locale with a Latin+diacritics candidate_pattern catches accented names."""
locale = {
"candidate_pattern": "[A-ZÀ-Ú][a-zà-ÿ]{1,19}",
"multi_word_pattern": "[A-ZÀ-Ú][a-zà-ÿ]+(?:\\s+[A-ZÀ-Ú][a-zà-ÿ]+)+",
"person_verb_patterns": [],
"pronoun_patterns": [],
"dialogue_patterns": [],
"project_verb_patterns": [],
"stopwords": [],
}
with _temp_locale("zz-test-latin", locale):
text = "João said hi. João laughed. João waved. João decided."
result = extract_candidates(text, languages=("en", "zz-test-latin"))
assert "João" in result
assert result["João"] >= 3
def test_extract_candidates_with_cyrillic_locale():
"""A locale with a Cyrillic candidate_pattern catches Russian names."""
locale = {
"candidate_pattern": "[А-ЯЁ][а-яё]{1,19}",
"multi_word_pattern": "[А-ЯЁ][а-яё]+(?:\\s+[А-ЯЁ][а-яё]+)+",
"person_verb_patterns": [],
"pronoun_patterns": [],
"dialogue_patterns": [],
"project_verb_patterns": [],
"stopwords": [],
}
with _temp_locale("zz-test-cyrillic", locale):
text = "Иван сказал привет. Иван засмеялся. Иван помахал. Иван решил."
result = extract_candidates(text, languages=("en", "zz-test-cyrillic"))
assert "Иван" in result
def test_score_entity_unions_person_verbs_across_languages():
"""A non-English person-verb pattern fires when its locale is enabled."""
locale = {
"candidate_pattern": "[A-Z][a-z]{1,19}",
"multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
"person_verb_patterns": [
"\\b{name}\\s+disse\\b",
"\\b{name}\\s+falou\\b",
"\\b{name}\\s+riu\\b",
],
"pronoun_patterns": [],
"dialogue_patterns": [],
"project_verb_patterns": [],
"stopwords": [],
}
with _temp_locale("zz-test-verbs", locale):
text = "Maria disse oi. Maria falou. Maria riu."
lines = text.splitlines()
en_only = score_entity("Maria", text, lines, languages=("en",))
multi = score_entity("Maria", text, lines, languages=("en", "zz-test-verbs"))
assert multi["person_score"] > en_only["person_score"]
assert any("action" in s for s in multi["person_signals"])
def test_get_entity_patterns_unknown_lang_falls_back_to_english():
"""Asking for a non-existent language returns English defaults."""
from mempalace.i18n import get_entity_patterns
patterns = get_entity_patterns(("zz-does-not-exist",))
assert len(patterns["stopwords"]) > 0
assert patterns["candidate_patterns"] # English fallback
def test_get_entity_patterns_dedupes_across_overlapping_languages():
"""Loading ('en', 'en') doesn't double-count patterns or stopwords."""
from mempalace.i18n import get_entity_patterns
single = get_entity_patterns(("en",))
doubled = get_entity_patterns(("en", "en"))
assert len(doubled["person_verb_patterns"]) == len(single["person_verb_patterns"])
assert len(doubled["stopwords"]) == len(single["stopwords"])
def test_build_patterns_cache_is_keyed_by_language():
"""Same name with different language tuples yields different compiled sets."""
from mempalace.entity_detector import _build_patterns
locale = {
"candidate_pattern": "[A-Z][a-z]+",
"multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
"person_verb_patterns": ["\\b{name}\\s+ranxx\\b"],
"pronoun_patterns": [],
"dialogue_patterns": [],
"project_verb_patterns": [],
"stopwords": [],
}
with _temp_locale("zz-test-cache", locale):
en_patterns = _build_patterns("Sam", ("en",))
multi_patterns = _build_patterns("Sam", ("en", "zz-test-cache"))
assert len(multi_patterns["person_verbs"]) > len(en_patterns["person_verbs"])
def test_normalize_langs_handles_string_input():
"""Passing a bare string instead of a tuple still works."""
from mempalace.entity_detector import _normalize_langs
assert _normalize_langs("en") == ("en",)
assert _normalize_langs(["en", "pt-br"]) == ("en", "pt-br")
assert _normalize_langs(None) == ("en",)
assert _normalize_langs(()) == ("en",)
def test_config_entity_languages_defaults_to_english(tmp_path, monkeypatch):
"""MempalaceConfig.entity_languages defaults to ['en'] with no config file."""
from mempalace.config import MempalaceConfig
monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
cfg = MempalaceConfig(config_dir=str(tmp_path))
assert cfg.entity_languages == ["en"]
def test_config_entity_languages_from_env(tmp_path, monkeypatch):
"""Env var overrides config file."""
from mempalace.config import MempalaceConfig
monkeypatch.setenv("MEMPALACE_ENTITY_LANGUAGES", "en,pt-br,ru")
cfg = MempalaceConfig(config_dir=str(tmp_path))
assert cfg.entity_languages == ["en", "pt-br", "ru"]
def test_config_set_entity_languages_persists(tmp_path, monkeypatch):
"""set_entity_languages writes to disk and is read back."""
from mempalace.config import MempalaceConfig
monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
cfg = MempalaceConfig(config_dir=str(tmp_path))
cfg.set_entity_languages(["en", "pt-br"])
cfg2 = MempalaceConfig(config_dir=str(tmp_path))
assert cfg2.entity_languages == ["en", "pt-br"]
def test_config_set_entity_languages_empty_falls_back_to_english(tmp_path, monkeypatch):
"""An empty list normalizes to ['en']."""
from mempalace.config import MempalaceConfig
monkeypatch.delenv("MEMPALACE_ENTITY_LANGUAGES", raising=False)
monkeypatch.delenv("MEMPAL_ENTITY_LANGUAGES", raising=False)
cfg = MempalaceConfig(config_dir=str(tmp_path))
result = cfg.set_entity_languages([])
assert result == ["en"]
assert cfg.entity_languages == ["en"]
# ── boundary_chars for combining-mark scripts ─────────────────────────
# Devanagari vowel signs (matras) are Unicode Mc — not matched by \w.
# Without boundary_chars, \b truncates names like अनीता → अनीत and
# person_verb patterns never fire. With boundary_chars, the i18n loader
# replaces \b with a script-aware lookaround, fixing both.
_DEVANAGARI_ENTITY = {
"boundary_chars": "\\w\\u0900-\\u097F",
"candidate_pattern": "[\\u0900-\\u097F]{2,20}",
"multi_word_pattern": "[\\u0900-\\u097F]+(?:\\s+[\\u0900-\\u097F]+)+",
"person_verb_patterns": [
"\\b{name}\\s+ने\\s+कहा\\b",
"\\b{name}\\s+हँसा\\b",
],
"pronoun_patterns": ["\\bवह\\b", "\\bउसने\\b"],
"dialogue_patterns": ["^{name}:\\s"],
"direct_address_pattern": "\\bनमस्ते\\s+{name}\\b",
"project_verb_patterns": [],
"stopwords": ["यह", "वह", "और", "का", "के", "की"],
}
def test_devanagari_candidate_extraction_with_boundary_chars():
"""Names ending in matras are extracted in full with boundary_chars."""
with _temp_locale("zz-test-hindi", _DEVANAGARI_ENTITY):
text = "अनीता ने कहा। अनीता हँसा। अनीता सोचा। अनीता बोला।"
result = extract_candidates(text, languages=("en", "zz-test-hindi"))
assert "अनीता" in result, f"expected अनीता in {result}"
assert result["अनीता"] >= 3
def test_devanagari_candidate_without_boundary_chars_truncates():
"""Without boundary_chars, a matra-ending name gets truncated."""
locale_no_boundary = dict(_DEVANAGARI_ENTITY)
del locale_no_boundary["boundary_chars"]
with _temp_locale("zz-test-hindi-no-b", locale_no_boundary):
text = "अनीता ने कहा। अनीता हँसा। अनीता सोचा।"
result = extract_candidates(text, languages=("en", "zz-test-hindi-no-b"))
# Without boundary_chars, \b splits on the matra — full name won't appear
assert "अनीता" not in result
def test_devanagari_person_verb_fires_with_boundary_chars():
"""Hindi person-verb patterns fire when boundary_chars extends \\b."""
with _temp_locale("zz-test-hindi", _DEVANAGARI_ENTITY):
text = "राज ने कहा कुछ। राज हँसा।"
lines = text.splitlines()
scores = score_entity("राज", text, lines, languages=("en", "zz-test-hindi"))
assert scores["person_score"] > 0, f"expected person_score > 0, got {scores}"
assert any("action" in s for s in scores["person_signals"])
def test_devanagari_person_verb_silent_without_boundary_chars():
"""Without boundary_chars, Hindi person verbs don't fire."""
locale_no_boundary = dict(_DEVANAGARI_ENTITY)
del locale_no_boundary["boundary_chars"]
with _temp_locale("zz-test-hindi-no-b", locale_no_boundary):
text = "राज ने कहा कुछ। राज हँसा।"
lines = text.splitlines()
scores = score_entity("राज", text, lines, languages=("en", "zz-test-hindi-no-b"))
assert scores["person_score"] == 0
def test_boundary_chars_english_regression():
"""English patterns (no boundary_chars) still work identically."""
text = "Riley said hello. Riley laughed. Riley smiled. Riley waved."
result = extract_candidates(text, languages=("en",))
assert "Riley" in result
assert result["Riley"] >= 3
# ── Chinese (zh-TW / zh-CN) entity detection ──────────────────────────
# CJK scripts have no word delimiters — a regex-based extractor can only
# catch names when they have a non-CJK neighbour (whitespace, punctuation,
# newline, or mixed English). Real-world technical notes in zh-TW / zh-CN
# routinely satisfy this: names appear at the start of bullet lines, next
# to English terms, or before full-width punctuation 「」:,。. The patterns
# below target that realistic regime.
def test_zh_tw_candidate_extraction_at_boundaries():
"""A 3-char Traditional Chinese name is extracted when neighboured by
whitespace, English, full-width punctuation, or line-start."""
text = (
"# 會議紀錄\n"
"- 朱宜振 主持\n"
"朱宜振 跟 Jeffrey 討論。\n"
"朱宜振: 方向正確。\n"
"朱宜振, 明天 pitch。\n"
)
result = extract_candidates(text, languages=("zh-TW",))
assert "朱宜振" in result, f"expected 朱宜振 in {result}"
assert result["朱宜振"] >= 3
def test_zh_tw_person_classification():
"""A Traditional Chinese name with dialogue + verb context classifies
as a person."""
text = (
"朱宜振: 「我們要 6 月 launch。」\n"
"朱宜振 同意 Arnold 的方案。\n"
"朱宜振 覺得 Hermes 方向對。\n"
"朱宜振 決定 ship pitch。\n"
)
lines = text.splitlines()
scores = score_entity("朱宜振", text, lines, languages=("zh-TW",))
# Dialogue + action signals fire — person score dominates
assert scores["person_score"] > 0, f"expected person signals, got {scores}"
def test_zh_tw_stopwords_filter_common_particles():
"""Common Chinese particles / pronouns should be stopword-filtered
even if they happen to share a surname prefix like 甘 or 習."""
from mempalace.i18n import get_entity_patterns
patterns = get_entity_patterns(("zh-TW",))
stopwords = set(patterns["stopwords"])
# Sanity: stopwords are lower-cased from the source list
assert "這個" in stopwords
assert "我們" in stopwords
assert "他們" in stopwords
assert "完成" in stopwords
def test_zh_tw_falls_back_to_english_for_non_cjk_names():
"""English names embedded in Chinese text are still captured via the
English pattern — Lman's Chinese notes mix in names like 'Jeffrey Lai'."""
text = (
"朱宜振 跟 Jeffrey Lai 討論 pitch。\n"
"Jeffrey Lai 報告進度。\n"
"朱宜振 同意 Jeffrey Lai 的方案。\n"
"朱宜振: 確認。\n"
)
result = extract_candidates(text, languages=("zh-TW", "en"))
assert "Jeffrey Lai" in result or "Jeffrey" in result
assert "朱宜振" in result
def test_zh_cn_candidate_extraction():
"""Simplified-Chinese name extraction mirrors zh-TW behaviour."""
text = "张三 今天主持。\n- 张三 跟 Bob 谈。\n张三: 已经搞定了。\n张三, 明天继续。\n"
result = extract_candidates(text, languages=("zh-CN",))
assert "张三" in result, f"expected 张三 in {result}"
assert result["张三"] >= 3
def test_zh_cn_and_zh_tw_union_covers_both_variants():
"""Passing both zh-CN and zh-TW unions the surname sets — a document
mixing simplified 张三 and traditional 張三 extracts both."""
text = "张三 说 hello。张三 笑了。张三 同意。\n張三 也參加。張三 寫 code。張三 決定。\n"
result = extract_candidates(text, languages=("zh-TW", "zh-CN"))
# At least one variant meets freq>=3
assert "张三" in result or "張三" in result
def test_zh_tw_known_limitation_inline_name_no_boundary():
"""Documented limitation: a name sandwiched between CJK chars with no
whitespace or punctuation break is not extracted. This is a fundamental
limit of regex-based CJK entity detection — words have no delimiters.
Realistic Chinese writing has enough non-CJK boundaries (punctuation,
newlines, mixed English) that 3+ occurrences normally produce matches
elsewhere in the document, so this rarely degrades real-world recall."""
# 朱宜振 appears 4x but every instance is flanked by CJK on both sides.
text = "他是朱宜振今天來。說朱宜振決定。又朱宜振負責。問朱宜振意見。"
result = extract_candidates(text, languages=("zh-TW",))
# Extraction is expected to miss this adversarial case.
assert "朱宜振" not in result