MemPalace · igorls · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
@@ -71,7 +71,8 @@ def _ensure_mempalace_files_gitignored(project_dir) -> bool:
 def cmd_init(args):
     import json
     from pathlib import Path
-    from .entity_detector import scan_for_detection, detect_entities, confirm_entities
+    from .entity_detector import confirm_entities
+    from .project_scanner import discover_entities
     from .room_detector_local import detect_rooms_local
 
     cfg = MempalaceConfig()
@@ -85,25 +86,23 @@ def cmd_init(args):
         languages = cfg.entity_languages
     languages_tuple = tuple(languages)
 
-    # Pass 1: auto-detect people and projects from file content
+    # Pass 1: discover entities — manifests + git authors first, prose detection
+    # as supplement for names mentioned only in docs/notes.
     print(f"\n  Scanning for entities in: {args.dir}")
     if languages_tuple != ("en",):
         print(f"  Languages: {', '.join(languages_tuple)}")
-    files = scan_for_detection(args.dir)
-    if files:
-        print(f"  Reading {len(files)} files...")
-        detected = detect_entities(files, languages=languages_tuple)
-        total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
-        if total > 0:
-            confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
-            # Save confirmed entities to <project>/entities.json for the miner
-            if confirmed["people"] or confirmed["projects"]:
-                entities_path = Path(args.dir).expanduser().resolve() / "entities.json"
-                with open(entities_path, "w") as f:
-                    json.dump(confirmed, f, indent=2)
-                print(f"  Entities saved: {entities_path}")
-        else:
-            print("  No entities detected — proceeding with directory-based rooms.")
+    detected = discover_entities(args.dir, languages=languages_tuple)
+    total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
+    if total > 0:
+        confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
+        # Save confirmed entities to <project>/entities.json for the miner
+        if confirmed["people"] or confirmed["projects"]:
+            entities_path = Path(args.dir).expanduser().resolve() / "entities.json"
+            with open(entities_path, "w") as f:
+                json.dump(confirmed, f, indent=2)
+            print(f"  Entities saved: {entities_path}")
+    else:
+        print("  No entities detected — proceeding with directory-based rooms.")
 
     # Pass 2: detect rooms from folder structure
     detect_rooms_local(project_dir=args.dir, yes=getattr(args, "yes", False))

@@ -113,6 +113,23 @@ def _get_stopwords(languages: tuple) -> frozenset:
     ".next",
     "coverage",
     ".mempalace",
+    ".terraform",
+    "vendor",
+    "target",
+}
+
+# Files whose content is boilerplate prose — poisons entity detection.
+# Matched by stem (case-insensitive), with or without an extension.
+SKIP_FILENAMES = {
+    "license",
+    "licence",
+    "copying",
+    "copyright",
+    "notice",
+    "authors",
+    "patents",
+    "third_party_notices",
+    "third-party-notices",
 }
 
 
@@ -193,7 +210,7 @@ def _compile_each(raw_patterns, flags=re.IGNORECASE):
         "person_verbs": _compile_each(sources["person_verb_patterns"]),
         "project_verbs": _compile_each(sources["project_verb_patterns"]),
         "direct": direct_compiled,
-        "versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE),
+        "versioned": re.compile(rf"\b{n}[-_]v?\d+(?:\.\d+)*\b", re.IGNORECASE),
         "code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE),
     }
 
@@ -227,12 +244,19 @@ def score_entity(name: str, text: str, lines: list, languages=("en",)) -> dict:
 
     # --- Person signals ---
 
-    # Dialogue markers (strong signal)
+    # Dialogue markers (strong signal).
+    # The bare `^NAME:\s` colon-prefix pattern matches metadata lines like
+    # `Created: 2026-04-21`, so we require >= 2 hits for it to count as dialogue
+    # (real speaker markers repeat; single-line metadata doesn't).
     for rx in patterns["dialogue"]:
         matches = len(rx.findall(text))
-        if matches > 0:
-            person_score += matches * 3
-            person_signals.append(f"dialogue marker ({matches}x)")
+        if matches == 0:
+            continue
+        is_bare_colon = rx.pattern.endswith(r":\s") and not rx.pattern.endswith(r"[:\s]")
+        if is_bare_colon and matches < 2:
+            continue
+        person_score += matches * 3
+        person_signals.append(f"dialogue marker ({matches}x)")
 
     # Person verbs
     for rx in patterns["person_verbs"]:
@@ -328,17 +352,28 @@ def classify_entity(name: str, frequency: int, scores: dict) -> dict:
             signal_categories.add("addressed")
 
     has_two_signal_types = len(signal_categories) >= 2
-    _ = signal_categories - {"pronoun"}  # reserved for future thresholds
+    # Single-category pronoun signal still classifies as person when the
+    # evidence is overwhelming — a diary's main character is referenced
+    # with pronouns, not dialogue markers. Require both: many pronoun hits
+    # AND a high pronoun-to-frequency ratio so common sentence-start words
+    # (Never, Before, etc.) with incidental pronoun proximity don't qualify.
+    pronoun_hits = 0
+    for s in scores["person_signals"]:
+        m = re.search(r"pronoun nearby \((\d+)x\)", s)
+        if m:
+            pronoun_hits = int(m.group(1))
+            break
+    strong_pronoun_signal = pronoun_hits >= 5 and frequency > 0 and pronoun_hits / frequency >= 0.2
 
-    if person_ratio >= 0.7 and has_two_signal_types and ps >= 5:
+    if person_ratio >= 0.7 and (has_two_signal_types and ps >= 5 or strong_pronoun_signal):
         entity_type = "person"
         confidence = min(0.99, 0.5 + person_ratio * 0.5)
         signals = scores["person_signals"] or [f"appears {frequency}x"]
-    elif person_ratio >= 0.7 and (not has_two_signal_types or ps < 5):
-        # Pronoun-only match — downgrade to uncertain
+    elif person_ratio >= 0.7:
+        # Weak single-category person signal — downgrade to uncertain
         entity_type = "uncertain"
         confidence = 0.4
-        signals = scores["person_signals"] + [f"appears {frequency}x — pronoun-only match"]
+        signals = scores["person_signals"] + [f"appears {frequency}x — weak person signal"]
     elif person_ratio <= 0.3:
         entity_type = "project"
         confidence = min(0.99, 0.5 + (1 - person_ratio) * 0.5)
@@ -560,6 +595,8 @@ def scan_for_detection(project_dir: str, max_files: int = 10) -> list:
         dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
         for filename in filenames:
             filepath = Path(root) / filename
+            if filepath.stem.lower() in SKIP_FILENAMES:
+                continue
             ext = filepath.suffix.lower()
             if ext in PROSE_EXTENSIONS:
                 prose_files.append(filepath)

@@ -42,7 +42,7 @@
     "action_pattern": "(?:built|fixed|wrote|added|pushed|measured|tested|reviewed|created|deleted|updated|configured|deployed|migrated)\\s+[\\w\\s]{3,30}"
   },
   "entity": {
-    "candidate_pattern": "[A-Z][a-z]{1,19}",
+    "candidate_pattern": "[A-Z][a-z]+(?:[A-Z][a-z]+|[A-Z]{2,})+|[A-Z][a-z]{1,19}",
     "multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
     "person_verb_patterns": [
       "\\b{name}\\s+said\\b",
@@ -140,7 +140,17 @@
       "agents", "tools", "others", "guards", "ethics", "regulation",
       "learning", "thinking", "memory", "language", "intelligence",
       "technology", "society", "culture", "future", "history", "science",
-      "model", "models", "network", "networks", "training", "inference"
+      "model", "models", "network", "networks", "training", "inference",
+      "created", "updated", "deleted", "added", "removed", "modified",
+      "extracted", "processed", "generated", "compiled", "launched", "installed",
+      "deployed", "executed", "loaded", "parsed", "validated", "configured",
+      "total", "summary", "covered", "included", "pending", "failed", "success",
+      "ready", "active", "disabled", "enabled", "available", "completed",
+      "auto", "multi", "mini", "micro", "meta", "super", "hybrid",
+      "context", "bridge", "batch", "local", "global", "native", "cloud",
+      "before", "after", "during", "often", "always", "never",
+      "project", "contributor", "software",
+      "backend", "frontend", "server", "client", "service", "app", "api"
     ]
   }
 }