Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 16 additions & 17 deletions mempalace/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ def _ensure_mempalace_files_gitignored(project_dir) -> bool:
def cmd_init(args):
import json
from pathlib import Path
from .entity_detector import scan_for_detection, detect_entities, confirm_entities
from .entity_detector import confirm_entities
from .project_scanner import discover_entities
from .room_detector_local import detect_rooms_local

cfg = MempalaceConfig()
Expand All @@ -85,25 +86,23 @@ def cmd_init(args):
languages = cfg.entity_languages
languages_tuple = tuple(languages)

# Pass 1: auto-detect people and projects from file content
# Pass 1: discover entities — manifests + git authors first, prose detection
# as supplement for names mentioned only in docs/notes.
print(f"\n Scanning for entities in: {args.dir}")
if languages_tuple != ("en",):
print(f" Languages: {', '.join(languages_tuple)}")
files = scan_for_detection(args.dir)
if files:
print(f" Reading {len(files)} files...")
detected = detect_entities(files, languages=languages_tuple)
total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
if total > 0:
confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
# Save confirmed entities to <project>/entities.json for the miner
if confirmed["people"] or confirmed["projects"]:
entities_path = Path(args.dir).expanduser().resolve() / "entities.json"
with open(entities_path, "w") as f:
json.dump(confirmed, f, indent=2)
print(f" Entities saved: {entities_path}")
else:
print(" No entities detected — proceeding with directory-based rooms.")
detected = discover_entities(args.dir, languages=languages_tuple)
total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
if total > 0:
confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
# Save confirmed entities to <project>/entities.json for the miner
if confirmed["people"] or confirmed["projects"]:
entities_path = Path(args.dir).expanduser().resolve() / "entities.json"
with open(entities_path, "w") as f:
json.dump(confirmed, f, indent=2)
print(f" Entities saved: {entities_path}")
else:
print(" No entities detected — proceeding with directory-based rooms.")

# Pass 2: detect rooms from folder structure
detect_rooms_local(project_dir=args.dir, yes=getattr(args, "yes", False))
Expand Down
57 changes: 47 additions & 10 deletions mempalace/entity_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,23 @@ def _get_stopwords(languages: tuple) -> frozenset:
".next",
"coverage",
".mempalace",
".terraform",
"vendor",
"target",
}

# Files whose content is boilerplate prose — poisons entity detection.
# Matched by stem (case-insensitive), with or without an extension.
SKIP_FILENAMES = {
"license",
"licence",
"copying",
"copyright",
"notice",
"authors",
"patents",
"third_party_notices",
"third-party-notices",
}


Expand Down Expand Up @@ -193,7 +210,7 @@ def _compile_each(raw_patterns, flags=re.IGNORECASE):
"person_verbs": _compile_each(sources["person_verb_patterns"]),
"project_verbs": _compile_each(sources["project_verb_patterns"]),
"direct": direct_compiled,
"versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE),
"versioned": re.compile(rf"\b{n}[-_]v?\d+(?:\.\d+)*\b", re.IGNORECASE),
"code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE),
}

Expand Down Expand Up @@ -227,12 +244,19 @@ def score_entity(name: str, text: str, lines: list, languages=("en",)) -> dict:

# --- Person signals ---

# Dialogue markers (strong signal)
# Dialogue markers (strong signal).
# The bare `^NAME:\s` colon-prefix pattern matches metadata lines like
# `Created: 2026-04-21`, so we require >= 2 hits for it to count as dialogue
# (real speaker markers repeat; single-line metadata doesn't).
for rx in patterns["dialogue"]:
matches = len(rx.findall(text))
if matches > 0:
person_score += matches * 3
person_signals.append(f"dialogue marker ({matches}x)")
if matches == 0:
continue
is_bare_colon = rx.pattern.endswith(r":\s") and not rx.pattern.endswith(r"[:\s]")
if is_bare_colon and matches < 2:
continue
person_score += matches * 3
person_signals.append(f"dialogue marker ({matches}x)")

# Person verbs
for rx in patterns["person_verbs"]:
Expand Down Expand Up @@ -328,17 +352,28 @@ def classify_entity(name: str, frequency: int, scores: dict) -> dict:
signal_categories.add("addressed")

has_two_signal_types = len(signal_categories) >= 2
_ = signal_categories - {"pronoun"} # reserved for future thresholds
# Single-category pronoun signal still classifies as person when the
# evidence is overwhelming — a diary's main character is referenced
# with pronouns, not dialogue markers. Require both: many pronoun hits
# AND a high pronoun-to-frequency ratio so common sentence-start words
# (Never, Before, etc.) with incidental pronoun proximity don't qualify.
pronoun_hits = 0
for s in scores["person_signals"]:
m = re.search(r"pronoun nearby \((\d+)x\)", s)
if m:
pronoun_hits = int(m.group(1))
break
strong_pronoun_signal = pronoun_hits >= 5 and frequency > 0 and pronoun_hits / frequency >= 0.2

if person_ratio >= 0.7 and has_two_signal_types and ps >= 5:
if person_ratio >= 0.7 and (has_two_signal_types and ps >= 5 or strong_pronoun_signal):
entity_type = "person"
confidence = min(0.99, 0.5 + person_ratio * 0.5)
signals = scores["person_signals"] or [f"appears {frequency}x"]
elif person_ratio >= 0.7 and (not has_two_signal_types or ps < 5):
# Pronoun-only match — downgrade to uncertain
elif person_ratio >= 0.7:
# Weak single-category person signal — downgrade to uncertain
entity_type = "uncertain"
confidence = 0.4
signals = scores["person_signals"] + [f"appears {frequency}x — pronoun-only match"]
signals = scores["person_signals"] + [f"appears {frequency}x — weak person signal"]
elif person_ratio <= 0.3:
entity_type = "project"
confidence = min(0.99, 0.5 + (1 - person_ratio) * 0.5)
Expand Down Expand Up @@ -560,6 +595,8 @@ def scan_for_detection(project_dir: str, max_files: int = 10) -> list:
dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
for filename in filenames:
filepath = Path(root) / filename
if filepath.stem.lower() in SKIP_FILENAMES:
continue
ext = filepath.suffix.lower()
if ext in PROSE_EXTENSIONS:
prose_files.append(filepath)
Expand Down
14 changes: 12 additions & 2 deletions mempalace/i18n/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
"action_pattern": "(?:built|fixed|wrote|added|pushed|measured|tested|reviewed|created|deleted|updated|configured|deployed|migrated)\\s+[\\w\\s]{3,30}"
},
"entity": {
"candidate_pattern": "[A-Z][a-z]{1,19}",
"candidate_pattern": "[A-Z][a-z]+(?:[A-Z][a-z]+|[A-Z]{2,})+|[A-Z][a-z]{1,19}",
"multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
"person_verb_patterns": [
"\\b{name}\\s+said\\b",
Expand Down Expand Up @@ -140,7 +140,17 @@
"agents", "tools", "others", "guards", "ethics", "regulation",
"learning", "thinking", "memory", "language", "intelligence",
"technology", "society", "culture", "future", "history", "science",
"model", "models", "network", "networks", "training", "inference"
"model", "models", "network", "networks", "training", "inference",
"created", "updated", "deleted", "added", "removed", "modified",
"extracted", "processed", "generated", "compiled", "launched", "installed",
"deployed", "executed", "loaded", "parsed", "validated", "configured",
"total", "summary", "covered", "included", "pending", "failed", "success",
"ready", "active", "disabled", "enabled", "available", "completed",
"auto", "multi", "mini", "micro", "meta", "super", "hybrid",
"context", "bridge", "batch", "local", "global", "native", "cloud",
"before", "after", "during", "often", "always", "never",
"project", "contributor", "software",
"backend", "frontend", "server", "client", "service", "app", "api"
]
}
}
Loading
Loading