MemPalace · igorls · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
@@ -120,8 +120,7 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> li
             os.rename(seg_dir, target)
             moved.append(target)
             logger.warning(
-                "Quarantined stale HNSW segment %s "
-                "(sqlite %.0fs newer than HNSW); renamed to %s",
+                "Quarantined stale HNSW segment %s (sqlite %.0fs newer than HNSW); renamed to %s",
                 seg_dir,
                 sqlite_mtime - hnsw_mtime,
                 target,

@@ -86,21 +86,53 @@ def cmd_init(args):
         languages = cfg.entity_languages
     languages_tuple = tuple(languages)
 
+    # Optional phase-2 LLM provider (opt-in via --llm).
+    llm_provider = None
+    if getattr(args, "llm", False):
+        from .llm_client import LLMError, get_provider
+
+        try:
+            llm_provider = get_provider(
+                name=args.llm_provider,
+                model=args.llm_model,
+                endpoint=args.llm_endpoint,
+                api_key=args.llm_api_key,
+            )
+        except LLMError as e:
+            print(f"  ERROR: {e}", file=sys.stderr)
+            sys.exit(2)
+        ok, msg = llm_provider.check_available()
+        if not ok:
+            print(
+                f"  ERROR: LLM provider '{args.llm_provider}' unavailable: {msg}",
+                file=sys.stderr,
+            )
+            sys.exit(2)
+        print(f"  LLM refinement enabled: {args.llm_provider}/{args.llm_model}")
+
     # Pass 1: discover entities — manifests + git authors first, prose detection
-    # as supplement for names mentioned only in docs/notes.
+    # as supplement for names mentioned only in docs/notes. Optional phase-2
+    # LLM refinement runs inside discover_entities when llm_provider is given.
     print(f"\n  Scanning for entities in: {args.dir}")
     if languages_tuple != ("en",):
         print(f"  Languages: {', '.join(languages_tuple)}")
-    detected = discover_entities(args.dir, languages=languages_tuple)
+    detected = discover_entities(args.dir, languages=languages_tuple, llm_provider=llm_provider)
     total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
     if total > 0:
         confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
-        # Save confirmed entities to <project>/entities.json for the miner
+        # Save confirmed entities to <project>/entities.json (per-project
+        # audit trail — user can inspect or hand-edit) AND merge into the
+        # global registry the miner reads at mine time.
         if confirmed["people"] or confirmed["projects"]:
             entities_path = Path(args.dir).expanduser().resolve() / "entities.json"
-            with open(entities_path, "w") as f:
-                json.dump(confirmed, f, indent=2)
+            with open(entities_path, "w", encoding="utf-8") as f:
+                json.dump(confirmed, f, indent=2, ensure_ascii=False)
             print(f"  Entities saved: {entities_path}")
+
+            from .miner import add_to_known_entities
+
+            registry_path = add_to_known_entities(confirmed)
+            print(f"  Registry updated: {registry_path}")
     else:
         print("  No entities detected — proceeding with directory-based rooms.")
 
@@ -550,6 +582,43 @@ def main():
             "When given, the value is also persisted to config.json."
         ),
     )
+    p_init.add_argument(
+        "--llm",
+        action="store_true",
+        help=(
+            "Enable LLM-assisted entity refinement (opt-in, local-first). "
+            "Runs after manifest/git/regex detection, asking the configured "
+            "provider to reclassify ambiguous candidates. "
+            "Ctrl-C during refinement returns partial results."
+        ),
+    )
+    p_init.add_argument(
+        "--llm-provider",
+        default="ollama",
+        choices=["ollama", "openai-compat", "anthropic"],
+        help="LLM provider (default: ollama). Use --llm to enable.",
+    )
+    p_init.add_argument(
+        "--llm-model",
+        default="gemma4:e4b",
+        help="Model name for the chosen provider (default: gemma4:e4b for Ollama).",
+    )
+    p_init.add_argument(
+        "--llm-endpoint",
+        default=None,
+        help=(
+            "Provider endpoint URL. Default for Ollama: http://localhost:11434. "
+            "Required for openai-compat."
+        ),
+    )
+    p_init.add_argument(
+        "--llm-api-key",
+        default=None,
+        help=(
+            "API key for the provider. For anthropic, defaults to $ANTHROPIC_API_KEY; "
+            "for openai-compat, defaults to $OPENAI_API_KEY."
+        ),
+    )
 
     # mine
     p_mine = sub.add_parser("mine", help="Mine files into the palace")

@@ -0,0 +1,160 @@
+"""
+convo_scanner.py — Parse Claude Code conversation directories into ProjectInfo.
+
+Claude Code stores sessions under ``~/.claude/projects/<slug>/<id>.jsonl``,
+where the ``<slug>`` is the original CWD with ``/`` replaced by ``-``. That
+encoding is lossy: we can't tell whether ``foo-bar`` in a slug is the
+literal project name ``foo-bar`` or two path segments ``foo/bar``.
+
+Fortunately, every message record in the JSONL carries a ``cwd`` field with
+the true path. This scanner reads one record per session to recover the
+accurate project name, falling back to slug-decoding only if the JSONL
+is malformed or empty.
+
+Output is the same ``ProjectInfo`` shape used by ``project_scanner``, so the
+``discover_entities`` orchestrator can mix-and-match sources.
+
+Public:
+    is_claude_projects_root(path) -> bool
+    scan_claude_projects(path) -> list[ProjectInfo]
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Optional
+
+from mempalace.project_scanner import ProjectInfo
+
+
+MAX_HEADER_LINES = 20  # lines to read per session looking for `cwd`
+
+
+def is_claude_projects_root(path: Path) -> bool:
+    """Return True if path looks like `.claude/projects/`.
+
+    Heuristic: at least one child dir whose name starts with ``-`` and which
+    contains at least one ``.jsonl`` file.
+    """
+    if not path.is_dir():
+        return False
+    try:
+        children = list(path.iterdir())
+    except OSError:
+        return False
+    for child in children:
+        if not (child.is_dir() and child.name.startswith("-")):
+            continue
+        try:
+            if any(p.suffix == ".jsonl" for p in child.iterdir() if p.is_file()):
+                return True
+        except OSError:
+            continue
+    return False
+
+
+def _extract_cwd_from_session(session_file: Path) -> Optional[str]:
+    """Return the ``cwd`` from the first message record that carries one.
+
+    Returns None if the file can't be read, has no JSON, or no record has cwd.
+    """
+    try:
+        with open(session_file, encoding="utf-8", errors="replace") as f:
+            for i, line in enumerate(f):
+                if i >= MAX_HEADER_LINES:
+                    break
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    obj = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+                cwd = obj.get("cwd")
+                if isinstance(cwd, str) and cwd:
+                    return cwd
+    except OSError:
+        return None
+    return None
+
+
+def _decode_slug_fallback(slug: str) -> str:
+    """Best-effort project name from slug when cwd is unavailable.
+
+    The slug is lossy (`/` and `-` both become `-`). Last non-empty segment
+    is the closest guess at the project name, preserving kebab-case is
+    impossible without cwd.
+    """
+    stripped = slug.lstrip("-")
+    parts = [p for p in stripped.split("-") if p]
+    return parts[-1] if parts else slug
+
+
+def _safe_mtime(path: Path) -> float:
+    """Return file mtime, defaulting old on permission or filesystem errors."""
+    try:
+        return path.stat().st_mtime
+    except OSError:
+        return 0.0
+
+
+def _resolve_project_name(project_dir: Path) -> str:
+    """Read one session's cwd to recover the original project name.
+
+    Falls back to slug-decoding if no session has a readable cwd.
+    """
+    sessions = sorted(
+        (p for p in project_dir.iterdir() if p.is_file() and p.suffix == ".jsonl"),
+        key=_safe_mtime,
+        reverse=True,  # newest first — most likely to be well-formed
+    )
+    for session in sessions:
+        cwd = _extract_cwd_from_session(session)
+        if cwd:
+            return Path(cwd).name or cwd
+    return _decode_slug_fallback(project_dir.name)
+
+
+def scan_claude_projects(path: str | Path) -> list[ProjectInfo]:
+    """Scan a ``.claude/projects/`` directory for Claude Code conversations.
+
+    One ProjectInfo per subdir. ``has_git`` is False (the directory isn't a
+    repo itself) but ``total_commits`` is repurposed here as session count so
+    the UX surfaces a density signal for ranking.
+    """
+    root = Path(path).expanduser().resolve()
+    if not is_claude_projects_root(root):
+        return []
+
+    projects: dict[str, ProjectInfo] = {}
+    for sub in sorted(root.iterdir()):
+        if not (sub.is_dir() and sub.name.startswith("-")):
+            continue
+        try:
+            sessions = [p for p in sub.iterdir() if p.is_file() and p.suffix == ".jsonl"]
+        except OSError:
+            continue
+        if not sessions:
+            continue
+
+        name = _resolve_project_name(sub)
+        session_count = len(sessions)
+
+        proj = ProjectInfo(
+            name=name,
+            repo_root=sub,
+            manifest=None,
+            has_git=False,
+            total_commits=session_count,
+            user_commits=session_count,
+            is_mine=True,  # Claude Code sessions are authored by the user
+        )
+        existing = projects.get(name)
+        if existing is None or session_count > existing.user_commits:
+            projects[name] = proj
+
+    return sorted(
+        projects.values(),
+        key=lambda p: (-p.user_commits, p.name),
+    )