Skip to content

Commit 19ce58c

Browse files
committed
chore: rescue merged stacked PRs MemPalace#1150 and MemPalace#1157 into develop
MemPalace#1148, MemPalace#1150, and MemPalace#1157 were reviewed and merged on GitHub, but the two stacked children landed on their parent feature branches (now stale) rather than on develop. Only MemPalace#1148's commits reached develop via the direct merge. Release PR MemPalace#1159 (develop → main for v3.3.3) is therefore missing the LLM refinement, Claude-conversation scanner, and miner- registry wire-up that were ostensibly part of the release. This merge brings the stale `feat/llm-entity-refine` branch (which contains the rolled-up merge commit for MemPalace#1157MemPalace#1150 → everything below) into develop so the release tag includes it. No code changes here — only history recovery.
2 parents a851c7a + 61d6c3c commit 19ce58c

14 files changed

Lines changed: 2588 additions & 12 deletions

mempalace/backends/chroma.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,7 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> li
120120
os.rename(seg_dir, target)
121121
moved.append(target)
122122
logger.warning(
123-
"Quarantined stale HNSW segment %s "
124-
"(sqlite %.0fs newer than HNSW); renamed to %s",
123+
"Quarantined stale HNSW segment %s (sqlite %.0fs newer than HNSW); renamed to %s",
125124
seg_dir,
126125
sqlite_mtime - hnsw_mtime,
127126
target,

mempalace/cli.py

Lines changed: 74 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,21 +86,53 @@ def cmd_init(args):
8686
languages = cfg.entity_languages
8787
languages_tuple = tuple(languages)
8888

89+
# Optional phase-2 LLM provider (opt-in via --llm).
90+
llm_provider = None
91+
if getattr(args, "llm", False):
92+
from .llm_client import LLMError, get_provider
93+
94+
try:
95+
llm_provider = get_provider(
96+
name=args.llm_provider,
97+
model=args.llm_model,
98+
endpoint=args.llm_endpoint,
99+
api_key=args.llm_api_key,
100+
)
101+
except LLMError as e:
102+
print(f" ERROR: {e}", file=sys.stderr)
103+
sys.exit(2)
104+
ok, msg = llm_provider.check_available()
105+
if not ok:
106+
print(
107+
f" ERROR: LLM provider '{args.llm_provider}' unavailable: {msg}",
108+
file=sys.stderr,
109+
)
110+
sys.exit(2)
111+
print(f" LLM refinement enabled: {args.llm_provider}/{args.llm_model}")
112+
89113
# Pass 1: discover entities — manifests + git authors first, prose detection
90-
# as supplement for names mentioned only in docs/notes.
114+
# as supplement for names mentioned only in docs/notes. Optional phase-2
115+
# LLM refinement runs inside discover_entities when llm_provider is given.
91116
print(f"\n Scanning for entities in: {args.dir}")
92117
if languages_tuple != ("en",):
93118
print(f" Languages: {', '.join(languages_tuple)}")
94-
detected = discover_entities(args.dir, languages=languages_tuple)
119+
detected = discover_entities(args.dir, languages=languages_tuple, llm_provider=llm_provider)
95120
total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
96121
if total > 0:
97122
confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
98-
# Save confirmed entities to <project>/entities.json for the miner
123+
# Save confirmed entities to <project>/entities.json (per-project
124+
# audit trail — user can inspect or hand-edit) AND merge into the
125+
# global registry the miner reads at mine time.
99126
if confirmed["people"] or confirmed["projects"]:
100127
entities_path = Path(args.dir).expanduser().resolve() / "entities.json"
101-
with open(entities_path, "w") as f:
102-
json.dump(confirmed, f, indent=2)
128+
with open(entities_path, "w", encoding="utf-8") as f:
129+
json.dump(confirmed, f, indent=2, ensure_ascii=False)
103130
print(f" Entities saved: {entities_path}")
131+
132+
from .miner import add_to_known_entities
133+
134+
registry_path = add_to_known_entities(confirmed)
135+
print(f" Registry updated: {registry_path}")
104136
else:
105137
print(" No entities detected — proceeding with directory-based rooms.")
106138

@@ -550,6 +582,43 @@ def main():
550582
"When given, the value is also persisted to config.json."
551583
),
552584
)
585+
p_init.add_argument(
586+
"--llm",
587+
action="store_true",
588+
help=(
589+
"Enable LLM-assisted entity refinement (opt-in, local-first). "
590+
"Runs after manifest/git/regex detection, asking the configured "
591+
"provider to reclassify ambiguous candidates. "
592+
"Ctrl-C during refinement returns partial results."
593+
),
594+
)
595+
p_init.add_argument(
596+
"--llm-provider",
597+
default="ollama",
598+
choices=["ollama", "openai-compat", "anthropic"],
599+
help="LLM provider (default: ollama). Use --llm to enable.",
600+
)
601+
p_init.add_argument(
602+
"--llm-model",
603+
default="gemma4:e4b",
604+
help="Model name for the chosen provider (default: gemma4:e4b for Ollama).",
605+
)
606+
p_init.add_argument(
607+
"--llm-endpoint",
608+
default=None,
609+
help=(
610+
"Provider endpoint URL. Default for Ollama: http://localhost:11434. "
611+
"Required for openai-compat."
612+
),
613+
)
614+
p_init.add_argument(
615+
"--llm-api-key",
616+
default=None,
617+
help=(
618+
"API key for the provider. For anthropic, defaults to $ANTHROPIC_API_KEY; "
619+
"for openai-compat, defaults to $OPENAI_API_KEY."
620+
),
621+
)
553622

554623
# mine
555624
p_mine = sub.add_parser("mine", help="Mine files into the palace")

mempalace/convo_scanner.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
"""
2+
convo_scanner.py — Parse Claude Code conversation directories into ProjectInfo.
3+
4+
Claude Code stores sessions under ``~/.claude/projects/<slug>/<id>.jsonl``,
5+
where the ``<slug>`` is the original CWD with ``/`` replaced by ``-``. That
6+
encoding is lossy: we can't tell whether ``foo-bar`` in a slug is the
7+
literal project name ``foo-bar`` or two path segments ``foo/bar``.
8+
9+
Fortunately, every message record in the JSONL carries a ``cwd`` field with
10+
the true path. This scanner reads one record per session to recover the
11+
accurate project name, falling back to slug-decoding only if the JSONL
12+
is malformed or empty.
13+
14+
Output is the same ``ProjectInfo`` shape used by ``project_scanner``, so the
15+
``discover_entities`` orchestrator can mix-and-match sources.
16+
17+
Public:
18+
is_claude_projects_root(path) -> bool
19+
scan_claude_projects(path) -> list[ProjectInfo]
20+
"""
21+
22+
from __future__ import annotations
23+
24+
import json
25+
from pathlib import Path
26+
from typing import Optional
27+
28+
from mempalace.project_scanner import ProjectInfo
29+
30+
31+
MAX_HEADER_LINES = 20 # lines to read per session looking for `cwd`
32+
33+
34+
def is_claude_projects_root(path: Path) -> bool:
35+
"""Return True if path looks like `.claude/projects/`.
36+
37+
Heuristic: at least one child dir whose name starts with ``-`` and which
38+
contains at least one ``.jsonl`` file.
39+
"""
40+
if not path.is_dir():
41+
return False
42+
try:
43+
children = list(path.iterdir())
44+
except OSError:
45+
return False
46+
for child in children:
47+
if not (child.is_dir() and child.name.startswith("-")):
48+
continue
49+
try:
50+
if any(p.suffix == ".jsonl" for p in child.iterdir() if p.is_file()):
51+
return True
52+
except OSError:
53+
continue
54+
return False
55+
56+
57+
def _extract_cwd_from_session(session_file: Path) -> Optional[str]:
58+
"""Return the ``cwd`` from the first message record that carries one.
59+
60+
Returns None if the file can't be read, has no JSON, or no record has cwd.
61+
"""
62+
try:
63+
with open(session_file, encoding="utf-8", errors="replace") as f:
64+
for i, line in enumerate(f):
65+
if i >= MAX_HEADER_LINES:
66+
break
67+
line = line.strip()
68+
if not line:
69+
continue
70+
try:
71+
obj = json.loads(line)
72+
except json.JSONDecodeError:
73+
continue
74+
cwd = obj.get("cwd")
75+
if isinstance(cwd, str) and cwd:
76+
return cwd
77+
except OSError:
78+
return None
79+
return None
80+
81+
82+
def _decode_slug_fallback(slug: str) -> str:
83+
"""Best-effort project name from slug when cwd is unavailable.
84+
85+
The slug is lossy (`/` and `-` both become `-`). Last non-empty segment
86+
is the closest guess at the project name, preserving kebab-case is
87+
impossible without cwd.
88+
"""
89+
stripped = slug.lstrip("-")
90+
parts = [p for p in stripped.split("-") if p]
91+
return parts[-1] if parts else slug
92+
93+
94+
def _safe_mtime(path: Path) -> float:
95+
"""Return file mtime, defaulting old on permission or filesystem errors."""
96+
try:
97+
return path.stat().st_mtime
98+
except OSError:
99+
return 0.0
100+
101+
102+
def _resolve_project_name(project_dir: Path) -> str:
103+
"""Read one session's cwd to recover the original project name.
104+
105+
Falls back to slug-decoding if no session has a readable cwd.
106+
"""
107+
sessions = sorted(
108+
(p for p in project_dir.iterdir() if p.is_file() and p.suffix == ".jsonl"),
109+
key=_safe_mtime,
110+
reverse=True, # newest first — most likely to be well-formed
111+
)
112+
for session in sessions:
113+
cwd = _extract_cwd_from_session(session)
114+
if cwd:
115+
return Path(cwd).name or cwd
116+
return _decode_slug_fallback(project_dir.name)
117+
118+
119+
def scan_claude_projects(path: str | Path) -> list[ProjectInfo]:
120+
"""Scan a ``.claude/projects/`` directory for Claude Code conversations.
121+
122+
One ProjectInfo per subdir. ``has_git`` is False (the directory isn't a
123+
repo itself) but ``total_commits`` is repurposed here as session count so
124+
the UX surfaces a density signal for ranking.
125+
"""
126+
root = Path(path).expanduser().resolve()
127+
if not is_claude_projects_root(root):
128+
return []
129+
130+
projects: dict[str, ProjectInfo] = {}
131+
for sub in sorted(root.iterdir()):
132+
if not (sub.is_dir() and sub.name.startswith("-")):
133+
continue
134+
try:
135+
sessions = [p for p in sub.iterdir() if p.is_file() and p.suffix == ".jsonl"]
136+
except OSError:
137+
continue
138+
if not sessions:
139+
continue
140+
141+
name = _resolve_project_name(sub)
142+
session_count = len(sessions)
143+
144+
proj = ProjectInfo(
145+
name=name,
146+
repo_root=sub,
147+
manifest=None,
148+
has_git=False,
149+
total_commits=session_count,
150+
user_commits=session_count,
151+
is_mine=True, # Claude Code sessions are authored by the user
152+
)
153+
existing = projects.get(name)
154+
if existing is None or session_count > existing.user_commits:
155+
projects[name] = proj
156+
157+
return sorted(
158+
projects.values(),
159+
key=lambda p: (-p.user_commits, p.name),
160+
)

0 commit comments

Comments
 (0)