Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions mempalace/backends/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,7 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> li
os.rename(seg_dir, target)
moved.append(target)
logger.warning(
"Quarantined stale HNSW segment %s "
"(sqlite %.0fs newer than HNSW); renamed to %s",
"Quarantined stale HNSW segment %s (sqlite %.0fs newer than HNSW); renamed to %s",
seg_dir,
sqlite_mtime - hnsw_mtime,
target,
Expand Down
79 changes: 74 additions & 5 deletions mempalace/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,21 +86,53 @@ def cmd_init(args):
languages = cfg.entity_languages
languages_tuple = tuple(languages)

# Optional phase-2 LLM provider (opt-in via --llm).
llm_provider = None
if getattr(args, "llm", False):
from .llm_client import LLMError, get_provider

try:
llm_provider = get_provider(
name=args.llm_provider,
model=args.llm_model,
endpoint=args.llm_endpoint,
api_key=args.llm_api_key,
)
except LLMError as e:
print(f" ERROR: {e}", file=sys.stderr)
sys.exit(2)
ok, msg = llm_provider.check_available()
if not ok:
print(
f" ERROR: LLM provider '{args.llm_provider}' unavailable: {msg}",
file=sys.stderr,
)
sys.exit(2)
print(f" LLM refinement enabled: {args.llm_provider}/{args.llm_model}")

# Pass 1: discover entities — manifests + git authors first, prose detection
# as supplement for names mentioned only in docs/notes.
# as supplement for names mentioned only in docs/notes. Optional phase-2
# LLM refinement runs inside discover_entities when llm_provider is given.
print(f"\n Scanning for entities in: {args.dir}")
if languages_tuple != ("en",):
print(f" Languages: {', '.join(languages_tuple)}")
detected = discover_entities(args.dir, languages=languages_tuple)
detected = discover_entities(args.dir, languages=languages_tuple, llm_provider=llm_provider)
total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
if total > 0:
confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
# Save confirmed entities to <project>/entities.json for the miner
# Save confirmed entities to <project>/entities.json (per-project
# audit trail — user can inspect or hand-edit) AND merge into the
# global registry the miner reads at mine time.
if confirmed["people"] or confirmed["projects"]:
entities_path = Path(args.dir).expanduser().resolve() / "entities.json"
with open(entities_path, "w") as f:
json.dump(confirmed, f, indent=2)
with open(entities_path, "w", encoding="utf-8") as f:
json.dump(confirmed, f, indent=2, ensure_ascii=False)
print(f" Entities saved: {entities_path}")

from .miner import add_to_known_entities

registry_path = add_to_known_entities(confirmed)
print(f" Registry updated: {registry_path}")
else:
print(" No entities detected — proceeding with directory-based rooms.")

Expand Down Expand Up @@ -550,6 +582,43 @@ def main():
"When given, the value is also persisted to config.json."
),
)
p_init.add_argument(
"--llm",
action="store_true",
help=(
"Enable LLM-assisted entity refinement (opt-in, local-first). "
"Runs after manifest/git/regex detection, asking the configured "
"provider to reclassify ambiguous candidates. "
"Ctrl-C during refinement returns partial results."
),
)
p_init.add_argument(
"--llm-provider",
default="ollama",
choices=["ollama", "openai-compat", "anthropic"],
help="LLM provider (default: ollama). Use --llm to enable.",
)
p_init.add_argument(
"--llm-model",
default="gemma4:e4b",
help="Model name for the chosen provider (default: gemma4:e4b for Ollama).",
)
p_init.add_argument(
"--llm-endpoint",
default=None,
help=(
"Provider endpoint URL. Default for Ollama: http://localhost:11434. "
"Required for openai-compat."
),
)
p_init.add_argument(
"--llm-api-key",
default=None,
help=(
"API key for the provider. For anthropic, defaults to $ANTHROPIC_API_KEY; "
"for openai-compat, defaults to $OPENAI_API_KEY."
),
)

# mine
p_mine = sub.add_parser("mine", help="Mine files into the palace")
Expand Down
160 changes: 160 additions & 0 deletions mempalace/convo_scanner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
"""
convo_scanner.py — Parse Claude Code conversation directories into ProjectInfo.

Claude Code stores sessions under ``~/.claude/projects/<slug>/<id>.jsonl``,
where the ``<slug>`` is the original CWD with ``/`` replaced by ``-``. That
encoding is lossy: we can't tell whether ``foo-bar`` in a slug is the
literal project name ``foo-bar`` or two path segments ``foo/bar``.

Fortunately, every message record in the JSONL carries a ``cwd`` field with
the true path. This scanner reads one record per session to recover the
accurate project name, falling back to slug-decoding only if the JSONL
is malformed or empty.

Output is the same ``ProjectInfo`` shape used by ``project_scanner``, so the
``discover_entities`` orchestrator can mix-and-match sources.

Public:
is_claude_projects_root(path) -> bool
scan_claude_projects(path) -> list[ProjectInfo]
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import Optional

from mempalace.project_scanner import ProjectInfo


MAX_HEADER_LINES = 20 # lines to read per session looking for `cwd`


def is_claude_projects_root(path: Path) -> bool:
"""Return True if path looks like `.claude/projects/`.

Heuristic: at least one child dir whose name starts with ``-`` and which
contains at least one ``.jsonl`` file.
"""
if not path.is_dir():
return False
try:
children = list(path.iterdir())
except OSError:
return False
for child in children:
if not (child.is_dir() and child.name.startswith("-")):
continue
try:
if any(p.suffix == ".jsonl" for p in child.iterdir() if p.is_file()):
return True
except OSError:
continue
return False


def _extract_cwd_from_session(session_file: Path) -> Optional[str]:
"""Return the ``cwd`` from the first message record that carries one.

Returns None if the file can't be read, has no JSON, or no record has cwd.
"""
try:
with open(session_file, encoding="utf-8", errors="replace") as f:
for i, line in enumerate(f):
if i >= MAX_HEADER_LINES:
break
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError:
continue
cwd = obj.get("cwd")
if isinstance(cwd, str) and cwd:
return cwd
except OSError:
return None
return None


def _decode_slug_fallback(slug: str) -> str:
"""Best-effort project name from slug when cwd is unavailable.

The slug is lossy (`/` and `-` both become `-`). Last non-empty segment
is the closest guess at the project name, preserving kebab-case is
impossible without cwd.
"""
stripped = slug.lstrip("-")
parts = [p for p in stripped.split("-") if p]
return parts[-1] if parts else slug


def _safe_mtime(path: Path) -> float:
"""Return file mtime, defaulting old on permission or filesystem errors."""
try:
return path.stat().st_mtime
except OSError:
return 0.0


def _resolve_project_name(project_dir: Path) -> str:
"""Read one session's cwd to recover the original project name.

Falls back to slug-decoding if no session has a readable cwd.
"""
sessions = sorted(
(p for p in project_dir.iterdir() if p.is_file() and p.suffix == ".jsonl"),
key=_safe_mtime,
reverse=True, # newest first — most likely to be well-formed
)
for session in sessions:
cwd = _extract_cwd_from_session(session)
if cwd:
return Path(cwd).name or cwd
return _decode_slug_fallback(project_dir.name)


def scan_claude_projects(path: str | Path) -> list[ProjectInfo]:
"""Scan a ``.claude/projects/`` directory for Claude Code conversations.

One ProjectInfo per subdir. ``has_git`` is False (the directory isn't a
repo itself) but ``total_commits`` is repurposed here as session count so
the UX surfaces a density signal for ranking.
"""
root = Path(path).expanduser().resolve()
if not is_claude_projects_root(root):
return []

projects: dict[str, ProjectInfo] = {}
for sub in sorted(root.iterdir()):
if not (sub.is_dir() and sub.name.startswith("-")):
continue
try:
sessions = [p for p in sub.iterdir() if p.is_file() and p.suffix == ".jsonl"]
except OSError:
continue
if not sessions:
continue

name = _resolve_project_name(sub)
session_count = len(sessions)

proj = ProjectInfo(
name=name,
repo_root=sub,
manifest=None,
has_git=False,
total_commits=session_count,
user_commits=session_count,
is_mine=True, # Claude Code sessions are authored by the user
)
existing = projects.get(name)
if existing is None or session_count > existing.user_commits:
projects[name] = proj

return sorted(
projects.values(),
key=lambda p: (-p.user_commits, p.name),
)
Loading