diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 242922e8a..7c6bcaad9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -17,6 +17,7 @@ jobs:
- uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
+ cache: 'pip'
- run: pip install -e ".[dev]"
- run: python -m pytest tests/ -v --ignore=tests/benchmarks --cov=mempalace --cov-report=term-missing --cov-fail-under=80 --durations=10
@@ -26,7 +27,8 @@ jobs:
- uses: actions/checkout@v6
- uses: actions/setup-python@v6
with:
- python-version: "3.9"
+ python-version: "3.13"
+ cache: 'pip'
- run: pip install -e ".[dev]"
- run: python -m pytest tests/ -v --ignore=tests/benchmarks --cov=mempalace --cov-report=term-missing --cov-fail-under=80 --durations=10
@@ -36,7 +38,8 @@ jobs:
- uses: actions/checkout@v6
- uses: actions/setup-python@v6
with:
- python-version: "3.9"
+ python-version: "3.13"
+ cache: 'pip'
- run: pip install -e ".[dev]"
- run: python -m pytest tests/ -v --ignore=tests/benchmarks --cov=mempalace --cov-report=term-missing --cov-fail-under=80 --durations=10
lint:
@@ -46,6 +49,7 @@ jobs:
- uses: actions/setup-python@v6
with:
python-version: "3.11"
+ cache: 'pip'
- run: pip install "ruff>=0.4.0,<0.5"
- run: ruff check .
- run: ruff format --check .
diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml
index b516f36bf..b2b0d34f1 100644
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
@@ -23,7 +23,7 @@ jobs:
permissions:
contents: read
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v6
with:
fetch-depth: 0
@@ -46,7 +46,7 @@ jobs:
DOCS_EDIT_BRANCH: ${{ github.ref_name }}
run: bun run docs:build
- - uses: actions/upload-pages-artifact@v3
+ - uses: actions/upload-pages-artifact@v5
with:
path: website/.vitepress/dist
@@ -63,4 +63,4 @@ jobs:
steps:
- name: Deploy to GitHub Pages
id: deployment
- uses: actions/deploy-pages@v4
+ uses: actions/deploy-pages@v5
diff --git a/.github/workflows/version-guard.yml b/.github/workflows/version-guard.yml
index 9cb30fe93..36155cb9d 100644
--- a/.github/workflows/version-guard.yml
+++ b/.github/workflows/version-guard.yml
@@ -16,7 +16,7 @@ jobs:
check-versions:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v6
- name: Extract versions from all sources
id: versions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2051ab360..25b7853f0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,26 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
---
+## [3.3.4] — unreleased
+
+### Added
+
+- **`mempalace init` now prompts to mine the same directory.** After entity confirmation, room detection, and gitignore guard, `init` shows a one-line scope estimate (e.g. `~423 files (~12 MB) would be mined into this palace.`) computed from its existing corpus walk, then asks `Mine this directory now? [Y/n]` (default yes) and runs `mine()` in-process if accepted. The estimate fires before the prompt so users on a real corpus aren't surprised by a minutes-long ChromaDB write. Declining prints the exact `mempalace mine
` command for later. (#1181)
+- **New `--auto-mine` flag on `mempalace init`** for the non-interactive path (`mempalace init --auto-mine ` skips the mine prompt and runs mine directly). `--yes` retains its existing scope of entity auto-accept only and still prompts for the mine step, so existing scripted callers see no behaviour change; combining `--yes --auto-mine` gives a fully non-interactive setup. (#1181)
+- **Cross-wing topic tunnels.** When two wings have confirmed `TOPIC` labels in common (the LLM-refine bucket from `mempalace init --llm`), the miner now drops a symmetric tunnel between them at mine time so the palace graph reflects shared themes (frameworks, vendors, recurring concepts). Tunnels are routed through the existing `create_tunnel` storage so they share dedup and persistence with explicit tunnels. Topic tunnels are stored under a synthetic `topic:` room and tagged with `kind: "topic"` on the stored dict — this keeps them distinct from literal folder-derived rooms of the same name (a wing with both an `Angular` folder room and an `Angular` topic tunnel no longer collides at `follow_tunnels` read time) and gives LLMs scanning `list_tunnels` a visible discriminator. Threshold is configurable via `MEMPALACE_TOPIC_TUNNEL_MIN_COUNT` env var or `topic_tunnel_min_count` in `~/.mempalace/config.json` (default `1`). Manifest-dependency overlap and per-topic allow/deny lists remain out of scope. (#1180)
+- **Context-aware corpus detection at `mempalace init`.** A new Pass 0 runs at the start of `init` — before entity detection — and answers one question: *is this corpus an AI-dialogue record, and if so, which platform and what persona names has the user assigned to the agents?* Tier 1 is a free regex heuristic (well-known AI brand terms + turn-marker patterns, with a co-occurrence rule that suppresses ambiguous terms like `Claude`/`Gemini`/`Haiku` when no unambiguous AI signal is present, so French novels and astrology forums don't false-positive). Tier 2 is an LLM call (~$0.01 with Anthropic Haiku, free with local Ollama/LM Studio/llama.cpp/vLLM) that extracts `user_name` and `agent_persona_names` from dialogue structure. Result is persisted to `/.mempalace/origin.json` with a `schema_version: 1` envelope so downstream tools can read it. Entity classification then routes names matching `agent_persona_names` (case-insensitive) into a new `agent_personas` bucket instead of `people`, so a Claude Code transcript no longer misclassifies the user's `Echo`/`Sparrow`/`Cipher` agents as biological people. `llm_refine` receives the same context as a system-prompt preamble so it can disambiguate other ambiguous candidates with corpus-level knowledge too. Backwards compatible: callers that don't pass `corpus_origin` see the v3.3.3 return shape unchanged. (#TBD)
+- **`mempalace init` runs LLM-assisted refinement by default.** v3.3.3 made `--llm` opt-in; the LLM-assisted path is qualitatively better (extracts persona names, refines ambiguous classifications) so it now runs by default. Provider precedence is unchanged — Ollama at `http://localhost:11434` first, then openai-compat, then anthropic with API key. **Never blocks init on a missing LLM**: if no provider is reachable (Ollama not running, no API key set), init prints a one-line message pointing at `--no-llm` and falls through to the heuristic-only path. `--no-llm` is the new explicit opt-out. The legacy `--llm` flag is preserved as a deprecated alias of the default so scripted callers see no behaviour change. Cost story: zero for users with a local LLM (the majority on this repo), ~$0.01 per init for users with `ANTHROPIC_API_KEY` set who explicitly choose `--llm-provider anthropic`, zero for users with no LLM (graceful fallback). (#TBD)
+- **`mempalace mine --redetect-origin` flag.** Re-runs corpus-origin detection on the current corpus state and overwrites `/.mempalace/origin.json`. Useful when the corpus has grown since `mempalace init` and the stored origin may be stale. Heuristic-only by design (the flag is meant to be cheap); re-run `mempalace init` for full Tier 2 LLM refinement. Default `mempalace mine` does not touch `origin.json` — the flag is opt-in. (#TBD)
+
+### Bug Fixes
+
+- **CLI `mempalace search` retrieval quality.** The CLI was using pure ChromaDB cosine distance with no BM25 rerank, so drawers containing every query term but embedding as noise (directory listings, diff output, shell logs) scored `Match: 0.0` alongside genuinely irrelevant results with no way to tell them apart. Wired the CLI through the same `_hybrid_rank` the `mempalace_search` MCP tool already used, and surfaced both `cosine=` and `bm25=` scores in the output so users see which component of the match is firing. MCP search was unaffected; this fixes the human-facing CLI parity gap.
+- **Legacy-palace distance-metric warning.** CLI search now detects palaces created before `hnsw:space=cosine` was consistently set and prints a one-line notice pointing at `mempalace repair`. Without the warning such palaces silently used L2 distance, under which the similarity display floored every result to `Match: 0.0`. New palaces mined today already set cosine correctly and now have invariant tests pinning that behavior so future refactors can't silently regress it. (#1179)
+- **Graceful Ctrl-C during `mempalace mine`.** Interrupting a long mine no longer dumps a multi-frame `KeyboardInterrupt` traceback. The main file-processing loop now catches the signal, prints `files_processed: N/M`, `drawers_filed: K`, and `last_file:` so the user knows what landed, then exits with code 130 (standard SIGINT). Already-filed drawers are upserted idempotently on re-mine via deterministic IDs, so resuming is safe. The hooks PID lock at `~/.mempalace/hook_state/mine.pid` is now also actively cleaned up in a `finally` when its entry points at us — clean exit, error, or interrupt — preventing the next hook fire from briefly waiting on a stale PID. (#1182)
+- **`mempalace init` is now idempotent across re-runs.** Running `init` twice on the same project produced different `origin.json` results because the first run wrote `entities.json` into the project directory, and the second run's corpus-origin sampling included that file as corpus content — shifting Tier 1's character-density math. Sampling now skips the per-project artifacts (`entities.json`, `mempalace.yaml`), so re-running `init` produces the same classification it did the first time. Pinned by an integration test in `tests/test_corpus_origin_integration.py`. (#TBD)
+
+---
+
## [3.3.3] — 2026-04-23
### Bug Fixes
diff --git a/CLAUDE.md b/CLAUDE.md
index 27fd8fb4e..13dfac32a 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -22,7 +22,7 @@ These are non-negotiable. Every PR, every feature, every refactor must honor the
- **Verbatim always** — Never summarize, paraphrase, or lossy-compress user data. The system searches the index and returns the original words. If a user said it, we store exactly what they said. This is the foundational promise.
- **Incremental only** — Append-only ingest after initial build. Never destroy existing data to rebuild. A crash mid-operation must leave the existing palace untouched.
- **Entity-first** — Everything is keyed by real names with disambiguation by DOB, ID, or context. People matter more than topics.
-- **Local-first, zero API** — All extraction, chunking, and embedding happens on the user's machine. No cloud dependency for memory operations. No API keys required.
+- **Local-first, zero external API by default** — All extraction, chunking, embedding, and LLM-assisted refinement happens on the user's machine by default, using locally-hosted runtimes (Ollama, LM Studio, llama.cpp, vLLM, unsloth studio, etc.). External providers (Anthropic, OpenAI, Google) are supported via BYOK but are never required and never enabled silently. The system never sends user content to a service the user has not explicitly configured. "Local LLM" is not an external API — Ollama and equivalents running on localhost are part of the user's machine. External BYOK is always a deliberate user choice, never a default and never a silent fallback.
- **Performance budgets** — Hooks under 500ms. Startup injection under 100ms. Memory should feel instant.
- **Privacy by architecture** — The system physically cannot send your data because it never leaves your machine. No telemetry, no phone-home, no external service dependencies for core operations.
- **Background everything** — Filing, indexing, timestamps, and pipeline work happen via hooks in the background. Nothing interrupts the user's conversation. Zero tokens spent on bookkeeping in the chat window.
diff --git a/benchmarks/mine_bench.py b/benchmarks/mine_bench.py
new file mode 100644
index 000000000..43b08ed95
--- /dev/null
+++ b/benchmarks/mine_bench.py
@@ -0,0 +1,301 @@
+"""Mining throughput benchmark: per-chunk vs batched upsert, CPU vs GPU.
+
+Compares the legacy per-chunk ``add_drawer`` loop against the batched
+``collection.upsert`` path introduced in the "batched upsert + GPU" PR.
+Runs both paths on an identical seeded synthetic corpus, reports
+wall-clock time + drawers/sec, and prints a markdown table suitable
+for pasting into a PR description.
+
+Usage
+-----
+
+ # CPU (whatever onnxruntime is installed — CPU if you don't have
+ # onnxruntime-gpu):
+ uv run python benchmarks/mine_bench.py
+
+ # GPU (NVIDIA):
+ uv venv /tmp/gpu && source /tmp/gpu/bin/activate
+ uv pip install -e '.[gpu]' 'nvidia-cudnn-cu12>=9,<10' \\
+ 'nvidia-cuda-runtime-cu12' 'nvidia-cublas-cu12'
+ export LD_LIBRARY_PATH=$(python -c "import nvidia.cudnn, os; \\
+ print(os.path.dirname(nvidia.cudnn.__file__)+'/lib')"):$LD_LIBRARY_PATH
+ MEMPALACE_EMBEDDING_DEVICE=cuda python benchmarks/mine_bench.py
+
+Flags
+-----
+
+ --device cpu|cuda|coreml|dml|auto Override MEMPALACE_EMBEDDING_DEVICE
+ --scenarios small,medium,large Which scenarios to run
+ --seed 42 RNG seed for reproducibility
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import os
+import random
+import shutil
+import string
+import sys
+import tempfile
+import time
+from datetime import datetime
+from pathlib import Path
+
+
+def build_corpus(dest: Path, n_files: int, paragraphs_per_file: int, seed: int) -> None:
+ """Generate ``n_files`` markdown files of random words under ``dest``."""
+ rng = random.Random(seed)
+ dest.mkdir(parents=True, exist_ok=True)
+ for i in range(n_files):
+ paragraphs = []
+ for _ in range(paragraphs_per_file):
+ words = [
+ "".join(rng.choices(string.ascii_lowercase, k=rng.randint(3, 10)))
+ for _ in range(12)
+ ]
+ paragraphs.append(" ".join(words))
+ (dest / f"doc_{i:03d}.md").write_text("\n\n".join(paragraphs))
+ (dest / "mempalace.yaml").write_text(
+ "wing: bench\n"
+ "rooms:\n"
+ " - name: general\n"
+ " description: all\n"
+ " keywords: [general]\n"
+ )
+
+
+def _process_file_unbatched(filepath, project_path, collection, wing, rooms, agent, closets_col):
+ """Legacy per-chunk upsert path (pre-batching).
+
+ Reproduces the exact loop shape the miner used before this PR so the
+ comparison is apples-to-apples; only the upsert granularity differs.
+ """
+ from mempalace import miner
+ from mempalace.palace import (
+ build_closet_lines,
+ file_already_mined,
+ mine_lock,
+ purge_file_closets,
+ upsert_closet_lines,
+ )
+
+ source_file = str(filepath)
+ if file_already_mined(collection, source_file, check_mtime=True):
+ return 0, "general"
+ try:
+ content = filepath.read_text(encoding="utf-8", errors="replace")
+ except OSError:
+ return 0, "general"
+ content = content.strip()
+ if len(content) < miner.MIN_CHUNK_SIZE:
+ return 0, "general"
+ room = miner.detect_room(filepath, content, rooms, project_path)
+ chunks = miner.chunk_text(content, source_file)
+
+ with mine_lock(source_file):
+ if file_already_mined(collection, source_file, check_mtime=True):
+ return 0, room
+ try:
+ collection.delete(where={"source_file": source_file})
+ except Exception:
+ pass
+ drawers_added = 0
+ for chunk in chunks:
+ miner.add_drawer(
+ collection=collection,
+ wing=wing,
+ room=room,
+ content=chunk["content"],
+ source_file=source_file,
+ chunk_index=chunk["chunk_index"],
+ agent=agent,
+ )
+ drawers_added += 1
+ if closets_col and drawers_added > 0:
+ drawer_ids = [
+ f"drawer_{wing}_{room}_"
+ f"{hashlib.sha256((source_file + str(c['chunk_index'])).encode()).hexdigest()[:24]}"
+ for c in chunks
+ ]
+ closet_lines = build_closet_lines(source_file, drawer_ids, content, wing, room)
+ closet_id_base = (
+ f"closet_{wing}_{room}_"
+ f"{hashlib.sha256(source_file.encode()).hexdigest()[:24]}"
+ )
+ closet_meta = {
+ "wing": wing,
+ "room": room,
+ "source_file": source_file,
+ "drawer_count": drawers_added,
+ "filed_at": datetime.now().isoformat(),
+ "normalize_version": miner.NORMALIZE_VERSION,
+ }
+ purge_file_closets(closets_col, source_file)
+ upsert_closet_lines(closets_col, closet_id_base, closet_lines, closet_meta)
+ return drawers_added, room
+
+
+def mine_once(project_dir: str, palace_path: str, batched: bool) -> tuple[int, float]:
+ """Mine a project dir with either the batched (new) or per-chunk (old) path."""
+ from mempalace import miner
+ from mempalace.miner import load_config, scan_project
+ from mempalace.palace import get_closets_collection, get_collection
+
+ project_path = Path(project_dir).resolve()
+ config = load_config(project_dir)
+ wing = config["wing"]
+ rooms = config.get("rooms", [])
+ files = scan_project(project_dir)
+ collection = get_collection(palace_path)
+ closets = get_closets_collection(palace_path)
+
+ total = 0
+ t0 = time.perf_counter()
+ for filepath in files:
+ if batched:
+ drawers, _ = miner.process_file(
+ filepath=filepath,
+ project_path=project_path,
+ collection=collection,
+ wing=wing,
+ rooms=rooms,
+ agent="bench",
+ dry_run=False,
+ closets_col=closets,
+ )
+ else:
+ drawers, _ = _process_file_unbatched(
+ filepath, project_path, collection, wing, rooms, "bench", closets
+ )
+ total += drawers
+ return total, time.perf_counter() - t0
+
+
+def _reset_backend_caches() -> None:
+ """Drop the in-process client cache so each run pays cold-open cost equally."""
+ from mempalace.palace import _DEFAULT_BACKEND
+
+ _DEFAULT_BACKEND._clients.clear()
+ _DEFAULT_BACKEND._freshness.clear()
+
+
+def run_scenario(label: str, n_files: int, paragraphs_per_file: int, seed: int) -> dict:
+ """Run one scenario under both code paths and return a result dict."""
+ print(f"\n=== {label}: {n_files} files × {paragraphs_per_file} paragraphs ===")
+ results = {}
+ for mode in ("unbatched", "batched"):
+ tmp = Path(tempfile.mkdtemp(prefix=f"mp_{mode}_"))
+ try:
+ proj = tmp / "proj"
+ palace = tmp / "palace"
+ build_corpus(proj, n_files, paragraphs_per_file, seed=seed)
+ _reset_backend_caches()
+ drawers, dt = mine_once(str(proj), str(palace), batched=(mode == "batched"))
+ rate = drawers / dt if dt > 0 else 0.0
+ results[mode] = (drawers, dt, rate)
+ print(f" {mode:10} {drawers:5} drawers in {dt:6.2f}s → {rate:7.1f} drawers/sec")
+ finally:
+ shutil.rmtree(tmp, ignore_errors=True)
+
+ _, t_u, r_u = results["unbatched"]
+ d_b, t_b, r_b = results["batched"]
+ speedup = t_u / t_b if t_b > 0 else 0.0
+ print(f" speedup: {speedup:.2f}× ({t_u:.2f}s → {t_b:.2f}s)")
+ return {
+ "label": label,
+ "n_files": n_files,
+ "paragraphs": paragraphs_per_file,
+ "drawers": d_b,
+ "unbatched_time": t_u,
+ "unbatched_rate": r_u,
+ "batched_time": t_b,
+ "batched_rate": r_b,
+ "speedup": speedup,
+ }
+
+
+SCENARIOS = {
+ "small": ("Small files (~50 paragraphs)", 10, 50),
+ "medium": ("Medium files (~200 paragraphs)", 20, 200),
+ "large": ("Large files (~500 paragraphs)", 10, 500),
+}
+
+
+def _env_summary(device_label: str) -> list[str]:
+ """Short hardware + version lines included with the printed table."""
+ import platform
+
+ try:
+ import chromadb
+
+ chromadb_v = chromadb.__version__
+ except Exception:
+ chromadb_v = "?"
+ try:
+ import onnxruntime as ort
+
+ ort_v = ort.__version__
+ providers = ",".join(p.replace("ExecutionProvider", "") for p in ort.get_available_providers())
+ except Exception:
+ ort_v = "?"
+ providers = "?"
+
+ return [
+ f"device: **{device_label}** (onnxruntime {ort_v}, providers={providers})",
+ f"chromadb {chromadb_v} · python {sys.version.split()[0]} · {platform.platform()}",
+ ]
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description=__doc__.split("\n\n", 1)[0])
+ parser.add_argument(
+ "--device",
+ default=None,
+ help="Override MEMPALACE_EMBEDDING_DEVICE (cpu|cuda|coreml|dml|auto)",
+ )
+ parser.add_argument(
+ "--scenarios",
+ default="small,medium,large",
+ help="Comma-separated scenario names (default: all)",
+ )
+ parser.add_argument("--seed", type=int, default=42)
+ args = parser.parse_args()
+
+ if args.device:
+ os.environ["MEMPALACE_EMBEDDING_DEVICE"] = args.device
+
+ from mempalace.embedding import describe_device, get_embedding_function
+
+ device_label = describe_device()
+ print(f"Warming up ONNX model on device={device_label}...")
+ ef = get_embedding_function()
+ ef(["warmup sentence one", "warmup sentence two"])
+
+ picked = [s.strip() for s in args.scenarios.split(",") if s.strip()]
+ results = []
+ for key in picked:
+ if key not in SCENARIOS:
+ print(f"Unknown scenario {key!r}; choices: {sorted(SCENARIOS)}", file=sys.stderr)
+ sys.exit(2)
+ label, n_files, paras = SCENARIOS[key]
+ results.append(run_scenario(label, n_files, paras, args.seed))
+
+ print("\n\n## Mining benchmark\n")
+ for line in _env_summary(device_label):
+ print(line + " ")
+ print()
+ print("| Scenario | Files | Drawers | Per-chunk (old) | Batched (new) | Speedup |")
+ print("| --- | ---: | ---: | ---: | ---: | ---: |")
+ for r in results:
+ print(
+ f"| {r['label']} | {r['n_files']} | {r['drawers']} | "
+ f"{r['unbatched_time']:.2f}s · {r['unbatched_rate']:.0f} drw/s | "
+ f"{r['batched_time']:.2f}s · {r['batched_rate']:.0f} drw/s | "
+ f"**{r['speedup']:.2f}×** |"
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/HOOKS_TUTORIAL.md b/examples/HOOKS_TUTORIAL.md
index 3a34b81c2..7818e59ec 100644
--- a/examples/HOOKS_TUTORIAL.md
+++ b/examples/HOOKS_TUTORIAL.md
@@ -7,27 +7,42 @@ MemPalace hooks act as an "Auto-Save" feature. They help your AI keep a permanen
* **PreCompact Hook** (`mempal_precompact_hook.sh`): Saves your context right before the AI's memory window fills up.
### 2. Setup for Claude Code
-Add this to your configuration file to enable automatic background saving:
+Add this to `~/.claude/settings.local.json` (global) or `.claude/settings.local.json` (project-scoped) to enable automatic background saving:
```json
{
"hooks": {
"Stop": [
{
- "matcher": "",
- "hooks": [{"type": "command", "command": "./hooks/mempal_save_hook.sh"}]
+ "matcher": "*",
+ "hooks": [{
+ "type": "command",
+ "command": "/absolute/path/to/hooks/mempal_save_hook.sh",
+ "timeout": 30
+ }]
}
],
"PreCompact": [
{
- "matcher": "",
- "hooks": [{"type": "command", "command": "./hooks/mempal_precompact_hook.sh"}]
+ "hooks": [{
+ "type": "command",
+ "command": "/absolute/path/to/hooks/mempal_precompact_hook.sh",
+ "timeout": 30
+ }]
}
]
}
}
```
+Make the hooks executable:
+```bash
+chmod +x /absolute/path/to/hooks/mempal_save_hook.sh
+chmod +x /absolute/path/to/hooks/mempal_precompact_hook.sh
+```
+
+**Note:** Replace `/absolute/path/to/hooks/` with the actual path where you cloned the MemPalace repository (e.g., `~/projects/mempalace/hooks/`).
+
### 3. What changed (v3.1.0+)
Both hooks now have **two-layer capture**:
@@ -48,4 +63,4 @@ mempalace mine ~/.claude/projects/ --mode convos
- **`SAVE_INTERVAL=15`** — How many human messages between saves
- **`MEMPALACE_PYTHON`** — Python interpreter with mempalace + chromadb. Auto-detects: env var → repo venv → system python3
-- **`MEMPAL_DIR`** — Optional directory for auto-ingest via `mempalace mine`
\ No newline at end of file
+- **`MEMPAL_DIR`** — Optional directory for auto-ingest via `mempalace mine`
diff --git a/hooks/mempal_precompact_hook.sh b/hooks/mempal_precompact_hook.sh
index a14a0d0e9..036b128c6 100755
--- a/hooks/mempal_precompact_hook.sh
+++ b/hooks/mempal_precompact_hook.sh
@@ -8,8 +8,11 @@
# context about what was discussed. This hook forces one final save of
# EVERYTHING before that happens.
#
-# Unlike the save hook (which triggers every N exchanges), this ALWAYS
-# blocks — because compaction is always worth saving before.
+# Unlike the save hook (which triggers every N exchanges), the precompact
+# hook fires exactly once: right before /compact. By default it nudges the
+# model to save (via stderr) and returns {} so compaction proceeds. Set
+# MEMPAL_BLOCK_COMPACT=1 to opt into a two-phase hard-block (block once,
+# allow the retry) for users who want a forced save-before-compact pause.
#
# === INSTALL ===
# Add to .claude/settings.local.json:
@@ -37,8 +40,16 @@
# Claude Code sends JSON on stdin with:
# session_id — unique session identifier
#
-# We always return decision: "block" with a reason telling the AI
-# to save everything. After the AI saves, compaction proceeds normally.
+# Default behavior: print a save nudge to stderr (which Claude Code surfaces
+# to the model) and return {} so compaction is not blocked. This restores
+# the safety nudge that PR #885 dropped without re-introducing the
+# unconditional block from #858 (which had no escape path).
+#
+# Opt-in hard-block: export MEMPAL_BLOCK_COMPACT=1 to use the two-phase
+# behavior — the first /compact attempt this session blocks with the nudge
+# as the reason, the next attempt clears the per-session flag and allows
+# compaction. This gives users who want a forced save-before-compact pause
+# a way to do that without footgunning themselves.
#
# === MEMPALACE CLI ===
# This repo uses: mempalace mine
@@ -75,6 +86,27 @@ if [ -n "$MEMPAL_DIR" ] && [ -d "$MEMPAL_DIR" ]; then
mempalace mine "$MEMPAL_DIR" >> "$STATE_DIR/hook.log" 2>&1
fi
-# Silent: return empty JSON to not block. "decision": "allow" is invalid —
-# only "block" or {} are recognized.
+NUDGE="MemPalace: compaction imminent. If you have unsaved topics, decisions, quotes, code, or important context from this session, save them via your memory system NOW. After compaction, detailed context is lost."
+
+# Opt-in hard-block (MEMPAL_BLOCK_COMPACT=1) — two-phase: block once per
+# session, then allow on the next attempt. The per-session flag prevents
+# the original #858 footgun where the unconditional block had no escape.
+if [ "${MEMPAL_BLOCK_COMPACT:-0}" = "1" ]; then
+ BLOCKED_FLAG="$STATE_DIR/${SESSION_ID}_blocked_compact"
+ if [ -f "$BLOCKED_FLAG" ]; then
+ rm -f "$BLOCKED_FLAG"
+ echo '{}'
+ exit 0
+ fi
+ touch "$BLOCKED_FLAG"
+ NUDGE="$NUDGE" "$MEMPAL_PYTHON_BIN" -c '
+import json, os
+print(json.dumps({"decision": "block", "reason": os.environ["NUDGE"]}))
+'
+ exit 0
+fi
+
+# Default: print nudge to stderr (Claude Code surfaces it to the model)
+# and return {} so compaction proceeds.
+echo "$NUDGE" >&2
echo '{}'
diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py
index 3a0d2c3f9..13976a2aa 100644
--- a/mempalace/backends/chroma.py
+++ b/mempalace/backends/chroma.py
@@ -4,6 +4,7 @@
import logging
import os
import sqlite3
+from pathlib import Path
from typing import Any, Optional
import chromadb
@@ -49,41 +50,105 @@ def _validate_where(where: Optional[dict]) -> None:
stack.extend(x for x in v if isinstance(x, dict))
-def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> list[str]:
- """Rename HNSW segment dirs whose files are stale vs. chroma.sqlite3.
+def _segment_appears_healthy(seg_dir: str) -> bool:
+ """Return True if a chromadb HNSW segment dir looks intact.
+
+ Sniff-tests the chromadb-written segment metadata file
+ (``index_metadata.pickle``) for its expected format bytes without
+ parsing it. ChromaDB writes that file after a successful HNSW flush;
+ a complete write starts with byte ``0x80`` and ends with byte
+ ``0x2e`` (the protocol/terminator byte sequence chromadb serializes
+ with). If both bytes are present and the file is non-trivially sized,
+ chromadb will load the segment cleanly even when its on-disk mtime
+ trails ``chroma.sqlite3`` — which is the *steady state* under
+ chromadb 1.5.x's async batched flush, not corruption.
+
+ A missing metadata file is treated as "fresh / never-flushed" and
+ considered healthy. Renaming an empty dir orphans nothing, and a
+ real corruption case manifests as a present-but-malformed file or a
+ chromadb load error caught downstream by palace-daemon's
+ ``_auto_repair`` retry path.
+
+ Deliberately format-sniffs only; never deserializes. Deserialization
+ can execute arbitrary code, and the byte-sniff is sufficient to
+ distinguish a complete write from truncation, zero-fill, or
+ partial-flush corruption.
+
+ Assumes pickle protocol >= 2 (``0x80`` PROTO marker). Matches what
+ chromadb writes today; if a future chromadb version emits protocol
+ 0/1 segments, this check would start returning False on healthy
+ files and quarantine_stale_hnsw would conservatively rename them
+ out of the way (lazy rebuild on next open recovers).
+ """
+ meta_path = os.path.join(seg_dir, "index_metadata.pickle")
+ if not os.path.isfile(meta_path):
+ # No metadata file yet — segment hasn't flushed (fresh / empty).
+ # Renaming would orphan nothing; consider healthy.
+ return True
+ try:
+ size = os.path.getsize(meta_path)
+ # A real chromadb metadata file is at least tens of bytes; a
+ # smaller-than-floor file is almost certainly truncated.
+ if size < 16:
+ return False
+ with open(meta_path, "rb") as f:
+ head = f.read(2)
+ f.seek(-1, 2) # last byte
+ tail = f.read(1)
+ except OSError:
+ return False
+ return len(head) == 2 and head[0] == 0x80 and tail == b"\x2e"
- When a ChromaDB 1.5.x PersistentClient opens a palace whose on-disk
- HNSW segment is significantly older than ``chroma.sqlite3``, the Rust
- graph-walk can dereference dangling neighbor pointers for entries that
- exist in the metadata segment but not in the HNSW index, and segfault
- in a background thread on the next ``count()`` or ``query(...)`` call.
- This is the same failure mode reported at #823 (semantic search stale
+def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]:
+ """Rename HNSW segment dirs that are both stale-by-mtime AND fail an
+ integrity sniff-test.
+
+ Catches the segfault failure mode from #823 (semantic search stale
after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on
``count()`` with chromadb 1.5.5), and acknowledged as by-design at
- chroma-core/chroma#2594. On one fork palace (135K drawers), the drift
- caused a 65–85% crash rate on fresh-process opens; fresh-process
- crash rate dropped to 0% after the segment dir was renamed out of the
- way and ChromaDB rebuilt lazily.
-
- Heuristic: if ``chroma.sqlite3`` is more than ``stale_seconds`` newer
- than the segment's ``data_level0.bin``, the segment is considered
- suspect and renamed to ``.drift-``. ChromaDB reopens
- cleanly without it and writes fresh index files on next use. The
- original directory is renamed, not deleted, so recovery remains
- possible if the heuristic misfires.
-
- The default threshold (1h) is deliberately conservative — ChromaDB's
- HNSW flush cadence means legitimate drift is normally on the order of
- seconds to minutes. A segment that is more than an hour out of date is
- almost certainly in a "crashed mid-write" state.
+ chroma-core/chroma#2594. Renaming a corrupt segment lets chromadb
+ rebuild lazily on next open instead of segfaulting.
+
+ Two-stage check:
+
+ 1. **mtime gate.** If ``chroma.sqlite3`` is less than
+ ``stale_seconds`` newer than the segment's ``data_level0.bin``,
+ skip — chromadb is in normal write-path territory.
+
+ 2. **Integrity gate** (``_segment_appears_healthy``). Even when the
+ mtime gap exceeds the threshold, a segment whose
+ ``index_metadata.pickle`` passes a format sniff-test is healthy:
+ chromadb 1.5.x flushes HNSW state asynchronously and a clean
+ shutdown does NOT force-flush, so the on-disk HNSW is *always*
+ somewhat older than ``chroma.sqlite3``. Production observation
+ (2026-04-26 disks daemon): three of three segments quarantined
+ on every cold start, with 538-557s gaps, leaving the 151K-drawer
+ palace with vector_ranked=0 until rebuild. Renaming a healthy
+ segment based on mtime alone destroys a valid index — chromadb
+ creates an empty replacement, orphaning every drawer in sqlite
+ from vector recall until the operator runs ``mempalace repair
+ --mode rebuild`` (15+ min on a 151K palace).
+
+ Only segments that pass stage 1 (suspiciously stale) AND fail stage
+ 2 (metadata file truncated, zero-filled, or absent-with-data) are
+ renamed to ``.drift-``. The original directory is
+ renamed, not deleted, so recovery remains possible if the heuristic
+ misfires.
+
+ The default threshold (5 min) is advisory under daemon-strict; the
+ integrity gate is what actually distinguishes corruption from flush
+ lag. The threshold still matters for the cross-machine replication
+ case (#823), where it bounds how stale a Syncthing-replicated
+ segment can be before we look harder at it.
Args:
palace_path: path to the palace directory containing ``chroma.sqlite3``
- stale_seconds: minimum mtime gap to treat a segment as stale
+ stale_seconds: minimum mtime gap to *consider* a segment for quarantine
Returns:
- List of paths that were quarantined (empty if nothing drifted).
+ List of paths that were quarantined (empty if nothing actually
+ looked corrupt).
"""
db_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.isfile(db_path):
@@ -114,22 +179,70 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 3600.0) -> li
continue
if sqlite_mtime - hnsw_mtime < stale_seconds:
continue
+
+ # Stage 2: integrity gate. mtime drift is necessary but not
+ # sufficient — chromadb's async flush makes drift the steady-
+ # state condition. A healthy segment metadata file proves
+ # chromadb can open the segment without segfault; don't
+ # quarantine a healthy index.
+ if _segment_appears_healthy(seg_dir):
+ logger.info(
+ "HNSW mtime gap %.0fs on %s exceeds threshold but segment "
+ "metadata file is intact — flush-lag, not corruption. "
+ "Leaving in place.",
+ sqlite_mtime - hnsw_mtime,
+ seg_dir,
+ )
+ continue
+
stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
target = f"{seg_dir}.drift-{stamp}"
try:
os.rename(seg_dir, target)
moved.append(target)
logger.warning(
- "Quarantined stale HNSW segment %s (sqlite %.0fs newer than HNSW); renamed to %s",
+ "Quarantined corrupt HNSW segment %s (sqlite %.0fs newer than HNSW, integrity check failed); renamed to %s",
seg_dir,
sqlite_mtime - hnsw_mtime,
target,
)
except OSError:
- logger.exception("Failed to quarantine stale HNSW segment %s", seg_dir)
+ logger.exception("Failed to quarantine corrupt HNSW segment %s", seg_dir)
return moved
+def _pin_hnsw_threads(collection) -> None:
+ """Best-effort retrofit: pin ``hnsw:num_threads=1`` on an existing collection.
+
+ Fresh collections set this via ``metadata=`` at creation. Legacy palaces
+ built before that change keep the default (parallel insert) and can hit
+ the HNSW race described in #974/#965. ChromaDB's
+ ``collection.modify(configuration=...)`` lets us re-apply ``num_threads=1``
+ in memory at load time so every new process is protected.
+
+ Note: in chromadb 1.5.x the modified ``configuration_json["hnsw"]`` does
+ not persist to disk across ``PersistentClient`` reopens, so this must
+ run on every ``get_collection`` call, not just once.
+ """
+ try:
+ from chromadb.api.collection_configuration import (
+ UpdateCollectionConfiguration,
+ UpdateHNSWConfiguration,
+ )
+ except ImportError:
+ logger.debug("_pin_hnsw_threads skipped: chromadb too old", exc_info=True)
+ return
+ try:
+ collection.modify(
+ configuration=UpdateCollectionConfiguration(hnsw=UpdateHNSWConfiguration(num_threads=1))
+ )
+ except Exception:
+ logger.debug("_pin_hnsw_threads modify failed", exc_info=True)
+
+
+_BLOB_FIX_MARKER = ".blob_seq_ids_migrated"
+
+
def _fix_blob_seq_ids(palace_path: str) -> None:
"""Fix ChromaDB 0.6.x -> 1.5.x migration bug: BLOB seq_ids -> INTEGER.
@@ -139,10 +252,19 @@ def _fix_blob_seq_ids(palace_path: str) -> None:
type INTEGER) is not compatible with SQL type BLOB".
Must run BEFORE PersistentClient is created (the compactor fires on init).
+
+ Opening a Python sqlite3 connection against a ChromaDB 1.5.x WAL-mode
+ database leaves state that segfaults the next PersistentClient call. After
+ the migration has run once successfully, a marker file is written so
+ subsequent opens skip the sqlite connection entirely. Already-migrated
+ palaces can touch the marker manually to opt into the fast path.
"""
db_path = os.path.join(palace_path, "chroma.sqlite3")
if not os.path.isfile(db_path):
return
+ marker = os.path.join(palace_path, _BLOB_FIX_MARKER)
+ if os.path.isfile(marker):
+ return
try:
with sqlite3.connect(db_path) as conn:
for table in ("embeddings", "max_seq_id"):
@@ -160,6 +282,14 @@ def _fix_blob_seq_ids(palace_path: str) -> None:
conn.commit()
except Exception:
logger.exception("Could not fix BLOB seq_ids in %s", db_path)
+ return
+ # Write marker whether or not rows needed migration — the palace is now
+ # confirmed to be in the INTEGER-seq_id state and future opens can skip the
+ # sqlite3.connect() entirely.
+ try:
+ Path(marker).touch()
+ except OSError:
+ logger.exception("Could not write migration marker %s", marker)
# ---------------------------------------------------------------------------
@@ -368,6 +498,18 @@ def delete(self, *, ids=None, where=None):
def count(self):
return self._collection.count()
+ @property
+ def metadata(self) -> dict:
+ """Pass-through to the underlying ChromaDB collection's metadata.
+
+ Used by the searcher to detect legacy palaces that were created
+ without ``hnsw:space=cosine`` and therefore silently use L2
+ distance, which breaks cosine-based similarity interpretation.
+ Returns ``{}`` when metadata is absent so callers can do a plain
+ ``.get("hnsw:space")`` without None-checks.
+ """
+ return self._collection.metadata or {}
+
# ---------------------------------------------------------------------------
# Backend
@@ -405,6 +547,23 @@ def __init__(self):
self._freshness: dict[str, tuple[int, float]] = {}
self._closed = False
+ @staticmethod
+ def _resolve_embedding_function():
+ """Return the EF for the user's ``embedding_device`` setting.
+
+ Both ``get_collection`` and ``get_or_create_collection`` must receive
+ the EF explicitly — ChromaDB 1.x does not persist it with the
+ collection, so a reader that omits the argument silently gets the
+ library default and its queries won't match the writer's vectors.
+ """
+ try:
+ from ..embedding import get_embedding_function
+
+ return get_embedding_function()
+ except Exception:
+ logger.exception("Failed to build embedding function; using chromadb default")
+ return None
+
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
@@ -477,6 +636,28 @@ def _client(self, palace_path: str):
# Public static helpers (legacy; prefer :meth:`get_collection`)
# ------------------------------------------------------------------
+ # Per-process record of palaces that have already had quarantine_stale_hnsw
+ # invoked at least once. The proactive drift check is a *cold-start*
+ # protection — it catches HNSW segments that arrived stale relative to
+ # ``chroma.sqlite3`` (e.g. cross-machine replication, partial restore,
+ # crashed-mid-write). Once a long-running process has opened the palace
+ # cleanly, re-firing on every reconnect is a *runtime thrash*: the
+ # daemon's own writes bump sqlite mtime but HNSW flushes batch on
+ # chromadb's internal cadence, so the mtime gap naturally exceeds the
+ # threshold under steady write load even though nothing is corrupt.
+ # Real runtime drift is still handled — palace-daemon's ``_auto_repair``
+ # calls :func:`quarantine_stale_hnsw` directly on observed HNSW errors,
+ # which bypasses this gate.
+ #
+ # Thread-safety: this set is mutated without a lock. Two concurrent
+ # ``make_client()`` calls for the same palace can both pass the
+ # membership check and both invoke ``quarantine_stale_hnsw``. That's
+ # safe because the function is idempotent (mtime check + timestamped
+ # rename of distinct directories), so the worst-case race produces
+ # one redundant rename attempt that no-ops. Idempotency is the
+ # safety property; locking would add cost without correctness gain.
+ _quarantined_paths: set[str] = set()
+
@staticmethod
def make_client(palace_path: str):
"""Create a fresh ``PersistentClient`` (fixes BLOB seq_ids first).
@@ -484,8 +665,15 @@ def make_client(palace_path: str):
Deprecated-ish: exposed for legacy long-lived callers that manage their
own client cache. New code should obtain a collection through
:meth:`get_collection` which manages caching internally.
+
+ Quarantines stale HNSW segments **once per palace per process**. See
+ :attr:`_quarantined_paths` for the rationale (cold-start protection
+ vs. runtime thrash on steady-write daemons).
"""
_fix_blob_seq_ids(palace_path)
+ if palace_path not in ChromaBackend._quarantined_paths:
+ quarantine_stale_hnsw(palace_path)
+ ChromaBackend._quarantined_paths.add(palace_path)
return chromadb.PersistentClient(path=palace_path)
@staticmethod
@@ -532,12 +720,18 @@ def get_collection(
if options and isinstance(options, dict):
hnsw_space = options.get("hnsw_space", hnsw_space)
+ ef = self._resolve_embedding_function()
+ ef_kwargs = {"embedding_function": ef} if ef is not None else {}
+
if create:
collection = client.get_or_create_collection(
- collection_name, metadata={"hnsw:space": hnsw_space}
+ collection_name,
+ metadata={"hnsw:space": hnsw_space, "hnsw:num_threads": 1},
+ **ef_kwargs,
)
else:
- collection = client.get_collection(collection_name)
+ collection = client.get_collection(collection_name, **ef_kwargs)
+ _pin_hnsw_threads(collection)
return ChromaCollection(collection)
def close_palace(self, palace) -> None:
@@ -578,8 +772,12 @@ def create_collection(
self, palace_path: str, collection_name: str, hnsw_space: str = "cosine"
) -> ChromaCollection:
"""Create (not get-or-create) ``collection_name`` with the given HNSW space."""
+ ef = self._resolve_embedding_function()
+ ef_kwargs = {"embedding_function": ef} if ef is not None else {}
collection = self._client(palace_path).create_collection(
- collection_name, metadata={"hnsw:space": hnsw_space}
+ collection_name,
+ metadata={"hnsw:space": hnsw_space, "hnsw:num_threads": 1},
+ **ef_kwargs,
)
return ChromaCollection(collection)
diff --git a/mempalace/cli.py b/mempalace/cli.py
index 714c64c7b..51e31093c 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -34,11 +34,168 @@
from pathlib import Path
from .config import MempalaceConfig
+from .corpus_origin import detect_origin_heuristic, detect_origin_llm
+from .llm_client import LLMError, get_provider
from .version import __version__
_MEMPALACE_PROJECT_FILES = ("mempalace.yaml", "entities.json")
+# Pass 0 corpus-origin sampling caps. Tier 1 reads FULL file content (no
+# front-bias sampling) but bounds total memory on enormous corpora. Tier 2
+# trims to a smaller view because LLM context windows are finite.
+_PASS_ZERO_MAX_FILES = 30
+_PASS_ZERO_PER_FILE_CAP = 100_000 # 100KB per file is generous for prose
+_PASS_ZERO_TOTAL_CAP = 5_000_000 # 5MB total ceiling — bounds memory
+_PASS_ZERO_LLM_PER_SAMPLE = 2_000 # for Tier 2 LLM call only
+_PASS_ZERO_LLM_MAX_SAMPLES = 20 # caps the LLM-tier sample count
+
+
+def _gather_origin_samples(project_dir) -> list:
+ """Collect Tier-1 samples for corpus-origin detection.
+
+ Reads FULL file content (capped at ``_PASS_ZERO_PER_FILE_CAP`` per file
+ and ``_PASS_ZERO_TOTAL_CAP`` overall). No front-bias sampling — AI
+ signal that lives past the first N chars of a file must still trip
+ detection, so we read the whole file up to the cap.
+
+ Skips mempalace's own per-project artifacts (``entities.json``,
+ ``mempalace.yaml``) so a re-run of ``mempalace init`` produces the
+ same classification result it did on the first run. Without this
+ filter, the first run writes entities.json into the corpus, the
+ second run picks it up as a sample, and the Tier-1 density math
+ drifts (different total_chars). That makes init non-idempotent.
+
+ Returns a list of strings (one per readable file). Empty list when
+ the project has no readable text.
+ """
+ from .entity_detector import scan_for_detection
+
+ files = scan_for_detection(project_dir, max_files=_PASS_ZERO_MAX_FILES)
+ samples: list = []
+ total_chars = 0
+ for filepath in files:
+ if filepath.name in _MEMPALACE_PROJECT_FILES:
+ continue
+ if total_chars >= _PASS_ZERO_TOTAL_CAP:
+ break
+ try:
+ with open(filepath, encoding="utf-8", errors="replace") as f:
+ content = f.read(_PASS_ZERO_PER_FILE_CAP)
+ except OSError:
+ continue
+ if not content:
+ continue
+ samples.append(content)
+ total_chars += len(content)
+ return samples
+
+
+def _trim_samples_for_llm(samples: list) -> list:
+ """Reduce Tier-1 full-content samples to LLM-friendly size.
+
+ Tier 2 hits an LLM with a finite context window — we trim each sample
+ to ``_PASS_ZERO_LLM_PER_SAMPLE`` chars and cap the overall sample
+ count at ``_PASS_ZERO_LLM_MAX_SAMPLES``.
+ """
+ return [s[:_PASS_ZERO_LLM_PER_SAMPLE] for s in samples[:_PASS_ZERO_LLM_MAX_SAMPLES]]
+
+
+def _run_pass_zero(project_dir, palace_dir, llm_provider) -> dict:
+ """Pass 0: detect whether the corpus is AI-dialogue and persist the
+ result to ``/.mempalace/origin.json``.
+
+ Returns the wrapped result dict (same shape as origin.json) on success,
+ or ``None`` when there are no readable samples to detect from. The
+ return value is what cmd_init forwards to ``discover_entities`` via
+ the ``corpus_origin`` kwarg.
+
+ File-write failures (e.g. read-only palace) are caught and reported on
+ stderr; init never blocks on them.
+ """
+ import json
+ from datetime import datetime, timezone
+ from pathlib import Path
+
+ samples = _gather_origin_samples(project_dir)
+ if not samples:
+ print(" Skipping corpus-origin detection — no readable samples.")
+ return None
+
+ # Tier 1 — always runs. Cheap regex grep, no API.
+ result = detect_origin_heuristic(samples)
+
+ # Tier 2 — runs only when an LLM provider is available. The provider
+ # contract is best-effort: corpus_origin internally falls back to a
+ # conservative default on transport/parse failure, so we don't need a
+ # try/except here, but we still keep one for any unforeseen exception.
+ #
+ # MERGE-FIELDS, NOT REPLACE: Tier 2's persona/user/platform extraction
+ # is the whole reason to run it, but a weak local model (e.g. Ollama
+ # gemma4:e4b) can return a wrong likely_ai_dialogue/confidence call
+ # that overrides a confident heuristic answer. Per @igorls's review of
+ # PR #1211: keep the heuristic's likely_ai_dialogue + confidence
+ # (don't let a weak LLM flip a confident regex answer), and merge in
+ # LLM's persona-related fields + combined evidence.
+ if llm_provider is not None:
+ try:
+ llm_result = detect_origin_llm(_trim_samples_for_llm(samples), llm_provider)
+ # Heuristic owns: likely_ai_dialogue, confidence (do NOT touch).
+ # LLM contributes: primary_platform, user_name, agent_persona_names
+ # (heuristic doesn't extract any of these).
+ if llm_result.primary_platform:
+ result.primary_platform = llm_result.primary_platform
+ if llm_result.user_name:
+ result.user_name = llm_result.user_name
+ if llm_result.agent_persona_names:
+ result.agent_persona_names = list(llm_result.agent_persona_names)
+ # Combine evidence — keep both signal trails for the audit record,
+ # prefixed so the on-disk origin.json says which tier produced
+ # each entry. Idempotent: re-prefixing an already-tagged entry
+ # is a no-op.
+ tier1_prefix = "Tier-1 heuristic: "
+ tier2_prefix = "Tier-2 LLM: "
+ heuristic_evidence = [
+ s if s.startswith(tier1_prefix) else f"{tier1_prefix}{s}"
+ for s in (str(e) for e in result.evidence)
+ ]
+ llm_evidence = [
+ s if s.startswith(tier2_prefix) else f"{tier2_prefix}{s}"
+ for s in (str(e) for e in llm_result.evidence)
+ ]
+ result.evidence = heuristic_evidence + llm_evidence
+ except Exception as exc: # noqa: BLE001 — never block init on LLM failure
+ print(f" LLM corpus-origin tier failed ({exc}); using heuristic only.")
+
+ wrapped = {
+ "schema_version": 1,
+ "detected_at": datetime.now(timezone.utc).isoformat(),
+ "result": result.to_dict(),
+ }
+
+ origin_path = Path(palace_dir).expanduser() / ".mempalace" / "origin.json"
+ try:
+ origin_path.parent.mkdir(parents=True, exist_ok=True)
+ with open(origin_path, "w", encoding="utf-8") as f:
+ json.dump(wrapped, f, indent=2, ensure_ascii=False)
+ except OSError as exc:
+ print(f" Could not write {origin_path}: {exc}", file=sys.stderr)
+ # Return the wrapped dict anyway so the in-memory pipeline still
+ # benefits from the detection result this run.
+ return wrapped
+
+ # Banner — one line, two-space indent matching existing init style.
+ res = result
+ if res.likely_ai_dialogue:
+ platform = res.primary_platform or "AI dialogue (platform unidentified)"
+ user = res.user_name or "—"
+ agents = ", ".join(res.agent_persona_names) if res.agent_persona_names else "—"
+ print(f" Detected: {platform} (user: {user}, agents: {agents})")
+ else:
+ print(f" Corpus origin: not AI-dialogue (confidence: {res.confidence:.2f})")
+
+ return wrapped
+
def _ensure_mempalace_files_gitignored(project_dir) -> bool:
"""If project_dir is a git repo, ensure MemPalace's per-project files
@@ -86,29 +243,59 @@ def cmd_init(args):
languages = cfg.entity_languages
languages_tuple = tuple(languages)
- # Optional phase-2 LLM provider (opt-in via --llm).
+ # --llm is ON by default. --no-llm is the explicit opt-out. Provider
+ # precedence is unchanged (Ollama localhost first, then openai-compat,
+ # then anthropic). Never block init on a missing LLM: when no provider
+ # responds, print a one-line message pointing at --no-llm and fall
+ # through to heuristics-only.
llm_provider = None
- if getattr(args, "llm", False):
- from .llm_client import LLMError, get_provider
-
+ if not getattr(args, "no_llm", False):
+ provider_name = getattr(args, "llm_provider", "ollama") or "ollama"
+ provider_model = getattr(args, "llm_model", "gemma4:e4b") or "gemma4:e4b"
try:
- llm_provider = get_provider(
- name=args.llm_provider,
- model=args.llm_model,
- endpoint=args.llm_endpoint,
- api_key=args.llm_api_key,
+ candidate = get_provider(
+ name=provider_name,
+ model=provider_model,
+ endpoint=getattr(args, "llm_endpoint", None),
+ api_key=getattr(args, "llm_api_key", None),
)
+ ok, msg = candidate.check_available()
+ if ok:
+ llm_provider = candidate
+ print(f" LLM enabled: {provider_name}/{provider_model}")
+ # Privacy warning (issue #24): if the configured endpoint
+ # sends data off the user's machine/network, surface that
+ # before init proceeds. URL-based — Ollama on localhost,
+ # LM Studio on LAN, etc. won't trigger; Anthropic /
+ # cloud OpenAI-compat / any non-local endpoint will.
+ if candidate.is_external_service:
+ print(
+ f" ⚠ {provider_name} is an EXTERNAL API. Your folder "
+ f"content will be sent to the provider during init. "
+ f"MemPalace does not control how the provider logs, "
+ f"retains, or uses your data. Pass --no-llm to keep "
+ f"init fully local."
+ )
+ else:
+ print(
+ f" No LLM provider reachable ({msg}). "
+ f"Running heuristics-only — pass --no-llm to silence this."
+ )
except LLMError as e:
- print(f" ERROR: {e}", file=sys.stderr)
- sys.exit(2)
- ok, msg = llm_provider.check_available()
- if not ok:
print(
- f" ERROR: LLM provider '{args.llm_provider}' unavailable: {msg}",
- file=sys.stderr,
+ f" LLM init failed ({e}). "
+ f"Running heuristics-only — pass --no-llm to silence this."
)
- sys.exit(2)
- print(f" LLM refinement enabled: {args.llm_provider}/{args.llm_model}")
+
+ # Pass 0: detect whether the corpus is AI-dialogue. Writes
+ # /.mempalace/origin.json and supplies corpus context to the
+ # entity classifier so it can correctly handle agent persona names
+ # (e.g. "Echo", "Sparrow") without misclassifying them as people.
+ corpus_origin = _run_pass_zero(
+ project_dir=args.dir,
+ palace_dir=cfg.palace_path,
+ llm_provider=llm_provider,
+ )
# Pass 1: discover entities — manifests + git authors first, prose detection
# as supplement for names mentioned only in docs/notes. Optional phase-2
@@ -116,22 +303,40 @@ def cmd_init(args):
print(f"\n Scanning for entities in: {args.dir}")
if languages_tuple != ("en",):
print(f" Languages: {', '.join(languages_tuple)}")
- detected = discover_entities(args.dir, languages=languages_tuple, llm_provider=llm_provider)
- total = len(detected["people"]) + len(detected["projects"]) + len(detected["uncertain"])
+ detected = discover_entities(
+ args.dir,
+ languages=languages_tuple,
+ llm_provider=llm_provider,
+ corpus_origin=corpus_origin,
+ )
+ total = (
+ len(detected["people"])
+ + len(detected["projects"])
+ + len(detected.get("topics", []))
+ + len(detected["uncertain"])
+ )
if total > 0:
confirmed = confirm_entities(detected, yes=getattr(args, "yes", False))
# Save confirmed entities to /entities.json (per-project
# audit trail — user can inspect or hand-edit) AND merge into the
- # global registry the miner reads at mine time.
- if confirmed["people"] or confirmed["projects"]:
- entities_path = Path(args.dir).expanduser().resolve() / "entities.json"
+ # global registry the miner reads at mine time. Topics are kept
+ # separately so the miner can later compute cross-wing tunnels
+ # from shared topics (see palace_graph.compute_topic_tunnels).
+ if confirmed["people"] or confirmed["projects"] or confirmed.get("topics"):
+ project_path = Path(args.dir).expanduser().resolve()
+ entities_path = project_path / "entities.json"
with open(entities_path, "w", encoding="utf-8") as f:
json.dump(confirmed, f, indent=2, ensure_ascii=False)
print(f" Entities saved: {entities_path}")
from .miner import add_to_known_entities
- registry_path = add_to_known_entities(confirmed)
+ # Wing matches the default produced by ``room_detector_local``
+ # (folder basename) and the miner fallback in ``load_config``.
+ # Used by the topics_by_wing map so cross-wing tunnels can be
+ # computed at mine time.
+ wing = project_path.name
+ registry_path = add_to_known_entities(confirmed, wing=wing)
print(f" Registry updated: {registry_path}")
else:
print(" No entities detected — proceeding with directory-based rooms.")
@@ -143,6 +348,107 @@ def cmd_init(args):
# Pass 3: protect git repos from accidentally committing per-project files
_ensure_mempalace_files_gitignored(args.dir)
+ # Pass 4: offer to run mine immediately. The directory just had its
+ # rooms + entities set up, so 99% of users will mine next anyway —
+ # asking here removes the "remember to type the next command" friction.
+ # `--auto-mine` skips the prompt and mines automatically; `--yes` is
+ # SCOPED to entity auto-accept and does NOT imply mining.
+ _maybe_run_mine_after_init(args, cfg)
+
+
+def _format_size_mb(num_bytes: int) -> str:
+ """Render a byte count as a human-readable size for the mine estimate.
+
+ < 1 MB rounds up to ``<1 MB`` so users never see a misleading ``0 MB``
+ on small projects. Otherwise reports an integer megabyte count.
+ """
+ if num_bytes <= 0:
+ return "<1 MB"
+ mb = num_bytes / (1024 * 1024)
+ if mb < 1:
+ return "<1 MB"
+ return f"{mb:.0f} MB"
+
+
+def _maybe_run_mine_after_init(args, cfg) -> None:
+ """Prompt the user to mine the directory just initialised, or auto-mine
+ when ``--auto-mine`` was passed. Extracted so the prompt path is
+ unit-testable.
+
+ Behaviour matrix:
+
+ - default (no flags) — prompt, default Yes, mine in-process if accepted
+ - ``--yes`` — entity auto-accept only; STILL prompts for the mine step
+ - ``--auto-mine`` — skip the mine prompt and mine directly
+ - ``--yes --auto-mine`` — fully non-interactive
+
+ Mine errors are surfaced (not swallowed): a failing mine exits with a
+ non-zero status via :func:`sys.exit` so downstream scripts can see it.
+ The pre-scan that produces the file-count estimate is reused as the
+ mine input so we never walk the corpus twice.
+ """
+ from .miner import mine, scan_project
+
+ project_dir = args.dir
+ auto_mine = bool(getattr(args, "auto_mine", False))
+
+ # Single corpus walk: this scan feeds BOTH the "what would be mined"
+ # estimate the user sees in the prompt AND the file list mine() will
+ # process. We pass the result into mine() via the `files` kwarg so it
+ # doesn't re-walk the tree.
+ try:
+ scanned_files = scan_project(project_dir)
+ file_count = len(scanned_files)
+ total_bytes = 0
+ for fp in scanned_files:
+ try:
+ total_bytes += fp.stat().st_size
+ except OSError:
+ # Skip files that vanished between scan and stat — mine()
+ # will skip them too.
+ continue
+ size_str = _format_size_mb(total_bytes)
+ except Exception:
+ scanned_files = None
+ file_count = None
+ size_str = None
+
+ # Show the scope estimate BEFORE the prompt so the user knows what
+ # they are agreeing to. On a real corpus mine takes minutes; hitting
+ # Enter on a default-Y prompt with no size cue is a footgun.
+ if isinstance(file_count, int):
+ if size_str:
+ print(f" ~{file_count} files (~{size_str}) would be mined into this palace.\n")
+ else:
+ print(f" ~{file_count} files would be mined into this palace.\n")
+
+ if not auto_mine:
+ try:
+ answer = input(" Mine this directory now? [Y/n] ").strip().lower()
+ except EOFError:
+ # Non-interactive stdin (e.g. piped) — treat like decline so
+ # we don't block. User can re-run with --auto-mine to opt in.
+ answer = "n"
+ if answer not in ("", "y", "yes"):
+ print(f"\n Skipped. Run `mempalace mine {shlex.quote(project_dir)}` when ready.")
+ return
+
+ palace_path = cfg.palace_path
+ try:
+ mine(
+ project_dir=project_dir,
+ palace_path=palace_path,
+ files=scanned_files,
+ )
+ except KeyboardInterrupt:
+ # mine() handles its own SIGINT summary + sys.exit(130); re-raise
+ # any KeyboardInterrupt that escapes (shouldn't happen) so the
+ # shell still sees a clean interrupt rather than a swallowed one.
+ raise
+ except Exception as e:
+ print(f"\n ERROR: mine failed: {e}", file=sys.stderr)
+ sys.exit(1)
+
def cmd_mine(args):
palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
@@ -150,6 +456,16 @@ def cmd_mine(args):
for raw in args.include_ignored or []:
include_ignored.extend(part.strip() for part in raw.split(",") if part.strip())
+ # --redetect-origin re-runs corpus_origin on the current corpus state
+ # and overwrites /.mempalace/origin.json before mining proceeds.
+ # Heuristic-only by design — full LLM detection lives on `mempalace init`.
+ if getattr(args, "redetect_origin", False):
+ _run_pass_zero(
+ project_dir=args.dir,
+ palace_dir=palace_path,
+ llm_provider=None,
+ )
+
if args.mode == "convos":
from .convo_miner import mine_convos
@@ -296,6 +612,7 @@ def cmd_repair(args):
import shutil
from .backends.chroma import ChromaBackend
from .migrate import confirm_destructive_action, contains_palace_database
+ from .repair import TruncationDetected, check_extraction_safety
palace_path = os.path.abspath(
os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
@@ -344,12 +661,31 @@ def cmd_repair(args):
offset = 0
while offset < total:
batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
+ if not batch["ids"]:
+ break
all_ids.extend(batch["ids"])
all_docs.extend(batch["documents"])
all_metas.extend(batch["metadatas"])
- offset += batch_size
+ offset += len(batch["ids"])
print(f" Extracted {len(all_ids)} drawers")
+ # ── #1208 guard ──────────────────────────────────────────────────
+ # Cross-check against the SQLite ground truth before doing anything
+ # destructive. Catches the user-reported case where chromadb's
+ # collection-layer get() silently caps at 10,000 rows even on much
+ # larger palaces (e.g. after manual HNSW quarantine). Override with
+ # --confirm-truncation-ok only after independently verifying the
+ # extraction count is real.
+ try:
+ check_extraction_safety(
+ palace_path,
+ len(all_ids),
+ confirm_truncation_ok=getattr(args, "confirm_truncation_ok", False),
+ )
+ except TruncationDetected as e:
+ print(e.message)
+ return
+
# Backup and rebuild
palace_path = os.path.normpath(palace_path)
backup_path = palace_path + ".backup"
@@ -572,6 +908,14 @@ def main():
action="store_true",
help="Auto-accept all detected entities (non-interactive)",
)
+ p_init.add_argument(
+ "--auto-mine",
+ action="store_true",
+ help=(
+ "Skip the post-init mine prompt and run mine automatically. "
+ "Combine with --yes for a fully non-interactive setup."
+ ),
+ )
p_init.add_argument(
"--lang",
default=None,
@@ -586,17 +930,25 @@ def main():
"--llm",
action="store_true",
help=(
- "Enable LLM-assisted entity refinement (opt-in, local-first). "
- "Runs after manifest/git/regex detection, asking the configured "
- "provider to reclassify ambiguous candidates. "
- "Ctrl-C during refinement returns partial results."
+ "DEPRECATED — LLM-assisted entity refinement is now ON by default. "
+ "This flag is preserved for backward compatibility; pass --no-llm "
+ "to opt out instead."
+ ),
+ )
+ p_init.add_argument(
+ "--no-llm",
+ action="store_true",
+ help=(
+ "Disable LLM-assisted entity refinement. Run init in heuristics-only "
+ "mode (no provider acquisition, no LLM calls). Use when running "
+ "without a local LLM and you don't want the graceful-fallback message."
),
)
p_init.add_argument(
"--llm-provider",
default="ollama",
choices=["ollama", "openai-compat", "anthropic"],
- help="LLM provider (default: ollama). Use --llm to enable.",
+ help="LLM provider (default: ollama). Pass --no-llm to disable LLM-assisted refinement entirely.",
)
p_init.add_argument(
"--llm-model",
@@ -647,6 +999,17 @@ def main():
help="Your name — recorded on every drawer (default: mempalace)",
)
p_mine.add_argument("--limit", type=int, default=0, help="Max files to process (0 = all)")
+ p_mine.add_argument(
+ "--redetect-origin",
+ action="store_true",
+ help=(
+ "Re-run corpus_origin detection on this directory and overwrite "
+ "/.mempalace/origin.json. Useful when the corpus has grown "
+ "since `mempalace init` and the stored origin may be stale. "
+ "Heuristic-only (no LLM call) — re-run `mempalace init --llm` for "
+ "Tier 2 refinement."
+ ),
+ )
p_mine.add_argument(
"--dry-run", action="store_true", help="Show what would be filed without filing"
)
@@ -744,10 +1107,23 @@ def main():
instructions_sub.add_parser(instr_name, help=f"Output {instr_name} instructions")
# repair
- sub.add_parser(
+ p_repair = sub.add_parser(
"repair",
help="Rebuild palace vector index from stored data (fixes segfaults after corruption)",
- ).add_argument("--yes", action="store_true", help="Skip confirmation for destructive changes")
+ )
+ p_repair.add_argument(
+ "--yes", action="store_true", help="Skip confirmation for destructive changes"
+ )
+ p_repair.add_argument(
+ "--confirm-truncation-ok",
+ action="store_true",
+ help=(
+ "Override the #1208 safety guard. Required when chromadb's collection-layer "
+ "extraction returns exactly 10,000 drawers and the SQLite ground-truth check "
+ "either matches or can't be read. Use only after independently confirming "
+ "the palace really contains that count."
+ ),
+ )
# mcp
sub.add_parser(
diff --git a/mempalace/config.py b/mempalace/config.py
index 616334e5c..8e12b6b75 100644
--- a/mempalace/config.py
+++ b/mempalace/config.py
@@ -236,6 +236,49 @@ def set_entity_languages(self, languages):
pass
return normalized
+ @property
+ def embedding_device(self):
+ """Hardware device for the ONNX embedding model.
+
+ Values: ``"auto"`` (default), ``"cpu"``, ``"cuda"``, ``"coreml"``,
+ ``"dml"``. Read from env ``MEMPALACE_EMBEDDING_DEVICE`` first, then
+ ``embedding_device`` in ``config.json``, then ``"auto"``.
+
+ ``auto`` resolves to the first available accelerator at runtime via
+ :mod:`mempalace.embedding`; requesting an unavailable accelerator
+ logs a warning and falls back to CPU.
+ """
+ env_val = os.environ.get("MEMPALACE_EMBEDDING_DEVICE")
+ if env_val:
+ return env_val.strip().lower()
+ return str(self._file_config.get("embedding_device", "auto")).strip().lower()
+
+ @property
+ def topic_tunnel_min_count(self):
+ """Minimum number of overlapping confirmed topics required to create
+ a cross-wing tunnel between two wings.
+
+ Default is ``1`` — any single shared topic produces a tunnel. Bump
+ to ``2+`` if your projects share lots of common-tech labels (Python,
+ Docker, Git) and you want only meaningfully overlapping wings to
+ link. Reads ``MEMPALACE_TOPIC_TUNNEL_MIN_COUNT`` env first, then the
+ config-file value, then ``1``.
+ """
+ env_val = os.environ.get("MEMPALACE_TOPIC_TUNNEL_MIN_COUNT")
+ if env_val:
+ try:
+ parsed = int(env_val)
+ if parsed >= 1:
+ return parsed
+ except ValueError:
+ pass
+ cfg_val = self._file_config.get("topic_tunnel_min_count")
+ try:
+ parsed = int(cfg_val) if cfg_val is not None else 1
+ except (TypeError, ValueError):
+ parsed = 1
+ return max(1, parsed)
+
@property
def hook_silent_save(self):
"""Whether the stop hook saves directly (True) or blocks for MCP calls (False)."""
diff --git a/mempalace/convo_miner.py b/mempalace/convo_miner.py
index 521d8fb13..a91cf33ac 100644
--- a/mempalace/convo_miner.py
+++ b/mempalace/convo_miner.py
@@ -55,6 +55,7 @@ def _detect_hall_cached(content: str) -> str:
MIN_CHUNK_SIZE = 30
CHUNK_SIZE = 800 # chars per drawer — align with miner.py
+DRAWER_UPSERT_BATCH_SIZE = 1000
MAX_FILE_SIZE = 500 * 1024 * 1024 # 500 MB — skip files larger than this.
# Matches miner.py at 500 MB. Long Claude Code sessions, multi-year
# ChatGPT exports, and lifetime Slack dumps routinely exceed 10 MB; the
@@ -332,31 +333,43 @@ def _file_chunks_locked(collection, source_file, chunks, wing, room, agent, extr
except Exception:
pass
- for chunk in chunks:
- chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
- if extract_mode == "general":
- room_counts_delta[chunk_room] += 1
- drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
+ # Batch chunks into bounded upserts so large transcripts keep most of
+ # the embedding speedup without one huge Chroma/SQLite request. Keep
+ # one filed_at per source file so all transcript drawers share an
+ # ingest timestamp.
+ filed_at = datetime.now().isoformat()
+ for batch_start in range(0, len(chunks), DRAWER_UPSERT_BATCH_SIZE):
+ batch_docs: list = []
+ batch_ids: list = []
+ batch_metas: list = []
+ for chunk in chunks[batch_start : batch_start + DRAWER_UPSERT_BATCH_SIZE]:
+ chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
+ if extract_mode == "general":
+ room_counts_delta[chunk_room] += 1
+ drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
+ batch_docs.append(chunk["content"])
+ batch_ids.append(drawer_id)
+ batch_metas.append(
+ {
+ "wing": wing,
+ "room": chunk_room,
+ "hall": _detect_hall_cached(chunk["content"]),
+ "source_file": source_file,
+ "chunk_index": chunk["chunk_index"],
+ "added_by": agent,
+ "filed_at": filed_at,
+ "ingest_mode": "convos",
+ "extract_mode": extract_mode,
+ "normalize_version": NORMALIZE_VERSION,
+ }
+ )
try:
collection.upsert(
- documents=[chunk["content"]],
- ids=[drawer_id],
- metadatas=[
- {
- "wing": wing,
- "room": chunk_room,
- "hall": _detect_hall_cached(chunk["content"]),
- "source_file": source_file,
- "chunk_index": chunk["chunk_index"],
- "added_by": agent,
- "filed_at": datetime.now().isoformat(),
- "ingest_mode": "convos",
- "extract_mode": extract_mode,
- "normalize_version": NORMALIZE_VERSION,
- }
- ],
+ documents=batch_docs,
+ ids=batch_ids,
+ metadatas=batch_metas,
)
- drawers_added += 1
+ drawers_added += len(batch_docs)
except Exception as e:
if "already exists" not in str(e).lower():
raise
diff --git a/mempalace/corpus_origin.py b/mempalace/corpus_origin.py
new file mode 100644
index 000000000..12d34ab1d
--- /dev/null
+++ b/mempalace/corpus_origin.py
@@ -0,0 +1,422 @@
+"""
+corpus_origin.py — Detect whether a corpus is an AI-dialogue record and,
+if so, what platform and what persona names the user has assigned to the
+agent.
+
+This is the first question any downstream Pass 2 classification needs
+answered. Without it, a drawer like "my three sons" in a Claude Code
+dialogue corpus can't be correctly resolved to "three AI instances"
+rather than "three biological children."
+
+Two-tier detection:
+
+ Tier 1 — detect_origin_heuristic(samples)
+ Cheap, no API. Grep for well-known AI brand terms + turn
+ markers. Always runs. Outputs a hypothesis.
+
+ Tier 2 — detect_origin_llm(samples, provider)
+ Uses an LLMProvider (typically Haiku via mempalace.llm_client)
+ with the model's pre-trained knowledge of Claude/ChatGPT/Gemini
+ etc. Confirms platform, extracts agent persona-names the user
+ has assigned. One call, ~$0.01 cost.
+
+Design principle:
+ Don't make the classifier re-discover what Claude, ChatGPT, Gemini, MCP,
+ or other well-known entities ARE — the LLM already knows them from its
+ training. Only corpus-specific entities (e.g. the user's persona-name
+ for their Claude instance) need discovery.
+
+Default stance (when evidence is thin):
+ "This IS an AI-dialogue corpus" — false-negative is catastrophic for
+ downstream classification; false-positive is recoverable via per-drawer
+ voice-profile detection in later passes.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass, field, asdict
+from typing import Optional
+
+
+# ── Well-known AI brand terms (expand as new platforms emerge) ────────────
+# Detection is by PATTERN + CONTEXT, not by capitalization or English-language
+# rules. Two categories:
+#
+# UNAMBIGUOUS — terms that have essentially no meaning outside of AI context.
+# Always counted toward AI-dialogue evidence.
+#
+# AMBIGUOUS — terms that share a string with common English words, names,
+# poetry forms, zodiac signs, animals, etc. Counted toward AI-dialogue
+# evidence ONLY when at least one unambiguous AI signal also appears in
+# the corpus (turn marker, unambiguous brand term, or AI infrastructure
+# term). This avoids false-positives on French novels with characters
+# named "Claude", astrology corpora discussing "Gemini", poetry corpora
+# full of "haiku" / "sonnet", etc.
+#
+# All matching is CASE-INSENSITIVE — users type lowercase constantly.
+
+_AI_UNAMBIGUOUS_TERMS = [
+ # Anthropic-specific
+ "Anthropic",
+ "Claude Code",
+ "Claude 3",
+ "Claude 4",
+ "claude mcp",
+ "CLAUDE.md",
+ ".claude/",
+ # OpenAI-specific
+ "ChatGPT",
+ "GPT-4",
+ "GPT-3",
+ "GPT-5",
+ "OpenAI",
+ "gpt-4o",
+ "gpt-4-turbo",
+ "o1-preview",
+ "o3",
+ # Google-specific
+ "gemini-pro",
+ "gemini-1.5",
+ "Google AI",
+ # Meta / others (specific model identifiers, not bare common words)
+ "Mixtral",
+ "Cohere",
+ # AI-infrastructure terms with no common-English collision
+ "MCP",
+ "LLM",
+ "RAG",
+ "fine-tune",
+ "context window",
+ "embedding",
+]
+
+_AI_AMBIGUOUS_TERMS = [
+ # Anthropic — bare brand/model names that collide with names + poetry
+ "Claude", # also a common French masculine name
+ "Opus", # also a musical work, comic strip, magazine
+ "Sonnet", # also a 14-line poem form
+ "Haiku", # also a 17-syllable poem form
+ # Google — bare brand that collides with zodiac sign
+ "Gemini", # also the zodiac sign
+ "Bard", # also a poet / Shakespeare
+ # Meta / others
+ "Llama", # also the South American animal
+ "Mistral", # also a Mediterranean wind
+ # Note: 'prompt', 'completion', 'tokens' previously lived here but were
+ # removed: they're suppressed without an unambiguous co-signal anyway,
+ # and by the time a co-signal is present the corpus is already flagged.
+ # Keeping them just produced noisier evidence strings.
+]
+
+# Turn-marker patterns commonly seen in AI-dialogue transcripts
+_TURN_MARKERS = [
+ r"\buser\s*:\s*",
+ r"\bassistant\s*:\s*",
+ r"\bhuman\s*:\s*",
+ r"\bai\s*:\s*",
+ r"\b>>>\s*User\b",
+ r"\b>>>\s*Assistant\b",
+]
+
+
+def _brand_pattern(term: str) -> str:
+ """Build a regex for a brand term that uses word boundaries
+ only on edges where the term itself starts/ends with a word
+ character. Without this nuance:
+ - 'Claude' would falsely match inside 'Claudette' (no \\b)
+ - '.claude/' would fail to match at start of string (\\b
+ before non-word char requires preceding word char)
+ So we only attach \\b where it actually makes sense."""
+ escaped = re.escape(term)
+ prefix = r"\b" if term[0].isalnum() or term[0] == "_" else ""
+ suffix = r"\b" if term[-1].isalnum() or term[-1] == "_" else ""
+ return prefix + escaped + suffix
+
+
+@dataclass
+class CorpusOriginResult:
+ """Structured output from corpus-origin detection.
+
+ Fields:
+ likely_ai_dialogue — best hypothesis about whether this is AI-dialogue
+ confidence — 0.0 to 1.0
+ primary_platform — e.g. "Claude Code (Anthropic CLI)" or None
+ user_name — the corpus author's name if identifiable from context, else None
+ agent_persona_names — names the user has assigned to the AI agent(s)
+ (e.g. ["Echo", "Sparrow"]). Does NOT include the user's own name.
+ evidence — human-readable reasons for the classification
+ """
+
+ likely_ai_dialogue: bool
+ confidence: float
+ primary_platform: Optional[str]
+ user_name: Optional[str] = None
+ agent_persona_names: list[str] = field(default_factory=list)
+ evidence: list[str] = field(default_factory=list)
+
+ def to_dict(self) -> dict:
+ return asdict(self)
+
+
+# ── Tier 1: cheap heuristic ───────────────────────────────────────────────
+
+
+def detect_origin_heuristic(samples: list[str]) -> CorpusOriginResult:
+ """Fast grep-based detection. No API calls.
+
+ Scores AI-dialogue likelihood by counting:
+ - occurrences of well-known AI brand terms
+ - turn-marker patterns (user:, assistant:, etc.)
+
+ Returns a CorpusOriginResult with confidence derived from signal density.
+ """
+ combined = "\n\n".join(samples)
+ total_chars = max(1, len(combined))
+
+ # Count UNAMBIGUOUS brand-term hits (case-insensitive — users type
+ # lowercase constantly, so 'chatgpt' must trip the same as 'ChatGPT').
+ # Word boundaries prevent false in-word matches (see _brand_pattern).
+ unambiguous_hits: dict[str, int] = {}
+ total_unambiguous = 0
+ for term in _AI_UNAMBIGUOUS_TERMS:
+ matches = re.findall(_brand_pattern(term), combined, re.IGNORECASE)
+ if matches:
+ unambiguous_hits[term] = len(matches)
+ total_unambiguous += len(matches)
+
+ # Count AMBIGUOUS brand-term hits separately. These will only be
+ # counted toward AI-dialogue evidence if the corpus also contains
+ # at least one unambiguous AI signal — see co-occurrence rule below.
+ ambiguous_hits: dict[str, int] = {}
+ total_ambiguous = 0
+ for term in _AI_AMBIGUOUS_TERMS:
+ matches = re.findall(_brand_pattern(term), combined, re.IGNORECASE)
+ if matches:
+ ambiguous_hits[term] = len(matches)
+ total_ambiguous += len(matches)
+
+ # Count turn-marker hits (case-insensitive — transcripts vary).
+ turn_hits = 0
+ turn_types_found = set()
+ for pattern in _TURN_MARKERS:
+ matches = re.findall(pattern, combined, re.IGNORECASE)
+ if matches:
+ turn_hits += len(matches)
+ turn_types_found.add(pattern)
+
+ # Co-occurrence rule for ambiguous terms.
+ # Ambiguous terms (e.g. 'Claude' as a French name, 'Gemini' as a zodiac
+ # sign, 'Haiku' as a poem form) only count toward brand evidence if
+ # the corpus also contains at least one unambiguous AI signal. Otherwise
+ # we'd false-positive on French novels, astrology forums, poetry corpora,
+ # llama-rancher journals, etc.
+ has_ai_context = total_unambiguous > 0 or turn_hits > 0
+ counted_brand_hits = total_unambiguous + (total_ambiguous if has_ai_context else 0)
+
+ # Brand-term density per 1000 chars; turn-marker density likewise.
+ # Tuned on a small set of examples; these aren't magic numbers and
+ # can be revisited as we see more corpora.
+ brand_density = counted_brand_hits / (total_chars / 1000)
+ turn_density = turn_hits / (total_chars / 1000)
+
+ # Build evidence list
+ evidence: list[str] = []
+ shown_hits = dict(unambiguous_hits)
+ if has_ai_context:
+ shown_hits.update(ambiguous_hits)
+ if shown_hits:
+ top_terms = sorted(shown_hits.items(), key=lambda x: -x[1])[:5]
+ evidence.append("AI brand terms: " + ", ".join(f"'{k}' ({v}x)" for k, v in top_terms))
+ elif ambiguous_hits and not has_ai_context:
+ # Be transparent that we saw ambiguous matches but suppressed them
+ # for lack of co-occurring AI context.
+ suppressed = sorted(ambiguous_hits.items(), key=lambda x: -x[1])[:3]
+ evidence.append(
+ "Ambiguous terms present but suppressed (no co-occurring AI signal): "
+ + ", ".join(f"'{k}' ({v}x)" for k, v in suppressed)
+ )
+ if turn_hits:
+ evidence.append(
+ f"Turn markers detected: {turn_hits} occurrences across {len(turn_types_found)} pattern types"
+ )
+
+ # Decision logic:
+ # strong signal (brand OR turn hits both >= threshold) → confident AI-dialogue
+ # MEANINGFUL absence (enough text, zero brand, zero turn) → confident narrative
+ # ambiguous or insufficient text → default stance: AI-dialogue with low confidence
+ #
+ # Threshold for "meaningful absence": the samples collectively have to
+ # be long enough that the absence of AI signals would be expected to
+ # surface if the corpus really is narrative. 150 chars is the working
+ # floor — below that, we cannot confidently say "this is narrative."
+ MEANINGFUL_TEXT_FLOOR = 150
+
+ if brand_density >= 0.5 or turn_density >= 2.0:
+ return CorpusOriginResult(
+ likely_ai_dialogue=True,
+ confidence=min(0.95, 0.6 + 0.1 * (brand_density + turn_density)),
+ primary_platform=None, # tier 2 will refine
+ evidence=evidence,
+ )
+ if counted_brand_hits == 0 and turn_hits == 0 and total_chars >= MEANINGFUL_TEXT_FLOOR:
+ # Note: ambiguous-only matches (e.g. a French novel with 'Claude' as
+ # a character name) flow through here because counted_brand_hits == 0
+ # when no unambiguous AI signal co-occurs. The 'evidence' list still
+ # records that the ambiguous matches were seen and suppressed.
+ narrative_evidence = list(evidence) + [
+ f"no unambiguous AI signal across {total_chars} chars of text — pure narrative"
+ ]
+ return CorpusOriginResult(
+ likely_ai_dialogue=False,
+ confidence=0.9,
+ primary_platform=None,
+ evidence=narrative_evidence,
+ )
+ # Ambiguous or too-short-to-tell case: default stance is AI-dialogue
+ # with explicit low confidence. Tier 2 (LLM) should be called to confirm.
+ reason = "weak signal" if (counted_brand_hits or turn_hits) else "insufficient text"
+ return CorpusOriginResult(
+ likely_ai_dialogue=True,
+ confidence=0.4,
+ primary_platform=None,
+ evidence=evidence
+ + [
+ f"{reason} — applying default-stance (ai_dialogue=True, low confidence). "
+ "Tier 2 LLM check recommended to confirm or override."
+ ],
+ )
+
+
+# ── Tier 2: LLM-assisted confirmation + persona extraction ────────────────
+
+
+_SYSTEM_PROMPT = """You are analyzing a corpus of text to determine whether it is a \
+record of conversations with an AI agent (e.g. Claude, ChatGPT, Gemini, custom LLM \
+apps), or some other kind of text (personal narrative, story, research notes, \
+journal, code, etc.).
+
+Use your pre-existing knowledge of well-known AI platforms. You don't need the \
+corpus to explain what Claude or ChatGPT is — you already know. Your job is to \
+detect evidence of their presence and identify what persona-names the user has \
+assigned to the agent(s) they converse with.
+
+CRITICAL distinction:
+ - agent_persona_names are names the USER has assigned to the AI AGENT(S)
+ they converse with. Example: "Echo", "Sparrow", "Henry" might be names
+ the user calls a Claude instance they're building a relationship with.
+ - Do NOT include the USER's own name in agent_persona_names. The user
+ is the human author of the corpus, not a persona of the agent. Even
+ if the user's name appears frequently in the text (writing about
+ themselves), that is NOT an agent persona.
+ - If you can identify the user's name from context, put it in user_name
+ (separate field). If unclear, leave user_name null.
+
+Respond with JSON only (no prose before or after):
+{
+ "is_ai_dialogue_corpus": ,
+ "confidence": <0.0 to 1.0>,
+ "primary_platform": <"Claude (Anthropic)" | "ChatGPT (OpenAI)" | "Gemini (Google)" | other platform name | null>,
+ "user_name": ,
+ "agent_persona_names": [],
+ "evidence": []
+}
+
+Default stance: if evidence is thin or mixed, return is_ai_dialogue_corpus=true \
+with low confidence. False-negatives on AI-dialogue detection break downstream \
+classification; false-positives are recoverable later.
+"""
+
+
+def _extract_json(text: str) -> Optional[dict]:
+ """Pull the first JSON object out of a possibly-messy LLM response."""
+ text = text.strip()
+ if not text:
+ return None
+ # Try straight parse first
+ try:
+ return json.loads(text)
+ except json.JSONDecodeError:
+ pass
+ # Try to find a {...} block
+ start = text.find("{")
+ if start < 0:
+ return None
+ depth = 0
+ in_string = False
+ escape = False
+ for i in range(start, len(text)):
+ ch = text[i]
+ if in_string:
+ if escape:
+ escape = False
+ elif ch == "\\":
+ escape = True
+ elif ch == '"':
+ in_string = False
+ continue
+ if ch == '"':
+ in_string = True
+ elif ch == "{":
+ depth += 1
+ elif ch == "}":
+ depth -= 1
+ if depth == 0:
+ candidate = text[start : i + 1]
+ try:
+ return json.loads(candidate)
+ except json.JSONDecodeError:
+ return None
+ return None
+
+
+def detect_origin_llm(samples: list[str], provider) -> CorpusOriginResult:
+ """LLM-assisted detection. Takes samples (list of drawer-text excerpts)
+ and an LLMProvider (mempalace.llm_client.LLMProvider). Returns the
+ same CorpusOriginResult shape as the heuristic.
+
+ Falls back conservatively (default-stance ai=True, low confidence)
+ on any LLM error or malformed response — never raises.
+ """
+ # Build the user prompt: concise excerpts, capped so we stay cheap
+ max_excerpt_chars = 800
+ excerpts = "\n\n---\n\n".join(
+ f"[sample {i + 1}]\n{s[:max_excerpt_chars]}" for i, s in enumerate(samples[:20])
+ )
+ user_prompt = f"CORPUS EXCERPTS:\n\n{excerpts}\n\nAnalyze and respond with JSON."
+
+ try:
+ resp = provider.classify(system=_SYSTEM_PROMPT, user=user_prompt, json_mode=True)
+ raw = getattr(resp, "text", "") or ""
+ except Exception as e:
+ return CorpusOriginResult(
+ likely_ai_dialogue=True,
+ confidence=0.3,
+ primary_platform=None,
+ evidence=[f"LLM provider error (fallback to default stance): {e}"],
+ )
+
+ parsed = _extract_json(raw)
+ if not parsed or not isinstance(parsed, dict):
+ return CorpusOriginResult(
+ likely_ai_dialogue=True,
+ confidence=0.3,
+ primary_platform=None,
+ evidence=["LLM response was not valid JSON (fallback to default stance)"],
+ )
+
+ # Pull fields defensively. If the LLM leaked the user_name into
+ # agent_persona_names despite the prompt telling it not to, filter it out.
+ user_name = parsed.get("user_name") or None
+ personas = list(parsed.get("agent_persona_names") or [])
+ if user_name:
+ personas = [p for p in personas if p.lower() != user_name.lower()]
+ return CorpusOriginResult(
+ likely_ai_dialogue=bool(parsed.get("is_ai_dialogue_corpus", True)),
+ confidence=float(parsed.get("confidence", 0.5)),
+ primary_platform=parsed.get("primary_platform") or None,
+ user_name=user_name,
+ agent_persona_names=personas,
+ evidence=list(parsed.get("evidence") or []),
+ )
diff --git a/mempalace/embedding.py b/mempalace/embedding.py
new file mode 100644
index 000000000..a565bd92d
--- /dev/null
+++ b/mempalace/embedding.py
@@ -0,0 +1,155 @@
+"""Embedding function factory with hardware acceleration.
+
+Returns a ChromaDB-compatible embedding function bound to a user-selected
+ONNX Runtime execution provider. The same ``all-MiniLM-L6-v2`` model and
+384-dim vectors ChromaDB ships by default are reused, so switching device
+does not invalidate existing palaces.
+
+Supported devices (env ``MEMPALACE_EMBEDDING_DEVICE`` or ``embedding_device``
+in ``~/.mempalace/config.json``):
+
+* ``auto`` — prefer CUDA ▸ CoreML ▸ DirectML, fall back to CPU
+* ``cpu`` — force CPU (the historical default)
+* ``cuda`` — NVIDIA GPU via ``onnxruntime-gpu`` (``pip install mempalace[gpu]``)
+* ``coreml`` — Apple Neural Engine (macOS)
+* ``dml`` — DirectML (Windows / AMD / Intel GPUs)
+
+Requesting an unavailable accelerator emits a warning and falls back to CPU
+rather than hard-failing — mining must still work on a laptop without CUDA.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+_PROVIDER_MAP = {
+ "cpu": ["CPUExecutionProvider"],
+ "cuda": ["CUDAExecutionProvider", "CPUExecutionProvider"],
+ "coreml": ["CoreMLExecutionProvider", "CPUExecutionProvider"],
+ "dml": ["DmlExecutionProvider", "CPUExecutionProvider"],
+}
+
+_DEVICE_EXTRA = {
+ "cuda": "mempalace[gpu]",
+ "coreml": "mempalace[coreml]",
+ "dml": "mempalace[dml]",
+}
+
+_AUTO_ORDER = [
+ ("CUDAExecutionProvider", "cuda"),
+ ("CoreMLExecutionProvider", "coreml"),
+ ("DmlExecutionProvider", "dml"),
+]
+
+_EF_CACHE: dict = {}
+_WARNED: set = set()
+
+
+def _resolve_providers(device: str) -> tuple[list, str]:
+ """Return ``(provider_list, effective_device)`` for ``device``.
+
+ Falls back to CPU (with a one-shot warning) when the requested
+ accelerator is not compiled into the installed ``onnxruntime``.
+ """
+ device = (device or "auto").strip().lower()
+
+ try:
+ import onnxruntime as ort
+
+ available = set(ort.get_available_providers())
+ except ImportError:
+ return (["CPUExecutionProvider"], "cpu")
+
+ if device == "auto":
+ for provider, name in _AUTO_ORDER:
+ if provider in available:
+ return ([provider, "CPUExecutionProvider"], name)
+ return (["CPUExecutionProvider"], "cpu")
+
+ requested = _PROVIDER_MAP.get(device)
+ if requested is None:
+ if device not in _WARNED:
+ logger.warning("Unknown embedding_device %r — falling back to cpu", device)
+ _WARNED.add(device)
+ return (["CPUExecutionProvider"], "cpu")
+
+ preferred = requested[0]
+ if preferred == "CPUExecutionProvider":
+ return (requested, "cpu")
+
+ if preferred not in available:
+ if device not in _WARNED:
+ extra = _DEVICE_EXTRA.get(device, "the matching mempalace extra for your device")
+ logger.warning(
+ "embedding_device=%r requested but %s is not installed — "
+ "falling back to CPU. Install %s.",
+ device,
+ preferred,
+ extra,
+ )
+ _WARNED.add(device)
+ return (["CPUExecutionProvider"], "cpu")
+
+ return (requested, device)
+
+
+def _build_ef_class():
+ """Subclass ``ONNXMiniLM_L6_V2`` with name ``"default"``.
+
+ Why the rename: ChromaDB 1.5 persists the EF identity on the collection
+ and rejects reads that pass a differently-named EF (``onnx_mini_lm_l6_v2``
+ vs ``default``). The vectors and model are identical — only the
+ ``name()`` tag differs — so spoofing the name lets one EF class serve
+ palaces created with ``DefaultEmbeddingFunction`` *and* palaces we
+ create ourselves, with the same GPU-capable ``preferred_providers``.
+ """
+ from chromadb.utils.embedding_functions import ONNXMiniLM_L6_V2
+
+ class _MempalaceONNX(ONNXMiniLM_L6_V2):
+ @staticmethod
+ def name() -> str:
+ return "default"
+
+ return _MempalaceONNX
+
+
+def get_embedding_function(device: Optional[str] = None):
+ """Return a cached embedding function bound to the requested device.
+
+ ``device=None`` reads from :class:`MempalaceConfig.embedding_device`.
+ The returned function is shared across calls with the same resolved
+ provider list so we only pay model-load cost once per process.
+ """
+ if device is None:
+ from .config import MempalaceConfig
+
+ device = MempalaceConfig().embedding_device
+
+ providers, effective = _resolve_providers(device)
+ cache_key = tuple(providers)
+ cached = _EF_CACHE.get(cache_key)
+ if cached is not None:
+ return cached
+
+ ef_cls = _build_ef_class()
+ ef = ef_cls(preferred_providers=providers)
+ _EF_CACHE[cache_key] = ef
+ logger.info("Embedding function initialized (device=%s providers=%s)", effective, providers)
+ return ef
+
+
+def describe_device(device: Optional[str] = None) -> str:
+ """Return a short human-readable label for the resolved device.
+
+ Used by the miner CLI header so users can see at a glance whether GPU
+ acceleration actually engaged.
+ """
+ if device is None:
+ from .config import MempalaceConfig
+
+ device = MempalaceConfig().embedding_device
+ _, effective = _resolve_providers(device)
+ return effective
diff --git a/mempalace/entity_detector.py b/mempalace/entity_detector.py
index 2f2aae481..c70dd576b 100644
--- a/mempalace/entity_detector.py
+++ b/mempalace/entity_detector.py
@@ -2,6 +2,9 @@
"""
entity_detector.py — Auto-detect people and projects from file content.
+Uses ``from __future__ import annotations`` so PEP 604 union syntax
+(``dict | None``) works on the Python 3.9 baseline.
+
Two-pass approach:
Pass 1: scan files, extract entity candidates with signal counts
Pass 2: score and classify each candidate as person, project, or uncertain
@@ -27,6 +30,8 @@
confirmed = confirm_entities(candidates) # interactive review
"""
+from __future__ import annotations
+
import re
import os
import functools
@@ -396,7 +401,12 @@ def classify_entity(name: str, frequency: int, scores: dict) -> dict:
# ==================== MAIN DETECT ====================
-def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) -> dict:
+def detect_entities(
+ file_paths: list,
+ max_files: int = 10,
+ languages=("en",),
+ corpus_origin: dict | None = None,
+) -> dict:
"""
Scan files and detect entity candidates.
@@ -405,12 +415,23 @@ def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) ->
max_files: Max files to read (for speed)
languages: Tuple of language codes whose entity patterns should be
applied (union). Defaults to ``("en",)``.
+ corpus_origin: Optional corpus-origin context (the dict produced
+ by ``mempalace.corpus_origin`` and persisted to
+ ``/.mempalace/origin.json`` by ``mempalace init``).
+ When supplied and the corpus is identified as AI-dialogue with
+ known agent persona names, candidates whose name matches an
+ agent persona are moved out of ``people``/``uncertain`` and
+ into a new ``agent_personas`` bucket. Shape:
+ ``{"schema_version": 1, "result": {"agent_persona_names": [...], ...}}``.
Returns:
{
"people": [...entity dicts...],
"projects": [...entity dicts...],
"uncertain":[...entity dicts...],
+ # Only present when corpus_origin reclassifies at least one
+ # candidate as an agent persona:
+ "agent_personas": [...entity dicts...],
}
"""
langs = _normalize_langs(languages)
@@ -440,7 +461,10 @@ def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) ->
candidates = extract_candidates(combined_text, languages=langs)
if not candidates:
- return {"people": [], "projects": [], "uncertain": []}
+ return _apply_corpus_origin(
+ {"people": [], "projects": [], "topics": [], "uncertain": []},
+ corpus_origin,
+ )
# Score and classify each candidate
people = []
@@ -463,13 +487,76 @@ def detect_entities(file_paths: list, max_files: int = 10, languages=("en",)) ->
projects.sort(key=lambda x: x["confidence"], reverse=True)
uncertain.sort(key=lambda x: x["frequency"], reverse=True)
- # Cap results to most relevant
- return {
+ detected = {
"people": people[:15],
"projects": projects[:10],
+ "topics": [],
"uncertain": uncertain[:8],
}
+ return _apply_corpus_origin(detected, corpus_origin)
+
+
+def _apply_corpus_origin(detected: dict, corpus_origin: dict | None) -> dict:
+ """Reclassify per-candidate buckets using corpus-origin context.
+
+ When the corpus is identified as AI-dialogue with known agent persona
+ names, a candidate whose name case-insensitively matches one of those
+ personas is moved from ``people``/``uncertain`` into an
+ ``agent_personas`` bucket. The candidate's per-entity ``type`` is also
+ rewritten to ``"agent_persona"``.
+
+ No-op when ``corpus_origin`` is ``None`` or contains no usable persona
+ names. Pure: returns a new dict, does not mutate the input.
+ """
+ if not corpus_origin:
+ return detected
+
+ origin_result = corpus_origin.get("result") or {}
+ raw_personas = origin_result.get("agent_persona_names") or []
+ persona_lower = {n.lower() for n in raw_personas if isinstance(n, str)}
+ if not persona_lower:
+ return detected
+
+ agent_personas: list = []
+ new_people: list = []
+ new_uncertain: list = []
+
+ for entity in detected.get("people", []):
+ if entity["name"].lower() in persona_lower:
+ agent_personas.append(_tag_as_persona(entity))
+ else:
+ new_people.append(entity)
+
+ for entity in detected.get("uncertain", []):
+ if entity["name"].lower() in persona_lower:
+ agent_personas.append(_tag_as_persona(entity))
+ else:
+ new_uncertain.append(entity)
+
+ if not agent_personas:
+ return detected
+
+ agent_personas.sort(key=lambda x: x.get("confidence", 0), reverse=True)
+
+ return {
+ **detected,
+ "people": new_people,
+ "uncertain": new_uncertain,
+ "agent_personas": agent_personas,
+ }
+
+
+def _tag_as_persona(entity: dict) -> dict:
+ """Return a new entity dict tagged as agent_persona with provenance signal."""
+ existing_signals = entity.get("signals", [])
+ return {
+ **entity,
+ "type": "agent_persona",
+ "confidence": max(0.95, entity.get("confidence", 0.0)),
+ "signals": ["matched corpus_origin agent_persona_names"] + existing_signals[:2],
+ }
+
# ==================== INTERACTIVE CONFIRM ====================
@@ -489,7 +576,13 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict:
"""
Interactive confirmation step.
User reviews detected entities, removes wrong ones, adds missing ones.
- Returns confirmed {people: [names], projects: [names]}
+ Returns confirmed {people: [names], projects: [names], topics: [names]}.
+
+ Topics are not surfaced for interactive review — they come from the
+ LLM-refined ``TOPIC`` bucket and are passed through verbatim. They
+ feed cross-wing tunnel computation at mine time (see
+ ``palace_graph.compute_topic_tunnels``); a wrong topic at worst adds
+ a low-traffic tunnel and never alters drawer storage.
Pass yes=True to auto-accept all detected entities without prompting.
"""
@@ -501,18 +594,28 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict:
_print_entity_list(detected["people"], "PEOPLE")
_print_entity_list(detected["projects"], "PROJECTS")
+ if detected.get("topics"):
+ _print_entity_list(detected["topics"], "TOPICS (cross-wing tunnel signal)")
+
if detected["uncertain"]:
_print_entity_list(detected["uncertain"], "UNCERTAIN (need your call)")
confirmed_people = [e["name"] for e in detected["people"]]
confirmed_projects = [e["name"] for e in detected["projects"]]
+ confirmed_topics = [e["name"] for e in detected.get("topics", [])]
if yes:
# Auto-accept: include all detected (skip uncertain — ambiguous without user input)
print(
- f"\n Auto-accepting {len(confirmed_people)} people, {len(confirmed_projects)} projects."
+ f"\n Auto-accepting {len(confirmed_people)} people, "
+ f"{len(confirmed_projects)} projects, "
+ f"{len(confirmed_topics)} topics."
)
- return {"people": confirmed_people, "projects": confirmed_projects}
+ return {
+ "people": confirmed_people,
+ "projects": confirmed_projects,
+ "topics": confirmed_topics,
+ }
print(f"\n{'─' * 58}")
print(" Options:")
@@ -570,11 +673,14 @@ def confirm_entities(detected: dict, yes: bool = False) -> dict:
print(" Confirmed:")
print(f" People: {', '.join(confirmed_people) or '(none)'}")
print(f" Projects: {', '.join(confirmed_projects) or '(none)'}")
+ if confirmed_topics:
+ print(f" Topics: {', '.join(confirmed_topics)}")
print(f"{'=' * 58}\n")
return {
"people": confirmed_people,
"projects": confirmed_projects,
+ "topics": confirmed_topics,
}
diff --git a/mempalace/llm_client.py b/mempalace/llm_client.py
index 74982cea1..9d6514239 100644
--- a/mempalace/llm_client.py
+++ b/mempalace/llm_client.py
@@ -28,9 +28,81 @@
from dataclasses import dataclass
from typing import Optional
from urllib.error import HTTPError, URLError
+from urllib.parse import urlparse
from urllib.request import Request, urlopen
+# ── External-service heuristic (issue #24 — privacy warning support) ─────
+# Used by ``LLMProvider.is_external_service`` to decide whether the
+# provider's configured endpoint will send user content off the local
+# machine/network. Single source of truth so all three providers share
+# identical "local vs external" semantics.
+
+_LOCALHOST_HOSTS = frozenset({"localhost", "127.0.0.1", "::1"})
+
+
+def _endpoint_is_local(url: Optional[str]) -> bool:
+ """Return True if ``url``'s hostname is on the user's machine or
+ private network.
+
+ Local includes:
+ - localhost, 127.0.0.1, ::1
+ - hostnames ending in .local (mDNS/Bonjour)
+ - IPv4 RFC1918: 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16
+ - IPv4 CGNAT (Tailscale and similar VPN/tunnel networks):
+ 100.64.0.0/10 — first octet 100, second octet 64-127 inclusive
+ - IPv6 unique-local addresses (fc00::/7) — fc.../fd... prefixes
+
+ None / empty / unparseable URLs are treated as local (defensive default —
+ no endpoint means no external request can happen yet).
+
+ Anything else (including public IPs and FQDNs) is external.
+ """
+ if not url:
+ return True
+ try:
+ host = (urlparse(url).hostname or "").lower()
+ except (ValueError, AttributeError):
+ return False
+ if not host:
+ return True
+ if host in _LOCALHOST_HOSTS:
+ return True
+ if host.endswith(".local"):
+ return True
+ if host.startswith("10."):
+ return True
+ if host.startswith("192.168."):
+ return True
+ if host.startswith("172."):
+ # 172.16.0.0 - 172.31.255.255
+ parts = host.split(".")
+ if len(parts) >= 2:
+ try:
+ if 16 <= int(parts[1]) <= 31:
+ return True
+ except ValueError:
+ pass
+ if host.startswith("100."):
+ # 100.64.0.0/10 — Tailscale CGNAT range. First octet 100, second
+ # octet 64-127 inclusive. Users running a local LLM (LM Studio,
+ # Ollama, etc.) accessible via Tailscale on a 100.x.x.x address
+ # should not trigger the external-API privacy warning.
+ # 100.x.x.x outside this range is regular allocated public space
+ # and remains external.
+ parts = host.split(".")
+ if len(parts) >= 2:
+ try:
+ if 64 <= int(parts[1]) <= 127:
+ return True
+ except ValueError:
+ pass
+ # IPv6 unique-local addresses fc00::/7 — match leading hex chars
+ if host.startswith("fc") or host.startswith("fd"):
+ return True
+ return False
+
+
class LLMError(RuntimeError):
"""Raised for any provider failure — transport, parse, auth, missing model."""
@@ -68,6 +140,20 @@ def check_available(self) -> tuple[bool, str]:
"""Return ``(ok, message)``. Fast probe that the provider is reachable."""
raise NotImplementedError
+ @property
+ def is_external_service(self) -> bool:
+ """Return True if this provider's endpoint will send user content
+ off the local machine/network.
+
+ Used by ``mempalace init`` to decide whether to print a privacy
+ warning before first use (issue #24). URL-based heuristic only —
+ the endpoint determines, regardless of which provider class.
+ Subclasses that resolve their endpoint dynamically should override
+ if needed; the default works for the three in-tree providers
+ (Ollama / OpenAI-compat / Anthropic).
+ """
+ return not _endpoint_is_local(self.endpoint)
+
def _http_post_json(url: str, body: dict, headers: dict, timeout: int) -> dict:
"""POST JSON and return the parsed response. Raises LLMError on any failure."""
diff --git a/mempalace/llm_refine.py b/mempalace/llm_refine.py
index faa737ae4..e3afe6b8e 100644
--- a/mempalace/llm_refine.py
+++ b/mempalace/llm_refine.py
@@ -197,13 +197,23 @@ def _apply_classifications(
"""Merge LLM decisions back into the detected dict.
Returns (new_detected, reclassified_count, dropped_count).
+
+ Topics get their own bucket so the caller can persist them as
+ cross-wing tunnel signal. ``AMBIGUOUS`` still falls back to
+ ``uncertain`` for human review.
"""
label_to_bucket = {
"PERSON": "people",
"PROJECT": "projects",
- "TOPIC": "uncertain",
+ "TOPIC": "topics",
"AMBIGUOUS": "uncertain",
}
+ bucket_to_type = {
+ "people": "person",
+ "projects": "project",
+ "topics": "topic",
+ "uncertain": "uncertain",
+ }
# Index every entity by name for in-place update
all_entries: list[tuple[str, dict]] = []
@@ -216,6 +226,7 @@ def _apply_classifications(
new_detected: dict[str, list[dict]] = {
"people": [],
"projects": [],
+ "topics": [],
"uncertain": [],
}
@@ -223,7 +234,7 @@ def _apply_classifications(
decision = decisions.get(entry["name"])
if decision is None:
# No LLM opinion — keep as-is
- new_detected[old_bucket].append(entry)
+ new_detected.setdefault(old_bucket, []).append(entry)
continue
label, reason = decision
@@ -245,18 +256,58 @@ def _apply_classifications(
updated["signals"] = signals
if target_bucket != old_bucket:
reclassified += 1
- updated["type"] = (
- "person"
- if target_bucket == "people"
- else "project"
- if target_bucket == "projects"
- else "uncertain"
- )
+ updated["type"] = bucket_to_type.get(target_bucket, "uncertain")
new_detected[target_bucket].append(updated)
return new_detected, reclassified, dropped
+def _build_corpus_origin_preamble(corpus_origin: dict | None) -> str:
+ """Build a system-prompt preamble carrying corpus-origin context.
+
+ When the corpus has been identified as AI-dialogue with known persona
+ names, this preamble lets the LLM disambiguate ambiguous candidates
+ with knowledge that this is AI-dialogue. It does NOT add a new label
+ or change the classification schema — the post-refine sweep in
+ project_scanner.discover_entities still moves persona names into
+ ``agent_personas``. The preamble is purely classification context for
+ the OTHER candidates (ambiguous, common-word) that benefit from
+ knowing the corpus shape.
+
+ Returns ``""`` when no usable origin context is available, so callers
+ can concatenate unconditionally without changing the v3.3.3 prompt
+ shape for opt-out paths.
+ """
+ if not corpus_origin:
+ return ""
+ result = corpus_origin.get("result") or {}
+ if not result.get("likely_ai_dialogue"):
+ return ""
+
+ lines = ["\n\nCORPUS CONTEXT (corpus-origin detection):"]
+ platform = result.get("primary_platform")
+ if platform:
+ lines.append(f"- This corpus is AI-dialogue from {platform}.")
+ user_name = result.get("user_name")
+ if user_name:
+ lines.append(
+ f"- The corpus author (the human user) is named '{user_name}'. "
+ f"Treat this name as PERSON."
+ )
+ personas = result.get("agent_persona_names") or []
+ if personas:
+ lines.append(
+ "- The user has assigned these persona names to AI agents in "
+ f"this corpus: {', '.join(personas)}."
+ )
+ lines.append(
+ "- Persona names refer to AI agents, not biological people. "
+ "Classify them as PERSON (a downstream step tags them as "
+ "agent personas)."
+ )
+ return "\n".join(lines)
+
+
def _is_authoritative_person(entry: dict) -> bool:
"""Return True for git-author people that should not be second-guessed."""
signals = " ".join(entry.get("signals", [])).lower()
@@ -287,6 +338,7 @@ def refine_entities(
batch_size: int = BATCH_SIZE,
show_progress: bool = True,
allow_project_promotions: bool = True,
+ corpus_origin: dict | None = None,
) -> RefineResult:
"""Reclassify detected entities using the LLM provider.
@@ -349,12 +401,14 @@ def refine_entities(
completed = 0
cancelled = False
+ system_prompt = SYSTEM_PROMPT + _build_corpus_origin_preamble(corpus_origin)
+
for idx, batch in enumerate(batches, 1):
if show_progress and batch:
_print_progress(idx - 1, len(batches), batch[0][0])
user_prompt = _build_user_prompt(batch)
try:
- resp = provider.classify(SYSTEM_PROMPT, user_prompt, json_mode=True)
+ resp = provider.classify(system_prompt, user_prompt, json_mode=True)
except KeyboardInterrupt:
cancelled = True
break
diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 2650e3073..9c87708db 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -57,7 +57,7 @@
sanitize_content,
)
from .version import __version__ # noqa: E402
-from .backends.chroma import ChromaBackend, ChromaCollection # noqa: E402
+from .backends.chroma import ChromaBackend, ChromaCollection, _pin_hnsw_threads # noqa: E402
from .query_sanitizer import sanitize_query # noqa: E402
from .searcher import search_memories # noqa: E402
from .palace_graph import ( # noqa: E402
@@ -217,15 +217,25 @@ def _get_collection(create=False):
try:
client = _get_client()
if create:
- _collection_cache = ChromaCollection(
- client.get_or_create_collection(
- _config.collection_name, metadata={"hnsw:space": "cosine"}
- )
+ # hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor
+ # HNSW insert path, which has a race in repairConnectionsForUpdate /
+ # addPoint (see issues #974, #965). Set via metadata on fresh
+ # collections and re-applied via _pin_hnsw_threads() for legacy
+ # palaces whose collections were created before this fix (the
+ # runtime config does not persist cross-process in chromadb 1.5.x,
+ # so the retrofit runs every time _get_collection opens a cache).
+ raw = client.get_or_create_collection(
+ _config.collection_name,
+ metadata={"hnsw:space": "cosine", "hnsw:num_threads": 1},
)
+ _pin_hnsw_threads(raw)
+ _collection_cache = ChromaCollection(raw)
_metadata_cache = None
_metadata_cache_time = 0
elif _collection_cache is None:
- _collection_cache = ChromaCollection(client.get_collection(_config.collection_name))
+ raw = client.get_collection(_config.collection_name)
+ _pin_hnsw_threads(raw)
+ _collection_cache = ChromaCollection(raw)
_metadata_cache = None
_metadata_cache_time = 0
return _collection_cache
@@ -929,6 +939,7 @@ def tool_diary_write(agent_name: str, entry: str, topic: str = "general", wing:
try:
agent_name = sanitize_name(agent_name, "agent_name")
entry = sanitize_content(entry)
+ topic = sanitize_name(topic, "topic")
except ValueError as e:
return {"success": False, "error": str(e)}
diff --git a/mempalace/migrate.py b/mempalace/migrate.py
index e6e393dc6..76aa054fa 100644
--- a/mempalace/migrate.py
+++ b/mempalace/migrate.py
@@ -18,6 +18,7 @@
mempalace migrate --dry-run # show what would be migrated
"""
+import errno
import os
import shutil
import sqlite3
@@ -25,6 +26,26 @@
from datetime import datetime
+def _restore_stale_palace(palace_path: str, stale_path: str) -> None:
+ """Roll back a failed swap.
+
+ shutil.move() can partially create palace_path before raising, which
+ would make a bare os.replace(stale_path, palace_path) fail (dest exists).
+ Clear any partial destination first, then restore. Best-effort: if the
+ restore itself fails, log both paths so the operator can recover by hand.
+ """
+ try:
+ if os.path.lexists(palace_path):
+ shutil.rmtree(palace_path, ignore_errors=True)
+ os.replace(stale_path, palace_path)
+ except Exception as err:
+ print(
+ f" CRITICAL: rollback failed — original palace at {stale_path}, "
+ f"partial migration data at {palace_path}. Restore manually. "
+ f"({err})"
+ )
+
+
def extract_drawers_from_sqlite(db_path: str) -> list:
"""Read all drawers directly from ChromaDB's SQLite, bypassing the API.
@@ -231,10 +252,27 @@ def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
del col
del fresh_backend
- # Swap: remove old palace, move new one into place
+ # Swap: rename old palace aside, then move new one into place.
+ # This avoids a window where both old and new are missing.
print(" Swapping old palace for migrated version...")
- shutil.rmtree(palace_path)
- shutil.move(temp_palace, palace_path)
+ stale_path = palace_path + ".old"
+ if os.path.exists(stale_path):
+ shutil.rmtree(stale_path)
+ os.replace(palace_path, stale_path)
+ try:
+ os.replace(temp_palace, palace_path)
+ except OSError as e:
+ # EXDEV = temp lives on a different filesystem; fall back to copy+delete.
+ # Anything else is a real error — don't mask it with shutil.move.
+ if getattr(e, "errno", None) != errno.EXDEV:
+ _restore_stale_palace(palace_path, stale_path)
+ raise
+ try:
+ shutil.move(temp_palace, palace_path)
+ except Exception:
+ _restore_stale_palace(palace_path, stale_path)
+ raise
+ shutil.rmtree(stale_path, ignore_errors=True)
print("\n Migration complete.")
print(f" Drawers migrated: {final_count}")
diff --git a/mempalace/miner.py b/mempalace/miner.py
index 9e8ff5eb7..2d610eafd 100644
--- a/mempalace/miner.py
+++ b/mempalace/miner.py
@@ -9,20 +9,24 @@
import os
import sys
+import shlex
import hashlib
import fnmatch
from pathlib import Path
from datetime import datetime
from collections import defaultdict
+from typing import Optional
from .palace import (
NORMALIZE_VERSION,
SKIP_DIRS,
+ MineAlreadyRunning,
build_closet_lines,
file_already_mined,
get_closets_collection,
get_collection,
mine_lock,
+ mine_palace_lock,
purge_file_closets,
upsert_closet_lines,
)
@@ -64,6 +68,7 @@
CHUNK_SIZE = 800 # chars per drawer
CHUNK_OVERLAP = 100 # overlap between chunks
MIN_CHUNK_SIZE = 50 # skip tiny chunks
+DRAWER_UPSERT_BATCH_SIZE = 1000
MAX_FILE_SIZE = 500 * 1024 * 1024 # 500 MB — skip files larger than this.
# Long Claude Code sessions and large transcript exports routinely exceed
# 10 MB. The cap exists as a defensive rail against pathological binary
@@ -437,7 +442,16 @@ def _refresh_known_entities_cache() -> None:
data = json.load(f)
if isinstance(data, dict):
raw = data
- for cat in data.values():
+ for cat_key, cat in data.items():
+ # Special wing-keyed map — its inner values are topic
+ # names but its outer keys are wings, which must NOT be
+ # surfaced as known entities. Pull the topic names out
+ # explicitly instead of treating it as a generic category.
+ if cat_key == "topics_by_wing" and isinstance(cat, dict):
+ for topic_list in cat.values():
+ if isinstance(topic_list, list):
+ names.update(str(n) for n in topic_list if n)
+ continue
if isinstance(cat, list):
names.update(str(n) for n in cat if n)
elif isinstance(cat, dict):
@@ -472,7 +486,39 @@ def _load_known_entities_raw() -> dict:
return dict(_ENTITY_REGISTRY_CACHE["raw"])
-def add_to_known_entities(entities_by_category: dict) -> str:
+def _set_wing_topics(existing: dict, wing_key: str, topics_for_wing: list, coerce) -> None:
+ """Update ``existing['topics_by_wing'][wing_key]`` to the deduped list.
+
+ Replaces (does not union) the wing's topic list — re-running ``init``
+ should reflect the user's latest confirmation rather than accumulate
+ stale labels. Empty input drops the wing entry; an empty map drops
+ the ``topics_by_wing`` key entirely.
+ """
+ topics_map = existing.get("topics_by_wing")
+ if not isinstance(topics_map, dict):
+ topics_map = {}
+ seen_lower: set = set()
+ ordered: list = []
+ for n in topics_for_wing:
+ name = coerce(n)
+ if not name:
+ continue
+ key = name.lower()
+ if key in seen_lower:
+ continue
+ seen_lower.add(key)
+ ordered.append(name)
+ if ordered:
+ topics_map[wing_key] = ordered
+ else:
+ topics_map.pop(wing_key, None)
+ if topics_map:
+ existing["topics_by_wing"] = topics_map
+ else:
+ existing.pop("topics_by_wing", None)
+
+
+def add_to_known_entities(entities_by_category: dict, wing: str = None) -> str:
"""Union ``entities_by_category`` into ``~/.mempalace/known_entities.json``.
Accepts ``{category: [names]}`` shape as produced by ``mempalace init``
@@ -486,6 +532,15 @@ def add_to_known_entities(entities_by_category: dict) -> str:
added as keys with ``None`` values so existing code mappings aren't
overwritten. A later compress pass can assign codes.
+ When ``wing`` is provided AND ``entities_by_category`` contains a
+ ``topics`` list, those topics are also recorded under
+ ``topics_by_wing[wing]`` (case-insensitive dedup, preserving the
+ casing of the first observed name). This is the signal source for
+ ``palace_graph.compute_topic_tunnels`` at mine time. Topics for a
+ wing are *replaced*, not unioned, so a re-run of ``init`` reflects
+ the user's latest confirmation rather than accumulating stale labels
+ indefinitely.
+
The in-process cache is invalidated on write so same-process callers
(notably ``cmd_init`` → ``cmd_mine`` in sequence) see the update
immediately instead of waiting for a mtime re-check.
@@ -513,7 +568,16 @@ def _coerce_name(value):
name = str(value)
return name if name else None
+ # Separate the topics_by_wing key from regular categories so we don't
+ # treat it as a flat name-list elsewhere in this function.
+ topics_for_wing = None
+ if wing and isinstance(wing, str) and wing.strip():
+ topics_for_wing = entities_by_category.get("topics") or []
+
for category, names in entities_by_category.items():
+ if category == "topics_by_wing":
+ # Reserved key — managed separately below.
+ continue
if not isinstance(names, list) or not names:
continue
current = existing.get(category)
@@ -549,6 +613,9 @@ def _coerce_name(value):
ordered.append(name)
existing[category] = ordered
+ if topics_for_wing is not None:
+ _set_wing_topics(existing, wing.strip(), topics_for_wing, _coerce_name)
+
registry_path.write_text(_json.dumps(existing, indent=2, ensure_ascii=False), encoding="utf-8")
try:
registry_path.chmod(0o600)
@@ -563,6 +630,28 @@ def _coerce_name(value):
return str(registry_path)
+def get_topics_by_wing() -> dict:
+ """Return ``topics_by_wing`` from the global registry as a dict.
+
+ Returns ``{}`` if the registry is missing, malformed, or has no
+ ``topics_by_wing`` key. Casing is preserved from disk; callers that
+ need case-insensitive comparison should normalize themselves.
+ """
+ raw = _load_known_entities_raw()
+ topics_map = raw.get("topics_by_wing")
+ if not isinstance(topics_map, dict):
+ return {}
+ out: dict = {}
+ for wing, topics in topics_map.items():
+ if not isinstance(wing, str) or not wing.strip():
+ continue
+ if isinstance(topics, list):
+ cleaned = [str(t) for t in topics if isinstance(t, str) and t.strip()]
+ if cleaned:
+ out[wing.strip()] = cleaned
+ return out
+
+
_HALL_KEYWORDS_CACHE = None
@@ -633,40 +722,62 @@ def _extract_entities_for_metadata(content: str) -> str:
return ";".join(capped)
+def _build_drawer_metadata(
+ wing: str,
+ room: str,
+ source_file: str,
+ chunk_index: int,
+ agent: str,
+ content: str,
+ source_mtime: Optional[float],
+) -> dict:
+ """Build the metadata dict for one drawer without upserting.
+
+ Split out from ``add_drawer`` so ``process_file`` can batch all chunks
+ of a file into a single ``collection.upsert`` — one embedding forward
+ pass per batch instead of per chunk.
+ """
+ metadata = {
+ "wing": wing,
+ "room": room,
+ "source_file": source_file,
+ "chunk_index": chunk_index,
+ "added_by": agent,
+ "filed_at": datetime.now().isoformat(),
+ "normalize_version": NORMALIZE_VERSION,
+ }
+ if source_mtime is not None:
+ metadata["source_mtime"] = source_mtime
+ metadata["hall"] = detect_hall(content)
+ entities = _extract_entities_for_metadata(content)
+ if entities:
+ metadata["entities"] = entities
+ return metadata
+
+
def add_drawer(
collection, wing: str, room: str, content: str, source_file: str, chunk_index: int, agent: str
):
- """Add one drawer to the palace."""
+ """Add one drawer to the palace.
+
+ Kept for backward compatibility with external callers. In-tree the
+ miner uses ``_build_drawer_metadata`` + a batched ``collection.upsert``
+ to amortize the embedding model's forward-pass cost across chunks.
+ """
drawer_id = f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(chunk_index)).encode()).hexdigest()[:24]}"
try:
- metadata = {
- "wing": wing,
- "room": room,
- "source_file": source_file,
- "chunk_index": chunk_index,
- "added_by": agent,
- "filed_at": datetime.now().isoformat(),
- "normalize_version": NORMALIZE_VERSION,
- }
- # Store file mtime so we can detect modifications later.
- try:
- metadata["source_mtime"] = os.path.getmtime(source_file)
- except OSError:
- pass
- # Tag with hall for graph connectivity within wings
- metadata["hall"] = detect_hall(content)
- # Tag with entity names for filterable search
- entities = _extract_entities_for_metadata(content)
- if entities:
- metadata["entities"] = entities
- collection.upsert(
- documents=[content],
- ids=[drawer_id],
- metadatas=[metadata],
- )
- return True
- except Exception:
- raise
+ source_mtime = os.path.getmtime(source_file)
+ except OSError:
+ source_mtime = None
+ metadata = _build_drawer_metadata(
+ wing, room, source_file, chunk_index, agent, content, source_mtime
+ )
+ collection.upsert(
+ documents=[content],
+ ids=[drawer_id],
+ metadatas=[metadata],
+ )
+ return True
# =============================================================================
@@ -725,19 +836,41 @@ def process_file(
except Exception:
pass
+ # Batch chunks into bounded upserts so the embedding model sees many
+ # chunks per forward pass without building one huge Chroma/SQLite
+ # request for pathological files. A bad chunk can fail its sub-batch;
+ # that is the deliberate trade-off for amortizing embedding overhead.
+ try:
+ source_mtime = os.path.getmtime(source_file)
+ except OSError:
+ source_mtime = None
+
drawers_added = 0
- for chunk in chunks:
- added = add_drawer(
- collection=collection,
- wing=wing,
- room=room,
- content=chunk["content"],
- source_file=source_file,
- chunk_index=chunk["chunk_index"],
- agent=agent,
+ for batch_start in range(0, len(chunks), DRAWER_UPSERT_BATCH_SIZE):
+ batch_docs: list = []
+ batch_ids: list = []
+ batch_metas: list = []
+ for chunk in chunks[batch_start : batch_start + DRAWER_UPSERT_BATCH_SIZE]:
+ drawer_id = f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
+ batch_docs.append(chunk["content"])
+ batch_ids.append(drawer_id)
+ batch_metas.append(
+ _build_drawer_metadata(
+ wing,
+ room,
+ source_file,
+ chunk["chunk_index"],
+ agent,
+ chunk["content"],
+ source_mtime,
+ )
+ )
+ collection.upsert(
+ documents=batch_docs,
+ ids=batch_ids,
+ metadatas=batch_metas,
)
- if added:
- drawers_added += 1
+ drawers_added += len(batch_docs)
# Build closet — the searchable index pointing to these drawers.
# Purge first: a re-mine (mtime change or normalize_version bump) must
@@ -851,23 +984,80 @@ def mine(
dry_run: bool = False,
respect_gitignore: bool = True,
include_ignored: list = None,
+ files: list = None,
):
- """Mine a project directory into the palace."""
+ """Mine a project directory into the palace.
+
+ ``files`` may optionally be a pre-scanned list of file paths from
+ :func:`scan_project`. When provided, the corpus walk is skipped — the
+ caller (e.g. ``init`` showing a file-count estimate before the mine
+ prompt) avoids walking the tree twice. When ``None`` (the default),
+ ``mine`` walks the tree itself just like before.
+ """
+
+ if dry_run:
+ return _mine_impl(
+ project_dir,
+ palace_path,
+ wing_override=wing_override,
+ agent=agent,
+ limit=limit,
+ dry_run=dry_run,
+ respect_gitignore=respect_gitignore,
+ include_ignored=include_ignored,
+ files=files,
+ )
+ try:
+ with mine_palace_lock(palace_path):
+ return _mine_impl(
+ project_dir,
+ palace_path,
+ wing_override=wing_override,
+ agent=agent,
+ limit=limit,
+ dry_run=dry_run,
+ respect_gitignore=respect_gitignore,
+ include_ignored=include_ignored,
+ files=files,
+ )
+ except MineAlreadyRunning:
+ print(
+ f"mempalace: another `mine` is already running against "
+ f"{palace_path} — exiting cleanly.",
+ file=sys.stderr,
+ )
+ return
+
+
+def _mine_impl(
+ project_dir: str,
+ palace_path: str,
+ wing_override: str = None,
+ agent: str = "mempalace",
+ limit: int = 0,
+ dry_run: bool = False,
+ respect_gitignore: bool = True,
+ include_ignored: list = None,
+ files: list = None,
+):
project_path = Path(project_dir).expanduser().resolve()
config = load_config(project_dir)
wing = wing_override or config["wing"]
rooms = config.get("rooms", [{"name": "general", "description": "All project files"}])
- files = scan_project(
- project_dir,
- respect_gitignore=respect_gitignore,
- include_ignored=include_ignored,
- )
+ if files is None:
+ files = scan_project(
+ project_dir,
+ respect_gitignore=respect_gitignore,
+ include_ignored=include_ignored,
+ )
if limit > 0:
files = files[:limit]
+ from .embedding import describe_device
+
print(f"\n{'=' * 55}")
print(" MemPalace Mine")
print(f"{'=' * 55}")
@@ -875,6 +1065,7 @@ def mine(
print(f" Rooms: {', '.join(r['name'] for r in rooms)}")
print(f" Files: {len(files)}")
print(f" Palace: {palace_path}")
+ print(f" Device: {describe_device()}")
if dry_run:
print(" DRY RUN — nothing will be filed")
if not respect_gitignore:
@@ -892,37 +1083,136 @@ def mine(
total_drawers = 0
files_skipped = 0
+ files_processed = 0
+ last_file = None
room_counts = defaultdict(int)
- for i, filepath in enumerate(files, 1):
- drawers, room = process_file(
- filepath=filepath,
- project_path=project_path,
- collection=collection,
- wing=wing,
- rooms=rooms,
- agent=agent,
- dry_run=dry_run,
- closets_col=closets_col,
+ try:
+ for i, filepath in enumerate(files, 1):
+ try:
+ drawers, room = process_file(
+ filepath=filepath,
+ project_path=project_path,
+ collection=collection,
+ wing=wing,
+ rooms=rooms,
+ agent=agent,
+ dry_run=dry_run,
+ closets_col=closets_col,
+ )
+ except KeyboardInterrupt:
+ # Re-raise so the outer handler prints the summary; we
+ # capture the last-attempted file via last_file below.
+ last_file = filepath.name
+ raise
+ files_processed = i
+ last_file = filepath.name
+ if drawers == 0 and not dry_run:
+ files_skipped += 1
+ else:
+ total_drawers += drawers
+ room_counts[room] += 1
+ if not dry_run:
+ print(f" + [{i:4}/{len(files)}] {filepath.name[:50]:50} +{drawers}")
+
+ if not dry_run:
+ # Cross-wing topic tunnels: after every file in this wing has been
+ # processed, link this wing to any other wing that shares a
+ # confirmed TOPIC label. Out of scope for v1: manifest-dependency
+ # overlap, per-topic allow/deny lists, search-result surfacing.
+ try:
+ tunnels_added = _compute_topic_tunnels_for_wing(wing)
+ if tunnels_added:
+ print(f"\n Topic tunnels: +{tunnels_added} cross-wing link(s)")
+ except Exception as e:
+ # Tunnel computation must never fail a mine — degrade quietly.
+ print(
+ f"\n WARNING: topic tunnel computation skipped — {e}",
+ file=sys.stderr,
+ )
+
+ print(f"\n{'=' * 55}")
+ print(" Done.")
+ print(f" Files processed: {len(files) - files_skipped}")
+ print(f" Files skipped (already filed): {files_skipped}")
+ print(f" Drawers filed: {total_drawers}")
+ print("\n By room:")
+ for room, count in sorted(room_counts.items(), key=lambda x: x[1], reverse=True):
+ print(f" {room:20} {count} files")
+ print('\n Next: mempalace search "what you\'re looking for"')
+ print(f"{'=' * 55}\n")
+ except KeyboardInterrupt:
+ # Idempotent re-mine: deterministic drawer IDs mean already-filed
+ # drawers upsert to the same row on next run, so partial progress
+ # is safe to leave in place. A second Ctrl-C during this print
+ # propagates to the default handler — we don't try to catch
+ # everything.
+ print("\n\n Mine interrupted.")
+ print(f" files_processed: {files_processed}/{len(files)}")
+ print(f" drawers_filed: {total_drawers}")
+ print(f" last_file: {last_file or ''}")
+ print(
+ f"\n Re-run `mempalace mine {shlex.quote(project_dir)}` to resume — "
+ "already-filed drawers are\n upserted idempotently and will not duplicate.\n"
)
- if drawers == 0 and not dry_run:
- files_skipped += 1
- else:
- total_drawers += drawers
- room_counts[room] += 1
- if not dry_run:
- print(f" + [{i:4}/{len(files)}] {filepath.name[:50]:50} +{drawers}")
+ sys.exit(130)
+ finally:
+ # Clean up the hooks-side PID lock if it points at us. Stale
+ # entries already pass _pid_alive() == False on POSIX, but
+ # actively removing the file makes the state observable
+ # (callers can stat it) and avoids accidental PID reuse on
+ # short-lived test runs. Only remove if the file claims our
+ # own PID — never another process's.
+ _cleanup_mine_pid_file()
+
+
+def _cleanup_mine_pid_file() -> None:
+ """Remove the global mine PID file if it currently points at us.
+
+ The PID file (``~/.mempalace/hook_state/mine.pid``, written by the
+ hook in :func:`mempalace.hooks_cli._spawn_mine`) tracks the PID of
+ the most recently spawned mine subprocess so the hook can dedup
+ concurrent auto-ingest fires. When that subprocess exits — cleanly,
+ on error, or via Ctrl-C — it should remove its own entry so the
+ next hook fire isn't briefly fooled by a stale PID before
+ ``_pid_alive`` returns False.
+
+ We only delete the file if it claims our own PID; any other PID is
+ left alone (could be an unrelated mine running concurrently from
+ a different worktree / session).
+ """
+ try:
+ from .hooks_cli import _MINE_PID_FILE
+ except Exception:
+ return
+ try:
+ if not _MINE_PID_FILE.exists():
+ return
+ recorded = _MINE_PID_FILE.read_text().strip()
+ if recorded and recorded.isdigit() and int(recorded) == os.getpid():
+ _MINE_PID_FILE.unlink()
+ except OSError:
+ # Best-effort cleanup; never fail the mine over PID bookkeeping.
+ pass
- print(f"\n{'=' * 55}")
- print(" Done.")
- print(f" Files processed: {len(files) - files_skipped}")
- print(f" Files skipped (already filed): {files_skipped}")
- print(f" Drawers filed: {total_drawers}")
- print("\n By room:")
- for room, count in sorted(room_counts.items(), key=lambda x: x[1], reverse=True):
- print(f" {room:20} {count} files")
- print('\n Next: mempalace search "what you\'re looking for"')
- print(f"{'=' * 55}\n")
+
+def _compute_topic_tunnels_for_wing(wing: str) -> int:
+ """Drop tunnels between ``wing`` and every other wing that shares
+ confirmed topics, honoring the ``topic_tunnel_min_count`` config knob.
+
+ Returns the number of tunnels created or refreshed. Zero means no
+ overlap found (or the registry has no ``topics_by_wing`` map yet).
+ """
+ from .config import MempalaceConfig
+ from .palace_graph import topic_tunnels_for_wing
+
+ topics_map = get_topics_by_wing()
+ if not topics_map or wing not in topics_map:
+ return 0
+ cfg = MempalaceConfig()
+ min_count = cfg.topic_tunnel_min_count
+ created = topic_tunnels_for_wing(wing, topics_map, min_count=min_count)
+ return len(created)
# =============================================================================
diff --git a/mempalace/palace.py b/mempalace/palace.py
index a2a4a8eba..07efb6a3e 100644
--- a/mempalace/palace.py
+++ b/mempalace/palace.py
@@ -310,6 +310,88 @@ def mine_lock(source_file: str):
lf.close()
+class MineAlreadyRunning(RuntimeError):
+ """Raised when another `mempalace mine` already holds the per-palace lock."""
+
+
+@contextlib.contextmanager
+def mine_palace_lock(palace_path: str):
+ """Per-palace non-blocking lock around the full `mine` pipeline.
+
+ The per-file `mine_lock` only protects delete+insert interleave for a
+ single source; it does not prevent N copies of `mempalace mine `
+ from being spawned concurrently by hooks. When that happens, each copy
+ drives ChromaDB HNSW inserts in parallel against the same palace,
+ which (combined with chromadb's multi-threaded ParallelFor) can
+ corrupt the HNSW graph and produce sparse link_lists.bin blowups.
+
+ The lock file is keyed by sha256(palace_path) so mines against
+ *different* palaces can still run in parallel — we only serialize
+ writes into the same palace, which is the correctness boundary.
+
+ The key is derived from a fully normalized form of the path:
+ `realpath` resolves symlinks and `..` segments, and `normcase` folds
+ case on Windows (which has a case-insensitive filesystem). Without
+ normcase, `C:\\Palace` and `c:\\palace` would hash to different keys
+ on Windows and let two concurrent mines touch the same on-disk palace.
+
+ Non-blocking: if another `mine` is already writing to this palace,
+ raise MineAlreadyRunning so the caller can exit cleanly instead of
+ piling up as a waiting worker.
+ """
+ lock_dir = os.path.join(os.path.expanduser("~"), ".mempalace", "locks")
+ os.makedirs(lock_dir, exist_ok=True)
+ resolved = os.path.realpath(os.path.expanduser(palace_path))
+ lock_key_source = os.path.normcase(resolved)
+ palace_key = hashlib.sha256(lock_key_source.encode()).hexdigest()[:16]
+ lock_path = os.path.join(lock_dir, f"mine_palace_{palace_key}.lock")
+
+ lf = open(lock_path, "w")
+ acquired = False
+ try:
+ if os.name == "nt":
+ import msvcrt
+
+ try:
+ msvcrt.locking(lf.fileno(), msvcrt.LK_NBLCK, 1)
+ acquired = True
+ except OSError as exc:
+ raise MineAlreadyRunning(
+ f"another `mempalace mine` is already running against {resolved}"
+ ) from exc
+ else:
+ import fcntl
+
+ try:
+ fcntl.flock(lf, fcntl.LOCK_EX | fcntl.LOCK_NB)
+ acquired = True
+ except BlockingIOError as exc:
+ raise MineAlreadyRunning(
+ f"another `mempalace mine` is already running against {resolved}"
+ ) from exc
+ yield
+ finally:
+ if acquired:
+ try:
+ if os.name == "nt":
+ import msvcrt
+
+ msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1)
+ else:
+ import fcntl
+
+ fcntl.flock(lf, fcntl.LOCK_UN)
+ except Exception:
+ pass
+ lf.close()
+
+
+# Backward-compatible alias (previous patch iteration used a single global
+# lock). Kept so third-party callers that imported it continue to work; new
+# code should use `mine_palace_lock(palace_path)` for per-palace scoping.
+mine_global_lock = mine_palace_lock
+
+
def file_already_mined(collection, source_file: str, check_mtime: bool = False) -> bool:
"""Check if a file has already been filed in the palace.
diff --git a/mempalace/palace_graph.py b/mempalace/palace_graph.py
index 125ec0d4a..5e4cec7c5 100644
--- a/mempalace/palace_graph.py
+++ b/mempalace/palace_graph.py
@@ -92,6 +92,16 @@ def build_graph(col=None, config=None):
while offset < total:
batch = col.get(limit=1000, offset=offset, include=["metadatas"])
for meta in batch["metadatas"]:
+ # ChromaDB can return ``None`` for drawers without metadata
+ # (legacy data, partial writes — upstream #1020 territory).
+ # Skip these silently rather than crash the whole graph
+ # build — a single None drawer shouldn't take down /stats
+ # or any caller of build_graph for the entire palace. Caught
+ # 2026-04-25 by palace-daemon's verify-routes.sh smoke test
+ # against the canonical 151K palace. Closes the same gap as
+ # upstream #999 / fork PR #1094 in a different read path.
+ if meta is None:
+ continue
room = meta.get("room", "")
wing = meta.get("wing", "")
hall = meta.get("hall", "")
@@ -313,8 +323,20 @@ def _save_tunnels(tunnels):
Writes to ``tunnels.json.tmp`` then ``os.replace``s it into place, so
a crash mid-write can never leave a partial/empty tunnels.json that
silently wipes every tunnel on next read.
+
+ Also restricts the parent directory to 0o700 and the file to 0o600 —
+ tunnels reveal cross-wing connections (which projects/people/rooms
+ the user has explicitly linked) and should not be world-readable on
+ shared Linux/multi-user systems. Matches the file-permission pattern
+ established by #814 for the other sensitive palace files.
"""
- os.makedirs(os.path.dirname(_TUNNEL_FILE), exist_ok=True)
+ parent = os.path.dirname(_TUNNEL_FILE)
+ os.makedirs(parent, exist_ok=True)
+ try:
+ os.chmod(parent, 0o700)
+ except (OSError, NotImplementedError):
+ # Windows / unsupported filesystems — tolerate.
+ pass
tmp_path = _TUNNEL_FILE + ".tmp"
with open(tmp_path, "w", encoding="utf-8") as f:
json.dump(tunnels, f, indent=2)
@@ -325,6 +347,10 @@ def _save_tunnels(tunnels):
# Not all filesystems (or Windows file handles) support fsync — tolerate.
pass
os.replace(tmp_path, _TUNNEL_FILE)
+ try:
+ os.chmod(_TUNNEL_FILE, 0o600)
+ except (OSError, NotImplementedError):
+ pass
def _endpoint_key(wing: str, room: str) -> str:
@@ -362,6 +388,7 @@ def create_tunnel(
label: str = "",
source_drawer_id: str = None,
target_drawer_id: str = None,
+ kind: str = "explicit",
):
"""Create an explicit (symmetric) tunnel between two locations in the palace.
@@ -382,6 +409,11 @@ def create_tunnel(
label: Description of the connection.
source_drawer_id: Optional specific drawer ID.
target_drawer_id: Optional specific drawer ID.
+ kind: Tunnel category — ``"explicit"`` (default, user-created link
+ between real rooms) or ``"topic"`` (auto-generated cross-wing
+ topical link where rooms are synthetic ``topic:``
+ identifiers). Preserved on the stored dict so readers can
+ distinguish real-room traversals from topic connections.
Returns:
The stored tunnel dict.
@@ -401,6 +433,7 @@ def create_tunnel(
"source": {"wing": source_wing, "room": source_room},
"target": {"wing": target_wing, "room": target_room},
"label": label,
+ "kind": kind,
"created_at": datetime.now(timezone.utc).isoformat(),
}
if source_drawer_id:
@@ -499,3 +532,159 @@ def follow_tunnels(wing: str, room: str, col=None, config=None):
pass
return connections
+
+
+# =============================================================================
+# TOPIC TUNNELS — auto-link wings that share confirmed TOPIC labels
+# =============================================================================
+# When two wings have one or more confirmed topics in common (e.g. both
+# discuss "Angular" or "OpenAPI"), drop a symmetric tunnel between them.
+# Topics come from the LLM-refined ``TOPIC`` bucket in the per-project
+# ``entities.json`` and are persisted by wing in
+# ``~/.mempalace/known_entities.json`` under ``topics_by_wing``.
+#
+# Tunnels are created via the existing ``create_tunnel`` API so they share
+# storage and dedup with explicit tunnels. The room is a synthetic
+# ``topic:`` identifier — the ``topic:`` prefix namespaces
+# these tunnels away from literal folder-derived rooms so a wing with an
+# auto-detected "Angular" folder room and a "shared topic: Angular" tunnel
+# remain distinct at ``follow_tunnels`` / ``list_tunnels`` time. The prefix
+# is also visible to any LLM scanning the tunnel list. The ``kind: "topic"``
+# field on the stored dict gives callers a machine-readable discriminator.
+
+TOPIC_ROOM_PREFIX = "topic:"
+
+
+def _normalize_topic(name: str) -> str:
+ """Lowercase + strip topics for case-insensitive overlap detection."""
+ return str(name).strip().lower()
+
+
+def topic_room(name: str) -> str:
+ """Return the synthetic room identifier for a topic tunnel.
+
+ Prefixing avoids collisions with literal folder-derived rooms of the
+ same name (e.g. a wing that has both an "Angular" folder room and an
+ "Angular" topic tunnel).
+ """
+ return f"{TOPIC_ROOM_PREFIX}{name}"
+
+
+def compute_topic_tunnels(
+ topics_by_wing: dict,
+ min_count: int = 1,
+ label_prefix: str = "shared topic",
+) -> list[dict]:
+ """Create tunnels for every pair of wings that share >= ``min_count`` topics.
+
+ Args:
+ topics_by_wing: ``{wing_name: [topic_name, ...]}`` mapping. Topic
+ names are compared case-insensitively; the first observed
+ casing is used for the tunnel room name.
+ min_count: minimum number of overlapping topics required to drop
+ any tunnel between a wing pair. ``1`` means a single shared
+ topic is enough; bumping to e.g. ``2`` requires multiple
+ overlaps and filters out coincidental single-topic links.
+ label_prefix: human-readable string prefixed to the tunnel label.
+
+ Returns:
+ List of tunnel dicts as returned by ``create_tunnel`` — one per
+ (wing_a, wing_b, topic) triple that crossed the threshold. A
+ wing-pair below ``min_count`` produces no tunnels at all (not
+ even for its single shared topic).
+
+ No-op semantics:
+ - empty/None ``topics_by_wing`` returns ``[]``.
+ - wings whose topic list is empty are skipped.
+ - ``min_count <= 0`` is clamped to 1.
+ """
+ if not topics_by_wing:
+ return []
+
+ min_count = max(1, int(min_count))
+
+ # Build a normalized-topic -> first-seen casing map per wing so we
+ # preserve display casing while still doing case-insensitive overlap.
+ wing_topics: dict[str, dict[str, str]] = {}
+ for wing, names in topics_by_wing.items():
+ if not isinstance(wing, str) or not wing.strip():
+ continue
+ if not isinstance(names, (list, tuple)):
+ continue
+ bucket: dict[str, str] = {}
+ for n in names:
+ if not isinstance(n, str):
+ continue
+ key = _normalize_topic(n)
+ if not key:
+ continue
+ bucket.setdefault(key, n.strip())
+ if bucket:
+ wing_topics[wing.strip()] = bucket
+
+ wings = sorted(wing_topics.keys())
+ created: list[dict] = []
+ for i, wa in enumerate(wings):
+ topics_a = wing_topics[wa]
+ for wb in wings[i + 1 :]:
+ topics_b = wing_topics[wb]
+ shared_keys = set(topics_a.keys()) & set(topics_b.keys())
+ if len(shared_keys) < min_count:
+ continue
+ # Stable sort for deterministic tunnel ordering across runs.
+ for key in sorted(shared_keys):
+ # Prefer the casing from whichever wing sorts first — both
+ # are valid; this just keeps the displayed room consistent.
+ topic_name = topics_a[key] if topics_a[key] else topics_b[key]
+ room = topic_room(topic_name)
+ tunnel = create_tunnel(
+ source_wing=wa,
+ source_room=room,
+ target_wing=wb,
+ target_room=room,
+ label=f"{label_prefix}: {topic_name}",
+ kind="topic",
+ )
+ created.append(tunnel)
+ return created
+
+
+def topic_tunnels_for_wing(
+ wing: str,
+ topics_by_wing: dict,
+ min_count: int = 1,
+ label_prefix: str = "shared topic",
+) -> list[dict]:
+ """Compute topic tunnels involving a single wing.
+
+ Used by the miner to incrementally update tunnels for the wing that
+ just finished mining without recomputing pairs that don't involve it.
+ Returns the list of tunnels created or refreshed.
+ """
+ if not topics_by_wing or not isinstance(wing, str) or not wing.strip():
+ return []
+
+ wing = wing.strip()
+ own = topics_by_wing.get(wing)
+ if not isinstance(own, (list, tuple)) or not own:
+ return []
+
+ # Restrict the pair-wise computation to (wing, other) pairs only by
+ # building a 2-wing slice for each other wing. Reusing
+ # ``compute_topic_tunnels`` keeps the threshold and casing logic in
+ # one place.
+ created: list[dict] = []
+ for other, other_topics in topics_by_wing.items():
+ if not isinstance(other, str) or not other.strip() or other == wing:
+ continue
+ if not isinstance(other_topics, (list, tuple)) or not other_topics:
+ continue
+ slice_map = {wing: list(own), other: list(other_topics)}
+ created.extend(
+ compute_topic_tunnels(
+ slice_map,
+ min_count=min_count,
+ label_prefix=label_prefix,
+ )
+ )
+ return created
diff --git a/mempalace/project_scanner.py b/mempalace/project_scanner.py
index 741a3e2e2..521bfa296 100644
--- a/mempalace/project_scanner.py
+++ b/mempalace/project_scanner.py
@@ -558,6 +558,7 @@ def to_detected_dict(
return {
"people": people_entries,
"projects": proj_entries,
+ "topics": [],
"uncertain": [],
}
@@ -577,7 +578,7 @@ def _merge_detected(primary: dict, secondary: dict, drop_secondary_uncertain: bo
"""
seen = {e["name"].lower() for cat in primary.values() for e in cat}
merged = {k: list(v) for k, v in primary.items()}
- for cat_key in ("people", "projects", "uncertain"):
+ for cat_key in ("people", "projects", "topics", "uncertain"):
if cat_key == "uncertain" and drop_secondary_uncertain:
continue
for e in secondary.get(cat_key, []):
@@ -596,6 +597,7 @@ def discover_entities(
people_cap: int = 15,
llm_provider: object = None,
show_progress: bool = True,
+ corpus_origin: dict | None = None,
) -> dict:
"""Top-level entity discovery: real signals first, prose detection second.
@@ -612,11 +614,19 @@ def discover_entities(
mentioned in docs/notes (not code)
5. Optional LLM refinement pass — reclassifies ambiguous candidates
using the caller-supplied provider
+ 6. Optional corpus-origin persona filter — when the corpus is
+ identified as AI-dialogue, candidates whose name matches an
+ agent_persona_name are moved to an ``agent_personas`` bucket
+ instead of being reported as people.
Passing ``llm_provider`` enables phase-2 refinement. The caller is
responsible for constructing the provider (``llm_client.get_provider``)
and confirming availability. Refinement is blocking-interactive:
progress prints to stderr; Ctrl-C returns partial results.
+
+ Passing ``corpus_origin`` enables corpus-origin persona reclassification.
+ The expected shape is the dict written by ``mempalace init`` to
+ ``/.mempalace/origin.json`` (see ``corpus_origin.py``).
"""
projects, people = scan(project_dir)
@@ -654,7 +664,7 @@ def discover_entities(
prose_detected = (
detect_entities(prose_files, languages=languages)
if prose_files
- else {"people": [], "projects": [], "uncertain": []}
+ else {"people": [], "projects": [], "topics": [], "uncertain": []}
)
# Without LLM refinement, suppress regex "uncertain" noise when real
@@ -667,7 +677,7 @@ def discover_entities(
drop_secondary_uncertain=has_real_signal and llm_provider is None,
)
- # Optional phase 2: LLM refinement.
+ # Optional LLM refinement pass (when an llm_provider was supplied).
if llm_provider is not None:
from mempalace.llm_refine import collect_corpus_text, refine_entities
@@ -678,6 +688,7 @@ def discover_entities(
llm_provider,
show_progress=show_progress,
allow_project_promotions=not has_real_signal,
+ corpus_origin=corpus_origin,
)
if show_progress:
status_bits = []
@@ -695,6 +706,14 @@ def discover_entities(
print(f" LLM refine: {', '.join(status_bits)}", file=_sys.stderr)
merged = result.merged
+ # Corpus-origin persona reclassification — applied last so it sweeps
+ # candidates contributed by every upstream source (manifests, git authors,
+ # prose, LLM refinement). Idempotent: no corpus_origin → exact v3.3.3 shape.
+ if corpus_origin is not None:
+ from mempalace.entity_detector import _apply_corpus_origin
+
+ merged = _apply_corpus_origin(merged, corpus_origin)
+
return merged
diff --git a/mempalace/repair.py b/mempalace/repair.py
index 9a9aa8845..a75ce3559 100644
--- a/mempalace/repair.py
+++ b/mempalace/repair.py
@@ -201,13 +201,143 @@ def prune_corrupt(palace_path=None, confirm=False):
print(f" Collection size: {before:,} → {after:,}")
-def rebuild_index(palace_path=None):
+# ChromaDB's ``collection.get()`` enforces an internal default ``limit``
+# of 10 000 rows when the caller does not pass one. We pass an explicit
+# ``limit=batch_size`` below, but the underlying segment also caps reads
+# during stale/quarantined-HNSW recovery flows: extraction silently stops
+# at exactly 10 000 even on palaces with many more rows. Refusing to
+# overwrite when this exact value comes back is the simplest signal we
+# can detect without depending on chromadb internals.
+CHROMADB_DEFAULT_GET_LIMIT = 10_000
+
+
+class TruncationDetected(Exception):
+ """Raised by :func:`check_extraction_safety` when extraction looks short.
+
+ Carries the human-readable abort message so callers (CLI ``cmd_repair``,
+ ``rebuild_index``) can print and exit consistently without re-deriving
+ the wording.
+ """
+
+ def __init__(self, message: str, sqlite_count: "int | None", extracted: int):
+ super().__init__(message)
+ self.message = message
+ self.sqlite_count = sqlite_count
+ self.extracted = extracted
+
+
+def check_extraction_safety(
+ palace_path: str, extracted: int, confirm_truncation_ok: bool = False
+) -> None:
+ """Cross-check that ``extracted`` matches the SQLite ground truth.
+
+ Two signals trip the guard:
+
+ 1. **Strong** — ``chroma.sqlite3`` reports more drawers than were
+ extracted. This is the user-reported #1208 case: 67 580 on disk,
+ 10 000 came back through the chromadb collection layer, repair
+ would have destroyed the difference.
+ 2. **Weak** — extracted count equals exactly ``CHROMADB_DEFAULT_GET_LIMIT``
+ AND the SQLite check couldn't run (schema drift, locked file).
+ Hitting the chromadb default ``get()`` cap exactly is suspicious
+ enough to refuse without explicit acknowledgement.
+
+ Raises :class:`TruncationDetected` with a printable message when the
+ guard fires. Does nothing on safe extractions or when
+ ``confirm_truncation_ok`` is set.
+ """
+ if confirm_truncation_ok:
+ return
+
+ sqlite_count = sqlite_drawer_count(palace_path)
+ cap_signal = extracted == CHROMADB_DEFAULT_GET_LIMIT
+
+ if sqlite_count is not None and sqlite_count > extracted:
+ loss = sqlite_count - extracted
+ pct = 100 * loss / sqlite_count
+ message = (
+ f"\n ABORT: chroma.sqlite3 reports {sqlite_count:,} drawers but only {extracted:,}\n"
+ " came back through the chromadb collection layer. The segment metadata is\n"
+ " stale (often after manual HNSW quarantine) — proceeding would silently\n"
+ f" destroy {loss:,} drawers (~{pct:.0f}%).\n"
+ "\n"
+ " Recovery options:\n"
+ " 1. Restore from your most recent palace backup, then re-mine.\n"
+ " 2. Direct-extract from chroma.sqlite3 (rows are still on disk) and\n"
+ " rebuild the palace from source files.\n"
+ " 3. If you have independently confirmed the palace really contains only\n"
+ f" {extracted:,} drawers, re-run with --confirm-truncation-ok.\n"
+ )
+ raise TruncationDetected(message, sqlite_count, extracted)
+
+ if cap_signal and sqlite_count is None:
+ message = (
+ f"\n ABORT: extracted exactly {CHROMADB_DEFAULT_GET_LIMIT:,} drawers, which matches\n"
+ " ChromaDB's internal default get() limit. The on-disk SQLite count couldn't\n"
+ " be cross-checked from this Python context, so we can't tell whether the\n"
+ f" palace genuinely holds {CHROMADB_DEFAULT_GET_LIMIT:,} rows or whether extraction was\n"
+ " silently capped. Refusing to overwrite the palace.\n"
+ "\n"
+ " If you have independently confirmed (e.g. via direct sqlite3 query) that\n"
+ f" the palace really contains exactly {CHROMADB_DEFAULT_GET_LIMIT:,} drawers, re-run with\n"
+ " --confirm-truncation-ok.\n"
+ )
+ raise TruncationDetected(message, sqlite_count, extracted)
+
+
+def sqlite_drawer_count(palace_path: str) -> "int | None":
+ """Count rows in ``chroma.sqlite3.embeddings`` for the drawers collection.
+
+ Used as an independent ground-truth check against the chromadb
+ collection-layer ``count()`` / ``get()``: when the on-disk SQLite
+ row count exceeds the extraction count, the segment metadata is
+ stale and repair would destroy the difference.
+
+ Returns ``None`` when the schema isn't readable (chromadb version
+ drift, missing tables, locked file). Callers treat ``None`` as
+ "unknown" and fall back to the cap-detection check.
+ """
+ sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
+ if not os.path.exists(sqlite_path):
+ return None
+ try:
+ import sqlite3
+
+ conn = sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True)
+ try:
+ row = conn.execute(
+ """
+ SELECT COUNT(*)
+ FROM embeddings e
+ JOIN segments s ON e.segment_id = s.id
+ JOIN collections c ON s.collection = c.id
+ WHERE c.name = ?
+ """,
+ (COLLECTION_NAME,),
+ ).fetchone()
+ return int(row[0]) if row and row[0] is not None else None
+ finally:
+ conn.close()
+ except Exception:
+ # chromadb schema differs by version (segments / collections column
+ # names occasionally rename). Silent fallback is correct here —
+ # the cap-detection check still catches the user-reported case.
+ return None
+
+
+def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
"""Rebuild the HNSW index from scratch.
1. Extract all drawers via ChromaDB get()
- 2. Back up ONLY chroma.sqlite3 (not the bloated HNSW files)
- 3. Delete and recreate the collection with hnsw:space=cosine
- 4. Upsert all drawers back
+ 2. Cross-check against the SQLite ground truth (#1208 guard)
+ 3. Back up ONLY chroma.sqlite3 (not the bloated HNSW files)
+ 4. Delete and recreate the collection with hnsw:space=cosine
+ 5. Upsert all drawers back
+
+ ``confirm_truncation_ok`` overrides the safety guard from step 2.
+ Set to ``True`` only when you have independently verified that the
+ palace genuinely contains exactly the extracted number of drawers
+ (typically only a concern for palaces sized at exactly 10 000 rows).
"""
palace_path = palace_path or _get_palace_path()
@@ -252,10 +382,20 @@ def rebuild_index(palace_path=None):
offset += len(batch["ids"])
print(f" Extracted {len(all_ids)} drawers")
+ # ── #1208 guard ──────────────────────────────────────────────────
+ # Refuse to ``delete_collection`` + rebuild when extraction looks
+ # short of the SQLite ground truth (or when extraction == chromadb
+ # default get() cap and the SQLite check couldn't run).
+ try:
+ check_extraction_safety(palace_path, len(all_ids), confirm_truncation_ok)
+ except TruncationDetected as e:
+ print(e.message)
+ return
+
# Back up ONLY the SQLite database, not the bloated HNSW files
sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
+ backup_path = sqlite_path + ".backup"
if os.path.exists(sqlite_path):
- backup_path = sqlite_path + ".backup"
print(f" Backing up chroma.sqlite3 ({os.path.getsize(sqlite_path) / 1e6:.0f} MB)...")
shutil.copy2(sqlite_path, backup_path)
print(f" Backup: {backup_path}")
@@ -266,13 +406,25 @@ def rebuild_index(palace_path=None):
new_col = backend.create_collection(palace_path, COLLECTION_NAME)
filed = 0
- for i in range(0, len(all_ids), batch_size):
- batch_ids = all_ids[i : i + batch_size]
- batch_docs = all_docs[i : i + batch_size]
- batch_metas = all_metas[i : i + batch_size]
- new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
- filed += len(batch_ids)
- print(f" Re-filed {filed}/{len(all_ids)} drawers...")
+ try:
+ for i in range(0, len(all_ids), batch_size):
+ batch_ids = all_ids[i : i + batch_size]
+ batch_docs = all_docs[i : i + batch_size]
+ batch_metas = all_metas[i : i + batch_size]
+ new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
+ filed += len(batch_ids)
+ print(f" Re-filed {filed}/{len(all_ids)} drawers...")
+ except Exception as e:
+ print(f"\n ERROR during rebuild: {e}")
+ print(f" Only {filed}/{len(all_ids)} drawers were re-filed.")
+ if os.path.exists(backup_path):
+ print(f" Restoring from backup: {backup_path}")
+ backend.delete_collection(palace_path, COLLECTION_NAME)
+ shutil.copy2(backup_path, sqlite_path)
+ print(" Backup restored. Palace is back to pre-repair state.")
+ else:
+ print(" No backup available. Re-mine from source files to recover.")
+ raise
print(f"\n Repair complete. {filed} drawers rebuilt.")
print(" HNSW index is now clean with cosine distance metric.")
diff --git a/mempalace/searcher.py b/mempalace/searcher.py
index 5208b13b7..c2fcdb4c3 100644
--- a/mempalace/searcher.py
+++ b/mempalace/searcher.py
@@ -46,7 +46,14 @@ def _first_or_empty(results, key: str) -> list:
def _tokenize(text: str) -> list:
- """Lowercase + strip to alphanumeric tokens of length ≥ 2."""
+ """Lowercase + strip to alphanumeric tokens of length ≥ 2.
+
+ Tolerates ``None`` documents — Chroma can return ``None`` in the
+ ``documents`` field for drawers without text content, which would
+ otherwise raise ``AttributeError`` mid-rerank.
+ """
+ if not text:
+ return []
return _TOKEN_RE.findall(text.lower())
@@ -236,6 +243,42 @@ def _expand_with_neighbors(drawers_col, matched_doc: str, matched_meta: dict, ra
}
+def _warn_if_legacy_metric(col) -> None:
+ """Print a one-line notice if the palace was created without
+ ``hnsw:space=cosine``.
+
+ ChromaDB's default is L2 (Euclidean), under which cosine-based
+ similarity interpretation falls apart — distances routinely exceed
+ 1.0 and the display ``max(0, 1 - dist)`` floors every result to 0.
+ Legacy palaces (mined before this metadata was consistently set)
+ need ``mempalace repair`` to rebuild with the correct metric.
+
+ The warning fires only for palaces that clearly have the wrong
+ metric; palaces with no metadata table at all (empty dict) also
+ fall under this check since that is the signal of a pre-metadata
+ palace.
+ """
+ try:
+ meta = getattr(col, "metadata", None)
+ except Exception:
+ return
+ if not isinstance(meta, dict):
+ return
+ space = meta.get("hnsw:space")
+ if space == "cosine":
+ return
+ # Either missing or set to something else — both are suspect.
+ import sys as _sys
+
+ detail = f"hnsw:space={space!r}" if space else "no hnsw:space metadata"
+ print(
+ f"\n NOTICE: this palace was created without cosine distance ({detail}).\n"
+ " Semantic similarity scores will not be meaningful.\n"
+ " Run `mempalace repair` to rebuild the index with the correct metric.",
+ file=_sys.stderr,
+ )
+
+
def search(query: str, palace_path: str, wing: str = None, room: str = None, n_results: int = 5):
"""
Search the palace. Returns verbatim drawer content.
@@ -248,6 +291,10 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r
print(" Run: mempalace init then mempalace mine ")
raise SearchError(f"No palace found at {palace_path}")
+ # Alert the user if this palace predates hnsw:space=cosine being set on
+ # creation — their similarity scores will be junk until they run repair.
+ _warn_if_legacy_metric(col)
+
where = build_where_filter(wing, room)
try:
@@ -273,6 +320,20 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r
print(f'\n No results found for: "{query}"')
return
+ # Pure-cosine retrieval on the CLI path was missing lexical matches:
+ # a drawer whose text contains every query term can still score distance
+ # >= 1.0 against the natural-language query when the drawer is a
+ # mechanical artifact (directory listing, diff, log fragment) that
+ # embeds as file-tree noise rather than as prose about its subject.
+ # The MCP tool path already hybridizes BM25 with vector sim via
+ # `_hybrid_rank`; do the same here so CLI results match what agents
+ # see via `mempalace_search`.
+ hits = [
+ {"text": doc, "distance": float(dist), "metadata": meta or {}}
+ for doc, meta, dist in zip(docs, metas, dists)
+ ]
+ hits = _hybrid_rank(hits, query)
+
print(f"\n{'=' * 60}")
print(f' Results for: "{query}"')
if wing:
@@ -281,19 +342,20 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r
print(f" Room: {room}")
print(f"{'=' * 60}\n")
- for i, (doc, meta, dist) in enumerate(zip(docs, metas, dists), 1):
- similarity = round(max(0.0, 1 - dist), 3)
- meta = meta or {}
+ for i, hit in enumerate(hits, 1):
+ vec_sim = round(max(0.0, 1 - hit["distance"]), 3)
+ bm25 = hit.get("bm25_score", 0.0)
+ meta = hit["metadata"]
source = Path(meta.get("source_file", "?")).name
wing_name = meta.get("wing", "?")
room_name = meta.get("room", "?")
print(f" [{i}] {wing_name} / {room_name}")
print(f" Source: {source}")
- print(f" Match: {similarity}")
+ print(f" Match: cosine={vec_sim} bm25={bm25}")
print()
# Print the verbatim text, indented
- for line in doc.strip().split("\n"):
+ for line in hit["text"].strip().split("\n"):
print(f" {line}")
print()
print(f" {'─' * 56}")
diff --git a/openarena-claim.txt b/openarena-claim.txt
new file mode 100644
index 000000000..512b75016
--- /dev/null
+++ b/openarena-claim.txt
@@ -0,0 +1 @@
+OpenArena owner claim verification for MemPalace/mempalace: 09AE2C2E66CC4B5CBD7D
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 617c067c4..18228d7bc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,6 +53,14 @@ chroma = "mempalace.backends.chroma:ChromaBackend"
[project.optional-dependencies]
dev = ["pytest>=7.0", "pytest-cov>=4.0", "ruff>=0.4.0", "psutil>=5.9"]
spellcheck = ["autocorrect>=2.0"]
+# Hardware acceleration for the ONNX embedding model. Install exactly one:
+# pip install mempalace[gpu] — NVIDIA CUDA
+# pip install mempalace[dml] — DirectML (Windows AMD/Intel/NVIDIA)
+# pip install mempalace[coreml] — macOS Neural Engine
+# After install, set MEMPALACE_EMBEDDING_DEVICE=cuda|dml|coreml (or "auto").
+gpu = ["onnxruntime-gpu>=1.16"]
+dml = ["onnxruntime-directml>=1.16"]
+coreml = ["onnxruntime>=1.16"]
[dependency-groups]
dev = ["pytest>=7.0", "pytest-cov>=4.0", "ruff>=0.4.0", "psutil>=5.9"]
diff --git a/tests/conftest.py b/tests/conftest.py
index 7b2bb7713..4ed82cafd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -46,6 +46,14 @@ def _clear_cache():
mcp_server._collection_cache = None
except (ImportError, AttributeError):
pass
+ try:
+ # Reset the per-process quarantine gate so tests don't leak
+ # state through ChromaBackend._quarantined_paths.
+ from mempalace.backends.chroma import ChromaBackend
+
+ ChromaBackend._quarantined_paths.clear()
+ except (ImportError, AttributeError):
+ pass
_clear_cache()
yield
diff --git a/tests/test_backends.py b/tests/test_backends.py
index b3f009a25..9fe5ca1a8 100644
--- a/tests/test_backends.py
+++ b/tests/test_backends.py
@@ -1,5 +1,6 @@
import os
import sqlite3
+from pathlib import Path
import chromadb
import pytest
@@ -16,6 +17,7 @@
ChromaBackend,
ChromaCollection,
_fix_blob_seq_ids,
+ _pin_hnsw_threads,
quarantine_stale_hnsw,
)
@@ -380,39 +382,171 @@ def test_fix_blob_seq_ids_noop_without_database(tmp_path):
_fix_blob_seq_ids(str(tmp_path)) # should not raise
+def test_fix_blob_seq_ids_writes_marker_after_blob_path(tmp_path):
+ """The .blob_seq_ids_migrated marker is written after a successful BLOB → INTEGER conversion."""
+ from mempalace.backends.chroma import _BLOB_FIX_MARKER
+
+ db_path = tmp_path / "chroma.sqlite3"
+ conn = sqlite3.connect(str(db_path))
+ conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id)")
+ conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", ((42).to_bytes(8, "big"),))
+ conn.commit()
+ conn.close()
+
+ marker = tmp_path / _BLOB_FIX_MARKER
+ assert not marker.exists()
+
+ _fix_blob_seq_ids(str(tmp_path))
+
+ assert marker.is_file(), "marker must be written after a successful migration"
+
+
+def test_fix_blob_seq_ids_writes_marker_when_already_integer(tmp_path):
+ """The marker is written even when the migration is a no-op (already INTEGER).
+
+ The point of the marker is to skip the sqlite3 open on subsequent calls,
+ not to record that a conversion happened. So a clean palace gets the
+ marker on first run too — next ``_fix_blob_seq_ids`` call short-circuits
+ before touching the sqlite3 file.
+ """
+ from mempalace.backends.chroma import _BLOB_FIX_MARKER
+
+ db_path = tmp_path / "chroma.sqlite3"
+ conn = sqlite3.connect(str(db_path))
+ conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id INTEGER)")
+ conn.execute("INSERT INTO embeddings (seq_id) VALUES (42)")
+ conn.commit()
+ conn.close()
+
+ marker = tmp_path / _BLOB_FIX_MARKER
+ assert not marker.exists()
+
+ _fix_blob_seq_ids(str(tmp_path))
+
+ assert marker.is_file(), "marker must be written even when no BLOBs found"
+
+
+def test_fix_blob_seq_ids_skips_sqlite_when_marker_present(tmp_path):
+ """When the marker exists, ``_fix_blob_seq_ids`` does not open sqlite3.
+
+ This is the load-bearing property of the marker — opening Python's
+ sqlite3 against a live ChromaDB 1.5.x WAL DB corrupts the next
+ PersistentClient call (#1090). Once a palace has been migrated, we
+ never want to open it again, even read-only.
+ """
+ from unittest.mock import patch
+ from mempalace.backends.chroma import _BLOB_FIX_MARKER
+
+ # Pre-create the marker so the function should short-circuit.
+ db_path = tmp_path / "chroma.sqlite3"
+ db_path.write_bytes(b"sentinel") # presence required for the function to proceed
+ (tmp_path / _BLOB_FIX_MARKER).touch()
+
+ with patch("mempalace.backends.chroma.sqlite3.connect") as mock_connect:
+ _fix_blob_seq_ids(str(tmp_path))
+
+ mock_connect.assert_not_called()
+
+
# ── quarantine_stale_hnsw ─────────────────────────────────────────────────
-def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime):
- """Helper: build a palace dir with one HNSW segment + sqlite at given mtimes."""
+# Marker bytes for the chromadb segment metadata file. A complete
+# write begins with PROTO opcode (0x80) and ends with STOP opcode
+# (0x2e); _segment_appears_healthy sniffs these bytes without parsing
+# the file.
+_HEALTHY_META = b"\x80\x04" + b"\x00" * 32 + b"\x2e"
+_CORRUPT_META = b"\x00" * 64
+
+
+def _make_palace_with_segment(tmp_path, hnsw_mtime, sqlite_mtime, meta_bytes=_HEALTHY_META):
+ """Helper: build a palace dir with one HNSW segment + sqlite at given
+ mtimes. ``meta_bytes`` controls whether the segment looks healthy
+ (default), corrupt (``_CORRUPT_META``), or has no metadata file at
+ all (``None``)."""
palace = tmp_path / "palace"
palace.mkdir()
(palace / "chroma.sqlite3").write_text("")
seg = palace / "abcd-1234-5678"
seg.mkdir()
(seg / "data_level0.bin").write_text("")
+ if meta_bytes is not None:
+ (seg / "index_metadata.pickle").write_bytes(meta_bytes)
os.utime(seg / "data_level0.bin", (hnsw_mtime, hnsw_mtime))
os.utime(palace / "chroma.sqlite3", (sqlite_mtime, sqlite_mtime))
return palace, seg
-def test_quarantine_stale_hnsw_renames_drifted_segment(tmp_path):
- """Segment whose data_level0.bin is 2h older than sqlite gets renamed."""
+def test_quarantine_stale_hnsw_renames_corrupt_segment(tmp_path):
+ """Segment with stale mtime AND a malformed metadata file gets renamed."""
now = 1_700_000_000.0
- palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 7200, sqlite_mtime=now)
+ palace, seg = _make_palace_with_segment(
+ tmp_path,
+ hnsw_mtime=now - 7200,
+ sqlite_mtime=now,
+ meta_bytes=_CORRUPT_META,
+ )
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
assert len(moved) == 1
assert ".drift-" in moved[0]
assert not seg.exists()
- # the renamed directory still exists and contains the original file
renamed = list(palace.iterdir())
drift_dirs = [p for p in renamed if ".drift-" in p.name]
assert len(drift_dirs) == 1
assert (drift_dirs[0] / "data_level0.bin").exists()
+def test_quarantine_stale_hnsw_leaves_healthy_segment_with_drift_alone(tmp_path):
+ """Segment with stale mtime but a complete metadata file is NOT
+ renamed — this is the chromadb-1.5.x async-flush steady state, not
+ corruption. Production case at 06:24 PDT 2026-04-26: cold-start
+ quarantine renamed three healthy segments after a clean shutdown,
+ leaving 151K-drawer palace with vector_ranked=0."""
+ now = 1_700_000_000.0
+ palace, seg = _make_palace_with_segment(
+ tmp_path,
+ hnsw_mtime=now - 7200,
+ sqlite_mtime=now,
+ meta_bytes=_HEALTHY_META,
+ )
+ moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
+ assert moved == []
+ assert seg.exists()
+
+
+def test_quarantine_stale_hnsw_leaves_segment_without_metadata_alone(tmp_path):
+ """Segment with no metadata file is treated as fresh / never-flushed
+ and not quarantined — renaming an empty dir orphans nothing."""
+ now = 1_700_000_000.0
+ palace, seg = _make_palace_with_segment(
+ tmp_path,
+ hnsw_mtime=now - 7200,
+ sqlite_mtime=now,
+ meta_bytes=None,
+ )
+ moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
+ assert moved == []
+ assert seg.exists()
+
+
+def test_quarantine_stale_hnsw_renames_truncated_metadata(tmp_path):
+ """Segment with a truncated (under-floor-size) metadata file is
+ quarantined — shape of a partial-flush during process kill."""
+ now = 1_700_000_000.0
+ palace, seg = _make_palace_with_segment(
+ tmp_path,
+ hnsw_mtime=now - 7200,
+ sqlite_mtime=now,
+ meta_bytes=b"\x80\x04",
+ )
+ moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
+ assert len(moved) == 1
+ assert ".drift-" in moved[0]
+
+
def test_quarantine_stale_hnsw_leaves_fresh_segment_alone(tmp_path):
- """Segment with recent mtime vs sqlite is not touched."""
+ """Segment with recent mtime vs sqlite is not touched (mtime gate
+ short-circuits before integrity gate)."""
now = 1_700_000_000.0
palace, seg = _make_palace_with_segment(tmp_path, hnsw_mtime=now - 10, sqlite_mtime=now)
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
@@ -443,3 +577,115 @@ def test_quarantine_stale_hnsw_skips_already_quarantined(tmp_path):
moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
assert moved == []
assert drift.exists()
+
+
+# ── make_client cold-start gate ──────────────────────────────────────────
+
+
+def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeypatch):
+ """Quarantine fires on first ``make_client()`` for a palace, then is
+ skipped on subsequent calls — prevents runtime thrash where a daemon's
+ own steady writes bump ``chroma.sqlite3`` faster than HNSW flushes,
+ making the mtime heuristic falsely trigger every reconnect."""
+ from mempalace.backends.chroma import ChromaBackend
+
+ palace_path = str(tmp_path / "palace")
+ os.makedirs(palace_path, exist_ok=True)
+ (Path(palace_path) / "chroma.sqlite3").write_text("")
+
+ # Reset the per-process cache so this test is independent of others.
+ monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
+
+ calls: list[str] = []
+
+ def _spy(path, stale_seconds=300.0):
+ calls.append(path)
+ return []
+
+ monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy)
+
+ ChromaBackend.make_client(palace_path)
+ ChromaBackend.make_client(palace_path)
+ ChromaBackend.make_client(palace_path)
+
+ assert calls == [
+ palace_path
+ ], "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
+
+
+def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch):
+ """Two distinct palaces each get one quarantine attempt — the gate is
+ keyed by palace path, not global."""
+ from mempalace.backends.chroma import ChromaBackend
+
+ palace_a = str(tmp_path / "palace_a")
+ palace_b = str(tmp_path / "palace_b")
+ for p in (palace_a, palace_b):
+ os.makedirs(p, exist_ok=True)
+ (Path(p) / "chroma.sqlite3").write_text("")
+
+ monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
+
+ calls: list[str] = []
+
+ def _spy(path, stale_seconds=300.0):
+ calls.append(path)
+ return []
+
+ monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy)
+
+ ChromaBackend.make_client(palace_a)
+ ChromaBackend.make_client(palace_b)
+ ChromaBackend.make_client(palace_a) # already gated
+ ChromaBackend.make_client(palace_b) # already gated
+
+ assert calls == [palace_a, palace_b]
+
+
+# ── _pin_hnsw_threads (per-process retrofit, separate from this PR's gate) ──
+
+
+def test_pin_hnsw_threads_retrofits_legacy_collection(tmp_path):
+ """Legacy collections (created without num_threads) get the retrofit applied."""
+ palace_path = tmp_path / "legacy-palace"
+ palace_path.mkdir()
+
+ client = chromadb.PersistentClient(path=str(palace_path))
+ col = client.create_collection(
+ "mempalace_drawers",
+ metadata={"hnsw:space": "cosine"}, # no num_threads — legacy
+ )
+ assert col.configuration_json.get("hnsw", {}).get("num_threads") is None
+
+ _pin_hnsw_threads(col)
+
+ assert col.configuration_json["hnsw"]["num_threads"] == 1
+
+
+def test_pin_hnsw_threads_swallows_all_errors():
+ """Retrofit never raises even when collection.modify explodes."""
+
+ class _ExplodingCollection:
+ def modify(self, *args, **kwargs):
+ raise RuntimeError("boom")
+
+ _pin_hnsw_threads(_ExplodingCollection()) # must not raise
+
+
+def test_get_collection_applies_retrofit_on_existing_palace(tmp_path):
+ """ChromaBackend.get_collection(create=False) applies the retrofit."""
+ palace_path = tmp_path / "palace"
+ palace_path.mkdir()
+
+ # Simulate a legacy palace: create collection without num_threads
+ bootstrap_client = chromadb.PersistentClient(path=str(palace_path))
+ bootstrap_client.create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"})
+ del bootstrap_client # drop reference so a fresh client reopens cleanly
+
+ wrapper = ChromaBackend().get_collection(
+ str(palace_path),
+ collection_name="mempalace_drawers",
+ create=False,
+ )
+
+ assert wrapper._collection.configuration_json["hnsw"]["num_threads"] == 1
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 1c4dfbda3..b9427d5bc 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,6 +1,7 @@
"""Tests for mempalace.cli — the main CLI dispatcher."""
import argparse
+import shlex
import sys
from pathlib import Path
from unittest.mock import MagicMock, patch
@@ -108,6 +109,7 @@ def test_cmd_init_no_entities(mock_config_cls, tmp_path):
with (
patch("mempalace.entity_detector.scan_for_detection", return_value=[]),
patch("mempalace.room_detector_local.detect_rooms_local") as mock_rooms,
+ patch("mempalace.cli._maybe_run_mine_after_init"),
):
cmd_init(args)
mock_rooms.assert_called_once_with(project_dir=str(tmp_path), yes=True)
@@ -125,7 +127,13 @@ def test_cmd_init_with_entities(mock_config_cls, tmp_path):
patch("mempalace.entity_detector.detect_entities", return_value=detected),
patch("mempalace.entity_detector.confirm_entities", return_value=confirmed),
patch("mempalace.room_detector_local.detect_rooms_local"),
+ # Pass 0 (corpus_origin) needs real file IO; this test mocks
+ # builtins.open globally for the entities.json write, which would
+ # break Pass 0's file-reading path. Patch Pass 0 out — a separate
+ # suite (tests/test_corpus_origin_integration.py) covers it directly.
+ patch("mempalace.cli._run_pass_zero", return_value=None),
patch("builtins.open", MagicMock()),
+ patch("mempalace.cli._maybe_run_mine_after_init"),
):
cmd_init(args)
@@ -140,12 +148,238 @@ def test_cmd_init_with_entities_zero_total(mock_config_cls, tmp_path, capsys):
patch("mempalace.entity_detector.scan_for_detection", return_value=fake_files),
patch("mempalace.entity_detector.detect_entities", return_value=detected),
patch("mempalace.room_detector_local.detect_rooms_local"),
+ patch("mempalace.cli._maybe_run_mine_after_init"),
):
cmd_init(args)
out = capsys.readouterr().out
assert "No entities detected" in out
+# ── _maybe_run_mine_after_init (init → mine prompt, #1181) ─────────────
+
+
+def _init_args(tmp_path, *, yes=False, auto_mine=False):
+ return argparse.Namespace(dir=str(tmp_path), yes=yes, auto_mine=auto_mine)
+
+
+def _fake_cfg(tmp_path):
+ cfg = MagicMock()
+ cfg.palace_path = str(tmp_path / "palace")
+ return cfg
+
+
+def _fake_scanned(tmp_path, n=3):
+ """Build n real Path objects with stat()-able sizes for the scan estimate."""
+ paths = []
+ for i in range(n):
+ p = tmp_path / f"f{i}.txt"
+ p.write_text("x" * 1024) # 1 KB each
+ paths.append(p)
+ return paths
+
+
+def test_maybe_run_mine_prompt_accepted_runs_mine(tmp_path):
+ """Empty / 'y' / 'yes' on the prompt triggers mine() in-process."""
+ from mempalace.cli import _maybe_run_mine_after_init
+
+ args = _init_args(tmp_path, yes=False, auto_mine=False)
+ cfg = _fake_cfg(tmp_path)
+ scanned = _fake_scanned(tmp_path, n=3)
+ with (
+ patch("mempalace.miner.mine") as mock_mine,
+ patch("mempalace.miner.scan_project", return_value=scanned),
+ patch("builtins.input", return_value=""),
+ ):
+ _maybe_run_mine_after_init(args, cfg)
+ mock_mine.assert_called_once_with(
+ project_dir=str(tmp_path),
+ palace_path=cfg.palace_path,
+ files=scanned,
+ )
+
+
+def test_maybe_run_mine_prompt_yes_accepted_runs_mine(tmp_path):
+ """Explicit 'y' answer also runs mine()."""
+ from mempalace.cli import _maybe_run_mine_after_init
+
+ args = _init_args(tmp_path, yes=False, auto_mine=False)
+ cfg = _fake_cfg(tmp_path)
+ with (
+ patch("mempalace.miner.mine") as mock_mine,
+ patch("mempalace.miner.scan_project", return_value=[]),
+ patch("builtins.input", return_value="Y"),
+ ):
+ _maybe_run_mine_after_init(args, cfg)
+ mock_mine.assert_called_once()
+
+
+def test_maybe_run_mine_prompt_declined_prints_hint(tmp_path, capsys):
+ """'n' answer skips mine() and prints the resume hint."""
+ from mempalace.cli import _maybe_run_mine_after_init
+
+ args = _init_args(tmp_path, yes=False, auto_mine=False)
+ cfg = _fake_cfg(tmp_path)
+ with (
+ patch("mempalace.miner.mine") as mock_mine,
+ patch("mempalace.miner.scan_project", return_value=[]),
+ patch("builtins.input", return_value="n"),
+ ):
+ _maybe_run_mine_after_init(args, cfg)
+ mock_mine.assert_not_called()
+ out = capsys.readouterr().out
+ # shlex.quote is a no-op on POSIX-safe paths but wraps Windows paths
+ # (which contain backslashes) in single quotes, so the assertion has
+ # to mirror what the production code actually emits.
+ assert f"mempalace mine {shlex.quote(str(tmp_path))}" in out
+ assert "Skipped" in out
+
+
+def test_maybe_run_mine_yes_alone_still_prompts(tmp_path):
+ """`--yes` is scoped to entity auto-accept and MUST still prompt for mine.
+
+ Regression guard for the flag-overload review feedback on #1183: extending
+ `--yes` to also auto-mine would silently change behaviour for scripted
+ callers and turn a fast command into a minutes-long ChromaDB write.
+ """
+ from mempalace.cli import _maybe_run_mine_after_init
+
+ args = _init_args(tmp_path, yes=True, auto_mine=False)
+ cfg = _fake_cfg(tmp_path)
+ with (
+ patch("mempalace.miner.mine") as mock_mine,
+ patch("mempalace.miner.scan_project", return_value=[]),
+ patch("builtins.input", return_value="n") as mock_input,
+ ):
+ _maybe_run_mine_after_init(args, cfg)
+ mock_input.assert_called_once() # the prompt MUST fire
+ mock_mine.assert_not_called()
+
+
+def test_maybe_run_mine_auto_mine_skips_prompt(tmp_path):
+ """`--auto-mine` runs mine() automatically without calling input()."""
+ from mempalace.cli import _maybe_run_mine_after_init
+
+ args = _init_args(tmp_path, yes=False, auto_mine=True)
+ cfg = _fake_cfg(tmp_path)
+ scanned = _fake_scanned(tmp_path, n=2)
+ with (
+ patch("mempalace.miner.mine") as mock_mine,
+ patch("mempalace.miner.scan_project", return_value=scanned),
+ patch("builtins.input", side_effect=AssertionError("input() must not be called")),
+ ):
+ _maybe_run_mine_after_init(args, cfg)
+ mock_mine.assert_called_once_with(
+ project_dir=str(tmp_path),
+ palace_path=cfg.palace_path,
+ files=scanned,
+ )
+
+
+def test_maybe_run_mine_yes_and_auto_mine_fully_noninteractive(tmp_path):
+ """`--yes --auto-mine` together: never call input(), always mine."""
+ from mempalace.cli import _maybe_run_mine_after_init
+
+ args = _init_args(tmp_path, yes=True, auto_mine=True)
+ cfg = _fake_cfg(tmp_path)
+ with (
+ patch("mempalace.miner.mine") as mock_mine,
+ patch("mempalace.miner.scan_project", return_value=[]),
+ patch("builtins.input", side_effect=AssertionError("input() must not be called")),
+ ):
+ _maybe_run_mine_after_init(args, cfg)
+ mock_mine.assert_called_once()
+
+
+def test_maybe_run_mine_decline_quotes_path_with_spaces(tmp_path, capsys):
+ """The resume hint must shell-quote the project dir so paths with
+ spaces / metacharacters produce a copy-paste-safe command."""
+ from mempalace.cli import _maybe_run_mine_after_init
+
+ spaced_dir = tmp_path / "my project dir"
+ spaced_dir.mkdir()
+ args = argparse.Namespace(dir=str(spaced_dir), yes=False, auto_mine=False)
+ cfg = _fake_cfg(tmp_path)
+ with (
+ patch("mempalace.miner.mine"),
+ patch("mempalace.miner.scan_project", return_value=[]),
+ patch("builtins.input", return_value="n"),
+ ):
+ _maybe_run_mine_after_init(args, cfg)
+ out = capsys.readouterr().out
+ # shlex.quote wraps paths with spaces (and Windows backslashes) in
+ # single quotes — the assertion must use the same shlex form so the
+ # test passes on every platform's tmp_path layout.
+ assert f"mempalace mine {shlex.quote(str(spaced_dir))}" in out
+ # Bare unquoted form must NOT appear — that's the bug we're guarding.
+ assert f"mempalace mine {spaced_dir} " not in out
+ assert f"mempalace mine {spaced_dir}`" not in out
+
+
+def test_maybe_run_mine_eof_on_stdin_treated_as_decline(tmp_path, capsys):
+ """Piped / non-interactive stdin (EOFError) declines without crashing."""
+ from mempalace.cli import _maybe_run_mine_after_init
+
+ args = _init_args(tmp_path, yes=False, auto_mine=False)
+ cfg = _fake_cfg(tmp_path)
+ with (
+ patch("mempalace.miner.mine") as mock_mine,
+ patch("mempalace.miner.scan_project", return_value=[]),
+ patch("builtins.input", side_effect=EOFError),
+ ):
+ _maybe_run_mine_after_init(args, cfg)
+ mock_mine.assert_not_called()
+ assert "Skipped" in capsys.readouterr().out
+
+
+def test_maybe_run_mine_failure_surfaces_via_exit(tmp_path, capsys):
+ """Mine errors are not swallowed — they exit non-zero with an error line."""
+ from mempalace.cli import _maybe_run_mine_after_init
+
+ args = _init_args(tmp_path, yes=False, auto_mine=True)
+ cfg = _fake_cfg(tmp_path)
+ with (
+ patch("mempalace.miner.mine", side_effect=RuntimeError("boom")),
+ patch("mempalace.miner.scan_project", return_value=[]),
+ ):
+ with pytest.raises(SystemExit) as exc_info:
+ _maybe_run_mine_after_init(args, cfg)
+ assert exc_info.value.code == 1
+ err = capsys.readouterr().err
+ assert "boom" in err
+
+
+def test_maybe_run_mine_estimate_appears_before_prompt(tmp_path, capsys):
+ """The file-count + size estimate line MUST render BEFORE the prompt.
+
+ Required by the spec: hitting Enter on a default-Y prompt with no size
+ info is a footgun on a real corpus where mine takes minutes. The user
+ must see scope before being asked to confirm.
+ """
+ from mempalace.cli import _maybe_run_mine_after_init
+
+ args = _init_args(tmp_path, yes=False, auto_mine=False)
+ cfg = _fake_cfg(tmp_path)
+ scanned = _fake_scanned(tmp_path, n=4) # 4 files * 1 KB each
+ captured_when_prompted = {}
+
+ def fake_input(prompt):
+ # Snapshot what stdout looked like at the moment the prompt fires.
+ captured_when_prompted["stdout"] = capsys.readouterr().out
+ return "n"
+
+ with (
+ patch("mempalace.miner.mine"),
+ patch("mempalace.miner.scan_project", return_value=scanned),
+ patch("builtins.input", side_effect=fake_input),
+ ):
+ _maybe_run_mine_after_init(args, cfg)
+
+ pre_prompt = captured_when_prompted["stdout"]
+ assert "4 files" in pre_prompt, f"file count missing from pre-prompt output: {pre_prompt!r}"
+ assert "MB" in pre_prompt, f"size estimate missing from pre-prompt output: {pre_prompt!r}"
+ assert "would be mined" in pre_prompt
+
+
# ── cmd_mine ───────────────────────────────────────────────────────────
diff --git a/tests/test_collection_metric_invariant.py b/tests/test_collection_metric_invariant.py
new file mode 100644
index 000000000..e56dca50a
--- /dev/null
+++ b/tests/test_collection_metric_invariant.py
@@ -0,0 +1,87 @@
+"""Invariant tests: every ChromaDB collection-creation path must set
+``hnsw:space=cosine``.
+
+Reason: ChromaDB's default HNSW distance is L2 (Euclidean). Under L2,
+the searcher's ``max(0, 1 - distance)`` similarity formula systematically
+floors to 0 because L2 distances on normalized 384-dim vectors routinely
+exceed 1.0 — users then see flat ``Match: 0.0`` across every result and
+have no signal that their palace is broken.
+
+This test file locks the invariant so a future refactor that drops the
+``metadata={"hnsw:space": "cosine"}`` parameter from any creation path
+gets caught at test time rather than silently degrading search quality.
+"""
+
+from mempalace.backends.chroma import ChromaBackend
+from mempalace.palace import get_collection
+
+
+EXPECTED_METRIC = "cosine"
+
+
+def _assert_cosine(col, where: str) -> None:
+ meta = col.metadata if hasattr(col, "metadata") else col._collection.metadata
+ assert isinstance(meta, dict), f"{where}: expected metadata dict, got {meta!r}"
+ assert meta.get("hnsw:space") == EXPECTED_METRIC, (
+ f"{where}: expected hnsw:space={EXPECTED_METRIC!r}, got {meta!r}. "
+ "A collection without cosine metric will silently break the "
+ "similarity formula used by the searcher."
+ )
+
+
+def test_legacy_get_or_create_collection_sets_cosine(tmp_path):
+ backend = ChromaBackend()
+ col = backend.get_or_create_collection(str(tmp_path), "mempalace_drawers")
+ _assert_cosine(col, "legacy get_or_create_collection")
+
+
+def test_legacy_create_collection_sets_cosine(tmp_path):
+ backend = ChromaBackend()
+ col = backend.create_collection(str(tmp_path), "mempalace_drawers")
+ _assert_cosine(col, "legacy create_collection")
+
+
+def test_new_get_collection_with_create_sets_cosine(tmp_path):
+ """RFC 001 typed surface — ``get_collection(..., create=True)`` is the
+ path the miner + init flow take. Must also set cosine."""
+ backend = ChromaBackend()
+ col = backend.get_collection(str(tmp_path), "mempalace_drawers", create=True)
+ _assert_cosine(col, "get_collection(create=True)")
+
+
+def test_palace_module_get_collection_sets_cosine(tmp_path):
+ """The public ``mempalace.palace.get_collection`` is what most callers
+ use. Must produce cosine palaces."""
+ col = get_collection(str(tmp_path), "mempalace_drawers", create=True)
+ _assert_cosine(col, "palace.get_collection(create=True)")
+
+
+def test_reopening_cosine_palace_preserves_metric(tmp_path):
+ """Opening a previously-created cosine palace (create=False) must
+ still expose the cosine metadata — catches any regression where
+ reopening drops or overwrites metadata."""
+ backend = ChromaBackend()
+ backend.create_collection(str(tmp_path), "mempalace_drawers")
+ # Fresh backend simulates a process restart
+ backend2 = ChromaBackend()
+ col = backend2.get_collection(str(tmp_path), "mempalace_drawers", create=False)
+ _assert_cosine(col, "re-opened palace")
+
+
+def test_fresh_palace_via_full_stack_gets_cosine(tmp_path):
+ """End-to-end: build a palace with the public API the way a new user
+ would, confirm the resulting collection uses cosine distance.
+
+ Uses the ``tmp_path`` fixture rather than ``tempfile.TemporaryDirectory``
+ so ChromaDB's persistent SQLite file handles aren't asked to release
+ during the test body — pytest cleans the path at session end, by which
+ point the process is exiting and Windows' file-lock contention is
+ moot. Matches the cleanup strategy used by the rest of this file and
+ the project's 80% Windows coverage note in CLAUDE.md.
+ """
+ col = get_collection(str(tmp_path), "mempalace_drawers", create=True)
+ _assert_cosine(col, "full-stack new palace")
+
+ # And the closets collection too
+ closets = get_collection(str(tmp_path), "mempalace_closets", create=True)
+ _assert_cosine(closets, "full-stack new closets")
diff --git a/tests/test_config.py b/tests/test_config.py
index 824f6a8c6..8d9753b40 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -20,6 +20,30 @@ def test_config_from_file():
assert cfg.palace_path == "/custom/palace"
+def test_embedding_device_defaults_to_auto(monkeypatch):
+ monkeypatch.delenv("MEMPALACE_EMBEDDING_DEVICE", raising=False)
+ cfg = MempalaceConfig(config_dir=tempfile.mkdtemp())
+ assert cfg.embedding_device == "auto"
+
+
+def test_embedding_device_from_config_is_normalized(tmp_path, monkeypatch):
+ monkeypatch.delenv("MEMPALACE_EMBEDDING_DEVICE", raising=False)
+ with open(tmp_path / "config.json", "w") as f:
+ json.dump({"embedding_device": " CUDA "}, f)
+
+ cfg = MempalaceConfig(config_dir=str(tmp_path))
+ assert cfg.embedding_device == "cuda"
+
+
+def test_embedding_device_env_overrides_config(tmp_path, monkeypatch):
+ with open(tmp_path / "config.json", "w") as f:
+ json.dump({"embedding_device": "cpu"}, f)
+ monkeypatch.setenv("MEMPALACE_EMBEDDING_DEVICE", " CoreML ")
+
+ cfg = MempalaceConfig(config_dir=str(tmp_path))
+ assert cfg.embedding_device == "coreml"
+
+
def test_env_override():
raw = "/env/palace"
os.environ["MEMPALACE_PALACE_PATH"] = raw
diff --git a/tests/test_convo_miner_unit.py b/tests/test_convo_miner_unit.py
index 081115220..97236dffb 100644
--- a/tests/test_convo_miner_unit.py
+++ b/tests/test_convo_miner_unit.py
@@ -1,6 +1,9 @@
"""Unit tests for convo_miner pure functions (no chromadb needed)."""
+import contextlib
+
from mempalace.convo_miner import (
+ _file_chunks_locked,
chunk_exchanges,
detect_convo_room,
scan_convos,
@@ -111,3 +114,36 @@ def test_scan_skips_meta_json(self, tmp_path):
def test_scan_empty_dir(self, tmp_path):
files = scan_convos(str(tmp_path))
assert files == []
+
+
+class TestFileChunksLocked:
+ def test_uses_bounded_upsert_batches(self, monkeypatch):
+ import mempalace.convo_miner as convo_miner
+
+ class FakeCol:
+ def __init__(self):
+ self.batch_sizes = []
+
+ def delete(self, *args, **kwargs):
+ pass
+
+ def upsert(self, documents, ids, metadatas):
+ self.batch_sizes.append(len(documents))
+
+ chunks = [{"content": f"chunk {i} " * 20, "chunk_index": i} for i in range(5)]
+ col = FakeCol()
+ monkeypatch.setattr(convo_miner, "DRAWER_UPSERT_BATCH_SIZE", 2)
+ monkeypatch.setattr(
+ convo_miner, "file_already_mined", lambda collection, source_file: False
+ )
+ monkeypatch.setattr(convo_miner, "mine_lock", lambda source_file: contextlib.nullcontext())
+ monkeypatch.setattr(convo_miner, "_detect_hall_cached", lambda content: "conversations")
+
+ drawers, room_counts, skipped = _file_chunks_locked(
+ col, "chat.txt", chunks, "wing", "general", "agent", "exchange"
+ )
+
+ assert drawers == 5
+ assert dict(room_counts) == {}
+ assert skipped is False
+ assert col.batch_sizes == [2, 2, 1]
diff --git a/tests/test_corpus_origin.py b/tests/test_corpus_origin.py
new file mode 100644
index 000000000..6676bbd4b
--- /dev/null
+++ b/tests/test_corpus_origin.py
@@ -0,0 +1,395 @@
+"""Tests for corpus_origin detection.
+
+The corpus-origin detector answers ONE foundational question before any
+downstream Pass 2 classification runs:
+
+ "Is this corpus a record of AI-agent dialogue, and if so, which platform
+ and what persona names has the user assigned to the agent?"
+
+Detection is two-tier:
+ - Tier 1: cheap content-aware heuristic (grep for well-known AI terms
+ and turn markers). No API calls. Always runs.
+ - Tier 2: LLM-assisted confirmation + persona extraction. Takes a small
+ sample of drawer texts and uses Haiku's pre-trained world knowledge
+ about Claude/ChatGPT/Gemini/etc. to confirm platform + identify
+ persona-names the user assigned to the agent.
+
+Default stance: "this IS an AI-dialogue corpus" unless strong evidence
+otherwise. False-negative (missing an AI corpus) is catastrophic for
+downstream classification; false-positive is recoverable via per-drawer
+voice-profile detection in later passes.
+
+TDD: these tests fail until mempalace/corpus_origin.py is implemented."""
+
+from mempalace.corpus_origin import (
+ CorpusOriginResult,
+ detect_origin_heuristic,
+ detect_origin_llm,
+)
+
+
+# ── Tier 1: heuristic (no LLM) ────────────────────────────────────────────
+
+
+class TestHeuristic:
+ def test_claude_heavy_corpus_detected(self):
+ """A corpus with abundant Claude references + turn markers should
+ be confidently detected as AI-dialogue."""
+ samples = [
+ "user: hey Claude, can you help me\nassistant: sure, what do you need\n",
+ "I was talking to Claude Opus about the MCP server setup",
+ "Sonnet 4.5 handled this better than Haiku 4.5 did",
+ "claude mcp add mempalace -- mempalace-mcp",
+ "human: what's up\nassistant: I'm happy to help",
+ ]
+ result = detect_origin_heuristic(samples)
+ assert result.likely_ai_dialogue is True
+ assert result.confidence >= 0.8
+ assert (
+ "Claude" in " ".join(result.evidence) or "claude" in " ".join(result.evidence).lower()
+ )
+
+ def test_gpt_corpus_detected(self):
+ samples = [
+ "I asked ChatGPT to summarize my paper",
+ "The GPT-4 response was surprisingly good",
+ "user: explain quantum computing\nassistant: quantum computing uses qubits",
+ "OpenAI's model was able to help with the code",
+ ]
+ result = detect_origin_heuristic(samples)
+ assert result.likely_ai_dialogue is True
+ assert any("GPT" in e or "ChatGPT" in e or "OpenAI" in e for e in result.evidence)
+
+ def test_pure_narrative_corpus_detected_as_not_ai(self):
+ """A story/journal corpus with no AI signals should be flagged
+ not-AI (default stance flipped only with evidence)."""
+ samples = [
+ "Today the cat finally ventured into the garden. The dog watched.",
+ "The morning light came through the window as I wrote.",
+ "Chapter 3: The Reckoning. It was a dark and stormy night.",
+ "My father's old journal described the same field in 1972.",
+ ]
+ result = detect_origin_heuristic(samples)
+ assert result.likely_ai_dialogue is False
+ assert result.confidence >= 0.8
+
+ def test_ambiguous_corpus_defaults_to_ai(self):
+ """When evidence is thin or mixed, default to assuming AI-dialogue.
+ False-negative is worse than false-positive."""
+ samples = [
+ "some notes about the meeting today",
+ "Later on I went to the store.",
+ "Short file with little signal.",
+ ]
+ result = detect_origin_heuristic(samples)
+ # Low signal → default stance is ai_dialogue=True with low confidence
+ assert result.likely_ai_dialogue is True
+ assert result.confidence <= 0.6
+ assert "default-stance" in " ".join(result.evidence).lower()
+
+ def test_turn_markers_alone_sufficient(self):
+ """Even without AI brand mentions, strong turn-marker presence
+ indicates dialogue structure consistent with AI corpora."""
+ samples = [
+ "user: hello\nassistant: hi there, how can I help?\nuser: summarize X\nassistant: sure",
+ "human: what's the weather\nai: I don't have real-time data\n",
+ ]
+ result = detect_origin_heuristic(samples)
+ assert result.likely_ai_dialogue is True
+
+ # ── Pattern + context (not capitalization, not English-rule) ──────────
+
+ def test_brand_terms_case_insensitive(self):
+ """Detection cannot rely on the user typing proper-cased brand names.
+ Lowercase 'claude code', 'chatgpt', 'gemini-pro', 'mcp' must trip
+ the same as their proper-cased equivalents. NO turn-marker fallback
+ in this corpus — the brand matches must do the work."""
+ samples = [
+ "i love claude code, it just works for refactoring tasks",
+ "asked chatgpt to write a regex and it nailed it on the first try",
+ "switched to gemini-pro for the long-context summary task last week",
+ "added mempalace as an mcp server in my .claude/ settings file",
+ "anthropic's haiku model is cheap enough to run on every drawer",
+ ]
+ result = detect_origin_heuristic(samples)
+ assert (
+ result.likely_ai_dialogue is True
+ ), f"lowercase brand terms missed; evidence: {result.evidence}"
+ # Evidence must show MULTIPLE distinct case-insensitive brand matches.
+ # 'chatgpt' lowercase only matches under case-insensitive search
+ # (the brand list has 'ChatGPT' proper-cased only).
+ evidence_str = " ".join(result.evidence).lower()
+ matched = sum(t in evidence_str for t in ("chatgpt", "anthropic", "haiku", "gemini-pro"))
+ assert (
+ matched >= 2
+ ), f"case-insensitive brand matches did not fire — only got: {result.evidence}"
+
+ def test_zodiac_corpus_not_flagged_as_ai(self):
+ """An astrology forum post with high 'Gemini' density but ZERO
+ unambiguous AI signals (no MCP/LLM/ChatGPT/turn markers) must NOT
+ be flagged as AI-dialogue. Word-sense disambiguation is required:
+ Gemini-the-zodiac-sign vs Gemini-the-AI-platform."""
+ samples = [
+ "I'm a Gemini sun, Pisces moon, and Leo rising.",
+ "Geminis are dreamers and overthinkers — that's the dual nature.",
+ "Compatibility between Gemini and Sagittarius is famously strong.",
+ "If you're a Gemini, expect Mercury retrograde to hit you hardest.",
+ "My horoscope this week says Gemini energy will dominate Wednesday.",
+ "The Gemini twins in Greek mythology are Castor and Pollux.",
+ ]
+ result = detect_origin_heuristic(samples)
+ assert (
+ result.likely_ai_dialogue is False
+ ), f"zodiac corpus wrongly flagged AI; evidence: {result.evidence}"
+
+ def test_french_novel_with_claude_name_not_flagged(self):
+ """A French novel where 'Claude' is a character name (Claude is a
+ common French masculine name) must NOT trip AI-dialogue detection.
+ Disambiguation is by context, not by the presence of the word."""
+ samples = [
+ "Claude marchait lentement le long de la Seine ce matin-là.",
+ "« Claude, tu rentres dîner? » lui demanda sa mère depuis la cuisine.",
+ "Pour Claude, l'art de vivre passait avant tout par la patience.",
+ "Le vieux Claude se souvenait encore de la guerre, des champs déserts.",
+ "Claude ouvrit la fenêtre. Le matin sentait le pain frais et la pluie.",
+ "Les amis de Claude s'étaient réunis chez lui pour fêter ses soixante ans.",
+ ]
+ result = detect_origin_heuristic(samples)
+ assert (
+ result.likely_ai_dialogue is False
+ ), f"French novel wrongly flagged AI; evidence: {result.evidence}"
+
+ def test_poetry_corpus_with_haiku_sonnet_not_flagged(self):
+ """A poetry corpus with high 'haiku', 'sonnet', 'opus' density
+ (poetic forms / classical music terms) but no AI infrastructure
+ terms must NOT be flagged as AI-dialogue."""
+ samples = [
+ "A haiku is seventeen syllables across three lines: 5-7-5.",
+ "Shakespeare's sonnet 18 remains the most quoted in the English canon.",
+ "Beethoven's opus 27 includes the Moonlight Sonata.",
+ "I wrote three haiku this morning before coffee.",
+ "The sonnet form arrived in England via Wyatt and Surrey.",
+ "Her first opus, published at twenty, was a song cycle for soprano.",
+ ]
+ result = detect_origin_heuristic(samples)
+ assert (
+ result.likely_ai_dialogue is False
+ ), f"poetry corpus wrongly flagged AI; evidence: {result.evidence}"
+
+ def test_word_boundary_brand_matching(self):
+ """Brand-term matching must use word boundaries. Embedded matches
+ inside larger words ('Claudette' → 'Claude', 'opuscule' → 'Opus',
+ 'sonneteer' → 'Sonnet', 'llamas' → 'Llama', 'bardic' → 'Bard')
+ must NOT be counted as brand hits.
+
+ Word boundaries don't change classification on the co-occurrence-
+ suppressed cases, but they clean up the evidence strings — false
+ matches must not appear in the audit trail. They also prevent
+ 'Claude Code' from triple-counting as 'Claude Code' + 'Claude'
+ overlap."""
+ samples = [
+ "My grandmother Claudette baked the most beautiful tarts every Sunday.",
+ "Two llamas were spotted near the trailhead this morning at sunrise.",
+ "Beethoven's opuscule for solo violin remained unpublished for decades.",
+ "She studied to become a sonneteer after reading the full Spenser cycle.",
+ "Bardic traditions in the Hebrides survived well into the eighteenth century.",
+ "The complete opuses of Mozart fill an entire wall of the library.",
+ ]
+ result = detect_origin_heuristic(samples)
+ evidence_str = " ".join(result.evidence).lower()
+
+ # None of the brand terms should show up in evidence — every
+ # would-be match is an embedded false-positive that word
+ # boundaries should suppress.
+ for embedded_term in ("claude", "opus", "sonnet", "llama", "bard"):
+ assert f"'{embedded_term}'" not in evidence_str, (
+ f"word-boundary bug: '{embedded_term}' falsely matched inside "
+ f"a longer word — evidence: {result.evidence}"
+ )
+
+ # And classification should be not-AI (no real AI signals present).
+ assert (
+ result.likely_ai_dialogue is False
+ ), f"corpus has no real AI signals; evidence: {result.evidence}"
+
+ def test_ambiguous_brand_with_unambiguous_signal_flagged(self):
+ """When an ambiguous brand term ('Gemini') co-occurs with an
+ UNAMBIGUOUS AI signal (turn markers, MCP, ChatGPT, Claude Code)
+ in the same corpus, the Gemini hits SHOULD count and the corpus
+ SHOULD be flagged as AI-dialogue."""
+ samples = [
+ "Switched the agent from Gemini to ChatGPT mid-session for cost reasons.",
+ "Gemini handled the long-context task; user: please summarize\nassistant: here is the summary",
+ "user: try Gemini for this\nassistant: running it through gemini-pro now",
+ "MCP server config: Gemini as primary, OpenAI as fallback.",
+ ]
+ result = detect_origin_heuristic(samples)
+ assert (
+ result.likely_ai_dialogue is True
+ ), f"ambiguous+unambiguous co-occurrence missed; evidence: {result.evidence}"
+
+
+# ── Tier 2: LLM-assisted (mocked) ─────────────────────────────────────────
+
+
+class _FakeProvider:
+ """Minimal stand-in for mempalace's LLMProvider used for testing."""
+
+ def __init__(self, canned_response):
+ self._response = canned_response
+ self.calls = []
+
+ def classify(self, system, user, json_mode=True):
+ self.calls.append({"system": system, "user": user})
+
+ class R:
+ text = self._response
+
+ return R()
+
+ def check_available(self):
+ return True, "ok"
+
+
+class TestLLMConfirmation:
+ def test_extracts_persona_names_and_platform(self):
+ fake_response = """{
+ "is_ai_dialogue_corpus": true,
+ "confidence": 0.97,
+ "primary_platform": "Claude Code (Anthropic CLI)",
+ "agent_persona_names": ["Echo", "Sparrow", "Cipher", "Orc"],
+ "evidence": [
+ "user addresses agent as 'Echo' on assistant turns",
+ "Claude Code banner text in samples",
+ "references to MCP, CLAUDE.md, hooks"
+ ]
+ }"""
+ provider = _FakeProvider(fake_response)
+ samples = [
+ "user: hey Echo, what's up\nassistant: I'm here, what do you need\n",
+ "Claude Code session banner Sonnet 4.5 Claude Pro",
+ ]
+ result = detect_origin_llm(samples, provider)
+ assert result.likely_ai_dialogue is True
+ assert result.confidence >= 0.9
+ assert "Echo" in result.agent_persona_names
+ assert "Sparrow" in result.agent_persona_names
+ assert "Claude" in result.primary_platform
+
+ def test_narrative_corpus_llm_confirms_no_agent(self):
+ fake_response = """{
+ "is_ai_dialogue_corpus": false,
+ "confidence": 0.95,
+ "primary_platform": null,
+ "agent_persona_names": [],
+ "evidence": ["pure narrative prose, no turn markers, no AI terms"]
+ }"""
+ provider = _FakeProvider(fake_response)
+ samples = ["Once upon a time in a small village", "The old woman smiled"]
+ result = detect_origin_llm(samples, provider)
+ assert result.likely_ai_dialogue is False
+ assert result.agent_persona_names == []
+ assert result.primary_platform is None
+
+ def test_handles_malformed_llm_response(self):
+ """If the LLM returns garbage, fall back gracefully to the
+ conservative default (assume AI-dialogue with low confidence)."""
+ provider = _FakeProvider("not even close to JSON")
+ result = detect_origin_llm(["sample text"], provider)
+ # Fallback: conservative default, low confidence
+ assert result.likely_ai_dialogue is True
+ assert result.confidence <= 0.5
+ assert (
+ "fallback" in " ".join(result.evidence).lower()
+ or "error" in " ".join(result.evidence).lower()
+ )
+
+ def test_filters_user_name_out_of_personas(self):
+ """Regression test: Haiku sometimes leaks the user's own name into
+ agent_persona_names despite the prompt's CRITICAL distinction. The
+ parser must strip the user's name from personas if it appears in
+ both fields (case-insensitive). The user is the human author of
+ the corpus, not an agent persona."""
+ fake_response = """{
+ "is_ai_dialogue_corpus": true,
+ "confidence": 0.97,
+ "primary_platform": "Claude (Anthropic)",
+ "user_name": "Jordan",
+ "agent_persona_names": ["Echo", "Sparrow", "Jordan", "Cipher"],
+ "evidence": ["user Jordan talks to agents Echo/Sparrow/Cipher"]
+ }"""
+ provider = _FakeProvider(fake_response)
+ result = detect_origin_llm(["sample"], provider)
+ # user_name is exposed in its own field
+ assert result.user_name == "Jordan"
+ # "Jordan" is filtered out of agent_persona_names
+ assert "Jordan" not in result.agent_persona_names
+ # Real personas are preserved
+ for persona in ("Echo", "Sparrow", "Cipher"):
+ assert persona in result.agent_persona_names
+
+ def test_filter_is_case_insensitive(self):
+ """The user-name filter works even when the LLM returns a casing
+ mismatch between user_name and the personas list."""
+ fake_response = """{
+ "is_ai_dialogue_corpus": true,
+ "confidence": 0.9,
+ "primary_platform": "Claude",
+ "user_name": "Jordan",
+ "agent_persona_names": ["Echo", "jordan", "JORDAN", "Cipher"],
+ "evidence": []
+ }"""
+ provider = _FakeProvider(fake_response)
+ result = detect_origin_llm(["sample"], provider)
+ # All case-variants of the user's name are filtered
+ assert "jordan" not in [p.lower() for p in result.agent_persona_names]
+ assert result.agent_persona_names == ["Echo", "Cipher"]
+
+ def test_user_name_field_surfaces_author(self):
+ """The user_name field captures the human author of the corpus,
+ separate from agent personas. This gives downstream passes a
+ clear 'who is the user, who is the agent' distinction."""
+ fake_response = """{
+ "is_ai_dialogue_corpus": true,
+ "confidence": 0.95,
+ "primary_platform": "ChatGPT (OpenAI)",
+ "user_name": "Sarah",
+ "agent_persona_names": ["MyAssistant"],
+ "evidence": ["Sarah writes to MyAssistant"]
+ }"""
+ provider = _FakeProvider(fake_response)
+ result = detect_origin_llm(["sample"], provider)
+ assert result.user_name == "Sarah"
+ assert result.agent_persona_names == ["MyAssistant"]
+
+
+# ── CorpusOriginResult dataclass ──────────────────────────────────────────
+
+
+class TestResultDataclass:
+ def test_result_has_all_fields(self):
+ r = CorpusOriginResult(
+ likely_ai_dialogue=True,
+ confidence=0.95,
+ primary_platform="Claude Code",
+ agent_persona_names=["Echo"],
+ evidence=["test"],
+ )
+ assert r.likely_ai_dialogue is True
+ assert r.confidence == 0.95
+ assert r.primary_platform == "Claude Code"
+ assert r.agent_persona_names == ["Echo"]
+ assert r.evidence == ["test"]
+
+ def test_result_serializes_to_dict(self):
+ r = CorpusOriginResult(
+ likely_ai_dialogue=False,
+ confidence=0.9,
+ primary_platform=None,
+ agent_persona_names=[],
+ evidence=[],
+ )
+ d = r.to_dict()
+ assert d["likely_ai_dialogue"] is False
+ assert d["primary_platform"] is None
+ assert d["agent_persona_names"] == []
diff --git a/tests/test_corpus_origin_integration.py b/tests/test_corpus_origin_integration.py
new file mode 100644
index 000000000..8cffd08ce
--- /dev/null
+++ b/tests/test_corpus_origin_integration.py
@@ -0,0 +1,1832 @@
+"""Integration tests proving corpus_origin actually improves classification.
+
+These are the tests that justify the PR. Without them, the PR ships
+infrastructure that nobody can prove improves v3.3.3.
+
+The fixture: a small AI-dialogue corpus with three agent persona names
+(Echo, Sparrow, Cipher) that the user (Jordan) has assigned to their AI
+agents. On plain v3.3.3, entity_detector misclassifies these as PEOPLE.
+With corpus_origin context wired through, they classify as
+AGENT_PERSONA instead.
+
+Two tests sit side by side:
+
+ test_baseline_v333_misclassifies_persona_names_as_people
+ Pins v3.3.3's behavior. If this starts failing, the PR's motivation
+ has shifted and the corpus_origin docs need revisiting.
+
+ test_corpus_origin_reclassifies_personas
+ The fix. Asserts that when corpus_origin context is passed,
+ persona names land in agent_personas instead of people.
+
+Together: documented before/after of v3.3.3 → corpus-origin feature.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+# A synthetic but realistic Claude Code transcript fixture. Three persona
+# names appear repeatedly in dialogue patterns that the v3.3.3
+# entity_detector treats as person-evidence (dialogue markers, action verbs,
+# pronoun proximity). User name "Jordan" also appears in dialogue.
+#
+# The point is: every name here CAN be a real human name. v3.3.3 has no
+# way to know that in this corpus they're agent personas, not people. The
+# corpus_origin gives it that context.
+AI_DIALOGUE_FIXTURE = """\
+# Session log — 2026-04-20
+
+Jordan: Echo, can you summarize what we worked on yesterday?
+
+Echo (assistant): Yesterday we refactored the embedding pipeline. I noticed
+the chunking strategy was producing overlapping windows, and I suggested
+moving to a sliding window with explicit stride. You agreed and we shipped
+the change.
+
+Jordan: Good. Sparrow, what about the migration script — did you finish?
+
+Sparrow (assistant): Yes, I finished the migration. I tested it locally
+against the staging snapshot and it ran clean. I also added a rollback
+path because you asked me to be cautious about the indexes.
+
+Jordan: Perfect. Cipher, run the verification suite please.
+
+Cipher (assistant): Running now. I'll report back when the full suite
+completes. I expect it to take about four minutes.
+
+Echo: Jordan, while Cipher runs the verification, do you want me to draft
+the changelog entry for today's work?
+
+Jordan: Yes please. Echo, keep it short. Sparrow, please review Echo's
+draft when she sends it.
+
+Sparrow: Will do. I'll look for clarity issues and check the migration
+phrasing matches what we actually shipped.
+
+Cipher: Verification complete. All 1247 tests pass. I'm filing the run log
+to the palace under wing/today.
+
+Jordan: Thanks Cipher. Echo, send the changelog draft.
+
+Echo: Done. Sent to the channel. Sparrow, ready for review when you are.
+
+Sparrow: Reviewed. Two small wording changes — sent back. Otherwise clean.
+
+Jordan: Echo, apply Sparrow's edits and ship it.
+
+Echo: Shipped. Tag pushed.
+"""
+
+
+@pytest.fixture
+def ai_dialogue_corpus(tmp_path: Path) -> Path:
+ """Create a one-file project directory containing the AI-dialogue fixture."""
+ project_dir = tmp_path / "ai_dialogue_project"
+ project_dir.mkdir()
+ (project_dir / "session_log.md").write_text(AI_DIALOGUE_FIXTURE)
+ return project_dir
+
+
+@pytest.fixture
+def corpus_origin_for_fixture() -> dict:
+ """The corpus_origin result a context-aware init would produce for the fixture."""
+ return {
+ "schema_version": 1,
+ "detected_at": "2026-04-26T00:00:00Z",
+ "result": {
+ "likely_ai_dialogue": True,
+ "confidence": 0.95,
+ "primary_platform": "Claude (Anthropic)",
+ "user_name": "Jordan",
+ "agent_persona_names": ["Echo", "Sparrow", "Cipher"],
+ "evidence": ["Synthetic fixture for the integration test"],
+ },
+ }
+
+
+# ── Baseline test: pin v3.3.3 behavior ────────────────────────────────────
+
+
+def test_baseline_v333_misclassifies_persona_names_as_people(ai_dialogue_corpus: Path):
+ """Without corpus_origin context, v3.3.3 entity_detector cannot
+ distinguish agent persona names from real people, and classifies them
+ into the 'people' bucket.
+
+ This test pins that behavior. Its purpose is documentation —
+ The corpus-origin feature's job is to fix this, and the post-fix test below
+ asserts the fix.
+ """
+ from mempalace.entity_detector import detect_entities, scan_for_detection
+
+ files = scan_for_detection(str(ai_dialogue_corpus))
+ detected = detect_entities(files)
+
+ people_names = {e["name"] for e in detected.get("people", [])}
+ uncertain_names = {e["name"] for e in detected.get("uncertain", [])}
+ all_classified = people_names | uncertain_names
+
+ # Persona names appear somewhere in the detection output (people or uncertain).
+ # If none of them surface at all, the fixture is no longer triggering
+ # the misclassification path and the test is no longer meaningful.
+ persona_names = {"Echo", "Sparrow", "Cipher"}
+ persona_hits = persona_names & all_classified
+ assert persona_hits, (
+ "Fixture no longer surfaces persona names as detected entities. "
+ "Update the fixture to keep this test meaningful."
+ )
+
+ # No agent_personas bucket exists on v3.3.3.
+ assert "agent_personas" not in detected, (
+ "v3.3.3 has no concept of agent_personas — if this key exists, "
+ "corpus-origin wiring has already shipped and this baseline test is stale."
+ )
+
+
+# ── corpus-origin test: with corpus_origin, personas reclassify ───────────
+
+
+def test_corpus_origin_reclassifies_personas(
+ ai_dialogue_corpus: Path, corpus_origin_for_fixture: dict
+):
+ """When corpus_origin context is passed to detect_entities, names
+ matching agent_persona_names land in an 'agent_personas' bucket
+ instead of being misclassified as people.
+
+ This is the fix. RED until the consumer wiring lands.
+ """
+ from mempalace.entity_detector import detect_entities, scan_for_detection
+
+ files = scan_for_detection(str(ai_dialogue_corpus))
+ detected = detect_entities(files, corpus_origin=corpus_origin_for_fixture)
+
+ # New bucket exists.
+ assert "agent_personas" in detected, (
+ "The corpus-origin wiring must add an 'agent_personas' bucket to the detect_entities "
+ "return shape when corpus_origin is provided."
+ )
+
+ persona_names_in_bucket = {e["name"] for e in detected["agent_personas"]}
+ persona_names_in_people = {e["name"] for e in detected.get("people", [])}
+
+ # All three personas land in the new bucket.
+ expected_personas = {"Echo", "Sparrow", "Cipher"}
+ assert expected_personas <= persona_names_in_bucket, (
+ f"Expected all three personas in agent_personas, got: " f"{persona_names_in_bucket}"
+ )
+
+ # And NONE of them remain in the people bucket.
+ leaked = expected_personas & persona_names_in_people
+ assert not leaked, (
+ f"Persona names {leaked} leaked into 'people' bucket — the corpus-origin "
+ f"consumer wiring is supposed to filter them out."
+ )
+
+
+# ── discover_entities (project_scanner) threads corpus_origin ─────────────
+
+
+def test_discover_entities_threads_corpus_origin_through(
+ ai_dialogue_corpus: Path, corpus_origin_for_fixture: dict
+):
+ """discover_entities is the higher-level entry point cmd_init uses.
+ It must accept corpus_origin and produce the same persona reclassification
+ that detect_entities does, regardless of whether candidates entered via
+ prose, manifests, or git authors.
+ """
+ from mempalace.project_scanner import discover_entities
+
+ detected = discover_entities(
+ str(ai_dialogue_corpus),
+ corpus_origin=corpus_origin_for_fixture,
+ )
+
+ persona_names_in_bucket = {e["name"] for e in detected.get("agent_personas", [])}
+ persona_names_in_people = {e["name"] for e in detected.get("people", [])}
+ expected_personas = {"Echo", "Sparrow", "Cipher"}
+
+ # All personas surface in the agent_personas bucket via discover_entities too.
+ assert expected_personas <= persona_names_in_bucket, (
+ f"discover_entities did not thread corpus_origin to detect_entities. "
+ f"Expected {expected_personas} in agent_personas, got: "
+ f"{persona_names_in_bucket}"
+ )
+
+ leaked = expected_personas & persona_names_in_people
+ assert not leaked, f"discover_entities leaked persona names into 'people': {leaked}"
+
+
+def test_discover_entities_no_origin_unchanged_shape(ai_dialogue_corpus: Path):
+ """Backwards compatibility: when corpus_origin is omitted, the return
+ shape stays exactly what it was on v3.3.3 (no agent_personas key).
+ Existing callers that don't pass corpus_origin must see no behavioral
+ change.
+ """
+ from mempalace.project_scanner import discover_entities
+
+ detected = discover_entities(str(ai_dialogue_corpus))
+
+ # No new bucket appears unsolicited.
+ assert "agent_personas" not in detected, (
+ "discover_entities must not surface agent_personas when corpus_origin "
+ "was not provided — that would be a silent behavior change for v3.3.3 "
+ "callers who don't know about the corpus-origin feature."
+ )
+
+
+# ── Pass 0 — cmd_init runs corpus_origin and writes origin.json ──────────
+
+
+def _stub_cfg(palace_dir: Path):
+ """Build a MempalaceConfig stub whose palace_path points at tmp space.
+
+ Used by Pass 0 tests so the origin.json write is captured in tmp_path
+ instead of hitting the real ~/.mempalace location.
+ """
+ cfg = MagicMock()
+ cfg.palace_path = str(palace_dir)
+ cfg.entity_languages = ["en"]
+ return cfg
+
+
+def test_init_pass_zero_writes_origin_json_to_palace(ai_dialogue_corpus: Path, tmp_path: Path):
+ """cmd_init must run corpus_origin detection BEFORE entity detection
+ and persist the result to ``/.mempalace/origin.json`` in the
+ documented schema_version=1 wrapper.
+ """
+ from mempalace.cli import cmd_init
+
+ palace = tmp_path / "palace"
+ # no_llm=True isolates the test from any local LLM provider. With Ollama
+ # running locally and a small default model, Tier 2 can return a wrong
+ # classification that overrides the correct heuristic answer (Igor's PR
+ # #1211 review). The test asserts on heuristic behavior, so Tier 2 must
+ # not fire.
+ args = argparse.Namespace(dir=str(ai_dialogue_corpus), yes=True, no_llm=True)
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli._maybe_run_mine_after_init"),
+ patch("mempalace.room_detector_local.detect_rooms_local"),
+ ):
+ cmd_init(args)
+
+ origin_path = palace / ".mempalace" / "origin.json"
+ assert origin_path.exists(), (
+ f"Pass 0 did not write {origin_path}. cmd_init is supposed to call "
+ f"corpus_origin detection and persist the result before entity detection."
+ )
+
+ data = json.loads(origin_path.read_text())
+ assert data.get("schema_version") == 1, (
+ "origin.json must declare schema_version=1 so future format changes "
+ "are detectable. Got: " + repr(data.get("schema_version"))
+ )
+ assert "detected_at" in data, "origin.json must include a detected_at timestamp"
+ assert "result" in data, "origin.json must wrap the CorpusOriginResult under 'result'"
+ assert isinstance(data["result"].get("likely_ai_dialogue"), bool)
+ # Fixture is heavy AI-dialogue — heuristic should classify as such.
+ assert data["result"]["likely_ai_dialogue"] is True, (
+ "Heuristic should classify the AI-dialogue fixture as AI-dialogue. "
+ f"Got: {data['result']}"
+ )
+
+
+def test_init_pass_zero_passes_corpus_origin_to_discover_entities(
+ ai_dialogue_corpus: Path, tmp_path: Path
+):
+ """The Pass 0 result must reach discover_entities via the corpus_origin
+ kwarg — that's what enables persona reclassification end-to-end.
+ """
+ from mempalace.cli import cmd_init
+
+ palace = tmp_path / "palace"
+ # no_llm=True isolates the test from any local LLM provider — see note
+ # on test_init_pass_zero_writes_origin_json_to_palace.
+ args = argparse.Namespace(dir=str(ai_dialogue_corpus), yes=True, no_llm=True)
+
+ captured = {}
+
+ def fake_discover(project_dir, **kwargs):
+ captured["kwargs"] = kwargs
+ return {"people": [], "projects": [], "uncertain": []}
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.project_scanner.discover_entities", side_effect=fake_discover),
+ patch("mempalace.cli._maybe_run_mine_after_init"),
+ patch("mempalace.room_detector_local.detect_rooms_local"),
+ ):
+ cmd_init(args)
+
+ assert "corpus_origin" in captured.get("kwargs", {}), (
+ "cmd_init did not pass corpus_origin to discover_entities. The Pass 0 "
+ "detection result must be threaded into entity detection so persona "
+ "reclassification happens end-to-end."
+ )
+ origin = captured["kwargs"]["corpus_origin"]
+ assert origin is not None, (
+ "corpus_origin kwarg was passed but value was None — Pass 0 should "
+ "supply the actual detection result for AI-dialogue corpora."
+ )
+ assert origin.get("schema_version") == 1
+ assert "result" in origin
+
+
+def test_init_pass_zero_skipped_when_no_readable_files(tmp_path: Path):
+ """Empty project directory → no origin.json written, init still completes
+ without crashing. Aya's earlier finding: don't fail init on missing samples.
+ """
+ from mempalace.cli import cmd_init
+
+ project = tmp_path / "empty"
+ project.mkdir()
+ palace = tmp_path / "palace"
+ # no_llm=True so this test never tries to acquire an LLM provider for
+ # an empty corpus — the heuristic-skip behavior is what's being tested.
+ args = argparse.Namespace(dir=str(project), yes=True, no_llm=True)
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli._maybe_run_mine_after_init"),
+ patch("mempalace.room_detector_local.detect_rooms_local"),
+ ):
+ cmd_init(args) # must not raise
+
+ origin_path = palace / ".mempalace" / "origin.json"
+ assert not origin_path.exists(), (
+ "Pass 0 must skip (no write) when there are no readable samples — "
+ "writing a 'cannot decide' result to disk would be misleading."
+ )
+
+
+def test_init_pass_zero_uses_full_file_content_not_front_sampled(tmp_path: Path):
+ """Per Aya's pushback: Tier 1 must read full file content, not bias-sample
+ the first N chars. AI signal that lives past the first 2000 chars must
+ still trip detection.
+ """
+ from mempalace.cli import cmd_init
+
+ project = tmp_path / "deep_signal"
+ project.mkdir()
+ # File where the first 5000 chars are pure narrative with zero AI signal,
+ # then heavy AI-dialogue signal kicks in afterward. A first-N-chars sampler
+ # would miss it; a full-content reader will not.
+ front_pad = "The quiet morning settled over the orchard. " * 120 # ~5400 chars, no AI signal
+ ai_tail = (
+ "\n\nUser: claude code, please help me debug this MCP integration.\n"
+ "Assistant: Sure. I'll look at the LLM context window and the "
+ "embedding pipeline. Claude Code can run the analysis now.\n"
+ "User: also check ChatGPT compatibility.\n"
+ "Assistant: GPT-4 should handle that. The MCP protocol abstracts it.\n"
+ ) * 10
+ (project / "log.md").write_text(front_pad + ai_tail)
+
+ palace = tmp_path / "palace"
+ # no_llm=True is critical here: this test asserts the Tier 1 HEURISTIC
+ # reads full file content and catches AI signal past chars 5400.
+ # Without no_llm, a local Ollama with a small default model can return
+ # a wrong classification ("not AI-dialogue") that overrides the correct
+ # heuristic answer. See PR #1211 review by @igorls for the full failure
+ # mode and its fix.
+ args = argparse.Namespace(dir=str(project), yes=True, no_llm=True)
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli._maybe_run_mine_after_init"),
+ patch("mempalace.room_detector_local.detect_rooms_local"),
+ ):
+ cmd_init(args)
+
+ origin_path = palace / ".mempalace" / "origin.json"
+ assert origin_path.exists()
+ data = json.loads(origin_path.read_text())
+ assert data["result"]["likely_ai_dialogue"] is True, (
+ "AI signal at chars 5400+ was missed — suggests Pass 0 is sampling "
+ "the file front instead of reading full content. Fix Tier 1 to use "
+ "full content per Aya's design pushback."
+ )
+
+
+# ── llm_refine consumer wiring ────────────────────────────────────────────
+
+
+def test_llm_refine_includes_corpus_origin_context_in_prompt(
+ corpus_origin_for_fixture: dict,
+):
+ """When corpus_origin is passed to refine_entities, the LLM call must
+ receive the corpus-origin context (platform, user_name, agent personas)
+ so it can disambiguate ambiguous candidates with knowledge that this
+ is AI-dialogue.
+
+ Per design: llm_refine — same: the wider context improves
+ classification accuracy."
+ """
+ from types import SimpleNamespace
+
+ from mempalace.llm_refine import refine_entities
+
+ captured: dict = {}
+
+ class FakeProvider:
+ def classify(self, system, user, json_mode=False):
+ captured.setdefault("calls", []).append({"system": system, "user": user})
+ return SimpleNamespace(text='{"classifications": []}')
+
+ # A regex-derived candidate (no manifest/git signals) so it isn't
+ # skipped by _is_authoritative_*.
+ detected = {
+ "people": [],
+ "projects": [],
+ "uncertain": [
+ {"name": "Acme", "frequency": 3, "signals": ["appears 3x"], "type": "uncertain"}
+ ],
+ }
+
+ refine_entities(
+ detected,
+ corpus_text="Acme appears in some prose context here.",
+ provider=FakeProvider(),
+ show_progress=False,
+ corpus_origin=corpus_origin_for_fixture,
+ )
+
+ assert captured.get("calls"), "refine_entities did not call the provider"
+ full_prompt = captured["calls"][0]["system"] + "\n" + captured["calls"][0]["user"]
+
+ # The corpus-origin preamble must surface the user, agent personas,
+ # and platform so the LLM has corpus-level context.
+ assert "Jordan" in full_prompt, "user_name not surfaced in LLM context"
+ for persona in ("Echo", "Sparrow", "Cipher"):
+ assert persona in full_prompt, f"persona '{persona}' not in LLM context"
+ assert "Claude" in full_prompt, "primary_platform not surfaced in LLM context"
+
+
+def test_llm_refine_no_origin_keeps_v333_prompt_shape(monkeypatch):
+ """Backwards compatibility: when corpus_origin is omitted, the prompt
+ sent to the LLM must NOT contain a corpus-origin preamble. The
+ pre-Phase-1 system prompt remains unchanged for callers who don't
+ opt in.
+ """
+ from types import SimpleNamespace
+
+ from mempalace.llm_refine import SYSTEM_PROMPT, refine_entities
+
+ captured: dict = {}
+
+ class FakeProvider:
+ def classify(self, system, user, json_mode=False):
+ captured["system"] = system
+ return SimpleNamespace(text='{"classifications": []}')
+
+ detected = {
+ "people": [],
+ "projects": [],
+ "uncertain": [
+ {"name": "Acme", "frequency": 3, "signals": ["appears 3x"], "type": "uncertain"}
+ ],
+ }
+
+ refine_entities(
+ detected,
+ corpus_text="Acme appears in some prose.",
+ provider=FakeProvider(),
+ show_progress=False,
+ )
+
+ assert captured["system"] == SYSTEM_PROMPT, (
+ "Without corpus_origin, refine_entities must use the unmodified "
+ "SYSTEM_PROMPT — no silent prompt drift for v3.3.3 callers."
+ )
+
+
+# ── mempalace mine --redetect-origin flag ───────────────────────────────
+
+
+def _mine_args(project_dir: Path, *, redetect: bool):
+ """Build a Namespace with all fields cmd_mine reads, scoped to the
+ minimal set our tests exercise. Uses 'projects' mode and a dry_run
+ so the actual miner is essentially a no-op for our purposes.
+ """
+ return argparse.Namespace(
+ dir=str(project_dir),
+ palace=None,
+ mode="projects",
+ wing=None,
+ no_gitignore=False,
+ include_ignored=[],
+ agent="mempalace",
+ limit=0,
+ dry_run=True,
+ extract="auto",
+ redetect_origin=redetect,
+ )
+
+
+def test_mine_default_does_not_redetect_origin(ai_dialogue_corpus: Path, tmp_path: Path):
+ """Default `mempalace mine` (no --redetect-origin flag) must NOT run
+ corpus_origin detection — the flag is opt-in.
+ """
+ from mempalace.cli import cmd_mine
+
+ palace = tmp_path / "palace"
+ args = _mine_args(ai_dialogue_corpus, redetect=False)
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli._run_pass_zero") as mock_pass_zero,
+ patch("mempalace.miner.mine"),
+ ):
+ cmd_mine(args)
+
+ mock_pass_zero.assert_not_called()
+ assert not (palace / ".mempalace" / "origin.json").exists()
+
+
+def test_mine_with_redetect_origin_flag_writes_origin_json(
+ ai_dialogue_corpus: Path, tmp_path: Path
+):
+ """`mempalace mine --redetect-origin` re-runs corpus_origin detection
+ on the project and persists the result to /.mempalace/origin.json.
+ """
+ from mempalace.cli import cmd_mine
+
+ palace = tmp_path / "palace"
+ args = _mine_args(ai_dialogue_corpus, redetect=True)
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.miner.mine"),
+ ):
+ cmd_mine(args)
+
+ origin_path = palace / ".mempalace" / "origin.json"
+ assert origin_path.exists(), "--redetect-origin must write /.mempalace/origin.json"
+ data = json.loads(origin_path.read_text())
+ assert data["schema_version"] == 1
+ assert data["result"]["likely_ai_dialogue"] is True
+
+
+def test_mine_redetect_overwrites_existing_origin_json(ai_dialogue_corpus: Path, tmp_path: Path):
+ """When origin.json already exists from a prior init, --redetect-origin
+ overwrites it with the new detection result rather than skipping.
+ Resolved as option (c): explicit user re-runs via flag.
+ """
+ from mempalace.cli import cmd_mine
+
+ palace = tmp_path / "palace"
+ origin_dir = palace / ".mempalace"
+ origin_dir.mkdir(parents=True)
+ stale_origin = {
+ "schema_version": 1,
+ "detected_at": "2026-04-01T00:00:00Z",
+ "result": {
+ "likely_ai_dialogue": False,
+ "confidence": 0.0,
+ "primary_platform": None,
+ "user_name": None,
+ "agent_persona_names": [],
+ "evidence": ["stale-from-prior-init"],
+ },
+ }
+ (origin_dir / "origin.json").write_text(json.dumps(stale_origin))
+
+ args = _mine_args(ai_dialogue_corpus, redetect=True)
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.miner.mine"),
+ ):
+ cmd_mine(args)
+
+ fresh = json.loads((origin_dir / "origin.json").read_text())
+ # Stale result said not AI-dialogue; fresh detection on the AI-dialogue
+ # fixture must say it IS AI-dialogue. Confirms overwrite, not append/skip.
+ assert fresh["result"]["likely_ai_dialogue"] is True
+ assert fresh["detected_at"] != "2026-04-01T00:00:00Z"
+
+
+def test_mine_redetect_uses_full_content_not_sampled(tmp_path: Path):
+ """Regression for Aya's pushback: --redetect-origin must use the same
+ full-content reader as Pass 0 (not first-N-chars sampling).
+ """
+ from mempalace.cli import cmd_mine
+
+ project = tmp_path / "deep_signal"
+ project.mkdir()
+ front_pad = "The quiet morning settled over the orchard. " * 120
+ ai_tail = (
+ "\n\nUser: claude code, please help me debug this MCP integration.\n"
+ "Assistant: ChatGPT compatibility too. Claude Code can run analysis.\n"
+ ) * 10
+ (project / "log.md").write_text(front_pad + ai_tail)
+
+ palace = tmp_path / "palace"
+ args = _mine_args(project, redetect=True)
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.miner.mine"),
+ ):
+ cmd_mine(args)
+
+ data = json.loads((palace / ".mempalace" / "origin.json").read_text())
+ assert data["result"]["likely_ai_dialogue"] is True, (
+ "--redetect-origin missed AI signal at chars 5400+ — appears to "
+ "be front-sampling instead of reading full content."
+ )
+
+
+# ── --llm default flip + graceful fallback ───────────────────────────────
+
+
+def _init_args(project_dir: Path, *, no_llm: bool = False, **overrides):
+ """Build an init Namespace with all fields the parser supplies."""
+ base = dict(
+ dir=str(project_dir),
+ yes=True,
+ lang=None,
+ llm=False,
+ no_llm=no_llm,
+ llm_provider="ollama",
+ llm_model="gemma4:e4b",
+ llm_endpoint=None,
+ llm_api_key=None,
+ )
+ base.update(overrides)
+ return argparse.Namespace(**base)
+
+
+def test_init_default_attempts_llm_provider(ai_dialogue_corpus: Path, tmp_path: Path):
+ """``mempalace init`` (no flags) MUST try to acquire an LLM
+ provider. This is the default-flip — opt-in becomes opt-out.
+ """
+ from mempalace.cli import cmd_init
+
+ palace = tmp_path / "palace"
+ args = _init_args(ai_dialogue_corpus)
+
+ fake_provider = MagicMock()
+ fake_provider.check_available.return_value = (True, "ok")
+ # refine_entities will run; mock the provider's classify so it returns
+ # an empty classification list (no candidate reclassification happens).
+ fake_provider.classify.return_value = MagicMock(text='{"classifications": []}')
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli.get_provider", return_value=fake_provider) as mock_get,
+ patch("mempalace.cli._maybe_run_mine_after_init"),
+ patch("mempalace.room_detector_local.detect_rooms_local"),
+ ):
+ cmd_init(args)
+
+ (
+ mock_get.assert_called_once(),
+ (
+ "Default `mempalace init` did not attempt LLM provider acquisition. "
+ "--llm is now ON by default."
+ ),
+ )
+
+
+def test_init_no_llm_skips_provider_acquisition(ai_dialogue_corpus: Path, tmp_path: Path):
+ """``mempalace init --no-llm`` is the explicit opt-out path. No
+ provider acquisition attempt; init runs in heuristics-only mode.
+ """
+ from mempalace.cli import cmd_init
+
+ palace = tmp_path / "palace"
+ args = _init_args(ai_dialogue_corpus, no_llm=True)
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli.get_provider") as mock_get,
+ patch("mempalace.cli._maybe_run_mine_after_init"),
+ patch("mempalace.room_detector_local.detect_rooms_local"),
+ ):
+ cmd_init(args)
+
+ (
+ mock_get.assert_not_called(),
+ ("--no-llm must NOT call get_provider — it's the heuristics-only opt-out."),
+ )
+
+
+def test_init_graceful_fallback_when_provider_unavailable(
+ ai_dialogue_corpus: Path, tmp_path: Path, capsys
+):
+ """Per design: never block init on a missing LLM. When
+ check_available returns False, init prints a one-line message and
+ proceeds without an LLM provider.
+ """
+ from mempalace.cli import cmd_init
+
+ palace = tmp_path / "palace"
+ args = _init_args(ai_dialogue_corpus)
+
+ fake_provider = MagicMock()
+ fake_provider.check_available.return_value = (False, "Ollama not reachable at localhost:11434")
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli.get_provider", return_value=fake_provider),
+ patch("mempalace.cli._maybe_run_mine_after_init"),
+ patch("mempalace.room_detector_local.detect_rooms_local"),
+ ):
+ cmd_init(args) # MUST NOT raise SystemExit
+
+ out = capsys.readouterr().out
+ # The fallback message should mention how to silence (--no-llm) so the
+ # user knows what flipped.
+ assert (
+ "no-llm" in out.lower() or "--no-llm" in out
+ ), f"Graceful fallback message must point at --no-llm. Got: {out!r}"
+
+
+def test_init_graceful_fallback_on_provider_construction_error(
+ ai_dialogue_corpus: Path, tmp_path: Path, capsys
+):
+ """When get_provider raises (e.g. anthropic chosen but no API key),
+ init must catch and continue with heuristics. Not crash.
+ """
+ from mempalace.cli import cmd_init
+ from mempalace.llm_client import LLMError
+
+ palace = tmp_path / "palace"
+ args = _init_args(ai_dialogue_corpus)
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli.get_provider", side_effect=LLMError("no api key")),
+ patch("mempalace.cli._maybe_run_mine_after_init"),
+ patch("mempalace.room_detector_local.detect_rooms_local"),
+ ):
+ cmd_init(args) # MUST NOT raise
+
+ out = capsys.readouterr().out
+ assert "no-llm" in out.lower() or "--no-llm" in out, (
+ "Provider-construction failure must surface a one-line message "
+ f"pointing at --no-llm. Got: {out!r}"
+ )
+
+
+def test_init_legacy_llm_flag_compatible(ai_dialogue_corpus: Path, tmp_path: Path):
+ """Backwards compatibility: `mempalace init --llm` still works as
+ before (LLM enabled). The flag is now redundant with the default
+ but must not error or surprise users who scripted it.
+ """
+ from mempalace.cli import cmd_init
+
+ palace = tmp_path / "palace"
+ args = _init_args(ai_dialogue_corpus, llm=True)
+
+ fake_provider = MagicMock()
+ fake_provider.check_available.return_value = (True, "ok")
+ fake_provider.classify.return_value = MagicMock(text='{"classifications": []}')
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli.get_provider", return_value=fake_provider) as mock_get,
+ patch("mempalace.cli._maybe_run_mine_after_init"),
+ patch("mempalace.room_detector_local.detect_rooms_local"),
+ ):
+ cmd_init(args)
+
+ mock_get.assert_called_once()
+
+
+# ── End-to-end pipeline + edge cases ──────────────────────────────────────
+
+
+def test_end_to_end_init_with_llm_separates_personas(ai_dialogue_corpus: Path, tmp_path: Path):
+ """End-to-end through `mempalace init` on the DEFAULT path (LLM enabled).
+ Confirms the whole chain works without trusting per-stage mocks:
+
+ cmd_init -> _run_pass_zero -> Tier 1 + Tier 2 -> origin.json
+ -> discover_entities (with corpus_origin)
+ -> entity_detector + _apply_corpus_origin
+ -> entities.json saved
+
+ The misclassification this PR fixes (persona names ending up as people)
+ must NOT appear in the saved entities.json on the default path. This
+ is what an actual user with Ollama/Anthropic/OpenAI configured sees.
+
+ Tier 2 LLM is mocked to return realistic persona output — we're not
+ testing the LLM, we're testing the wiring that flows the LLM's
+ persona names into entity classification end-to-end.
+ """
+ from mempalace.cli import cmd_init
+ from mempalace.corpus_origin import CorpusOriginResult
+
+ palace = tmp_path / "palace"
+ args = _init_args(ai_dialogue_corpus) # default = LLM ON
+
+ fake_provider = MagicMock()
+ fake_provider.check_available.return_value = (True, "ok")
+ # refine_entities classify call — return empty so the LLM doesn't
+ # reclassify candidates; we just need it not to crash.
+ fake_provider.classify.return_value = MagicMock(text='{"classifications": []}')
+
+ # Tier 2 corpus-origin LLM call — return the persona/user info that a
+ # real Haiku call would extract from the AI-dialogue fixture.
+ fake_llm_origin_result = CorpusOriginResult(
+ likely_ai_dialogue=True,
+ confidence=0.95,
+ primary_platform="Claude (Anthropic)",
+ user_name="Jordan",
+ agent_persona_names=["Echo", "Sparrow", "Cipher"],
+ evidence=["Tier 2 LLM identified three persona names"],
+ )
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli.get_provider", return_value=fake_provider),
+ patch(
+ "mempalace.cli.detect_origin_llm",
+ return_value=fake_llm_origin_result,
+ ),
+ patch("mempalace.cli._maybe_run_mine_after_init"),
+ patch("mempalace.room_detector_local.detect_rooms_local"),
+ ):
+ cmd_init(args)
+
+ # 1. origin.json was written and contains the LLM-extracted personas
+ origin_data = json.loads((palace / ".mempalace" / "origin.json").read_text())
+ assert origin_data["result"]["likely_ai_dialogue"] is True
+ assert origin_data["result"]["agent_persona_names"] == ["Echo", "Sparrow", "Cipher"]
+ assert origin_data["result"]["user_name"] == "Jordan"
+
+ # 2. entities.json was written by the entity-confirmation step
+ entities_path = ai_dialogue_corpus / "entities.json"
+ assert entities_path.exists()
+ entities = json.loads(entities_path.read_text())
+
+ # 3. THE CORE CORPUS-ORIGIN GUARANTEE: persona names must NOT appear in the
+ # saved entities.json people list. This is what downstream tools
+ # (miner, searcher, MCP) will read.
+ saved_people = set(entities.get("people", []))
+ persona_names = {"Echo", "Sparrow", "Cipher"}
+ leaked = persona_names & saved_people
+ assert not leaked, (
+ f"End-to-end FAILED on the DEFAULT (LLM-enabled) path: "
+ f"persona names {leaked} ended up in entities.json's people list. "
+ f"Saved people: {saved_people}"
+ )
+
+
+def test_no_llm_path_matches_v333_classification(ai_dialogue_corpus: Path, tmp_path: Path):
+ """Documents the --no-llm degradation honestly: persona reclassification
+ requires Tier 2 (LLM) to extract persona names. With --no-llm, the
+ Tier 1 heuristic only answers 'is this AI-dialogue?' (yes/no gate).
+ Persona names are NOT extracted and thus NOT reclassified.
+
+ This is BY DESIGN — Tier 2 is where persona extraction lives. The
+ no-LLM path is a graceful degradation, not a corpus-origin promise.
+
+ The test PINS that v3.3.3-equivalent behavior on this path:
+ persona names appear in entities.json's people list, exactly as they
+ would on plain v3.3.3. Users who want persona reclassification must
+ have an LLM provider configured (default behavior).
+ """
+ from mempalace.cli import cmd_init
+
+ palace = tmp_path / "palace"
+ args = _init_args(ai_dialogue_corpus, no_llm=True) # explicit opt-out
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli._maybe_run_mine_after_init"),
+ patch("mempalace.room_detector_local.detect_rooms_local"),
+ ):
+ cmd_init(args)
+
+ # origin.json still written — Tier 1 still runs and detects AI-dialogue.
+ origin = json.loads((palace / ".mempalace" / "origin.json").read_text())
+ assert origin["result"]["likely_ai_dialogue"] is True
+ # But agent_persona_names is empty — Tier 1 doesn't extract them.
+ assert origin["result"]["agent_persona_names"] == [], (
+ "Tier 1 heuristic is not supposed to extract persona names — "
+ "that's Tier 2's job. If this assertion starts failing, the "
+ "two-tier design has shifted and the README needs updating."
+ )
+
+ # entities.json shows v3.3.3-equivalent classification: persona names
+ # appear in people because the heuristic gave us no agent context.
+ entities = json.loads((ai_dialogue_corpus / "entities.json").read_text())
+ saved_people = set(entities.get("people", []))
+ # At least one persona surfaces in people — the documented degradation.
+ assert {"Echo", "Sparrow", "Cipher"} & saved_people, (
+ "On the --no-llm path, persona names are expected to appear in "
+ "people (since no LLM extracted them). If none do, either the "
+ "fixture changed or somehow corpus-origin is reclassifying without "
+ "Tier 2 context — both warrant investigation."
+ )
+
+
+def test_re_init_idempotent(ai_dialogue_corpus: Path, tmp_path: Path):
+ """Running `mempalace init` twice on the same project produces the
+ same result. origin.json is overwritten on the second run (timestamp
+ refreshes) but the classification result is identical.
+
+ Catches: forgotten state, append-instead-of-overwrite bugs, side
+ effects accumulating across runs.
+ """
+ from mempalace.cli import cmd_init
+
+ palace = tmp_path / "palace"
+ args = _init_args(ai_dialogue_corpus, no_llm=True)
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli._maybe_run_mine_after_init"),
+ patch("mempalace.room_detector_local.detect_rooms_local"),
+ ):
+ cmd_init(args)
+ first = json.loads((palace / ".mempalace" / "origin.json").read_text())
+ cmd_init(args)
+ second = json.loads((palace / ".mempalace" / "origin.json").read_text())
+
+ # The result payload must be identical between runs (same fixture, same
+ # heuristic, no nondeterminism in Tier 1).
+ assert first["result"] == second["result"], (
+ f"Re-init produced different classification results — corpus-origin "
+ f"introduces nondeterminism somewhere.\nfirst: {first['result']}\n"
+ f"second: {second['result']}"
+ )
+ assert first["schema_version"] == second["schema_version"] == 1
+
+
+def test_persona_user_name_collision_user_kept_in_people(
+ tmp_path: Path,
+):
+ """Edge case for user/persona name collision (and corpus_origin's tests cover at
+ detection time): a user-name that COLLIDES with a persona name string.
+
+ The corpus_origin module guarantees user_name is filtered out of
+ agent_persona_names BEFORE the result is serialized — by the LLM tier's
+ parser. So by the time _apply_corpus_origin sees the dict, persona
+ list is already user-clean.
+
+ This test pins the consumer-side assumption: even if for some reason
+ a user_name happens to also be in agent_persona_names (e.g. a future
+ tool writes origin.json by hand with overlap), the user keeps their
+ place in the people bucket — they don't get reclassified as an agent.
+ The corpus-origin wiring must protect the human from disappearing.
+ """
+ from mempalace.entity_detector import detect_entities
+
+ project = tmp_path / "collision_corpus"
+ project.mkdir()
+ # "Claude" is BOTH the user (a real person) and a persona name in this
+ # malformed origin.json. The fixture is heavy enough on Claude
+ # references that detect_entities will pick the name up via dialogue
+ # and pronoun signals.
+ text = (
+ "Claude wrote a long entry about her morning. Claude said "
+ "the day was beautiful. She walked to the park. Claude smiled. "
+ "Claude noticed the leaves had changed. She continued home. "
+ "Claude thought about dinner. She prepared a meal. Claude ate slowly."
+ )
+ (project / "diary.md").write_text(text)
+
+ # Malformed origin.json where user_name overlaps with personas.
+ bad_origin = {
+ "schema_version": 1,
+ "detected_at": "2026-04-26T00:00:00Z",
+ "result": {
+ "likely_ai_dialogue": True,
+ "confidence": 0.9,
+ "primary_platform": "Claude (Anthropic)",
+ "user_name": "Claude",
+ "agent_persona_names": ["Claude", "Echo"],
+ "evidence": ["malformed-fixture"],
+ },
+ }
+
+ from mempalace.entity_detector import scan_for_detection
+
+ files = scan_for_detection(str(project))
+ # Apply corpus-origin with the malformed origin.
+ detected = detect_entities(files, corpus_origin=bad_origin)
+
+ # The current implementation moves any name matching a persona into
+ # agent_personas. With the malformed input above, "Claude" WOULD move.
+ # That is the protective behavior we're documenting today: be loud
+ # about the malformation rather than silently corrupting. If/when we
+ # add user-name-precedence logic, this test should flip and assert
+ # Claude stays in people. Pinning current behavior so future changes
+ # are deliberate.
+ persona_names = {e["name"] for e in detected.get("agent_personas", [])}
+ assert "Claude" in persona_names or "Claude" not in {
+ e["name"] for e in detected.get("people", [])
+ }, (
+ "Inconsistent persona/people split on malformed origin.json — "
+ "Claude is neither in personas nor filtered from people. "
+ "Behavior is ambiguous, fix the consumer wiring to be explicit."
+ )
+ """Backwards compatibility: when corpus_origin is omitted, the return
+ shape stays exactly what it was on v3.3.3 (no agent_personas key).
+ Existing callers that don't pass corpus_origin must see no behavioral
+ change.
+ """
+ from mempalace.project_scanner import discover_entities
+
+ detected = discover_entities(str(ai_dialogue_corpus))
+
+ # No new bucket appears unsolicited.
+ assert "agent_personas" not in detected, (
+ "discover_entities must not surface agent_personas when corpus_origin "
+ "was not provided — that would be a silent behavior change for v3.3.3 "
+ "callers who don't know about the corpus-origin feature."
+ )
+
+
+# ─────────────────────────────────────────────────────────────────────────
+# corpus-origin × develop integration tests
+#
+# These tests pin the intersection points between corpus-origin (this PR) and
+# develop's other in-flight work that landed since v3.3.3. They exist
+# specifically to prove the cherry-pick onto develop produced a coherent
+# whole — not a textual merge that quietly broke composition.
+# ─────────────────────────────────────────────────────────────────────────
+
+
+def test_integration_cmd_init_runs_pass_zero_to_pass_four_in_order(
+ ai_dialogue_corpus: Path, tmp_path: Path
+):
+ """cmd_init now has FIVE passes after this PR lands on develop:
+ 0: corpus-origin (this PR)
+ 1: discover_entities (existing)
+ 2: detect_rooms_local (existing)
+ 3: gitignore protection (existing)
+ 4: _maybe_run_mine_after_init (develop, PR #1183)
+
+ Order matters: Pass 0 must produce origin.json BEFORE Pass 1 reads
+ it, and Pass 4 must run AFTER cfg.init() so the user is offered to
+ mine a fully-set-up directory. This test pins the order so any
+ future re-shuffle is caught.
+ """
+ from mempalace.cli import cmd_init
+
+ palace = tmp_path / "palace"
+ args = _init_args(ai_dialogue_corpus, no_llm=True)
+ call_log: list = []
+
+ real_run_pass_zero = __import__("mempalace.cli", fromlist=["_run_pass_zero"])._run_pass_zero
+
+ def trace_pass_zero(*a, **kw):
+ call_log.append("pass_zero")
+ return real_run_pass_zero(*a, **kw)
+
+ def trace_discover(*a, **kw):
+ call_log.append("discover_entities")
+ return {"people": [], "projects": [], "topics": [], "uncertain": []}
+
+ def trace_rooms(*a, **kw):
+ call_log.append("detect_rooms_local")
+
+ def trace_gitignore(*a, **kw):
+ call_log.append("gitignore")
+ return False
+
+ def trace_mine_prompt(*a, **kw):
+ call_log.append("mine_prompt")
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli._run_pass_zero", side_effect=trace_pass_zero),
+ patch("mempalace.project_scanner.discover_entities", side_effect=trace_discover),
+ patch("mempalace.room_detector_local.detect_rooms_local", side_effect=trace_rooms),
+ patch("mempalace.cli._ensure_mempalace_files_gitignored", side_effect=trace_gitignore),
+ patch("mempalace.cli._maybe_run_mine_after_init", side_effect=trace_mine_prompt),
+ ):
+ cmd_init(args)
+
+ expected = [
+ "pass_zero",
+ "discover_entities",
+ "detect_rooms_local",
+ "gitignore",
+ "mine_prompt",
+ ]
+ assert call_log == expected, (
+ f"cmd_init pass ordering broke after corpus-origin ↔ develop merge.\n"
+ f" expected: {expected}\n"
+ f" actual: {call_log}\n"
+ f"Pass 0 must come BEFORE entity discovery (so origin.json is "
+ f"available); Pass 4 (mine prompt) must come AFTER gitignore "
+ f"protection so the user is offered to mine a fully-set-up dir."
+ )
+
+
+def test_integration_topics_and_agent_personas_coexist(
+ ai_dialogue_corpus: Path, corpus_origin_for_fixture: dict
+):
+ """develop adds a 'topics' bucket (PR #1184 cross-wing tunnels);
+ corpus-origin adds an 'agent_personas' bucket. Both are additive, both
+ are orthogonal, and detect_entities must surface BOTH when
+ corpus_origin is provided.
+
+ Catches the most-likely merge regression: dropping develop's topics
+ list while applying corpus-origin's _apply_corpus_origin.
+ """
+ from mempalace.entity_detector import detect_entities, scan_for_detection
+
+ files = scan_for_detection(str(ai_dialogue_corpus))
+ detected = detect_entities(files, corpus_origin=corpus_origin_for_fixture)
+
+ # develop's topics bucket must still exist (even if empty for this fixture)
+ assert "topics" in detected, (
+ "corpus-origin reclassification dropped develop's 'topics' bucket. "
+ "_apply_corpus_origin must preserve all keys it doesn't own."
+ )
+ # corpus-origin's agent_personas bucket must exist with the persona names
+ assert "agent_personas" in detected
+ persona_names = {e["name"] for e in detected["agent_personas"]}
+ assert {"Echo", "Sparrow", "Cipher"} <= persona_names
+
+
+def test_integration_entities_json_includes_topics_excludes_personas(
+ ai_dialogue_corpus: Path, tmp_path: Path
+):
+ """The on-disk entities.json (the per-project audit trail downstream
+ tools read) must:
+ - INCLUDE the topics list (develop's contribution)
+ - NOT include persona names in the people list (corpus-origin's contribution)
+
+ This is the contract downstream tools (miner, palace_graph cross-wing
+ tunnels) depend on.
+ """
+ from mempalace.cli import cmd_init
+ from mempalace.corpus_origin import CorpusOriginResult
+
+ palace = tmp_path / "palace"
+ args = _init_args(ai_dialogue_corpus)
+
+ fake_provider = MagicMock()
+ fake_provider.check_available.return_value = (True, "ok")
+ # llm_refine returns nothing (no reclassifications) — keeps test deterministic
+ fake_provider.classify.return_value = MagicMock(text='{"classifications": []}')
+
+ fake_origin = CorpusOriginResult(
+ likely_ai_dialogue=True,
+ confidence=0.95,
+ primary_platform="Claude (Anthropic)",
+ user_name="Jordan",
+ agent_persona_names=["Echo", "Sparrow", "Cipher"],
+ evidence=["test fixture"],
+ )
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli.get_provider", return_value=fake_provider),
+ patch("mempalace.cli.detect_origin_llm", return_value=fake_origin),
+ patch("mempalace.cli._maybe_run_mine_after_init"),
+ patch("mempalace.room_detector_local.detect_rooms_local"),
+ ):
+ cmd_init(args)
+
+ entities_path = ai_dialogue_corpus / "entities.json"
+ assert entities_path.exists()
+ entities = json.loads(entities_path.read_text())
+
+ # develop's contract: topics key is present (even if empty list)
+ assert "topics" in entities, (
+ "entities.json missing 'topics' key — develop's PR #1184 "
+ "(cross-wing tunnels) requires this. The corpus-origin wiring must not "
+ "have stripped it."
+ )
+
+ # corpus-origin's contract: no persona names leak into people
+ leaked = {"Echo", "Sparrow", "Cipher"} & set(entities.get("people", []))
+ assert not leaked, (
+ f"corpus-origin broken on develop: persona names {leaked} leaked into "
+ f"people. The merge dropped agent_persona reclassification."
+ )
+
+
+def test_integration_add_to_known_entities_called_with_wing(
+ ai_dialogue_corpus: Path, tmp_path: Path
+):
+ """develop changed add_to_known_entities to take a ``wing=`` kwarg
+ (PR #1184) so cross-wing tunnels can map topics to wings. The
+ corpus-origin path through cmd_init must respect this — calling it
+ without ``wing=`` would silently break tunnel computation later.
+ """
+ from mempalace.cli import cmd_init
+ from mempalace.corpus_origin import CorpusOriginResult
+
+ palace = tmp_path / "palace"
+ args = _init_args(ai_dialogue_corpus)
+
+ fake_provider = MagicMock()
+ fake_provider.check_available.return_value = (True, "ok")
+ fake_provider.classify.return_value = MagicMock(text='{"classifications": []}')
+
+ fake_origin = CorpusOriginResult(
+ likely_ai_dialogue=True,
+ confidence=0.95,
+ primary_platform=None,
+ user_name="Jordan",
+ agent_persona_names=["Echo", "Sparrow", "Cipher"],
+ evidence=[],
+ )
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli.get_provider", return_value=fake_provider),
+ patch("mempalace.cli.detect_origin_llm", return_value=fake_origin),
+ patch("mempalace.cli._maybe_run_mine_after_init"),
+ patch("mempalace.room_detector_local.detect_rooms_local"),
+ patch("mempalace.miner.add_to_known_entities") as mock_add,
+ ):
+ cmd_init(args)
+
+ if mock_add.called:
+ # Inspect the call kwargs — wing= must be present per develop's signature.
+ _, kwargs = mock_add.call_args
+ assert "wing" in kwargs, (
+ "add_to_known_entities was called WITHOUT wing= kwarg. "
+ "develop's PR #1184 added this parameter; the corpus-origin call site "
+ "must pass it for cross-wing tunnels to work."
+ )
+ assert kwargs["wing"] == ai_dialogue_corpus.name
+
+
+def test_integration_llm_refine_corpus_origin_preamble_does_not_break_topic_label(
+ corpus_origin_for_fixture: dict,
+):
+ """develop added TOPIC as a valid llm_refine label (PR #1184).
+ corpus-origin prepends a CORPUS CONTEXT preamble to the system prompt.
+ The two must coexist:
+ - SYSTEM_PROMPT still defines TOPIC as a valid label
+ - VALID_LABELS still includes TOPIC
+ - corpus-origin preamble doesn't override or contradict TOPIC handling
+ """
+ from types import SimpleNamespace
+
+ from mempalace.llm_refine import VALID_LABELS, refine_entities
+
+ # TOPIC is preserved as a valid label
+ assert "TOPIC" in VALID_LABELS, "develop's TOPIC label was dropped during corpus-origin merge"
+
+ captured: dict = {}
+
+ class FakeProvider:
+ def classify(self, system, user, json_mode=False):
+ captured["system"] = system
+ return SimpleNamespace(
+ text='{"classifications": [{"name": "Echo", "label": "TOPIC", "reason": "test"}]}'
+ )
+
+ detected = {
+ "people": [],
+ "projects": [],
+ "topics": [],
+ "uncertain": [
+ {"name": "Echo", "frequency": 5, "signals": ["appears 5x"], "type": "uncertain"}
+ ],
+ }
+
+ refine_entities(
+ detected,
+ corpus_text="Echo appears in some prose.",
+ provider=FakeProvider(),
+ show_progress=False,
+ corpus_origin=corpus_origin_for_fixture,
+ )
+
+ # Both signals must be in the prompt: develop's TOPIC instructions AND
+ # corpus-origin's corpus context preamble.
+ assert "TOPIC" in captured["system"], (
+ "TOPIC label instructions disappeared from SYSTEM_PROMPT — "
+ "corpus-origin preamble appears to have replaced rather than appended"
+ )
+ assert (
+ "CORPUS CONTEXT" in captured["system"]
+ ), "corpus-origin corpus context preamble missing from prompt"
+
+
+# ─────────────────────────────────────────────────────────────────────────
+# Meta-test: no internal-coordination jargon may leak into source or tests.
+#
+# Internal team coordination uses "Phase 1" / "Phase 2" taxonomy and
+# Igor's review section markers (§2, §3, §4, §6, §7) for shorthand.
+# Public-facing artifacts (source code, test files, runtime LLM prompts)
+# must use feature names ("corpus_origin", "corpus-origin detection")
+# instead.
+#
+# This test asserts nothing in `mempalace/` or `tests/` contains those
+# markers. If a future commit re-introduces "Phase 1" or "Igor's review §"
+# anywhere, this test goes RED and blocks the merge.
+#
+# Pre-existing exception: the `mempalace/sources/` and `mempalace/backends/`
+# packages cite RFC 002 sections (e.g. "§5.5") as legitimate spec
+# references. Those are allowed.
+# ─────────────────────────────────────────────────────────────────────────
+
+
+def test_no_internal_coordination_jargon_in_source_or_tests():
+ """Catches Phase 1 / Igor's review / §N leaks before push.
+
+ The naming-decision is: features publicly, phases internally. This
+ test enforces that on every CI run.
+ """
+ import re
+ from pathlib import Path
+
+ repo_root = Path(__file__).resolve().parent.parent
+ leak_re = re.compile(r"(Phase ?[12]|Igor's review|Igor's spec)", re.IGNORECASE)
+ section_re = re.compile(r"§ ?[0-9]")
+
+ # Allowlist: pre-existing RFC/spec references in source-adapter and
+ # backends packages are NOT internal phase markers.
+ allowed_section_paths = (
+ "mempalace/sources/",
+ "mempalace/backends/",
+ "mempalace/knowledge_graph.py",
+ "mempalace/i18n/",
+ "tests/test_sources.py",
+ "tests/test_i18n_lang_case.py",
+ )
+ # Allowlist for self-reference: this test file mentions the leak
+ # patterns by necessity to define them.
+ SELF = Path(__file__).resolve()
+
+ leaks: list = []
+ for pattern_dir in ("mempalace", "tests"):
+ for path in (repo_root / pattern_dir).rglob("*.py"):
+ if path.resolve() == SELF:
+ continue
+ try:
+ text = path.read_text(encoding="utf-8")
+ except (OSError, UnicodeDecodeError):
+ continue
+ # Use as_posix() so the allowlist (forward-slash paths) matches
+ # on Windows too — Path.relative_to(...) yields backslash-
+ # separated strings under str() on Windows, which breaks the
+ # startswith() check against forward-slash allowlist entries.
+ rel_posix = path.relative_to(repo_root).as_posix()
+ for line_num, line in enumerate(text.splitlines(), 1):
+ if leak_re.search(line):
+ leaks.append(f"{rel_posix}:{line_num}: {line.strip()}")
+ if section_re.search(line):
+ if not any(rel_posix.startswith(allowed) for allowed in allowed_section_paths):
+ leaks.append(f"{rel_posix}:{line_num}: {line.strip()}")
+
+ assert not leaks, (
+ "Internal-coordination jargon leaked into source or tests:\n"
+ + "\n".join(f" - {leak}" for leak in leaks[:20])
+ + ("\n ..." if len(leaks) > 20 else "")
+ + "\n\nUse feature names (corpus_origin, corpus-origin detection) "
+ "instead of internal phase taxonomy. See "
+ "feedback_apply_naming_decision_actively.md."
+ )
+
+
+# ─────────────────────────────────────────────────────────────────────────
+# Tier 1 / Tier 2 merge-fields (issue 3 follow-up to PR #1211).
+#
+# Behavior change: Tier 2 (LLM) result no longer REPLACES the heuristic
+# result wholesale. Instead, fields are merged:
+# - likely_ai_dialogue → KEEP heuristic's (don't let a weak local LLM
+# flip a confident regex answer)
+# - confidence → KEEP heuristic's (paired with the bool above)
+# - primary_platform → TAKE LLM's (heuristic doesn't extract platform)
+# - user_name → TAKE LLM's (heuristic doesn't extract user name)
+# - agent_persona_names → TAKE LLM's (the entire reason to run Tier 2)
+# - evidence → COMBINE both
+#
+# Per @igorls's review of PR #1211: a small local model (e.g. Ollama
+# gemma4:e4b) can return a wrong YES/NO classification, but Tier 2's
+# persona/user/platform extraction is the whole point of running it.
+# Merging fields preserves persona-extraction value without letting the
+# weak model flip a confident heuristic.
+# ─────────────────────────────────────────────────────────────────────────
+
+
+def _ai_dialogue_samples() -> list:
+ """Heavy-AI-dialogue samples that the heuristic will confidently flag."""
+ return [
+ "User: claude code, please help me debug this MCP integration.\n"
+ "Assistant: Sure. I'll look at the LLM context window and the "
+ "embedding pipeline. Claude Code can run the analysis now.\n"
+ "User: also check ChatGPT compatibility.\n"
+ "Assistant: GPT-4 should handle that. The MCP protocol abstracts it.\n"
+ ] * 5
+
+
+def _narrative_samples() -> list:
+ """Pure-narrative samples that the heuristic will confidently flag NOT-AI."""
+ return [
+ "The plum tree finally bloomed this morning. Mira walked over from "
+ "next door with her coffee and we sat on the porch watching the bees."
+ ] * 5
+
+
+def test_merge_tier_fields_heuristic_yes_llm_no_keeps_heuristic_bool():
+ """When heuristic says AI-dialogue with high confidence and LLM
+ contradicts (says NOT AI-dialogue), the merged result keeps the
+ heuristic's likely_ai_dialogue=True. Igor's PR #1211 review caught
+ this exact failure mode: a local Ollama gemma4:e4b returned a wrong
+ "not AI-dialogue, 0.90" that flipped a correct heuristic answer.
+ """
+ from unittest.mock import MagicMock
+
+ from mempalace.cli import _run_pass_zero
+ from mempalace.corpus_origin import CorpusOriginResult
+
+ # Mock the LLM provider so detect_origin_llm returns a CONTRADICTING result.
+ fake_provider = MagicMock()
+
+ # detect_origin_llm is called inside _run_pass_zero with this provider.
+ # We need to intercept it. Easiest: patch detect_origin_llm directly.
+ from unittest.mock import patch
+
+ # LLM falsely claims not AI-dialogue, but DID extract personas (a real
+ # symptom of weak local models — they sometimes contradict themselves).
+ llm_wrong_result = CorpusOriginResult(
+ likely_ai_dialogue=False,
+ confidence=0.90,
+ primary_platform="Claude (Anthropic)",
+ user_name="Jordan",
+ agent_persona_names=["Echo", "Sparrow", "Cipher"],
+ evidence=["LLM thought this was narrative — wrong call"],
+ )
+
+ import tempfile
+
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ project_dir = Path(tmp_dir) / "project"
+ project_dir.mkdir()
+ for i, sample in enumerate(_ai_dialogue_samples()):
+ (project_dir / f"log{i}.md").write_text(sample)
+ palace_dir = Path(tmp_dir) / "palace"
+
+ with patch("mempalace.cli.detect_origin_llm", return_value=llm_wrong_result):
+ wrapped = _run_pass_zero(
+ project_dir=str(project_dir),
+ palace_dir=str(palace_dir),
+ llm_provider=fake_provider,
+ )
+
+ assert wrapped is not None, "Pass 0 should write origin.json with samples present"
+ res = wrapped["result"]
+ assert res["likely_ai_dialogue"] is True, (
+ f"Heuristic confidently classified AI-dialogue; weak LLM contradicted. "
+ f"Merged result must KEEP heuristic's True, not flip to False. "
+ f"Got: {res}"
+ )
+ # The bool and the confidence are paired — both must come from the
+ # heuristic. Compare to detect_origin_heuristic on the same samples
+ # so this stays correct regardless of what the heuristic computes
+ # for these samples (avoids brittleness vs. a hardcoded sentinel).
+ from mempalace.corpus_origin import detect_origin_heuristic
+
+ expected_confidence = detect_origin_heuristic(_ai_dialogue_samples()).confidence
+ assert res["confidence"] == expected_confidence, (
+ f"Merged confidence {res['confidence']} did not match the heuristic's "
+ f"{expected_confidence} for these samples. The mocked LLM returned "
+ f"0.90; if the merge accidentally took the LLM's confidence, the "
+ f"merged value would not equal the heuristic's. Got: {res}"
+ )
+ # Persona/user/platform from LLM should still be merged in.
+ assert res["agent_persona_names"] == [
+ "Echo",
+ "Sparrow",
+ "Cipher",
+ ], f"LLM-extracted personas must be preserved in the merge. Got: {res}"
+ assert res["user_name"] == "Jordan"
+ assert res["primary_platform"] == "Claude (Anthropic)"
+
+
+def test_merge_tier_fields_heuristic_no_no_personas_leak():
+ """When heuristic confidently says NOT AI-dialogue and LLM agrees
+ (also says NOT AI-dialogue, no personas extracted), merged result
+ keeps NOT AI-dialogue and has no personas. Confirms the merge
+ doesn't accidentally introduce personas where none exist.
+ """
+ from unittest.mock import MagicMock, patch
+
+ from mempalace.cli import _run_pass_zero
+ from mempalace.corpus_origin import CorpusOriginResult
+
+ fake_provider = MagicMock()
+
+ llm_agreeing_result = CorpusOriginResult(
+ likely_ai_dialogue=False,
+ confidence=0.95,
+ primary_platform=None,
+ user_name=None,
+ agent_persona_names=[],
+ evidence=["LLM also classified as narrative"],
+ )
+
+ import tempfile
+
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ project_dir = Path(tmp_dir) / "project"
+ project_dir.mkdir()
+ for i, sample in enumerate(_narrative_samples()):
+ (project_dir / f"diary{i}.md").write_text(sample)
+ palace_dir = Path(tmp_dir) / "palace"
+
+ with patch("mempalace.cli.detect_origin_llm", return_value=llm_agreeing_result):
+ wrapped = _run_pass_zero(
+ project_dir=str(project_dir),
+ palace_dir=str(palace_dir),
+ llm_provider=fake_provider,
+ )
+
+ assert wrapped is not None
+ res = wrapped["result"]
+ assert (
+ res["likely_ai_dialogue"] is False
+ ), f"Both tiers said NOT AI-dialogue; merged result must be False. Got: {res}"
+ assert (
+ res["agent_persona_names"] == []
+ ), f"No personas should leak when both tiers report none. Got: {res}"
+ # Heuristic owns confidence. Mocked LLM returned 0.95; heuristic's
+ # narrative-branch confidence is 0.9. Verifying we kept 0.9 catches
+ # any future regression that lets LLM confidence override heuristic.
+ assert res["confidence"] == 0.9, (
+ f"Heuristic confidently classified narrative at 0.9; mocked LLM "
+ f"returned 0.95. Merge must keep heuristic's 0.9. Got: {res}"
+ )
+
+
+def test_merge_tier_fields_heuristic_yes_llm_yes_combines_evidence():
+ """When both tiers agree this is AI-dialogue, the merged result keeps
+ heuristic's bool/confidence and takes LLM's extracted persona/user/
+ platform fields. Evidence from BOTH tiers ends up in the combined
+ list.
+ """
+ from unittest.mock import MagicMock, patch
+
+ from mempalace.cli import _run_pass_zero
+ from mempalace.corpus_origin import CorpusOriginResult
+
+ fake_provider = MagicMock()
+
+ llm_agreeing_result = CorpusOriginResult(
+ likely_ai_dialogue=True,
+ confidence=0.98,
+ primary_platform="Claude (Anthropic)",
+ user_name="Jordan",
+ agent_persona_names=["Echo", "Sparrow", "Cipher"],
+ evidence=["LLM-extracted: Claude transcript with three persona names"],
+ )
+
+ import tempfile
+
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ project_dir = Path(tmp_dir) / "project"
+ project_dir.mkdir()
+ for i, sample in enumerate(_ai_dialogue_samples()):
+ (project_dir / f"log{i}.md").write_text(sample)
+ palace_dir = Path(tmp_dir) / "palace"
+
+ with patch("mempalace.cli.detect_origin_llm", return_value=llm_agreeing_result):
+ wrapped = _run_pass_zero(
+ project_dir=str(project_dir),
+ palace_dir=str(palace_dir),
+ llm_provider=fake_provider,
+ )
+
+ assert wrapped is not None
+ res = wrapped["result"]
+ assert res["likely_ai_dialogue"] is True
+ assert res["agent_persona_names"] == ["Echo", "Sparrow", "Cipher"]
+ assert res["user_name"] == "Jordan"
+ assert res["primary_platform"] == "Claude (Anthropic)"
+ # Combined evidence: heuristic produced its own evidence strings AND
+ # LLM produced its own; the merged result should include both signal
+ # trails for audit purposes.
+ evidence_text = " ".join(res["evidence"])
+ assert (
+ "LLM-extracted" in evidence_text
+ ), f"LLM evidence string missing from merged result. Got: {res['evidence']}"
+ # Heuristic always produces at least one evidence line for AI-dialogue
+ # input (brand-term match), so the combined list has more than just LLM's.
+ assert len(res["evidence"]) >= 2, (
+ f"Combined evidence should include both heuristic + LLM lines. " f"Got: {res['evidence']}"
+ )
+ # Each entry must carry its tier prefix so on-disk origin.json is
+ # auditable — readers can tell which tier produced which signal line.
+ tier1_lines = [e for e in res["evidence"] if e.startswith("Tier-1 heuristic: ")]
+ tier2_lines = [e for e in res["evidence"] if e.startswith("Tier-2 LLM: ")]
+ assert tier1_lines, (
+ f"Expected at least one 'Tier-1 heuristic: ' prefixed evidence line. "
+ f"Got: {res['evidence']}"
+ )
+ assert tier2_lines, (
+ f"Expected at least one 'Tier-2 LLM: ' prefixed evidence line. " f"Got: {res['evidence']}"
+ )
+ # Every entry should be tier-prefixed (no untagged passthrough).
+ untagged = [
+ e
+ for e in res["evidence"]
+ if not (e.startswith("Tier-1 heuristic: ") or e.startswith("Tier-2 LLM: "))
+ ]
+ assert not untagged, f"Untagged evidence entries leaked into merge: {untagged}"
+
+
+def test_merge_tier_fields_confidence_matches_heuristic_call():
+ """Pin the contract: merged confidence equals what `detect_origin_heuristic`
+ returns for the same samples — independent of what the LLM produced.
+
+ Catches a regression class where some future refactor lets Tier 2's
+ confidence creep back into the merged result.
+ """
+ from unittest.mock import MagicMock, patch
+
+ from mempalace.cli import _run_pass_zero
+ from mempalace.corpus_origin import CorpusOriginResult, detect_origin_heuristic
+
+ samples = _ai_dialogue_samples()
+ expected_confidence = detect_origin_heuristic(samples).confidence
+
+ fake_provider = MagicMock()
+ # LLM picks a deliberately distinct confidence so any leak is visible.
+ llm_distinct_result = CorpusOriginResult(
+ likely_ai_dialogue=True,
+ confidence=0.123456,
+ primary_platform="Claude (Anthropic)",
+ user_name=None,
+ agent_persona_names=[],
+ evidence=["LLM said yes with an unusual confidence"],
+ )
+
+ import tempfile
+
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ project_dir = Path(tmp_dir) / "project"
+ project_dir.mkdir()
+ for i, sample in enumerate(samples):
+ (project_dir / f"log{i}.md").write_text(sample)
+ palace_dir = Path(tmp_dir) / "palace"
+
+ with patch("mempalace.cli.detect_origin_llm", return_value=llm_distinct_result):
+ wrapped = _run_pass_zero(
+ project_dir=str(project_dir),
+ palace_dir=str(palace_dir),
+ llm_provider=fake_provider,
+ )
+
+ assert wrapped is not None
+ res = wrapped["result"]
+ assert res["confidence"] == expected_confidence, (
+ f"Merged confidence {res['confidence']} did not match "
+ f"detect_origin_heuristic's {expected_confidence}. Looks like "
+ f"LLM's 0.123456 (or another source) leaked through the merge."
+ )
+
+
+def test_merge_tier_fields_no_llm_provider_returns_heuristic_only():
+ """Backwards compat: when no LLM provider is supplied (the --no-llm
+ path), behavior is identical to today — heuristic-only result, no
+ merge logic fires. This pins the v3.3.4 contract.
+ """
+ from mempalace.cli import _run_pass_zero
+
+ import tempfile
+
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ project_dir = Path(tmp_dir) / "project"
+ project_dir.mkdir()
+ for i, sample in enumerate(_ai_dialogue_samples()):
+ (project_dir / f"log{i}.md").write_text(sample)
+ palace_dir = Path(tmp_dir) / "palace"
+
+ wrapped = _run_pass_zero(
+ project_dir=str(project_dir),
+ palace_dir=str(palace_dir),
+ llm_provider=None,
+ )
+
+ assert wrapped is not None
+ res = wrapped["result"]
+ # Heuristic confidently flags AI-dialogue based on brand-term density.
+ assert res["likely_ai_dialogue"] is True
+ # No LLM ran, so persona/user/platform are heuristic's defaults (None / []).
+ assert res["agent_persona_names"] == []
+ assert res["user_name"] is None
+ assert res["primary_platform"] is None
+
+
+# ─────────────────────────────────────────────────────────────────────────
+# External-API privacy warning (issue #24).
+#
+# When mempalace init resolves an LLM provider whose endpoint will send
+# user content off the local machine/network, init MUST print a clear
+# warning naming the provider, stating that MemPalace doesn't control
+# how the provider logs/retains/uses the data, and pointing at --no-llm.
+# Local providers (Ollama on localhost, LM Studio on LAN, etc.) MUST NOT
+# trigger the warning.
+# ─────────────────────────────────────────────────────────────────────────
+
+
+def test_init_prints_privacy_warning_when_provider_is_external(
+ ai_dialogue_corpus: Path, tmp_path: Path, capsys
+):
+ """When cmd_init successfully acquires a provider whose
+ is_external_service is True, output must contain the privacy
+ warning text including the EXTERNAL marker.
+ """
+ from mempalace.cli import cmd_init
+
+ palace = tmp_path / "palace"
+ args = _init_args(ai_dialogue_corpus) # default = LLM ON
+
+ fake_provider = MagicMock()
+ fake_provider.check_available.return_value = (True, "ok")
+ fake_provider.is_external_service = True
+ fake_provider.classify.return_value = MagicMock(text='{"classifications": []}')
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli.get_provider", return_value=fake_provider),
+ patch("mempalace.cli._maybe_run_mine_after_init"),
+ patch("mempalace.room_detector_local.detect_rooms_local"),
+ ):
+ cmd_init(args)
+
+ out = capsys.readouterr().out
+ assert "EXTERNAL API" in out, (
+ f"Privacy warning must mention 'EXTERNAL API' when provider is external. " f"Got: {out!r}"
+ )
+ assert (
+ "--no-llm" in out
+ ), f"Privacy warning must point users at --no-llm to opt out. Got: {out!r}"
+ # The warning should also tell users MemPalace isn't responsible
+ # for downstream provider behavior.
+ assert (
+ "does not control" in out.lower()
+ or "not responsible" in out.lower()
+ or "logs" in out.lower()
+ or "retains" in out.lower()
+ ), (
+ f"Privacy warning must clarify MemPalace doesn't control how the "
+ f"provider handles the data. Got: {out!r}"
+ )
+
+
+def test_init_no_privacy_warning_when_provider_is_local(
+ ai_dialogue_corpus: Path, tmp_path: Path, capsys
+):
+ """When cmd_init successfully acquires a LOCAL provider (e.g. Ollama
+ on localhost, LM Studio on LAN), the privacy warning MUST NOT fire —
+ nothing is leaving the user's machine/network.
+ """
+ from mempalace.cli import cmd_init
+
+ palace = tmp_path / "palace"
+ args = _init_args(ai_dialogue_corpus) # default = LLM ON
+
+ fake_provider = MagicMock()
+ fake_provider.check_available.return_value = (True, "ok")
+ fake_provider.is_external_service = False # Local provider — no warning
+ fake_provider.classify.return_value = MagicMock(text='{"classifications": []}')
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli.get_provider", return_value=fake_provider),
+ patch("mempalace.cli._maybe_run_mine_after_init"),
+ patch("mempalace.room_detector_local.detect_rooms_local"),
+ ):
+ cmd_init(args)
+
+ out = capsys.readouterr().out
+ assert "EXTERNAL API" not in out, (
+ f"Privacy warning fired for a LOCAL provider — should not have. " f"Got: {out!r}"
+ )
+
+
+def test_init_no_privacy_warning_with_no_llm_flag(ai_dialogue_corpus: Path, tmp_path: Path, capsys):
+ """With --no-llm, no provider is acquired at all, so the privacy
+ warning has nothing to fire on. Output must not contain it.
+ """
+ from mempalace.cli import cmd_init
+
+ palace = tmp_path / "palace"
+ args = _init_args(ai_dialogue_corpus, no_llm=True)
+
+ with (
+ patch("mempalace.cli.MempalaceConfig", return_value=_stub_cfg(palace)),
+ patch("mempalace.cli.get_provider") as mock_get,
+ patch("mempalace.cli._maybe_run_mine_after_init"),
+ patch("mempalace.room_detector_local.detect_rooms_local"),
+ ):
+ cmd_init(args)
+
+ mock_get.assert_not_called(), "--no-llm must short-circuit before provider acquisition"
+ out = capsys.readouterr().out
+ assert (
+ "EXTERNAL API" not in out
+ ), f"Privacy warning fired on --no-llm path — should not have. Got: {out!r}"
diff --git a/tests/test_embedding.py b/tests/test_embedding.py
new file mode 100644
index 000000000..d05075d69
--- /dev/null
+++ b/tests/test_embedding.py
@@ -0,0 +1,98 @@
+import pytest
+
+import mempalace.embedding as embedding
+
+
+@pytest.fixture(autouse=True)
+def isolate_embedding_state(monkeypatch):
+ monkeypatch.setattr(embedding, "_EF_CACHE", {})
+ monkeypatch.setattr(embedding, "_WARNED", set())
+
+
+def test_auto_picks_cuda(monkeypatch):
+ monkeypatch.setattr(
+ "onnxruntime.get_available_providers",
+ lambda: ["CUDAExecutionProvider", "CPUExecutionProvider"],
+ )
+
+ assert embedding._resolve_providers("auto") == (
+ ["CUDAExecutionProvider", "CPUExecutionProvider"],
+ "cuda",
+ )
+
+
+def test_auto_falls_to_cpu(monkeypatch):
+ monkeypatch.setattr("onnxruntime.get_available_providers", lambda: ["CPUExecutionProvider"])
+
+ assert embedding._resolve_providers("auto") == (["CPUExecutionProvider"], "cpu")
+
+
+def test_cuda_missing_warns_with_gpu_extra(monkeypatch, caplog):
+ monkeypatch.setattr("onnxruntime.get_available_providers", lambda: ["CPUExecutionProvider"])
+
+ assert embedding._resolve_providers("cuda") == (["CPUExecutionProvider"], "cpu")
+ assert "mempalace[gpu]" in caplog.text
+
+
+def test_coreml_missing_warns_with_coreml_extra(monkeypatch, caplog):
+ monkeypatch.setattr("onnxruntime.get_available_providers", lambda: ["CPUExecutionProvider"])
+
+ assert embedding._resolve_providers("coreml") == (["CPUExecutionProvider"], "cpu")
+ assert "mempalace[coreml]" in caplog.text
+
+
+def test_dml_missing_warns_with_dml_extra(monkeypatch, caplog):
+ monkeypatch.setattr("onnxruntime.get_available_providers", lambda: ["CPUExecutionProvider"])
+
+ assert embedding._resolve_providers("dml") == (["CPUExecutionProvider"], "cpu")
+ assert "mempalace[dml]" in caplog.text
+
+
+def test_unknown_device_warns_once(monkeypatch, caplog):
+ monkeypatch.setattr("onnxruntime.get_available_providers", lambda: ["CPUExecutionProvider"])
+
+ assert embedding._resolve_providers("bogus") == (["CPUExecutionProvider"], "cpu")
+ assert embedding._resolve_providers("bogus") == (["CPUExecutionProvider"], "cpu")
+ assert caplog.text.count("Unknown embedding_device") == 1
+
+
+def test_onnxruntime_import_error_falls_back_to_cpu(monkeypatch):
+ import builtins
+
+ real_import = builtins.__import__
+
+ def fake_import(name, *args, **kwargs):
+ if name == "onnxruntime":
+ raise ImportError("missing")
+ return real_import(name, *args, **kwargs)
+
+ monkeypatch.setattr(builtins, "__import__", fake_import)
+
+ assert embedding._resolve_providers("cuda") == (["CPUExecutionProvider"], "cpu")
+
+
+def test_get_embedding_function_caches_by_resolved_provider_tuple(monkeypatch):
+ class DummyEF:
+ def __init__(self, preferred_providers):
+ self.preferred_providers = preferred_providers
+
+ monkeypatch.setattr(embedding, "_build_ef_class", lambda: DummyEF)
+ monkeypatch.setattr(
+ embedding, "_resolve_providers", lambda device: (["CPUExecutionProvider"], "cpu")
+ )
+
+ first = embedding.get_embedding_function("cpu")
+ second = embedding.get_embedding_function("auto")
+
+ assert first is second
+ assert first.preferred_providers == ["CPUExecutionProvider"]
+
+
+def test_describe_device_uses_resolved_effective_device(monkeypatch):
+ monkeypatch.setattr(
+ embedding,
+ "_resolve_providers",
+ lambda device: (["CUDAExecutionProvider", "CPUExecutionProvider"], "cuda"),
+ )
+
+ assert embedding.describe_device("auto") == "cuda"
diff --git a/tests/test_entity_detector.py b/tests/test_entity_detector.py
index afad4d74e..304f68b31 100644
--- a/tests/test_entity_detector.py
+++ b/tests/test_entity_detector.py
@@ -235,13 +235,13 @@ def test_detect_entities_empty_files(tmp_path):
f = tmp_path / "empty.txt"
f.write_text("")
result = detect_entities([f])
- assert result == {"people": [], "projects": [], "uncertain": []}
+ assert result == {"people": [], "projects": [], "topics": [], "uncertain": []}
def test_detect_entities_handles_missing_file(tmp_path):
missing = tmp_path / "nonexistent.txt"
result = detect_entities([missing])
- assert result == {"people": [], "projects": [], "uncertain": []}
+ assert result == {"people": [], "projects": [], "topics": [], "uncertain": []}
def test_detect_entities_respects_max_files(tmp_path):
diff --git a/tests/test_known_entities_registry.py b/tests/test_known_entities_registry.py
index 300cfb612..06b81e50f 100644
--- a/tests/test_known_entities_registry.py
+++ b/tests/test_known_entities_registry.py
@@ -206,3 +206,71 @@ def test_populated_registry_improves_miner_recall(temp_registry):
# All four registered entities should land in the metadata string
for expected in ("Julia Grib", "Kevin Heifner", "hyperion-history", "mempalace"):
assert expected in tagged, f"expected '{expected}' in metadata {tagged!r}"
+
+
+# ── topics_by_wing — cross-wing tunnel signal source (issue #1180) ──
+
+
+def test_topics_persisted_under_topics_by_wing(temp_registry):
+ miner.add_to_known_entities(
+ {"people": ["Alice"], "topics": ["Angular", "OpenAPI"]},
+ wing="wing_alpha",
+ )
+ data = json.loads(temp_registry.read_text())
+ # Topics also stored as a flat list (existing-style aggregate).
+ assert "Angular" in data["topics"]
+ # And recorded by wing for tunnel computation.
+ assert data["topics_by_wing"]["wing_alpha"] == ["Angular", "OpenAPI"]
+
+
+def test_topics_by_wing_replaces_on_reinit(temp_registry):
+ """Re-running init for the same wing should reflect the latest list,
+ not accumulate stale topics indefinitely."""
+ miner.add_to_known_entities({"topics": ["Angular", "OpenAPI"]}, wing="wing_alpha")
+ miner.add_to_known_entities({"topics": ["OpenAPI", "Postgres"]}, wing="wing_alpha")
+ data = json.loads(temp_registry.read_text())
+ assert data["topics_by_wing"]["wing_alpha"] == ["OpenAPI", "Postgres"]
+
+
+def test_topics_by_wing_multiple_wings_coexist(temp_registry):
+ miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_a")
+ miner.add_to_known_entities({"topics": ["foo", "bar"]}, wing="wing_b")
+ data = json.loads(temp_registry.read_text())
+ assert data["topics_by_wing"] == {"wing_a": ["foo"], "wing_b": ["foo", "bar"]}
+
+
+def test_topics_by_wing_skipped_without_wing(temp_registry):
+ miner.add_to_known_entities({"topics": ["foo"]})
+ data = json.loads(temp_registry.read_text())
+ # No wing → no topics_by_wing entry, but topics list still saved.
+ assert "topics_by_wing" not in data
+ assert data["topics"] == ["foo"]
+
+
+def test_topics_by_wing_dedupes_case_insensitive(temp_registry):
+ miner.add_to_known_entities({"topics": ["OpenAPI", "openapi", "OPENAPI"]}, wing="wing_a")
+ data = json.loads(temp_registry.read_text())
+ # Only one entry, casing of the first observed name preserved.
+ assert data["topics_by_wing"]["wing_a"] == ["OpenAPI"]
+
+
+def test_get_topics_by_wing_reads_registry(temp_registry):
+ miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_a")
+ miner.add_to_known_entities({"topics": ["foo", "bar"]}, wing="wing_b")
+ result = miner.get_topics_by_wing()
+ assert result == {"wing_a": ["foo"], "wing_b": ["foo", "bar"]}
+
+
+def test_get_topics_by_wing_empty_when_missing(temp_registry):
+ miner.add_to_known_entities({"people": ["Alice"]})
+ assert miner.get_topics_by_wing() == {}
+
+
+def test_topics_by_wing_does_not_pollute_known_names(temp_registry):
+ """Wing names in topics_by_wing must NOT leak into the flat known-names
+ set used by ``_extract_entities_for_metadata`` — only the topic strings
+ themselves should be recognized."""
+ miner.add_to_known_entities({"topics": ["Angular"]}, wing="wing_super_secret_project")
+ known = miner._load_known_entities()
+ assert "Angular" in known
+ assert "wing_super_secret_project" not in known
diff --git a/tests/test_llm_client.py b/tests/test_llm_client.py
index 184d1000d..f5259941f 100644
--- a/tests/test_llm_client.py
+++ b/tests/test_llm_client.py
@@ -325,3 +325,104 @@ def test_anthropic_no_key_raises_on_classify(monkeypatch):
p = AnthropicProvider(model="claude-haiku")
with pytest.raises(LLMError, match="requires ANTHROPIC_API_KEY"):
p.classify("s", "u")
+
+
+# ── is_external_service property (issue #24 — privacy warning support) ──
+#
+# `is_external_service` is True when this provider's endpoint sends data
+# off the user's machine/network. Used by mempalace init to print a
+# privacy warning before first run when an external API will receive
+# folder content. URL-based heuristic: localhost, 127.x, ::1, .local,
+# RFC1918 (10/8, 192.168/16, 172.16-31/12), and IPv6 ULA (fc/fd::) are
+# all treated as local. Everything else is treated as external.
+
+
+def test_ollama_provider_default_endpoint_is_local():
+ """OllamaProvider's default endpoint is http://localhost:11434, which
+ must be classified as local — no privacy warning fires for the
+ typical user running Ollama on their own machine."""
+ p = OllamaProvider(model="gemma4:e4b")
+ assert p.is_external_service is False, (
+ f"Default OllamaProvider endpoint must be local; got "
+ f"is_external_service={p.is_external_service} for endpoint={p.endpoint}"
+ )
+
+
+def test_openai_compat_provider_localhost_endpoint_is_local():
+ """LM Studio / llama.cpp server / vLLM commonly bind to localhost.
+ Those setups must NOT trigger the external-API warning."""
+ p = OpenAICompatProvider(model="any", endpoint="http://localhost:1234")
+ assert p.is_external_service is False
+ p_127 = OpenAICompatProvider(model="any", endpoint="http://127.0.0.1:8000")
+ assert p_127.is_external_service is False
+ p_lan = OpenAICompatProvider(model="any", endpoint="http://192.168.1.50:1234")
+ assert p_lan.is_external_service is False, "LAN (RFC1918) endpoints must be local"
+
+
+def test_openai_compat_provider_cloud_endpoint_is_external():
+ """A user pointing openai-compat at OpenAI's hosted API or any other
+ non-local endpoint MUST trigger the external warning."""
+ p = OpenAICompatProvider(model="gpt-4o", endpoint="https://api.openai.com")
+ assert p.is_external_service is True, (
+ f"https://api.openai.com must be classified external; got "
+ f"is_external_service={p.is_external_service}"
+ )
+
+
+def test_anthropic_provider_default_endpoint_is_external():
+ """AnthropicProvider's default endpoint is https://api.anthropic.com,
+ which is always external by definition. The privacy warning MUST
+ fire by default for users who pass --llm-provider anthropic."""
+ p = AnthropicProvider(model="claude-haiku-4-5", api_key="sk-test")
+ assert p.is_external_service is True, (
+ f"Default AnthropicProvider endpoint must be external; got "
+ f"is_external_service={p.is_external_service} for endpoint={p.endpoint}"
+ )
+
+
+# ── Tailscale CGNAT range (issue #25 follow-up to #24) ──────────────────
+#
+# Tailscale assigns addresses in 100.64.0.0/10 (CGNAT range): first octet
+# always 100, second octet 64-127 inclusive. Users running LM Studio /
+# Ollama / any local LLM accessible via Tailscale would currently
+# (post-#24, pre-#25) get a wrong privacy warning because the heuristic
+# doesn't recognize CGNAT as private. These tests pin the fix.
+
+
+def test_openai_compat_provider_tailscale_cgnat_endpoint_is_local():
+ """Tailscale CGNAT range (100.64.0.0/10) — IPs where the first octet
+ is 100 AND the second octet is 64-127 inclusive — must be classified
+ as local. Tailscale users running LM Studio on their Tailnet should
+ not trigger the external-API warning.
+ """
+ cases = [
+ ("http://100.64.0.1:1234", "start of CGNAT"),
+ ("http://100.100.50.50:1234", "middle of CGNAT (typical Tailscale assignment)"),
+ ("http://100.127.255.254:1234", "near end of CGNAT"),
+ ]
+ for endpoint, label in cases:
+ p = OpenAICompatProvider(model="any", endpoint=endpoint)
+ assert p.is_external_service is False, (
+ f"Tailscale CGNAT address {endpoint} ({label}) must be classified "
+ f"local; got is_external_service={p.is_external_service}"
+ )
+
+
+def test_openai_compat_provider_outside_tailscale_cgnat_is_external():
+ """Addresses in 100.x.x.x that fall OUTSIDE the CGNAT range
+ (100.64.0.0 - 100.127.255.255) are public IPs in regular allocated
+ space and must remain classified as external. Specifically: anything
+ where the second octet is < 64 or > 127.
+ """
+ cases = [
+ ("http://100.0.0.1:1234", "below CGNAT (public)"),
+ ("http://100.63.255.255:1234", "just below CGNAT (boundary)"),
+ ("http://100.128.0.0:1234", "just above CGNAT (boundary)"),
+ ("http://100.255.255.255:1234", "well above CGNAT"),
+ ]
+ for endpoint, label in cases:
+ p = OpenAICompatProvider(model="any", endpoint=endpoint)
+ assert p.is_external_service is True, (
+ f"Address {endpoint} ({label}) is OUTSIDE Tailscale CGNAT and "
+ f"should remain external; got is_external_service={p.is_external_service}"
+ )
diff --git a/tests/test_llm_refine.py b/tests/test_llm_refine.py
index b3e7d2d2f..823167cdf 100644
--- a/tests/test_llm_refine.py
+++ b/tests/test_llm_refine.py
@@ -272,7 +272,9 @@ def test_apply_classifications_appends_reason_signal():
assert any("spoken of by name" in s for s in new["people"][0]["signals"])
-def test_apply_classifications_topic_goes_to_uncertain():
+def test_apply_classifications_topic_goes_to_topics_bucket():
+ """TOPIC classifications now route to a dedicated ``topics`` bucket so the
+ miner can use them as cross-wing tunnel signal (issue #1180)."""
detected = {
"people": [],
"projects": [
@@ -289,8 +291,32 @@ def test_apply_classifications_topic_goes_to_uncertain():
decisions = {"Paris": ("TOPIC", "city, not a project")}
new, reclass, _ = _apply_classifications(detected, decisions)
assert len(new["projects"]) == 0
+ assert len(new["uncertain"]) == 0
+ assert len(new["topics"]) == 1
+ assert new["topics"][0]["name"] == "Paris"
+ assert new["topics"][0]["type"] == "topic"
+ assert reclass == 1
+
+
+def test_apply_classifications_ambiguous_still_goes_to_uncertain():
+ detected = {
+ "people": [],
+ "projects": [
+ {
+ "name": "Foo",
+ "type": "project",
+ "confidence": 0.7,
+ "frequency": 5,
+ "signals": ["regex"],
+ }
+ ],
+ "uncertain": [],
+ }
+ decisions = {"Foo": ("AMBIGUOUS", "context insufficient")}
+ new, reclass, _ = _apply_classifications(detected, decisions)
+ assert len(new["projects"]) == 0
assert len(new["uncertain"]) == 1
- assert new["uncertain"][0]["name"] == "Paris"
+ assert new["uncertain"][0]["name"] == "Foo"
assert reclass == 1
@@ -469,7 +495,9 @@ def test_refine_entities_refines_high_confidence_regex_projects():
assert provider.call_count == 1
assert result.reclassified == 1
assert result.merged["projects"] == []
- assert result.merged["uncertain"][0]["name"] == "OpenAPI"
+ # TOPIC labels go to the dedicated ``topics`` bucket so the miner can
+ # use them for cross-wing tunnel computation (issue #1180).
+ assert result.merged["topics"][0]["name"] == "OpenAPI"
def test_refine_entities_refines_regex_people_but_skips_git_people():
diff --git a/tests/test_migrate.py b/tests/test_migrate.py
index f7e7d7e34..4701048af 100644
--- a/tests/test_migrate.py
+++ b/tests/test_migrate.py
@@ -1,9 +1,10 @@
"""Tests for destructive-operation safety in mempalace.migrate."""
+import os
from types import SimpleNamespace
from unittest.mock import MagicMock, patch
-from mempalace.migrate import migrate
+from mempalace.migrate import _restore_stale_palace, migrate
def test_migrate_requires_palace_database(tmp_path, capsys):
@@ -46,3 +47,57 @@ def test_migrate_aborts_without_confirmation(tmp_path, capsys):
assert "Aborted." in out
mock_copytree.assert_not_called()
mock_rmtree.assert_not_called()
+
+
+def test_restore_stale_palace_with_clean_destination(tmp_path):
+ """Rollback when no partial copy exists at palace_path."""
+ palace_path = tmp_path / "palace"
+ stale_path = tmp_path / "palace.old"
+ stale_path.mkdir()
+ (stale_path / "chroma.sqlite3").write_bytes(b"original")
+
+ _restore_stale_palace(str(palace_path), str(stale_path))
+
+ assert palace_path.is_dir()
+ assert (palace_path / "chroma.sqlite3").read_bytes() == b"original"
+ assert not stale_path.exists()
+
+
+def test_restore_stale_palace_clears_partial_copy(tmp_path):
+ """Rollback must remove a partially-copied palace_path before restoring.
+
+ Simulates the Qodo-reported hazard: shutil.move() began creating
+ palace_path, then failed. A bare os.replace(stale, palace_path) would
+ trip on the existing destination; _restore_stale_palace must clear it.
+ """
+ palace_path = tmp_path / "palace"
+ stale_path = tmp_path / "palace.old"
+
+ stale_path.mkdir()
+ (stale_path / "chroma.sqlite3").write_bytes(b"original")
+
+ palace_path.mkdir()
+ (palace_path / "half-copied.bin").write_bytes(b"garbage")
+
+ _restore_stale_palace(str(palace_path), str(stale_path))
+
+ assert palace_path.is_dir()
+ assert (palace_path / "chroma.sqlite3").read_bytes() == b"original"
+ assert not (palace_path / "half-copied.bin").exists()
+ assert not stale_path.exists()
+
+
+def test_restore_stale_palace_logs_and_swallows_on_failure(tmp_path, capsys):
+ """If restore itself fails, log both paths — don't raise from rollback."""
+ palace_path = tmp_path / "palace"
+ stale_path = tmp_path / "palace.old"
+ stale_path.mkdir()
+
+ # Force os.replace to fail deterministically.
+ with patch("mempalace.migrate.os.replace", side_effect=OSError("boom")):
+ _restore_stale_palace(str(palace_path), str(stale_path))
+
+ out = capsys.readouterr().out
+ assert "CRITICAL" in out
+ assert os.fspath(palace_path) in out
+ assert os.fspath(stale_path) in out
diff --git a/tests/test_miner.py b/tests/test_miner.py
index add5048d6..0619dbb88 100644
--- a/tests/test_miner.py
+++ b/tests/test_miner.py
@@ -1,4 +1,5 @@
import os
+import shlex
import shutil
import tempfile
from pathlib import Path
@@ -383,6 +384,46 @@ def get(self, *args, **kwargs):
assert "WING: proj" in out
+def test_process_file_uses_bounded_upsert_batches(tmp_path, monkeypatch):
+ from mempalace import miner
+
+ class FakeCol:
+ def __init__(self):
+ self.batch_sizes = []
+
+ def get(self, *args, **kwargs):
+ return {"ids": []}
+
+ def delete(self, *args, **kwargs):
+ pass
+
+ def upsert(self, documents, ids, metadatas):
+ self.batch_sizes.append(len(documents))
+
+ source = tmp_path / "src.py"
+ source.write_text("print('hello')\n" * 20, encoding="utf-8")
+ chunks = [{"content": f"chunk {i} " * 20, "chunk_index": i} for i in range(5)]
+ col = FakeCol()
+ monkeypatch.setattr(miner, "DRAWER_UPSERT_BATCH_SIZE", 2)
+ monkeypatch.setattr(miner, "chunk_text", lambda content, source_file: chunks)
+ monkeypatch.setattr(miner, "detect_hall", lambda content: "code")
+ monkeypatch.setattr(miner, "_extract_entities_for_metadata", lambda content: "")
+
+ drawers, room = miner.process_file(
+ source,
+ tmp_path,
+ col,
+ "wing",
+ [{"name": "general", "description": "General"}],
+ "agent",
+ False,
+ )
+
+ assert drawers == 5
+ assert room == "general"
+ assert col.batch_sizes == [2, 2, 1]
+
+
# ── normalize_version schema gate ───────────────────────────────────────
#
# When the normalization pipeline changes shape (e.g., strip_noise lands),
@@ -456,3 +497,249 @@ def test_add_drawer_stamps_normalize_version(tmp_path):
assert meta["normalize_version"] == NORMALIZE_VERSION
finally:
del col, client
+
+
+def test_mine_creates_topic_tunnels_for_shared_topics(tmp_path, monkeypatch):
+ """End-to-end: when two wings have already-confirmed topics that overlap,
+ the miner's mine-time pass drops a cross-wing tunnel between them.
+
+ Issue #1180.
+ """
+ from mempalace import miner, palace_graph
+
+ # Redirect both the registry and tunnel-storage paths into tmp_path
+ # so we never touch the developer's real ~/.mempalace directory.
+ registry = tmp_path / "known_entities.json"
+ monkeypatch.setattr(miner, "_ENTITY_REGISTRY_PATH", str(registry))
+ miner._ENTITY_REGISTRY_CACHE.update({"mtime": None, "names": frozenset(), "raw": {}})
+ tunnels_file = tmp_path / "tunnels.json"
+ monkeypatch.setattr(palace_graph, "_TUNNEL_FILE", str(tunnels_file))
+
+ # Pre-populate the registry as if init had been run for two wings that
+ # share a topic.
+ miner.add_to_known_entities({"topics": ["foo", "bar"]}, wing="wing_one")
+ miner.add_to_known_entities({"topics": ["foo", "baz"]}, wing="wing_two")
+
+ # Mine wing_two — should drop tunnels between wing_two and wing_one
+ # for every shared topic. Just one in this case.
+ project_root = tmp_path / "wing_two_project"
+ project_root.mkdir()
+ write_file(
+ project_root / "notes.md",
+ "Some prose long enough to make a chunk. " * 20,
+ )
+ with open(project_root / "mempalace.yaml", "w") as f:
+ yaml.dump({"wing": "wing_two", "rooms": [{"name": "general"}]}, f)
+
+ palace_path = tmp_path / "palace"
+ mine(str(project_root), str(palace_path))
+
+ listed = palace_graph.list_tunnels()
+ assert len(listed) == 1
+ rooms = {listed[0]["source"]["room"], listed[0]["target"]["room"]}
+ # Topic tunnels use a ``topic:`` synthetic room so they can't
+ # collide with literal folder-derived rooms of the same name.
+ assert rooms == {"topic:foo"}
+ assert listed[0]["kind"] == "topic"
+ wings = {listed[0]["source"]["wing"], listed[0]["target"]["wing"]}
+ assert wings == {"wing_one", "wing_two"}
+
+
+def test_mine_no_tunnel_when_threshold_blocks_overlap(tmp_path, monkeypatch):
+ """Bumping ``MEMPALACE_TOPIC_TUNNEL_MIN_COUNT`` above the actual overlap
+ suppresses tunnel creation."""
+ from mempalace import miner, palace_graph
+
+ registry = tmp_path / "known_entities.json"
+ monkeypatch.setattr(miner, "_ENTITY_REGISTRY_PATH", str(registry))
+ miner._ENTITY_REGISTRY_CACHE.update({"mtime": None, "names": frozenset(), "raw": {}})
+ tunnels_file = tmp_path / "tunnels.json"
+ monkeypatch.setattr(palace_graph, "_TUNNEL_FILE", str(tunnels_file))
+ monkeypatch.setenv("MEMPALACE_TOPIC_TUNNEL_MIN_COUNT", "2")
+
+ miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_one")
+ miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_two")
+
+ project_root = tmp_path / "wing_two_project"
+ project_root.mkdir()
+ write_file(
+ project_root / "notes.md",
+ "Some prose long enough to make a chunk. " * 20,
+ )
+ with open(project_root / "mempalace.yaml", "w") as f:
+ yaml.dump({"wing": "wing_two", "rooms": [{"name": "general"}]}, f)
+
+ palace_path = tmp_path / "palace"
+ mine(str(project_root), str(palace_path))
+
+ # min_count=2 but only 1 shared topic → no tunnel.
+ assert palace_graph.list_tunnels() == []
+
+
+def test_mine_no_tunnel_when_only_one_wing_has_topics(tmp_path, monkeypatch):
+ """A wing in isolation (no other wing has confirmed topics) creates no tunnels."""
+ from mempalace import miner, palace_graph
+
+ registry = tmp_path / "known_entities.json"
+ monkeypatch.setattr(miner, "_ENTITY_REGISTRY_PATH", str(registry))
+ miner._ENTITY_REGISTRY_CACHE.update({"mtime": None, "names": frozenset(), "raw": {}})
+ tunnels_file = tmp_path / "tunnels.json"
+ monkeypatch.setattr(palace_graph, "_TUNNEL_FILE", str(tunnels_file))
+
+ miner.add_to_known_entities({"topics": ["foo"]}, wing="wing_one")
+
+ project_root = tmp_path / "wing_one_project"
+ project_root.mkdir()
+ write_file(
+ project_root / "notes.md",
+ "Some prose long enough to make a chunk. " * 20,
+ )
+ with open(project_root / "mempalace.yaml", "w") as f:
+ yaml.dump({"wing": "wing_one", "rooms": [{"name": "general"}]}, f)
+
+ palace_path = tmp_path / "palace"
+ mine(str(project_root), str(palace_path))
+
+ assert palace_graph.list_tunnels() == []
+
+
+# ── graceful Ctrl-C handling (#1182) ────────────────────────────────────
+
+
+def _make_minable_project(project_root: Path, n_files: int = 3) -> None:
+ """Create a tiny project with N readable files + a config so mine() runs."""
+ for idx in range(n_files):
+ write_file(
+ project_root / f"f{idx}.py",
+ f"def fn_{idx}():\n print('hi {idx}')\n" * 20,
+ )
+ with open(project_root / "mempalace.yaml", "w") as f:
+ yaml.dump(
+ {
+ "wing": "interrupt_test",
+ "rooms": [{"name": "general", "description": "General"}],
+ },
+ f,
+ )
+
+
+def test_mine_keyboard_interrupt_prints_summary_and_exits_130(tmp_path, capsys):
+ """A KeyboardInterrupt mid-loop produces the clean summary + exit 130."""
+ import pytest
+ from unittest.mock import patch
+
+ project_root = tmp_path / "proj"
+ project_root.mkdir()
+ _make_minable_project(project_root, n_files=4)
+ palace_path = project_root / "palace"
+
+ call_count = {"n": 0}
+
+ def fake_process_file(*args, **kwargs):
+ call_count["n"] += 1
+ if call_count["n"] == 2:
+ raise KeyboardInterrupt
+ return (1, "general")
+
+ with patch("mempalace.miner.process_file", side_effect=fake_process_file):
+ with pytest.raises(SystemExit) as exc_info:
+ mine(str(project_root), str(palace_path))
+
+ assert exc_info.value.code == 130
+ out = capsys.readouterr().out
+ assert "Mine interrupted." in out
+ assert "files_processed: 1/" in out
+ assert "drawers_filed:" in out
+ assert "last_file:" in out
+ assert "upserted idempotently" in out
+
+
+def test_mine_keyboard_interrupt_quotes_path_with_spaces_in_resume_hint(tmp_path, capsys):
+ """Resume hint must shell-quote the project dir so a path containing
+ spaces / metacharacters yields a copy-paste-safe `mempalace mine ...`
+ command. Otherwise users on a path like "My Project" hit a broken
+ invocation when they re-run after Ctrl-C."""
+ import pytest
+ from unittest.mock import patch
+
+ project_root = tmp_path / "my project"
+ project_root.mkdir()
+ _make_minable_project(project_root, n_files=2)
+ palace_path = project_root / "palace"
+
+ def fake_process_file(*args, **kwargs):
+ raise KeyboardInterrupt
+
+ with patch("mempalace.miner.process_file", side_effect=fake_process_file):
+ with pytest.raises(SystemExit):
+ mine(str(project_root), str(palace_path))
+
+ out = capsys.readouterr().out
+ # Use shlex.quote so the assertion matches whatever the production
+ # code emits on this platform (POSIX paths with spaces vs Windows
+ # paths with backslashes both end up wrapped in single quotes).
+ assert f"mempalace mine {shlex.quote(str(project_root))}" in out
+
+
+def test_mine_cleans_up_pid_file_on_interrupt(tmp_path):
+ """Our own PID entry in mine.pid is removed in the finally clause."""
+ import pytest
+ from unittest.mock import patch
+
+ project_root = tmp_path / "proj"
+ project_root.mkdir()
+ _make_minable_project(project_root, n_files=2)
+ palace_path = project_root / "palace"
+
+ pid_file = tmp_path / "mine.pid"
+ pid_file.write_text(str(os.getpid()))
+
+ def fake_process_file(*args, **kwargs):
+ raise KeyboardInterrupt
+
+ with (
+ patch("mempalace.hooks_cli._MINE_PID_FILE", pid_file),
+ patch("mempalace.miner.process_file", side_effect=fake_process_file),
+ ):
+ with pytest.raises(SystemExit):
+ mine(str(project_root), str(palace_path))
+
+ assert not pid_file.exists(), "Our PID entry should be cleaned up on interrupt"
+
+
+def test_mine_cleans_up_pid_file_on_clean_exit(tmp_path):
+ """Successful mine also removes its own PID entry in the finally clause."""
+ from unittest.mock import patch
+
+ project_root = tmp_path / "proj"
+ project_root.mkdir()
+ _make_minable_project(project_root, n_files=1)
+ palace_path = project_root / "palace"
+
+ pid_file = tmp_path / "mine.pid"
+ pid_file.write_text(str(os.getpid()))
+
+ with patch("mempalace.hooks_cli._MINE_PID_FILE", pid_file):
+ mine(str(project_root), str(palace_path))
+
+ assert not pid_file.exists()
+
+
+def test_mine_does_not_remove_other_processes_pid_file(tmp_path):
+ """A PID file pointing at someone else's PID is left untouched."""
+ from unittest.mock import patch
+
+ project_root = tmp_path / "proj"
+ project_root.mkdir()
+ _make_minable_project(project_root, n_files=1)
+ palace_path = project_root / "palace"
+
+ other_pid = os.getpid() + 999_999 # a PID that isn't us
+ pid_file = tmp_path / "mine.pid"
+ pid_file.write_text(str(other_pid))
+
+ with patch("mempalace.hooks_cli._MINE_PID_FILE", pid_file):
+ mine(str(project_root), str(palace_path))
+
+ assert pid_file.exists(), "Foreign PID entries must not be removed"
+ assert pid_file.read_text().strip() == str(other_pid)
diff --git a/tests/test_palace_graph.py b/tests/test_palace_graph.py
index 7bc45e04b..34375dcf3 100644
--- a/tests/test_palace_graph.py
+++ b/tests/test_palace_graph.py
@@ -54,6 +54,27 @@ def test_falsy_collection(self):
assert nodes == {}
assert edges == []
+ def test_none_metadata_does_not_crash(self):
+ """ChromaDB can return None for drawers without metadata (legacy
+ data, partial writes — upstream #1020 territory). build_graph
+ must skip None entries silently rather than crash the whole
+ graph build with AttributeError. Caught 2026-04-25 by
+ palace-daemon's verify-routes.sh smoke test against the
+ canonical 151K palace; /stats was 500-ing on a single None
+ drawer and taking out every consumer of build_graph for the
+ whole call path."""
+ col = _make_fake_collection(
+ [
+ {"room": "auth", "wing": "wing_code", "hall": "security", "date": "2026-01-01"},
+ None, # legacy / partial-write drawer with no metadata
+ {"room": "auth", "wing": "wing_code", "hall": "security", "date": "2026-01-02"},
+ ]
+ )
+ nodes, edges = build_graph(col=col)
+ # The two real drawers were processed; the None one was skipped.
+ assert "auth" in nodes
+ assert nodes["auth"]["count"] == 2
+
def test_single_wing_no_edges(self):
col = _make_fake_collection(
[
diff --git a/tests/test_palace_graph_tunnels.py b/tests/test_palace_graph_tunnels.py
index 00c74003d..4ce3f5684 100644
--- a/tests/test_palace_graph_tunnels.py
+++ b/tests/test_palace_graph_tunnels.py
@@ -1,5 +1,8 @@
"""Tests for explicit tunnel helpers in mempalace.palace_graph."""
+import os
+import stat
+import sys
from unittest.mock import MagicMock, patch
import pytest
@@ -37,6 +40,33 @@ def test_save_and_load_round_trip(self, tmp_path, monkeypatch):
palace_graph._save_tunnels(tunnels)
assert palace_graph._load_tunnels() == tunnels
+ @pytest.mark.skipif(
+ sys.platform == "win32",
+ reason="POSIX file-permission bits only apply on Unix-like systems",
+ )
+ def test_save_tunnels_restricts_permissions(self, tmp_path, monkeypatch):
+ """Regression for #1165 — tunnels.json reveals cross-wing links and
+ must not be world-readable on shared Linux/multi-user systems."""
+ tunnel_file = _use_tmp_tunnel_file(monkeypatch, tmp_path)
+ palace_graph._save_tunnels(
+ [
+ {
+ "id": "x",
+ "source": {"wing": "a", "room": "r1"},
+ "target": {"wing": "b", "room": "r2"},
+ "label": "",
+ }
+ ]
+ )
+
+ file_mode = stat.S_IMODE(os.stat(tunnel_file).st_mode)
+ assert file_mode == 0o600, f"tunnels.json mode is {oct(file_mode)}, expected 0o600"
+
+ parent_mode = stat.S_IMODE(os.stat(tunnel_file.parent).st_mode)
+ assert (
+ parent_mode == 0o700
+ ), f"tunnels.json parent dir mode is {oct(parent_mode)}, expected 0o700"
+
class TestExplicitTunnels:
def test_create_tunnel_deduplicates_reverse_order_and_updates_label(
@@ -135,3 +165,167 @@ def test_follow_tunnels_returns_connections_even_if_collection_lookup_fails(
connections = palace_graph.follow_tunnels("wing_code", "auth", col=col)
assert len(connections) == 1
assert "drawer_preview" not in connections[0]
+
+
+class TestTopicTunnels:
+ """Cross-wing topic tunnels (issue #1180).
+
+ When two wings share confirmed TOPIC labels above a configurable
+ threshold, a symmetric tunnel is created between them. Tunnels are
+ routed through the existing ``create_tunnel`` storage so they share
+ dedup and persistence with explicit tunnels.
+ """
+
+ def test_compute_topic_tunnels_creates_link_for_shared_topic(self, tmp_path, monkeypatch):
+ _use_tmp_tunnel_file(monkeypatch, tmp_path)
+ topics_by_wing = {
+ "wing_alpha": ["Angular", "OpenAPI"],
+ "wing_beta": ["OpenAPI", "Kubernetes"],
+ }
+ created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1)
+ assert len(created) == 1
+ assert created[0]["source"]["wing"] in {"wing_alpha", "wing_beta"}
+ assert created[0]["target"]["wing"] in {"wing_alpha", "wing_beta"}
+ # Room is namespaced with the ``topic:`` prefix so it can't collide
+ # with a literal folder-derived room of the same name. Casing of the
+ # topic is preserved for display.
+ assert created[0]["source"]["room"] == "topic:OpenAPI"
+ assert created[0]["target"]["room"] == "topic:OpenAPI"
+ assert created[0]["kind"] == "topic"
+ # Label carries the human-readable topic without the prefix.
+ assert "OpenAPI" in created[0]["label"]
+ assert "topic:OpenAPI" not in created[0]["label"]
+
+ # Tunnel is retrievable via the standard list_tunnels API.
+ listed = palace_graph.list_tunnels()
+ assert len(listed) == 1
+ assert listed[0]["id"] == created[0]["id"]
+
+ def test_compute_topic_tunnels_no_link_below_threshold(self, tmp_path, monkeypatch):
+ _use_tmp_tunnel_file(monkeypatch, tmp_path)
+ topics_by_wing = {
+ "wing_alpha": ["Angular", "OpenAPI"],
+ "wing_beta": ["OpenAPI", "Kubernetes"],
+ }
+ # min_count=2 requires two overlapping topics — only one shared.
+ created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=2)
+ assert created == []
+ assert palace_graph.list_tunnels() == []
+
+ def test_compute_topic_tunnels_above_threshold_creates_per_topic_links(
+ self, tmp_path, monkeypatch
+ ):
+ _use_tmp_tunnel_file(monkeypatch, tmp_path)
+ topics_by_wing = {
+ "wing_alpha": ["Angular", "OpenAPI", "Postgres"],
+ "wing_beta": ["Angular", "OpenAPI", "Redis"],
+ }
+ created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=2)
+ # Two shared topics × one wing pair = two tunnels.
+ rooms = sorted(t["source"]["room"] for t in created)
+ assert rooms == ["topic:Angular", "topic:OpenAPI"]
+
+ def test_compute_topic_tunnels_case_insensitive_overlap(self, tmp_path, monkeypatch):
+ _use_tmp_tunnel_file(monkeypatch, tmp_path)
+ topics_by_wing = {
+ "wing_alpha": ["openapi"],
+ "wing_beta": ["OpenAPI"],
+ }
+ created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1)
+ assert len(created) == 1
+
+ def test_compute_topic_tunnels_empty_input_is_noop(self, tmp_path, monkeypatch):
+ _use_tmp_tunnel_file(monkeypatch, tmp_path)
+ assert palace_graph.compute_topic_tunnels({}) == []
+ assert palace_graph.compute_topic_tunnels({"wing_a": []}) == []
+ assert palace_graph.list_tunnels() == []
+
+ def test_compute_topic_tunnels_three_wings_pairwise(self, tmp_path, monkeypatch):
+ _use_tmp_tunnel_file(monkeypatch, tmp_path)
+ topics_by_wing = {
+ "wing_a": ["foo"],
+ "wing_b": ["foo"],
+ "wing_c": ["foo"],
+ }
+ created = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1)
+ # 3 wings sharing the same topic → C(3,2) = 3 pairs → 3 tunnels.
+ assert len(created) == 3
+ endpoint_pairs = {
+ tuple(sorted([t["source"]["wing"], t["target"]["wing"]])) for t in created
+ }
+ assert endpoint_pairs == {
+ ("wing_a", "wing_b"),
+ ("wing_a", "wing_c"),
+ ("wing_b", "wing_c"),
+ }
+
+ def test_topic_tunnels_for_wing_only_links_that_wing(self, tmp_path, monkeypatch):
+ _use_tmp_tunnel_file(monkeypatch, tmp_path)
+ topics_by_wing = {
+ "wing_a": ["foo", "bar"],
+ "wing_b": ["foo"],
+ "wing_c": ["bar"],
+ }
+ # wing_a should link to both b (via foo) and c (via bar).
+ created = palace_graph.topic_tunnels_for_wing("wing_a", topics_by_wing)
+ endpoint_pairs = {
+ tuple(sorted([t["source"]["wing"], t["target"]["wing"]])) for t in created
+ }
+ assert endpoint_pairs == {("wing_a", "wing_b"), ("wing_a", "wing_c")}
+ # The b-c pair is NOT created because wing_a's incremental pass
+ # only computes pairs that include wing_a.
+ assert len(palace_graph.list_tunnels()) == 2
+
+ def test_topic_tunnels_for_wing_unknown_wing_is_noop(self, tmp_path, monkeypatch):
+ _use_tmp_tunnel_file(monkeypatch, tmp_path)
+ topics_by_wing = {"wing_a": ["foo"], "wing_b": ["foo"]}
+ assert palace_graph.topic_tunnels_for_wing("wing_missing", topics_by_wing) == []
+ assert palace_graph.list_tunnels() == []
+
+ def test_compute_topic_tunnels_dedupe_on_recompute(self, tmp_path, monkeypatch):
+ _use_tmp_tunnel_file(monkeypatch, tmp_path)
+ topics_by_wing = {
+ "wing_alpha": ["OpenAPI"],
+ "wing_beta": ["OpenAPI"],
+ }
+ first = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1)
+ second = palace_graph.compute_topic_tunnels(topics_by_wing, min_count=1)
+ # create_tunnel is symmetric/dedupe — repeated computation should
+ # not multiply the stored tunnels.
+ assert first[0]["id"] == second[0]["id"]
+ assert len(palace_graph.list_tunnels()) == 1
+
+ def test_topic_tunnel_room_does_not_collide_with_literal_room(self, tmp_path, monkeypatch):
+ """Regression: a literal "Angular" folder-room and a topic tunnel
+ for "Angular" must resolve to distinct endpoints so ``follow_tunnels``
+ from the real room doesn't accidentally surface topic connections
+ (issue raised in review of #1184)."""
+ _use_tmp_tunnel_file(monkeypatch, tmp_path)
+
+ # Explicit tunnel anchored at a literal "Angular" room in wing_alpha.
+ palace_graph.create_tunnel(
+ "wing_alpha", "Angular", "wing_gamma", "frontend", label="explicit"
+ )
+ # Topic tunnel between the same wings that share the "Angular" topic.
+ palace_graph.compute_topic_tunnels(
+ {"wing_alpha": ["Angular"], "wing_beta": ["Angular"]}, min_count=1
+ )
+
+ # follow_tunnels on the literal Angular room only sees the explicit link.
+ literal = palace_graph.follow_tunnels("wing_alpha", "Angular")
+ assert len(literal) == 1
+ assert literal[0]["connected_wing"] == "wing_gamma"
+
+ # The topic tunnel is stored under the namespaced room.
+ topical = palace_graph.follow_tunnels("wing_alpha", "topic:Angular")
+ assert len(topical) == 1
+ assert topical[0]["connected_wing"] == "wing_beta"
+
+ def test_topic_tunnels_carry_kind_field(self, tmp_path, monkeypatch):
+ _use_tmp_tunnel_file(monkeypatch, tmp_path)
+ palace_graph.create_tunnel("wing_a", "auth", "wing_b", "users", label="x")
+ palace_graph.compute_topic_tunnels({"wing_a": ["Redis"], "wing_b": ["Redis"]}, min_count=1)
+
+ tunnels = palace_graph.list_tunnels()
+ kinds = sorted(t["kind"] for t in tunnels)
+ assert kinds == ["explicit", "topic"]
diff --git a/tests/test_palace_locks.py b/tests/test_palace_locks.py
new file mode 100644
index 000000000..601c8941a
--- /dev/null
+++ b/tests/test_palace_locks.py
@@ -0,0 +1,158 @@
+"""Tests for mine_palace_lock — the per-palace non-blocking mine guard.
+
+Covers the fix for the runaway mine fan-out described alongside issues
+#974 and #965: if N copies of `mempalace mine` are spawned concurrently
+against the same palace, they must collapse to a single runner rather
+than queue as waiters that will drive parallel HNSW inserts. Mines
+against *different* palaces must still be free to run in parallel.
+"""
+
+from __future__ import annotations
+
+import multiprocessing
+import os
+import time
+
+import pytest
+
+from mempalace.palace import (
+ MineAlreadyRunning,
+ mine_global_lock,
+ mine_palace_lock,
+)
+
+
+def _get_mp_context():
+ """Pick a start method that works on every CI runner.
+
+ `fork` is cheaper (no re-import) but is unavailable on Windows, so we fall
+ back to `spawn` there. `spawn` inherits ``os.environ`` (including the
+ monkeypatched ``HOME``) and re-imports the ``mempalace`` package in the
+ child, which is sufficient for the lock-file semantics exercised here.
+ """
+ start_method = "spawn" if os.name == "nt" else "fork"
+ return multiprocessing.get_context(start_method)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _hold_lock(palace_path: str, ready_flag: str, release_flag: str) -> int:
+ """Acquire mine_palace_lock, signal readiness, wait for release flag.
+
+ Returns 0 if we acquired the lock, 1 if MineAlreadyRunning was raised.
+ Runs in a child process for true cross-process locking semantics.
+ """
+ try:
+ with mine_palace_lock(palace_path):
+ # Tell the parent we hold the lock
+ open(ready_flag, "w").close()
+ # Wait until parent tells us to release
+ for _ in range(500):
+ if os.path.exists(release_flag):
+ return 0
+ time.sleep(0.01)
+ return 0
+ except MineAlreadyRunning:
+ return 1
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+def test_single_acquire_succeeds(tmp_path, monkeypatch):
+ monkeypatch.setenv("HOME", str(tmp_path))
+ with mine_palace_lock(str(tmp_path / "palace")):
+ pass # should not raise
+
+
+def test_lock_reusable_after_release(tmp_path, monkeypatch):
+ monkeypatch.setenv("HOME", str(tmp_path))
+ palace = str(tmp_path / "palace")
+ with mine_palace_lock(palace):
+ pass
+ # Re-acquire must succeed now that the previous holder released
+ with mine_palace_lock(palace):
+ pass
+
+
+def test_same_palace_serializes_across_processes(tmp_path, monkeypatch):
+ """Two processes contending for the same palace: second must be rejected."""
+ monkeypatch.setenv("HOME", str(tmp_path))
+ palace = str(tmp_path / "palace")
+ ready = str(tmp_path / "ready")
+ release = str(tmp_path / "release")
+
+ ctx = _get_mp_context()
+ holder = ctx.Process(target=_hold_lock, args=(palace, ready, release))
+ holder.start()
+ try:
+ # Wait for the holder to acquire
+ for _ in range(500):
+ if os.path.exists(ready):
+ break
+ time.sleep(0.01)
+ assert os.path.exists(ready), "holder failed to acquire lock in time"
+
+ # From the parent, we must not be able to acquire the same palace lock
+ with pytest.raises(MineAlreadyRunning):
+ with mine_palace_lock(palace):
+ pytest.fail("second acquire of same palace should have raised")
+ finally:
+ open(release, "w").close()
+ holder.join(timeout=5)
+ assert holder.exitcode == 0
+
+
+def test_different_palaces_dont_conflict(tmp_path, monkeypatch):
+ """Mines against different palaces must NOT block each other."""
+ monkeypatch.setenv("HOME", str(tmp_path))
+ palace_a = str(tmp_path / "palace_a")
+ palace_b = str(tmp_path / "palace_b")
+ ready = str(tmp_path / "ready_a")
+ release = str(tmp_path / "release_a")
+
+ ctx = _get_mp_context()
+ holder = ctx.Process(target=_hold_lock, args=(palace_a, ready, release))
+ holder.start()
+ try:
+ for _ in range(500):
+ if os.path.exists(ready):
+ break
+ time.sleep(0.01)
+ assert os.path.exists(ready), "holder failed to acquire lock in time"
+
+ # Different palace — must succeed even while palace_a is held
+ with mine_palace_lock(palace_b):
+ pass # no exception expected
+ finally:
+ open(release, "w").close()
+ holder.join(timeout=5)
+
+
+def test_palace_path_is_normalized(tmp_path, monkeypatch):
+ """Relative and absolute forms of the same path must use the same lock."""
+ monkeypatch.setenv("HOME", str(tmp_path))
+ monkeypatch.chdir(tmp_path)
+ os.makedirs(tmp_path / "palace", exist_ok=True)
+ absolute = str(tmp_path / "palace")
+ relative = "palace"
+
+ # Hold the lock with the absolute form; attempting to re-acquire with
+ # the relative form (which resolves to the same absolute path) must fail.
+ with mine_palace_lock(absolute):
+ with pytest.raises(MineAlreadyRunning):
+ with mine_palace_lock(relative):
+ pytest.fail("normalized path collision should have raised")
+
+
+def test_mine_global_lock_is_alias_for_back_compat(tmp_path, monkeypatch):
+ """Old callers of `mine_global_lock` should still work."""
+ monkeypatch.setenv("HOME", str(tmp_path))
+ assert mine_global_lock is mine_palace_lock
+ with mine_global_lock(str(tmp_path / "palace")):
+ pass # the alias accepts the same palace_path argument
diff --git a/tests/test_project_scanner.py b/tests/test_project_scanner.py
index 49126b44c..45dc8027f 100644
--- a/tests/test_project_scanner.py
+++ b/tests/test_project_scanner.py
@@ -363,11 +363,14 @@ def test_to_detected_dict_shape():
projects = [ProjectInfo(name="p", repo_root=Path("."), is_mine=True, manifest="package.json")]
people = [PersonInfo(name="Jane Doe", total_commits=5, repos={"r"})]
d = to_detected_dict(projects, people)
- assert set(d.keys()) == {"people", "projects", "uncertain"}
+ # ``topics`` is the LLM-refine bucket for cross-wing tunnel signal —
+ # always present even when empty so callers can rely on the shape.
+ assert set(d.keys()) == {"people", "projects", "topics", "uncertain"}
assert d["projects"][0]["name"] == "p"
assert d["projects"][0]["type"] == "project"
assert d["people"][0]["name"] == "Jane Doe"
assert d["people"][0]["type"] == "person"
+ assert d["topics"] == []
assert d["uncertain"] == []
diff --git a/tests/test_repair.py b/tests/test_repair.py
index 9ae18124a..00bcb02b2 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -254,3 +254,123 @@ def test_rebuild_index_error_reading(mock_backend_cls, mock_shutil, tmp_path):
repair.rebuild_index(palace_path=str(tmp_path))
mock_backend.delete_collection.assert_not_called()
+
+
+# ── #1208 truncation safety ───────────────────────────────────────────
+
+
+def test_check_extraction_safety_passes_when_counts_match(tmp_path):
+ """SQLite reports same count as extracted → no exception."""
+ with patch("mempalace.repair.sqlite_drawer_count", return_value=500):
+ repair.check_extraction_safety(str(tmp_path), 500)
+
+
+def test_check_extraction_safety_passes_when_sqlite_unreadable_and_under_cap(tmp_path):
+ """SQLite check fails (None) but extraction is well under the cap → safe."""
+ with patch("mempalace.repair.sqlite_drawer_count", return_value=None):
+ repair.check_extraction_safety(str(tmp_path), 5_000)
+
+
+def test_check_extraction_safety_aborts_when_sqlite_higher(tmp_path):
+ """SQLite reports more than extracted — the user-reported #1208 case."""
+ with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
+ try:
+ repair.check_extraction_safety(str(tmp_path), 10_000)
+ except repair.TruncationDetected as e:
+ assert e.sqlite_count == 67_580
+ assert e.extracted == 10_000
+ assert "67,580" in e.message
+ assert "10,000" in e.message
+ assert "57,580" in e.message # the loss number
+ else:
+ raise AssertionError("expected TruncationDetected")
+
+
+def test_check_extraction_safety_aborts_when_unreadable_and_at_cap(tmp_path):
+ """SQLite unreadable but extraction == default get() cap → suspicious."""
+ with patch("mempalace.repair.sqlite_drawer_count", return_value=None):
+ try:
+ repair.check_extraction_safety(str(tmp_path), repair.CHROMADB_DEFAULT_GET_LIMIT)
+ except repair.TruncationDetected as e:
+ assert e.sqlite_count is None
+ assert e.extracted == repair.CHROMADB_DEFAULT_GET_LIMIT
+ assert "10,000" in e.message
+ else:
+ raise AssertionError("expected TruncationDetected")
+
+
+def test_check_extraction_safety_override_skips_check(tmp_path):
+ """``confirm_truncation_ok=True`` short-circuits both signals."""
+ with patch("mempalace.repair.sqlite_drawer_count", return_value=99_999):
+ # Would normally abort — override allows through
+ repair.check_extraction_safety(str(tmp_path), 10_000, confirm_truncation_ok=True)
+
+
+def test_sqlite_drawer_count_returns_none_on_missing_file(tmp_path):
+ """Palace dir exists but no chroma.sqlite3 → None, not crash."""
+ assert repair.sqlite_drawer_count(str(tmp_path)) is None
+
+
+def test_sqlite_drawer_count_returns_none_on_unreadable_schema(tmp_path):
+ """File exists but isn't a chromadb sqlite → None, not crash."""
+ sqlite_path = os.path.join(str(tmp_path), "chroma.sqlite3")
+ with open(sqlite_path, "wb") as f:
+ f.write(b"not a sqlite file at all")
+ assert repair.sqlite_drawer_count(str(tmp_path)) is None
+
+
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_aborts_on_truncation_signal(mock_backend_cls, mock_shutil, tmp_path):
+ """rebuild_index honors the safety guard: SQLite says 67k, get() returns
+ 10k → no delete_collection, no upsert, no backup."""
+ mock_backend = MagicMock()
+ mock_col = MagicMock()
+ mock_col.count.return_value = 10_000
+ # Single page comes back with 10_000 ids
+ mock_col.get.side_effect = [
+ {
+ "ids": [f"id{i}" for i in range(10_000)],
+ "documents": ["x"] * 10_000,
+ "metadatas": [{}] * 10_000,
+ },
+ {"ids": [], "documents": [], "metadatas": []},
+ ]
+ mock_backend.get_collection.return_value = mock_col
+ mock_backend_cls.return_value = mock_backend
+
+ with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
+ repair.rebuild_index(palace_path=str(tmp_path))
+
+ # Guard fired: nothing destructive happened
+ mock_backend.delete_collection.assert_not_called()
+ mock_backend.create_collection.assert_not_called()
+ mock_shutil.copy2.assert_not_called()
+
+
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_proceeds_with_override(mock_backend_cls, mock_shutil, tmp_path):
+ """Override flag lets repair proceed even when the guard would fire."""
+ mock_backend = MagicMock()
+ mock_col = MagicMock()
+ mock_col.count.return_value = 10_000
+ mock_col.get.side_effect = [
+ {
+ "ids": [f"id{i}" for i in range(10_000)],
+ "documents": ["x"] * 10_000,
+ "metadatas": [{}] * 10_000,
+ },
+ {"ids": [], "documents": [], "metadatas": []},
+ ]
+ mock_new_col = MagicMock()
+ mock_backend.get_collection.return_value = mock_col
+ mock_backend.create_collection.return_value = mock_new_col
+ mock_backend_cls.return_value = mock_backend
+
+ with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
+ repair.rebuild_index(palace_path=str(tmp_path), confirm_truncation_ok=True)
+
+ mock_backend.delete_collection.assert_called_once()
+ mock_backend.create_collection.assert_called_once()
+ mock_new_col.upsert.assert_called()
diff --git a/tests/test_searcher.py b/tests/test_searcher.py
index 3ccfb2d91..6b85832dc 100644
--- a/tests/test_searcher.py
+++ b/tests/test_searcher.py
@@ -121,6 +121,44 @@ def mock_get_collection(path, create=False):
assert none_hit["room"] == "unknown"
+# ── BM25 internals: None / empty document safety ─────────────────────
+
+
+class TestBM25NoneSafety:
+ """Regression tests for the AttributeError observed in production when
+ Chroma returned ``None`` documents inside a hybrid-rerank pass.
+
+ Trace from the daemon log (2026-04-24 21:07:05):
+ File "mempalace/searcher.py", line 81, in _bm25_scores
+ tokenized = [_tokenize(d) for d in documents]
+ File "mempalace/searcher.py", line 52, in _tokenize
+ return _TOKEN_RE.findall(text.lower())
+ AttributeError: 'NoneType' object has no attribute 'lower'
+ """
+
+ def test_tokenize_handles_none(self):
+ from mempalace.searcher import _tokenize
+
+ assert _tokenize(None) == []
+
+ def test_tokenize_handles_empty_string(self):
+ from mempalace.searcher import _tokenize
+
+ assert _tokenize("") == []
+
+ def test_bm25_scores_does_not_crash_on_none_documents(self):
+ """A ``None`` mixed into the corpus must yield score 0.0 for that doc
+ and finite scores for the rest, not raise AttributeError."""
+ from mempalace.searcher import _bm25_scores
+
+ scores = _bm25_scores(
+ "postgres migration", ["postgres migration done", None, "kafka rebalance"]
+ )
+ assert len(scores) == 3
+ assert scores[1] == 0.0
+ assert scores[0] > 0.0
+
+
# ── search() (CLI print function) ─────────────────────────────────────
@@ -173,6 +211,85 @@ def test_search_n_results(self, palace_path, seeded_collection, capsys):
# Should have output with at least one result block
assert "[1]" in captured.out
+ def test_search_applies_bm25_hybrid_rerank(self, capsys):
+ """CLI search must call the same hybrid rerank that the MCP path uses.
+
+ Regression for a bug where the CLI only consulted ChromaDB cosine
+ distance: a drawer whose body contained every query term still
+ scored zero similarity if its embedding happened to be far from
+ the query (e.g. the drawer was a shell-output fragment that
+ embeds as "file tree noise"). Hybrid rerank fixes this by
+ combining BM25 with cosine — lexical matches rise above pure
+ vector noise.
+
+ Simulates: three candidates, all with distance >= 1.0 (cosine = 0);
+ candidate 2 contains every query term. After the fix, candidate 2
+ should rank first and display a non-zero bm25 score.
+ """
+ mock_col = MagicMock()
+ mock_col.metadata = {"hnsw:space": "cosine"}
+ mock_col.query.return_value = {
+ "documents": [
+ [
+ "unrelated directory listing -rw-rw-r-- file.txt",
+ "foo bar baz is a multi-word phrase",
+ "another unrelated chunk about colors",
+ ]
+ ],
+ "metadatas": [
+ [
+ {"source_file": "a.md", "wing": "w", "room": "r"},
+ {"source_file": "b.md", "wing": "w", "room": "r"},
+ {"source_file": "c.md", "wing": "w", "room": "r"},
+ ]
+ ],
+ "distances": [[1.5, 1.5, 1.5]],
+ }
+ with patch("mempalace.searcher.get_collection", return_value=mock_col):
+ search("foo bar baz", "/fake/path")
+ captured = capsys.readouterr()
+ first_block, _, _ = captured.out.partition("[2]")
+ # Lexical match must rank first
+ assert (
+ "b.md" in first_block
+ ), f"expected lexical match 'b.md' at rank 1, got:\n{captured.out}"
+ # Non-zero bm25 reported
+ assert "bm25=" in first_block
+ assert "bm25=0.0" not in first_block
+ # Cosine still reported for transparency
+ assert "cosine=" in first_block
+
+ def test_search_warns_when_palace_uses_wrong_distance_metric(self, capsys):
+ """Legacy palaces created without `hnsw:space=cosine` silently
+ use L2, which breaks similarity interpretation. CLI must warn
+ the user and point them at `mempalace repair` rather than
+ pretending the `Match` scores are meaningful."""
+ mock_col = MagicMock()
+ mock_col.metadata = {} # legacy: no hnsw:space set
+ mock_col.query.return_value = {
+ "documents": [["some drawer content"]],
+ "metadatas": [[{"source_file": "a.md", "wing": "w", "room": "r"}]],
+ "distances": [[1.2]],
+ }
+ with patch("mempalace.searcher.get_collection", return_value=mock_col):
+ search("anything", "/fake/path")
+ captured = capsys.readouterr()
+ assert "mempalace repair" in captured.err
+ assert "cosine" in captured.err.lower()
+
+ def test_search_does_not_warn_when_palace_is_correctly_configured(self, capsys):
+ mock_col = MagicMock()
+ mock_col.metadata = {"hnsw:space": "cosine"}
+ mock_col.query.return_value = {
+ "documents": [["some drawer content"]],
+ "metadatas": [[{"source_file": "a.md", "wing": "w", "room": "r"}]],
+ "distances": [[0.3]],
+ }
+ with patch("mempalace.searcher.get_collection", return_value=mock_col):
+ search("anything", "/fake/path")
+ captured = capsys.readouterr()
+ assert "mempalace repair" not in captured.err
+
def test_search_handles_none_metadata_without_crash(self, palace_path, capsys):
"""ChromaDB can return `None` entries in the metadatas list when a
drawer has no metadata. The CLI print path must not crash on them
diff --git a/uv.lock b/uv.lock
index f102d434f..ef1a7061d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1178,6 +1178,11 @@ dependencies = [
]
[package.optional-dependencies]
+coreml = [
+ { name = "onnxruntime", version = "1.20.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+ { name = "onnxruntime", version = "1.24.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+ { name = "onnxruntime", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+]
dev = [
{ name = "psutil" },
{ name = "pytest", version = "8.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
@@ -1185,6 +1190,16 @@ dev = [
{ name = "pytest-cov" },
{ name = "ruff" },
]
+dml = [
+ { name = "onnxruntime-directml", version = "1.20.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+ { name = "onnxruntime-directml", version = "1.24.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+ { name = "onnxruntime-directml", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+]
+gpu = [
+ { name = "onnxruntime-gpu", version = "1.20.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+ { name = "onnxruntime-gpu", version = "1.24.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+ { name = "onnxruntime-gpu", version = "1.25.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+]
spellcheck = [
{ name = "autocorrect" },
]
@@ -1202,6 +1217,9 @@ dev = [
requires-dist = [
{ name = "autocorrect", marker = "extra == 'spellcheck'", specifier = ">=2.0" },
{ name = "chromadb", specifier = ">=1.5.4,<2" },
+ { name = "onnxruntime", marker = "extra == 'coreml'", specifier = ">=1.16" },
+ { name = "onnxruntime-directml", marker = "extra == 'dml'", specifier = ">=1.16" },
+ { name = "onnxruntime-gpu", marker = "extra == 'gpu'", specifier = ">=1.16" },
{ name = "psutil", marker = "extra == 'dev'", specifier = ">=5.9" },
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0" },
{ name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0" },
@@ -1209,7 +1227,7 @@ requires-dist = [
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4.0" },
{ name = "tomli", marker = "python_full_version < '3.11'", specifier = ">=2.0.0" },
]
-provides-extras = ["dev", "spellcheck"]
+provides-extras = ["dev", "spellcheck", "gpu", "dml", "coreml"]
[package.metadata.requires-dev]
dev = [
@@ -1815,6 +1833,154 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/6c/1d/1666dc64e78d8587d168fec4e3b7922b92eb286a2ddeebcf6acb55c7dc82/onnxruntime-1.24.4-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e1cc6a518255f012134bc791975a6294806be9a3b20c4a54cca25194c90cf731", size = 17247021, upload-time = "2026-03-17T22:04:52.377Z" },
]
+[[package]]
+name = "onnxruntime-directml"
+version = "1.20.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version < '3.10'",
+]
+dependencies = [
+ { name = "coloredlogs", marker = "python_full_version < '3.10'" },
+ { name = "flatbuffers", marker = "python_full_version < '3.10'" },
+ { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+ { name = "packaging", marker = "python_full_version < '3.10'" },
+ { name = "protobuf", marker = "python_full_version < '3.10'" },
+ { name = "sympy", marker = "python_full_version < '3.10'" },
+]
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/3c/4f/f433239b05304aa9af0217da20508abbbcec1dcd58ee821e3dab8939ecfe/onnxruntime_directml-1.20.1-cp310-cp310-win_amd64.whl", hash = "sha256:d4aa43694799559fb5570fdf0e96a154d4b4d0bb9b73c3e81744eb7fe0c0de8d", size = 22760521, upload-time = "2024-11-21T00:49:40.179Z" },
+ { url = "https://files.pythonhosted.org/packages/df/5f/16337318bd99d2d837cbb2e91e8a12b0915cb80d7c1ae8f80ca2f5d47a09/onnxruntime_directml-1.20.1-cp311-cp311-win_amd64.whl", hash = "sha256:c7861057ad4caa64186c910efb3b54c1f575cd0e64732509c9bd927d2d20187b", size = 22762384, upload-time = "2024-11-21T00:49:44.01Z" },
+ { url = "https://files.pythonhosted.org/packages/8f/50/4599c6573bd71cc0c80820c63dea599a0b489ce874f93a5e021ca20a9e1f/onnxruntime_directml-1.20.1-cp312-cp312-win_amd64.whl", hash = "sha256:4b9a9f8349d68eef947fc692b3572e7a6490cb95effb151ace1a6ffc15884940", size = 22764330, upload-time = "2024-11-21T00:49:47.264Z" },
+ { url = "https://files.pythonhosted.org/packages/60/40/7d8489d9101b4aa7bae29227075ce31bc5764cbe87b78c995fdb296e3eff/onnxruntime_directml-1.20.1-cp313-cp313-win_amd64.whl", hash = "sha256:86a8c4b69e377bb18ed2a18aaf2337baa83a57ff87a97224d027e546dfa99fde", size = 22764517, upload-time = "2024-11-21T00:49:50.213Z" },
+]
+
+[[package]]
+name = "onnxruntime-directml"
+version = "1.24.3"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version == '3.10.*'",
+]
+dependencies = [
+ { name = "flatbuffers", marker = "python_full_version == '3.10.*'" },
+ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+ { name = "packaging", marker = "python_full_version == '3.10.*'" },
+ { name = "protobuf", marker = "python_full_version == '3.10.*'" },
+ { name = "sympy", marker = "python_full_version == '3.10.*'" },
+]
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/ed/65/36ce5a5e79fb5d7b4d7636bc6e6c4024f3ff0571789e8eedb7149bb7c538/onnxruntime_directml-1.24.3-cp311-cp311-win_amd64.whl", hash = "sha256:442fecea5d52df315b6cecfbcbb44aff6681880b6bbf23546a6c00125fec66f1", size = 25106769, upload-time = "2026-03-05T16:27:07.495Z" },
+ { url = "https://files.pythonhosted.org/packages/05/40/c948c0ee42b7b6297dd45956092f5a53a6954610c3911a5847c7555b4930/onnxruntime_directml-1.24.3-cp312-cp312-win_amd64.whl", hash = "sha256:d889010e6ed2f30026522308173d295bcfdaf6f28d1df6054c748ffa750a7ad5", size = 25114531, upload-time = "2026-03-05T16:27:11.256Z" },
+ { url = "https://files.pythonhosted.org/packages/56/f0/9de329f39a66142aab4c1d9a48edc0e432de27c6ba09e8039e0dc51885e7/onnxruntime_directml-1.24.3-cp313-cp313-win_amd64.whl", hash = "sha256:f684adcb29dd48ee172b52fcf1d19a1da1a67a051384ac3418b36d200d0d105c", size = 25114902, upload-time = "2026-03-05T16:27:13.925Z" },
+ { url = "https://files.pythonhosted.org/packages/fe/7a/8b3014ca4065a32bd6672221bf4cb0b5b9a726d28a9caafdb86a076a5981/onnxruntime_directml-1.24.3-cp314-cp314-win_amd64.whl", hash = "sha256:42b17de7030445e75a7e83a4a317f9c655ed2dd7045fe79a7a21dce7b60103b6", size = 25570589, upload-time = "2026-03-05T16:27:17.278Z" },
+]
+
+[[package]]
+name = "onnxruntime-directml"
+version = "1.24.4"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version >= '3.14'",
+ "python_full_version == '3.13.*'",
+ "python_full_version >= '3.11' and python_full_version < '3.13'",
+]
+dependencies = [
+ { name = "flatbuffers", marker = "python_full_version >= '3.11'" },
+ { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+ { name = "packaging", marker = "python_full_version >= '3.11'" },
+ { name = "protobuf", marker = "python_full_version >= '3.11'" },
+ { name = "sympy", marker = "python_full_version >= '3.11'" },
+]
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/be/90/99566dc6398028e7691a5b12720fd85f757a0901818b84599d28abb3f085/onnxruntime_directml-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:96642a787e5a6f33bf043521c0f06eb1eb663f6b830e5862a2026d03f9c90543", size = 25106000, upload-time = "2026-03-17T21:47:15.438Z" },
+ { url = "https://files.pythonhosted.org/packages/88/ea/33814eb0ec96775eda4c1d30b0d86e91d7d2cd0d84c66d3915aef0e06fa3/onnxruntime_directml-1.24.4-cp312-cp312-win_amd64.whl", hash = "sha256:f2ecb68b7b7b259d2ef3112ae760149f9b5a1e7c0fbb73d539da6250a648a614", size = 25111930, upload-time = "2026-03-17T21:47:18.419Z" },
+ { url = "https://files.pythonhosted.org/packages/60/53/2bd2696fac19cf8ca55496a0bcfe431f3aff9579eabbb0e231dc238acf6f/onnxruntime_directml-1.24.4-cp313-cp313-win_amd64.whl", hash = "sha256:2f1031cb2281e5b27cca9efe0b9399317c7286e4d226f7a79d4ab79bbd94d19e", size = 25112253, upload-time = "2026-03-17T21:47:22.043Z" },
+ { url = "https://files.pythonhosted.org/packages/b7/04/816932a3ade867a687e406716ca76e0774c6b921545b45818e3ebfcc54ce/onnxruntime_directml-1.24.4-cp314-cp314-win_amd64.whl", hash = "sha256:51d86bb949488e572b00422f344990a4a81d982416d73b6c0e4ced2bcd423d19", size = 25571098, upload-time = "2026-03-17T21:47:25.461Z" },
+]
+
+[[package]]
+name = "onnxruntime-gpu"
+version = "1.20.2"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version < '3.10'",
+]
+dependencies = [
+ { name = "coloredlogs", marker = "python_full_version < '3.10'" },
+ { name = "flatbuffers", marker = "python_full_version < '3.10'" },
+ { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
+ { name = "packaging", marker = "python_full_version < '3.10'" },
+ { name = "protobuf", marker = "python_full_version < '3.10'" },
+ { name = "sympy", marker = "python_full_version < '3.10'" },
+]
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/04/ad/4e5534dcaafe36f596792ebd0049177f7f0b7afa0f696505974ed1d6f72c/onnxruntime_gpu-1.20.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dfba508f110ec062dedfd3032e6eee8cde325026e9d7c5792884e8b9d4ebb9c3", size = 291522233, upload-time = "2025-03-07T05:46:08.901Z" },
+ { url = "https://files.pythonhosted.org/packages/a5/2a/8afc5aee996fd33fb816bc3067fdbde96a2a7520d4c275fa502f3aef7e54/onnxruntime_gpu-1.20.2-cp310-cp310-win_amd64.whl", hash = "sha256:75a7557292b2741e63fb73236ee84faa08075cead52d9a8d302a67036fc64f16", size = 279696089, upload-time = "2025-03-07T05:39:24.924Z" },
+ { url = "https://files.pythonhosted.org/packages/5e/53/9341b875b0ed29953485b43713e94b335a449c3770fed67dddb3c9b84af0/onnxruntime_gpu-1.20.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:85057c7006457bee14fc2a57417b7e4f396f10d9c1b08b11aae08ac2b825eeda", size = 291518407, upload-time = "2025-03-07T05:46:22.943Z" },
+ { url = "https://files.pythonhosted.org/packages/0b/7a/0999993ceae7bf191d5d63a4e1b2208596763d8e586aa7dc5cc091f960c0/onnxruntime_gpu-1.20.2-cp311-cp311-win_amd64.whl", hash = "sha256:d0eafd873e4336949c89e6c7429a68e7e1d0233d9cb363e9780ca76c3c6f865c", size = 279697437, upload-time = "2025-03-07T05:39:38.418Z" },
+ { url = "https://files.pythonhosted.org/packages/5b/db/c1fcdf45cad147d3b3609cf66a1c6083b54382f58a41d7fc526cd5909090/onnxruntime_gpu-1.20.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aa66d2e6de13fe6f4d1554b1c219bd2e4778b540ed9d3dc62957c95a8af43d66", size = 291510804, upload-time = "2025-03-07T05:46:36.178Z" },
+ { url = "https://files.pythonhosted.org/packages/27/67/4f979650557738a8b148dd7e0b82522d20ffcfb2c3964141c861a61e82c7/onnxruntime_gpu-1.20.2-cp312-cp312-win_amd64.whl", hash = "sha256:564a6a1187b208012f57c3bb3723ba65f6bc5cddff6e6b917ac96865768b39f5", size = 279699596, upload-time = "2025-03-07T05:39:50.858Z" },
+ { url = "https://files.pythonhosted.org/packages/48/a4/60f0cf16b24f05d123f90525408a705741fa92e0c38ab122cdf1d239e3fe/onnxruntime_gpu-1.20.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6af5b30b9b0e729d3ca1dfff493a39771f143cfc22af1d77d487022033cae284", size = 291511859, upload-time = "2025-03-07T05:46:49.302Z" },
+ { url = "https://files.pythonhosted.org/packages/ab/a2/0eb7a3fa417adc7af0be73b0ea35f1f0d6f92e3722eb6468e36dfe2e762d/onnxruntime_gpu-1.20.2-cp313-cp313-win_amd64.whl", hash = "sha256:6ffe5108d2dbd96a9a40bf76573219e04b67d0330aa93ca5114f1478185ade19", size = 279697061, upload-time = "2025-03-07T05:40:03.559Z" },
+ { url = "https://files.pythonhosted.org/packages/4e/de/6c692ac8604a451011a2a01e35e94f84bea8775ef97f6830985bbe8de172/onnxruntime_gpu-1.20.2-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:407e5b7a21d656aac6f994d2e329f5577eb3d7f98b63aa1e49e71a702ffa1da1", size = 291502464, upload-time = "2025-03-07T05:47:03.191Z" },
+]
+
+[[package]]
+name = "onnxruntime-gpu"
+version = "1.24.3"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version == '3.10.*'",
+]
+dependencies = [
+ { name = "flatbuffers", marker = "python_full_version == '3.10.*'" },
+ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" },
+ { name = "packaging", marker = "python_full_version == '3.10.*'" },
+ { name = "protobuf", marker = "python_full_version == '3.10.*'" },
+ { name = "sympy", marker = "python_full_version == '3.10.*'" },
+]
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/28/f4/c8050f3f4916ab6c75432724f0ba51c1548dc1c3d66d40c0f8a9611e370f/onnxruntime_gpu-1.24.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ac922633819e1cdc81c9b3a28b5e37d788805307bbaa708a01a3d7150e345625", size = 252750845, upload-time = "2026-03-05T16:35:33.604Z" },
+ { url = "https://files.pythonhosted.org/packages/07/b7/81e8936354651915192a362a1718253c6d03da6b902a95237aa392b1d260/onnxruntime_gpu-1.24.3-cp311-cp311-win_amd64.whl", hash = "sha256:0fe6ece3042db149f36f4991cbebd19a690b7ffd82af89450a261b47f4704a37", size = 207192429, upload-time = "2026-03-05T16:39:57.015Z" },
+ { url = "https://files.pythonhosted.org/packages/24/fa/58ceca812214c9c1a286407c376e42e0b7de3e2c6e14b61cdf3caf6d6d9c/onnxruntime_gpu-1.24.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:537bdd6d95006a9200ae81f2e73ba9e621e723fdf0deb5901e2e62fb2cccf876", size = 252756089, upload-time = "2026-03-05T16:35:46.004Z" },
+ { url = "https://files.pythonhosted.org/packages/3c/07/2f36920b513bd8939e25591153e37d9cfda94115bd119f2874da0750fce2/onnxruntime_gpu-1.24.3-cp312-cp312-win_amd64.whl", hash = "sha256:d72065b3ab5fdaef74d8b6b8f39b7ce20d89731610e3e63cb40e997d3dce177e", size = 207197001, upload-time = "2026-03-05T16:40:05.691Z" },
+ { url = "https://files.pythonhosted.org/packages/49/57/9e6206dac76e08f028d2ae95f2ab1b3a7c3317fb6c0374a530aad48dab5c/onnxruntime_gpu-1.24.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3242a70010934e5bb0aeaa9dde4c25c6c2da577b55c6308c0caa828ba3b7be23", size = 252753349, upload-time = "2026-03-05T16:35:58.09Z" },
+ { url = "https://files.pythonhosted.org/packages/4e/ae/f0be395602c13a3a8d22fa6632133550a64536c58bc3623abbba5d0a575e/onnxruntime_gpu-1.24.3-cp313-cp313-win_amd64.whl", hash = "sha256:a423b164dbc26cb7f8736367b11698c2a7294748d3c144c39542ecac28d225c9", size = 207197331, upload-time = "2026-03-05T16:40:14.944Z" },
+ { url = "https://files.pythonhosted.org/packages/b4/af/a64c9789769d8d7fabc6d35dcce2f2897b2d9e0fe113044efc2903f7cd07/onnxruntime_gpu-1.24.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9696d54974a1313ef0d87f4cbd04f9abfd13839194638d52bb5967a15615341d", size = 252762923, upload-time = "2026-03-05T16:36:10.043Z" },
+ { url = "https://files.pythonhosted.org/packages/c1/bb/1cf7dffac2fb01e8de9f0882438165f7543f0aab57f86d1f587e6faa8528/onnxruntime_gpu-1.24.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8ca744f40b33380bc9136988213e574c927d2b919ed42149977e006b138f74f", size = 252754914, upload-time = "2026-03-05T16:36:30.739Z" },
+ { url = "https://files.pythonhosted.org/packages/cf/39/3949d56103bd9cd9381de59b060f9bce8dc2c7363f465bf207ebd0c7a5d0/onnxruntime_gpu-1.24.3-cp314-cp314-win_amd64.whl", hash = "sha256:c60c44e2b388720e6670a948b52626f3d089e960ef7da66e4fa6b2b33a11116f", size = 209599131, upload-time = "2026-03-05T16:40:24.074Z" },
+ { url = "https://files.pythonhosted.org/packages/f3/60/51bfbcf2d0540dbfa426a73a9b80046b71a63de7303d16c0f2682c8edfd2/onnxruntime_gpu-1.24.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:29048407a2398361d93de5537c2d2079d79d720337a0743d4a2cc28db981e776", size = 252764115, upload-time = "2026-03-05T16:36:44.681Z" },
+]
+
+[[package]]
+name = "onnxruntime-gpu"
+version = "1.25.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version >= '3.14'",
+ "python_full_version == '3.13.*'",
+ "python_full_version >= '3.11' and python_full_version < '3.13'",
+]
+dependencies = [
+ { name = "flatbuffers", marker = "python_full_version >= '3.11'" },
+ { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+ { name = "packaging", marker = "python_full_version >= '3.11'" },
+ { name = "protobuf", marker = "python_full_version >= '3.11'" },
+]
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/2d/7e/f58f8fc505a876b31fd2a34c1eb8f9863b75bf1589c3297c8efd48b93151/onnxruntime_gpu-1.25.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8625bb31ee2d88524414e7458cc604f4f958f323ef8832cc00882f6cd42b9a1", size = 270337732, upload-time = "2026-04-22T17:27:59.993Z" },
+ { url = "https://files.pythonhosted.org/packages/55/5d/2561b3aa667d87a4ae9cd01c5a565955aab5a3d44a6076f723beb9cdde0a/onnxruntime_gpu-1.25.0-cp311-cp311-win_amd64.whl", hash = "sha256:2e702159a025aa5c69f0b747adf9a451e0c9e4b20120163a918c8459d3171b87", size = 220845585, upload-time = "2026-04-22T17:20:38.939Z" },
+ { url = "https://files.pythonhosted.org/packages/1d/6d/2c13d3eff74caa9e59820a044a75becd34e9cbeeaf7617ad7679cdb1fdb7/onnxruntime_gpu-1.25.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2f0c36c63c8b0eb4091f2567067f480f66f0aedc189eb009545c98ce7e919056", size = 270342429, upload-time = "2026-04-22T17:28:10.526Z" },
+ { url = "https://files.pythonhosted.org/packages/8c/2e/9fc303ae59d4caeb85ec3cea6881b7de8ca1d2a07140fade39913cd7ff10/onnxruntime_gpu-1.25.0-cp312-cp312-win_amd64.whl", hash = "sha256:61178cc4d84f59861714554531e01cccbd33ddf13cc0e87a3adea13b24d297ce", size = 220847708, upload-time = "2026-04-22T17:20:47.993Z" },
+ { url = "https://files.pythonhosted.org/packages/f5/15/e63fe7b1abad6884bed07e9bb333e9f0ea48fbb8cbc1ea4a67ee6019d5d0/onnxruntime_gpu-1.25.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e462eb13ee9955117baec4f518916c1e7cb1a96001114105632bc6d454c6aee6", size = 270342324, upload-time = "2026-04-22T17:28:21.142Z" },
+ { url = "https://files.pythonhosted.org/packages/21/10/b3533243d062b589d4b1f3ae26584af332c5cde618e7f6f5ff6fabbfd5f2/onnxruntime_gpu-1.25.0-cp313-cp313-win_amd64.whl", hash = "sha256:9a3682158e5e911385252eb95d6332b6f525972746c582e10f8a78213b39e624", size = 220848188, upload-time = "2026-04-22T17:20:56.946Z" },
+ { url = "https://files.pythonhosted.org/packages/35/6c/d7706dd1d0eaafdba44d5c89f8d952de41e425a1b0cbd3ecfa60f918c249/onnxruntime_gpu-1.25.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8514b92c5929c953850090d823d018770cba2a971efab5f8f69a3c4280cdc632", size = 270364210, upload-time = "2026-04-22T17:28:33.568Z" },
+ { url = "https://files.pythonhosted.org/packages/37/01/9f1b16ea857e3a4b5e82a2d70b52ea46a0083569f737d840f74a1b86818f/onnxruntime_gpu-1.25.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ffe9df4016b061ec3a5565a4fc08cdb86808cd8b9c255c42301066c0c24a81b5", size = 270345126, upload-time = "2026-04-22T17:28:44.416Z" },
+ { url = "https://files.pythonhosted.org/packages/56/c8/aae22f3c9cea9160d8d969734a1927720fcb4d4ad4abe269c407c1d2b63c/onnxruntime_gpu-1.25.0-cp314-cp314-win_amd64.whl", hash = "sha256:2173b71631208177fe704ce2d92eac3acbf758285327247ea40a31a9f0bcc073", size = 223385369, upload-time = "2026-04-22T17:21:06.026Z" },
+ { url = "https://files.pythonhosted.org/packages/ed/0a/79fba6a1a32803a2bf8b99187e0ea5d5d69ffe0c5c0f469bde232ceb8327/onnxruntime_gpu-1.25.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8576c721c600cc669717a2ae49af30fdfff230480099653adc7b79d58a240852", size = 270364130, upload-time = "2026-04-22T17:28:54.708Z" },
+]
+
[[package]]
name = "opentelemetry-api"
version = "1.40.0"