feat(mine): queue requests during repair-rebuild + drain after (#4)

jphein · claude · web-flow · commit 924d07821c30 · 2026-05-05T20:30:28.000-07:00
Mirrors the existing /silent-save queue pattern for /mine. Closes a
gap JP noticed: when the daemon is in repair-mode rebuild, hook fires
that POST /mine fail outright (the rebuild replaces the collection
mid-flight; running a concurrent mine subprocess would race the
swap). The /silent-save queue covered diary writes; /mine had no
equivalent, so transcript-ingest requests during a rebuild window
were lost.

Adds three pieces, all parallel to the silent-save infrastructure:

* `_pending_mines_path()` — separate jsonl queue file (next to the
  silent-save pending file).
* `_enqueue_pending_mine(payload)` — appends a /mine request body to
  the queue, off-loop via asyncio.to_thread.
* `_drain_pending_mines()` — replays queued mines after rebuild via
  the same subprocess pattern the live /mine endpoint uses, gated by
  `_mine_sem`. Same rename-then-read pattern as
  `_drain_pending_writes` so concurrent /mine POSTs landing during
  the drain go to a fresh queue file. Dedup by (dir, wing, mode)
  before replay — a storm of hook fires queues the same target many
  times, but one mine catches up all of them via convo_miner's
  mtime-based dedup, so we don't need to run it N times.

The /mine endpoint checks `_repair_state` and queues if in rebuild
mode, returning `{"queued": true, "reason": "repair-in-progress"}`
to signal the caller. After the rebuild completes,
`_drain_pending_mines()` runs alongside `_drain_pending_writes()`.

Also extends `/repair/status` to surface `pending_mines` count
alongside `pending_writes`.

Tests: 5 new unittest cases — pending-mines path is distinct from
writes, enqueue→drain replays each target, drain dedups repeated
targets (hook-storm scenario), failed replays quarantine to
.failed-* files instead of getting lost, empty queue returns 0.

NOT a fix for the HNSW corruption that prompted the current incident
— that came from concurrent update_drawer calls hitting chromadb's
HNSW concurrency hazards (CLAUDE.md row 15). The corruption-side fix
is `PALACE_MAX_WRITE_CONCURRENCY=1` in the daemon env. This PR
covers a separate failure mode: hook writes during repair windows.

Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/main.py b/main.py
@@ -320,6 +320,108 @@ def _append():
     await asyncio.to_thread(_append)
 
 
+def _pending_mines_path() -> str:
+    """Location of the jsonl queue that holds /mine requests during rebuild.
+
+    Separate from the silent-save queue because mines are fire-and-forget
+    subprocess invocations rather than diary writes — replayed via the
+    same /mine subprocess pattern, not _do_silent_save_write.
+    """
+    palace_path = _mp._config.palace_path
+    parent = os.path.dirname(palace_path.rstrip("/")) or os.path.expanduser("~")
+    return os.path.join(parent, "palace-daemon-pending-mines.jsonl")
+
+
+async def _enqueue_pending_mine(payload: dict) -> None:
+    """Append a /mine request payload to the pending-mines queue (off-loop)."""
+    path = _pending_mines_path()
+    line = json.dumps({"payload": payload, "enqueued_at": datetime.now().isoformat()})
+
+    def _append():
+        with open(path, "a", encoding="utf-8") as f:
+            f.write(line + "\n")
+
+    await asyncio.to_thread(_append)
+
+
+async def _drain_pending_mines() -> int:
+    """Replay queued /mine requests after a rebuild completes.
+
+    Same rename-then-read pattern as _drain_pending_writes — concurrent
+    /mine POSTs landing during the drain go to a fresh pending file.
+    Each entry is replayed by spawning the same subprocess the live
+    /mine endpoint would, gated by _mine_sem. Failed entries are
+    quarantined to a timestamped .failed-* file.
+    """
+    path = _pending_mines_path()
+    if not os.path.isfile(path):
+        return 0
+    proc_path = path + ".processing"
+    try:
+        os.rename(path, proc_path)
+    except OSError:
+        return 0
+    count = 0
+    failed_lines: list[str] = []
+    try:
+        with open(proc_path, encoding="utf-8") as f:
+            lines = [ln for ln in f.readlines() if ln.strip()]
+        # Dedup queued mines by (dir, wing, mode) — re-mining the same dir
+        # is the goal, not running it N times. A storm of hook fires during
+        # rebuild may have queued the same target dozens of times; one
+        # successful drain replay covers them all.
+        seen: set = set()
+        unique_entries: list = []
+        for line in reversed(lines):  # keep newest of each (dir, wing, mode)
+            try:
+                entry = json.loads(line)
+                payload = entry.get("payload", {})
+                key = (payload.get("dir"), payload.get("wing"), payload.get("mode", "convos"))
+                if key in seen:
+                    continue
+                seen.add(key)
+                unique_entries.append((line, entry))
+            except json.JSONDecodeError:
+                failed_lines.append(line)
+        # Replay in original order
+        unique_entries.reverse()
+        for line, entry in unique_entries:
+            try:
+                payload = entry["payload"]
+                directory = _translate_client_path(payload["dir"])
+                if not Path(directory).is_dir():
+                    _log.warning("drain-mine: skipping %s — not a directory", directory)
+                    continue
+                wing = payload.get("wing", "general")
+                mode = payload.get("mode", "convos")
+                mempalace_bin = os.path.join(os.path.dirname(sys.executable), "mempalace")
+                cmd = [mempalace_bin, "mine", directory, "--mode", mode, "--wing", wing]
+                async with _mine_sem:
+                    proc = await asyncio.create_subprocess_exec(
+                        *cmd,
+                        stdout=asyncio.subprocess.PIPE,
+                        stderr=asyncio.subprocess.PIPE,
+                    )
+                    stdout, stderr = await proc.communicate()
+                if proc.returncode == 0:
+                    count += 1
+                else:
+                    _log.warning("drain-mine: replay returned %s for %s", proc.returncode, directory)
+                    failed_lines.append(line)
+            except Exception:
+                _log.exception("drain-mine: entry replay raised")
+                failed_lines.append(line)
+        if failed_lines:
+            qpath = proc_path + ".failed-" + datetime.now().strftime("%Y%m%d%H%M%S")
+            with open(qpath, "w", encoding="utf-8") as f:
+                f.writelines(failed_lines)
+            _log.warning("drain-mine: %d entries quarantined at %s", len(failed_lines), qpath)
+        os.remove(proc_path)
+    except Exception:
+        _log.exception("drain-mine: read failed; leaving %s in place", proc_path)
+    return count
+
+
 async def _drain_pending_writes() -> int:
     """Replay queued silent-saves after a rebuild completes.
 
@@ -1164,6 +1266,32 @@ async def mine(request: Request, x_api_key: str | None = Header(default=None)):
         except (TypeError, ValueError):
             raise HTTPException(status_code=400, detail="'limit' must be an integer")
 
+    # During /repair mode=rebuild, queue the mine instead of executing it.
+    # Mirrors the /silent-save queue pattern — the rebuild replaces the
+    # collection mid-flight, so any concurrent mine subprocess would race
+    # the swap. After repair completes, _drain_pending_mines() replays
+    # queued mines through the same code path. Pass-through fields preserve
+    # extract/limit on replay.
+    if (
+        _repair_state["in_progress"]
+        and _repair_state.get("mode") == "rebuild"
+    ):
+        await _enqueue_pending_mine({
+            "dir": body.get("dir"),  # original (untranslated) path so replay translates fresh
+            "wing": wing,
+            "mode": mode,
+            "extract": extract,
+            "limit": limit,
+        })
+        return {
+            "queued": True,
+            "reason": "repair-in-progress",
+            "systemMessage": (
+                "Mine queued — palace is rebuilding. Will replay automatically "
+                "when repair completes."
+            ),
+        }
+
     mempalace_bin = os.path.join(os.path.dirname(sys.executable), "mempalace")
     cmd = [mempalace_bin, "mine", directory, "--mode", mode, "--wing", wing]
     if extract:
@@ -1381,37 +1509,48 @@ async def repair(request: Request, x_api_key: str | None = Header(default=None))
         _repair_state["mode"] = None
         _repair_state["started_at"] = None
 
+    drained_mines = 0
     if mode == "rebuild":
         drained = await _drain_pending_writes()
+        # Also replay any /mine requests queued during the rebuild. Mirrors
+        # _drain_pending_writes — same rename-then-read, dedup-by-target,
+        # subprocess re-execution.
+        drained_mines = await _drain_pending_mines()
 
     duration = (datetime.now() - start).total_seconds()
     _log.info(messages.repair_complete(mode, drained, duration))
     return {
         "mode": mode,
         "result": result,
         "drained": drained,
+        "drained_mines": drained_mines,
         "duration_s": round(duration, 3),
         "systemMessage": messages.repair_complete(mode, drained, duration),
     }
 
 
 @app.get("/repair/status")
 async def repair_status():
-    """Current repair state + pending-writes queue depth."""
-    queue_path = _pending_writes_path()
-    pending = 0
-    if os.path.isfile(queue_path):
+    """Current repair state + pending-writes + pending-mines queue depths."""
+    def _count_lines(path: str) -> int:
+        if not os.path.isfile(path):
+            return 0
         try:
-            with open(queue_path, encoding="utf-8") as f:
-                pending = sum(1 for ln in f if ln.strip())
+            with open(path, encoding="utf-8") as f:
+                return sum(1 for ln in f if ln.strip())
         except OSError:
-            pending = -1
+            return -1
+
+    writes_path = _pending_writes_path()
+    mines_path = _pending_mines_path()
     return {
         "in_progress": _repair_state["in_progress"],
         "mode": _repair_state["mode"],
         "started_at": _repair_state["started_at"],
-        "pending_writes": pending,
-        "pending_writes_path": queue_path,
+        "pending_writes": _count_lines(writes_path),
+        "pending_writes_path": writes_path,
+        "pending_mines": _count_lines(mines_path),
+        "pending_mines_path": mines_path,
     }
 
 
diff --git a/tests/test_mine_queue.py b/tests/test_mine_queue.py
@@ -0,0 +1,134 @@
+"""Tests for /mine queue + drain during repair=rebuild.
+
+Mirrors the silent-save queue contract: while a rebuild is in progress,
+/mine requests are appended to a jsonl file at ``_pending_mines_path()``
+and replayed by ``_drain_pending_mines()`` after the rebuild completes.
+
+Run with::
+
+    python -m unittest tests.test_mine_queue -v
+
+Pure-function and pure-IO tests; no live daemon required. The drain
+test stubs the chromadb subprocess invocation by monkey-patching
+``asyncio.create_subprocess_exec``.
+"""
+import asyncio
+import json
+import os
+import sys
+import tempfile
+import unittest
+from unittest.mock import patch, AsyncMock, MagicMock
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_ROOT = os.path.dirname(_HERE)
+if _ROOT not in sys.path:
+    sys.path.insert(0, _ROOT)
+
+import main  # noqa: E402
+
+
+class TestPendingMinesPath(unittest.TestCase):
+    """The pending-mines file lives alongside the silent-save pending file
+    but with a distinct name, so a daemon-busy save and a daemon-busy mine
+    can both queue independently."""
+
+    def test_path_is_separate_from_writes_path(self):
+        # Both derive from _config.palace_path's parent, but the basenames differ.
+        writes = main._pending_writes_path()
+        mines = main._pending_mines_path()
+        self.assertNotEqual(writes, mines)
+        self.assertTrue(mines.endswith("palace-daemon-pending-mines.jsonl"))
+
+
+class TestEnqueueAndDrain(unittest.IsolatedAsyncioTestCase):
+    """End-to-end: enqueue a few payloads, then drain — verify dedup + replay."""
+
+    async def asyncSetUp(self):
+        self.tmp = tempfile.TemporaryDirectory()
+        # Point the queue file inside the tmp dir
+        self._queue_path = os.path.join(self.tmp.name, "pending-mines.jsonl")
+        self._patches = [
+            patch.object(main, "_pending_mines_path", return_value=self._queue_path),
+            # Skip path translation for tests
+            patch.object(main, "_translate_client_path", side_effect=lambda p: p),
+        ]
+        for p in self._patches:
+            p.start()
+        # Make every replayed dir "exist" so the drain doesn't skip
+        self._is_dir_patch = patch("pathlib.Path.is_dir", return_value=True)
+        self._is_dir_patch.start()
+
+    async def asyncTearDown(self):
+        for p in self._patches:
+            p.stop()
+        self._is_dir_patch.stop()
+        self.tmp.cleanup()
+
+    async def test_enqueue_then_drain_replays_each_target(self):
+        await main._enqueue_pending_mine({"dir": "/a", "wing": "wa", "mode": "convos"})
+        await main._enqueue_pending_mine({"dir": "/b", "wing": "wb", "mode": "convos"})
+        self.assertTrue(os.path.isfile(self._queue_path))
+
+        # Stub the subprocess: each call returns rc=0
+        async def _fake_subprocess(*args, **kwargs):
+            proc = MagicMock()
+            proc.communicate = AsyncMock(return_value=(b"", b""))
+            proc.returncode = 0
+            return proc
+
+        with patch("asyncio.create_subprocess_exec", side_effect=_fake_subprocess) as spawn:
+            count = await main._drain_pending_mines()
+        self.assertEqual(count, 2)
+        # Queue file is gone after a clean drain
+        self.assertFalse(os.path.isfile(self._queue_path))
+        # Subprocess was invoked twice (once per unique target)
+        self.assertEqual(spawn.call_count, 2)
+
+    async def test_drain_dedups_repeated_target(self):
+        """A storm of hook fires queues the same (dir, wing, mode) many times.
+        Drain replays once per unique target — a single mine catches up all
+        the queued requests via convo_miner's mtime-based dedup anyway."""
+        for _ in range(10):
+            await main._enqueue_pending_mine({"dir": "/a", "wing": "wa", "mode": "convos"})
+
+        async def _fake_subprocess(*args, **kwargs):
+            proc = MagicMock()
+            proc.communicate = AsyncMock(return_value=(b"", b""))
+            proc.returncode = 0
+            return proc
+
+        with patch("asyncio.create_subprocess_exec", side_effect=_fake_subprocess) as spawn:
+            count = await main._drain_pending_mines()
+        self.assertEqual(count, 1)
+        self.assertEqual(spawn.call_count, 1)
+
+    async def test_drain_quarantines_failed_replays(self):
+        """A non-zero subprocess exit doesn't lose the queue entry — it
+        moves to a timestamped .failed-* file so the next drain doesn't
+        replay it again."""
+        await main._enqueue_pending_mine({"dir": "/a", "wing": "wa", "mode": "convos"})
+
+        async def _fake_subprocess(*args, **kwargs):
+            proc = MagicMock()
+            proc.communicate = AsyncMock(return_value=(b"", b"boom"))
+            proc.returncode = 1
+            return proc
+
+        with patch("asyncio.create_subprocess_exec", side_effect=_fake_subprocess):
+            count = await main._drain_pending_mines()
+        self.assertEqual(count, 0)
+        # Original queue file was removed; a .failed-* sibling exists
+        self.assertFalse(os.path.isfile(self._queue_path))
+        siblings = os.listdir(self.tmp.name)
+        failed = [s for s in siblings if ".failed-" in s]
+        self.assertEqual(len(failed), 1)
+
+    async def test_drain_empty_queue_returns_zero(self):
+        """No queue file → no work → return 0, no error."""
+        count = await main._drain_pending_mines()
+        self.assertEqual(count, 0)
+
+
+if __name__ == "__main__":
+    unittest.main()