MemPalace
diff --git a/‎mempalace/cli.py‎
Lines changed: 54 additions & 0 deletions b/‎mempalace/cli.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎mempalace/convo_miner.py‎
Lines changed: 8 additions & 1 deletion b/‎mempalace/convo_miner.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎mempalace/miner.py‎
Lines changed: 9 additions & 1 deletion b/‎mempalace/miner.py‎
Lines changed: 9 additions & 1 deletion
@@ -146,6 +146,48 @@ def cmd_mine(args):
         )
 
 
+def cmd_sweep(args):
+    """Sweep a transcript file or directory.
+
+    The sweeper deduplicates against its own prior writes via
+    deterministic drawer IDs + a timestamp cursor. It does NOT currently
+    coordinate with the file-level miners (miner.py / convo_miner.py) —
+    those produce char-chunked drawers without compatible message
+    metadata, so running both miners may store overlapping content under
+    different IDs.
+    """
+    from .sweeper import sweep, sweep_directory
+
+    palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
+    target = os.path.expanduser(args.target)
+
+    if os.path.isfile(target):
+        result = sweep(target, palace_path)
+        print(
+            f"  Swept {target}: +{result['drawers_added']} new, "
+            f"{result['drawers_already_present']} already present, "
+            f"{result['drawers_skipped']} skipped (< cursor)."
+        )
+    elif os.path.isdir(target):
+        result = sweep_directory(target, palace_path)
+        print(
+            f"  Swept {result['files_succeeded']}/{result['files_attempted']} "
+            f"files from {target}: +{result['drawers_added']} new, "
+            f"{result['drawers_already_present']} already present, "
+            f"{result['drawers_skipped']} skipped (< cursor)."
+        )
+        failures = result.get("failures") or []
+        if failures:
+            print(
+                f"  ⚠ {len(failures)} file(s) failed to sweep — see stderr / logs for details.",
+                file=sys.stderr,
+            )
+            sys.exit(2)
+    else:
+        print(f"  ✗ Not a file or directory: {target}", file=sys.stderr)
+        sys.exit(1)
+
+
 def cmd_search(args):
     from .searcher import search, SearchError
 
@@ -547,6 +589,17 @@ def main():
         help="Extraction strategy for convos mode: 'exchange' (default) or 'general' (5 memory types)",
     )
 
+    # sweep
+    p_sweep = sub.add_parser(
+        "sweep",
+        help="Tandem miner: catch anything the primary miner missed "
+        "(message-level, timestamp-coordinated, idempotent)",
+    )
+    p_sweep.add_argument(
+        "target",
+        help="A .jsonl transcript file, or a directory to scan recursively",
+    )
+
     # search
     p_search = sub.add_parser("search", help="Find anything, exact words")
     p_search.add_argument("query", help="What to search for")
@@ -679,6 +732,7 @@ def main():
         "mine": cmd_mine,
         "split": cmd_split,
         "search": cmd_search,
+        "sweep": cmd_sweep,
         "mcp": cmd_mcp,
         "compress": cmd_compress,
         "wake-up": cmd_wakeup,
 
@@ -55,7 +55,14 @@ def _detect_hall_cached(content: str) -> str:
 
 MIN_CHUNK_SIZE = 30
 CHUNK_SIZE = 800  # chars per drawer — align with miner.py
-MAX_FILE_SIZE = 10 * 1024 * 1024  # 10 MB — skip files larger than this
+MAX_FILE_SIZE = 500 * 1024 * 1024  # 500 MB — skip files larger than this.
+# Matches miner.py at 500 MB. Long Claude Code sessions, multi-year
+# ChatGPT exports, and lifetime Slack dumps routinely exceed 10 MB; the
+# cap at that level silently dropped them with `continue`. Per-drawer
+# size is bounded by CHUNK_SIZE, but larger source files still produce
+# more drawers and therefore more embedding/storage work — and content
+# is normalized and loaded fully into memory before chunking, so memory
+# use also scales with source size.
 
 
 def _register_file(collection, source_file: str, wing: str, agent: str):
 
@@ -36,6 +36,7 @@
     ".jsx",
     ".tsx",
     ".json",
+    ".jsonl",
     ".yaml",
     ".yml",
     ".html",
@@ -62,7 +63,14 @@
 CHUNK_SIZE = 800  # chars per drawer
 CHUNK_OVERLAP = 100  # overlap between chunks
 MIN_CHUNK_SIZE = 50  # skip tiny chunks
-MAX_FILE_SIZE = 10 * 1024 * 1024  # 10 MB — skip files larger than this
+MAX_FILE_SIZE = 500 * 1024 * 1024  # 500 MB — skip files larger than this.
+# Long Claude Code sessions and large transcript exports routinely exceed
+# 10 MB. The cap exists as a defensive rail against pathological binary
+# files, not as a limit on legitimate text. Per-drawer size is bounded
+# by CHUNK_SIZE, but larger sources still produce proportionally more
+# drawers and therefore more storage, embedding, and processing work —
+# and file reads are not streamed (the whole content is loaded into
+# memory before chunking), so memory use scales with source size too.
 
 
 # =============================================================================