Skip to content

Commit 74a31b7

Browse files
authored
Merge pull request #998 from MemPalace/fix/silent-transcript-drop
Fix silent transcript drop: .jsonl ingestion + 500 MB cap + tandem sweeper
2 parents e4a2cd4 + 4a088ea commit 74a31b7

7 files changed

Lines changed: 893 additions & 2 deletions

File tree

mempalace/cli.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,48 @@ def cmd_mine(args):
146146
)
147147

148148

149+
def cmd_sweep(args):
150+
"""Sweep a transcript file or directory.
151+
152+
The sweeper deduplicates against its own prior writes via
153+
deterministic drawer IDs + a timestamp cursor. It does NOT currently
154+
coordinate with the file-level miners (miner.py / convo_miner.py) —
155+
those produce char-chunked drawers without compatible message
156+
metadata, so running both miners may store overlapping content under
157+
different IDs.
158+
"""
159+
from .sweeper import sweep, sweep_directory
160+
161+
palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
162+
target = os.path.expanduser(args.target)
163+
164+
if os.path.isfile(target):
165+
result = sweep(target, palace_path)
166+
print(
167+
f" Swept {target}: +{result['drawers_added']} new, "
168+
f"{result['drawers_already_present']} already present, "
169+
f"{result['drawers_skipped']} skipped (< cursor)."
170+
)
171+
elif os.path.isdir(target):
172+
result = sweep_directory(target, palace_path)
173+
print(
174+
f" Swept {result['files_succeeded']}/{result['files_attempted']} "
175+
f"files from {target}: +{result['drawers_added']} new, "
176+
f"{result['drawers_already_present']} already present, "
177+
f"{result['drawers_skipped']} skipped (< cursor)."
178+
)
179+
failures = result.get("failures") or []
180+
if failures:
181+
print(
182+
f" ⚠ {len(failures)} file(s) failed to sweep — see stderr / logs for details.",
183+
file=sys.stderr,
184+
)
185+
sys.exit(2)
186+
else:
187+
print(f" ✗ Not a file or directory: {target}", file=sys.stderr)
188+
sys.exit(1)
189+
190+
149191
def cmd_search(args):
150192
from .searcher import search, SearchError
151193

@@ -547,6 +589,17 @@ def main():
547589
help="Extraction strategy for convos mode: 'exchange' (default) or 'general' (5 memory types)",
548590
)
549591

592+
# sweep
593+
p_sweep = sub.add_parser(
594+
"sweep",
595+
help="Tandem miner: catch anything the primary miner missed "
596+
"(message-level, timestamp-coordinated, idempotent)",
597+
)
598+
p_sweep.add_argument(
599+
"target",
600+
help="A .jsonl transcript file, or a directory to scan recursively",
601+
)
602+
550603
# search
551604
p_search = sub.add_parser("search", help="Find anything, exact words")
552605
p_search.add_argument("query", help="What to search for")
@@ -679,6 +732,7 @@ def main():
679732
"mine": cmd_mine,
680733
"split": cmd_split,
681734
"search": cmd_search,
735+
"sweep": cmd_sweep,
682736
"mcp": cmd_mcp,
683737
"compress": cmd_compress,
684738
"wake-up": cmd_wakeup,

mempalace/convo_miner.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,14 @@ def _detect_hall_cached(content: str) -> str:
5555

5656
MIN_CHUNK_SIZE = 30
5757
CHUNK_SIZE = 800 # chars per drawer — align with miner.py
58-
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB — skip files larger than this
58+
MAX_FILE_SIZE = 500 * 1024 * 1024 # 500 MB — skip files larger than this.
59+
# Matches miner.py at 500 MB. Long Claude Code sessions, multi-year
60+
# ChatGPT exports, and lifetime Slack dumps routinely exceed 10 MB; the
61+
# cap at that level silently dropped them with `continue`. Per-drawer
62+
# size is bounded by CHUNK_SIZE, but larger source files still produce
63+
# more drawers and therefore more embedding/storage work — and content
64+
# is normalized and loaded fully into memory before chunking, so memory
65+
# use also scales with source size.
5966

6067

6168
def _register_file(collection, source_file: str, wing: str, agent: str):

mempalace/miner.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
".jsx",
3737
".tsx",
3838
".json",
39+
".jsonl",
3940
".yaml",
4041
".yml",
4142
".html",
@@ -62,7 +63,14 @@
6263
CHUNK_SIZE = 800 # chars per drawer
6364
CHUNK_OVERLAP = 100 # overlap between chunks
6465
MIN_CHUNK_SIZE = 50 # skip tiny chunks
65-
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB — skip files larger than this
66+
MAX_FILE_SIZE = 500 * 1024 * 1024 # 500 MB — skip files larger than this.
67+
# Long Claude Code sessions and large transcript exports routinely exceed
68+
# 10 MB. The cap exists as a defensive rail against pathological binary
69+
# files, not as a limit on legitimate text. Per-drawer size is bounded
70+
# by CHUNK_SIZE, but larger sources still produce proportionally more
71+
# drawers and therefore more storage, embedding, and processing work —
72+
# and file reads are not streamed (the whole content is loaded into
73+
# memory before chunking), so memory use scales with source size too.
6674

6775

6876
# =============================================================================

0 commit comments

Comments
 (0)