fix: restore mempalace compress after stats rename (MemPalace#159)

mvalentsev · mvalentsev · commit 65788516b67f · 2026-04-11T16:24:59.000+05:00
The honest-stats rename in PR MemPalace#147 changed the keys returned by Dialect.compression_stats() (ratio -> size_ratio, compressed_chars -> summary_chars, original_tokens / compressed_tokens -> original_tokens_est / summary_tokens_est). cmd_compress still reads the old names, so mempalace compress throws KeyError on the first drawer it touches and the feature is effectively dead. Also fix the summary line at the bottom of cmd_compress. It called count_tokens("x" * total_original), but count_tokens is word-based (max(1, int(len(text.split()) * 1.3))), and a string of repeated xs is a single "word", so both totals were always 1. Accumulate the per-drawer estimates during the main loop instead, and use a token-based ratio so the summary line is self-consistent with the per-drawer dry-run output. The storage metadata key names on the compressed collection (compression_ratio, original_tokens) stay the same for compatibility with anything already reading them. Only the source of the values is updated. Fixes MemPalace#159 (points 1 and 2)
diff --git a/mempalace/cli.py b/mempalace/cli.py
@@ -340,16 +340,16 @@ def cmd_compress(args):
     )
     print()
 
-    total_original = 0
-    total_compressed = 0
+    total_orig_tokens = 0
+    total_comp_tokens = 0
     compressed_entries = []
 
     for doc, meta, doc_id in zip(docs, metas, ids):
         compressed = dialect.compress(doc, metadata=meta)
         stats = dialect.compression_stats(doc, compressed)
 
-        total_original += stats["original_chars"]
-        total_compressed += stats["compressed_chars"]
+        total_orig_tokens += stats["original_tokens_est"]
+        total_comp_tokens += stats["summary_tokens_est"]
 
         compressed_entries.append((doc_id, compressed, meta, stats))
 
@@ -359,7 +359,8 @@ def cmd_compress(args):
             source = Path(meta.get("source_file", "?")).name
             print(f"  [{wing_name}/{room_name}] {source}")
             print(
-                f"    {stats['original_tokens']}t -> {stats['compressed_tokens']}t ({stats['ratio']:.1f}x)"
+                f"    {stats['original_tokens_est']}t -> {stats['summary_tokens_est']}t "
+                f"({stats['size_ratio']:.1f}x)"
             )
             print(f"    {compressed}")
             print()
@@ -370,8 +371,8 @@ def cmd_compress(args):
             comp_col = client.get_or_create_collection("mempalace_compressed")
             for doc_id, compressed, meta, stats in compressed_entries:
                 comp_meta = dict(meta)
-                comp_meta["compression_ratio"] = round(stats["ratio"], 1)
-                comp_meta["original_tokens"] = stats["original_tokens"]
+                comp_meta["compression_ratio"] = round(stats["size_ratio"], 1)
+                comp_meta["original_tokens"] = stats["original_tokens_est"]
                 comp_col.upsert(
                     ids=[doc_id],
                     documents=[compressed],
@@ -384,11 +385,9 @@ def cmd_compress(args):
             print(f"  Error storing compressed drawers: {e}")
             sys.exit(1)
 
-    # Summary
-    ratio = total_original / max(total_compressed, 1)
-    orig_tokens = Dialect.count_tokens("x" * total_original)
-    comp_tokens = Dialect.count_tokens("x" * total_compressed)
-    print(f"  Total: {orig_tokens:,}t -> {comp_tokens:,}t ({ratio:.1f}x compression)")
+    # Summary: token-based ratio stays consistent with the per-drawer line.
+    ratio = total_orig_tokens / max(total_comp_tokens, 1)
+    print(f"  Total: {total_orig_tokens:,}t -> {total_comp_tokens:,}t ({ratio:.1f}x compression)")
     if args.dry_run:
         print("  (dry run -- nothing stored)")