Skip to content

Commit 68e3414

Browse files
authored
Merge pull request #147 from milla-jovovich/fix/aaak-honest-stats
fix: honest AAAK stats — word-based token estimator, lossy labels
2 parents 27623a3 + 39c14be commit 68e3414

1 file changed

Lines changed: 47 additions & 22 deletions

File tree

mempalace/dialect.py

Lines changed: 47 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,19 @@
11
#!/usr/bin/env python3
22
"""
3-
AAAK Dialect -- Compressed Symbolic Memory Language
3+
AAAK Dialect -- Structured Symbolic Summary Format
44
====================================================
55
6-
A structured symbolic format that ANY LLM reads natively at ~30x compression.
7-
Not latent vectors. Not English prose. A universal memory compression dialect.
6+
A lossy summarization format that extracts entities, topics, key sentences,
7+
emotions, and flags from plain text into a compact structured representation.
8+
Any LLM reads it natively — no decoder required.
89
910
Works with: Claude, ChatGPT, Gemini, Llama, Mistral -- any model that reads text.
1011
12+
NOTE: AAAK is NOT lossless compression. The original text cannot be reconstructed
13+
from AAAK output. It is a structured summary layer (closets) that points to the
14+
original verbatim content (drawers). The 96.6% benchmark score is from raw mode,
15+
not AAAK mode.
16+
1117
Adapted for mempalace: works standalone on plain text and ChromaDB drawers.
1218
No dependency on palace.py or layers.py.
1319
@@ -538,19 +544,19 @@ def _detect_entities_in_text(self, text: str) -> List[str]:
538544

539545
def compress(self, text: str, metadata: dict = None) -> str:
540546
"""
541-
Compress plain text into AAAK Dialect format.
547+
Summarize plain text into AAAK Dialect format.
542548
543-
This is the primary method for mempalace: takes any text content
544-
(drawer content, transcript chunk, note) and returns a compressed
545-
symbolic representation.
549+
Extracts entities, topics, a key sentence, emotions, and flags
550+
from the input text. This is lossy — the original text cannot be
551+
reconstructed from the output.
546552
547553
Args:
548-
text: Plain text content to compress
554+
text: Plain text content to summarize
549555
metadata: Optional dict with keys like 'source_file', 'wing',
550556
'room', 'date', etc.
551557
552558
Returns:
553-
AAAK-compressed string (~30x smaller than input)
559+
AAAK-formatted summary string
554560
"""
555561
metadata = metadata or {}
556562

@@ -930,19 +936,34 @@ def decode(self, dialect_text: str) -> dict:
930936

931937
@staticmethod
932938
def count_tokens(text: str) -> int:
933-
"""Rough token count (1 token ~ 3 chars for structured text)."""
934-
return len(text) // 3
939+
"""Estimate token count using word-based heuristic (~1.3 tokens per word).
940+
941+
This is an approximation. For accurate counts, use a real tokenizer
942+
like tiktoken. The old len(text)//3 heuristic was wildly inaccurate
943+
and made AAAK compression ratios look much better than reality.
944+
"""
945+
words = text.split()
946+
# Most English words tokenize to 1-2 tokens; punctuation and
947+
# special chars in AAAK (|, +, :) each cost a token.
948+
# ~1.3 tokens/word is a conservative average.
949+
return max(1, int(len(words) * 1.3))
935950

936951
def compression_stats(self, original_text: str, compressed: str) -> dict:
937-
"""Get compression statistics for a text->AAAK conversion."""
952+
"""Get size comparison stats for a text->AAAK conversion.
953+
954+
NOTE: AAAK is lossy summarization, not compression. The "ratio"
955+
reflects how much shorter the summary is, not a compression ratio
956+
in the traditional sense — information is lost.
957+
"""
938958
orig_tokens = self.count_tokens(original_text)
939959
comp_tokens = self.count_tokens(compressed)
940960
return {
941-
"original_tokens": orig_tokens,
942-
"compressed_tokens": comp_tokens,
943-
"ratio": orig_tokens / max(comp_tokens, 1),
961+
"original_tokens_est": orig_tokens,
962+
"summary_tokens_est": comp_tokens,
963+
"size_ratio": round(orig_tokens / max(comp_tokens, 1), 1),
944964
"original_chars": len(original_text),
945-
"compressed_chars": len(compressed),
965+
"summary_chars": len(compressed),
966+
"note": "Estimates only. Use tiktoken for accurate counts. AAAK is lossy.",
946967
}
947968

948969

@@ -1021,9 +1042,9 @@ def usage():
10211042
encoded = dialect.encode_file(data)
10221043
stats = dialect.compression_stats(json_str, encoded)
10231044
print("=== COMPRESSION STATS ===")
1024-
print(f"JSON: ~{stats['original_tokens']:,} tokens")
1025-
print(f"AAAK: ~{stats['compressed_tokens']:,} tokens")
1026-
print(f"Ratio: {stats['ratio']:.0f}x")
1045+
print(f"JSON: ~{stats['original_tokens_est']:,} tokens (est)")
1046+
print(f"AAAK: ~{stats['summary_tokens_est']:,} tokens (est)")
1047+
print(f"Ratio: {stats['size_ratio']}x (lossy — information is lost)")
10271048
print()
10281049
print("=== AAAK DIALECT OUTPUT ===")
10291050
print(encoded)
@@ -1043,8 +1064,12 @@ def usage():
10431064
text = " ".join(args)
10441065
compressed = dialect.compress(text)
10451066
stats = dialect.compression_stats(text, compressed)
1046-
print(f"Original: ~{stats['original_tokens']} tokens ({stats['original_chars']} chars)")
1047-
print(f"AAAK: ~{stats['compressed_tokens']} tokens ({stats['compressed_chars']} chars)")
1048-
print(f"Ratio: {stats['ratio']:.1f}x")
1067+
print(
1068+
f"Original: ~{stats['original_tokens_est']} tokens est ({stats['original_chars']} chars)"
1069+
)
1070+
print(
1071+
f"AAAK: ~{stats['summary_tokens_est']} tokens est ({stats['summary_chars']} chars)"
1072+
)
1073+
print(f"Ratio: {stats['size_ratio']}x (lossy summary, not lossless compression)")
10491074
print()
10501075
print(compressed)

0 commit comments

Comments
 (0)