11#!/usr/bin/env python3
22"""
3- AAAK Dialect -- Compressed Symbolic Memory Language
3+ AAAK Dialect -- Structured Symbolic Summary Format
44====================================================
55
6- A structured symbolic format that ANY LLM reads natively at ~30x compression.
7- Not latent vectors. Not English prose. A universal memory compression dialect.
6+ A lossy summarization format that extracts entities, topics, key sentences,
7+ emotions, and flags from plain text into a compact structured representation.
8+ Any LLM reads it natively — no decoder required.
89
910Works with: Claude, ChatGPT, Gemini, Llama, Mistral -- any model that reads text.
1011
12+ NOTE: AAAK is NOT lossless compression. The original text cannot be reconstructed
13+ from AAAK output. It is a structured summary layer (closets) that points to the
14+ original verbatim content (drawers). The 96.6% benchmark score is from raw mode,
15+ not AAAK mode.
16+
1117Adapted for mempalace: works standalone on plain text and ChromaDB drawers.
1218No dependency on palace.py or layers.py.
1319
@@ -538,19 +544,19 @@ def _detect_entities_in_text(self, text: str) -> List[str]:
538544
539545 def compress (self , text : str , metadata : dict = None ) -> str :
540546 """
541- Compress plain text into AAAK Dialect format.
547+ Summarize plain text into AAAK Dialect format.
542548
543- This is the primary method for mempalace: takes any text content
544- (drawer content, transcript chunk, note) and returns a compressed
545- symbolic representation .
549+ Extracts entities, topics, a key sentence, emotions, and flags
550+ from the input text. This is lossy — the original text cannot be
551+ reconstructed from the output .
546552
547553 Args:
548- text: Plain text content to compress
554+ text: Plain text content to summarize
549555 metadata: Optional dict with keys like 'source_file', 'wing',
550556 'room', 'date', etc.
551557
552558 Returns:
553- AAAK-compressed string (~30x smaller than input)
559+ AAAK-formatted summary string
554560 """
555561 metadata = metadata or {}
556562
@@ -930,19 +936,34 @@ def decode(self, dialect_text: str) -> dict:
930936
931937 @staticmethod
932938 def count_tokens (text : str ) -> int :
933- """Rough token count (1 token ~ 3 chars for structured text)."""
934- return len (text ) // 3
939+ """Estimate token count using word-based heuristic (~1.3 tokens per word).
940+
941+ This is an approximation. For accurate counts, use a real tokenizer
942+ like tiktoken. The old len(text)//3 heuristic was wildly inaccurate
943+ and made AAAK compression ratios look much better than reality.
944+ """
945+ words = text .split ()
946+ # Most English words tokenize to 1-2 tokens; punctuation and
947+ # special chars in AAAK (|, +, :) each cost a token.
948+ # ~1.3 tokens/word is a conservative average.
949+ return max (1 , int (len (words ) * 1.3 ))
935950
936951 def compression_stats (self , original_text : str , compressed : str ) -> dict :
937- """Get compression statistics for a text->AAAK conversion."""
952+ """Get size comparison stats for a text->AAAK conversion.
953+
954+ NOTE: AAAK is lossy summarization, not compression. The "ratio"
955+ reflects how much shorter the summary is, not a compression ratio
956+ in the traditional sense — information is lost.
957+ """
938958 orig_tokens = self .count_tokens (original_text )
939959 comp_tokens = self .count_tokens (compressed )
940960 return {
941- "original_tokens " : orig_tokens ,
942- "compressed_tokens " : comp_tokens ,
943- "ratio " : orig_tokens / max (comp_tokens , 1 ),
961+ "original_tokens_est " : orig_tokens ,
962+ "summary_tokens_est " : comp_tokens ,
963+ "size_ratio " : round ( orig_tokens / max (comp_tokens , 1 ) , 1 ),
944964 "original_chars" : len (original_text ),
945- "compressed_chars" : len (compressed ),
965+ "summary_chars" : len (compressed ),
966+ "note" : "Estimates only. Use tiktoken for accurate counts. AAAK is lossy." ,
946967 }
947968
948969
@@ -1021,9 +1042,9 @@ def usage():
10211042 encoded = dialect .encode_file (data )
10221043 stats = dialect .compression_stats (json_str , encoded )
10231044 print ("=== COMPRESSION STATS ===" )
1024- print (f"JSON: ~{ stats ['original_tokens ' ]:,} tokens" )
1025- print (f"AAAK: ~{ stats ['compressed_tokens ' ]:,} tokens" )
1026- print (f"Ratio: { stats ['ratio' ]:.0f } x " )
1045+ print (f"JSON: ~{ stats ['original_tokens_est ' ]:,} tokens (est) " )
1046+ print (f"AAAK: ~{ stats ['summary_tokens_est ' ]:,} tokens (est) " )
1047+ print (f"Ratio: { stats ['size_ratio' ] } x (lossy — information is lost) " )
10271048 print ()
10281049 print ("=== AAAK DIALECT OUTPUT ===" )
10291050 print (encoded )
@@ -1043,8 +1064,12 @@ def usage():
10431064 text = " " .join (args )
10441065 compressed = dialect .compress (text )
10451066 stats = dialect .compression_stats (text , compressed )
1046- print (f"Original: ~{ stats ['original_tokens' ]} tokens ({ stats ['original_chars' ]} chars)" )
1047- print (f"AAAK: ~{ stats ['compressed_tokens' ]} tokens ({ stats ['compressed_chars' ]} chars)" )
1048- print (f"Ratio: { stats ['ratio' ]:.1f} x" )
1067+ print (
1068+ f"Original: ~{ stats ['original_tokens_est' ]} tokens est ({ stats ['original_chars' ]} chars)"
1069+ )
1070+ print (
1071+ f"AAAK: ~{ stats ['summary_tokens_est' ]} tokens est ({ stats ['summary_chars' ]} chars)"
1072+ )
1073+ print (f"Ratio: { stats ['size_ratio' ]} x (lossy summary, not lossless compression)" )
10491074 print ()
10501075 print (compressed )
0 commit comments