Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 47 additions & 22 deletions mempalace/dialect.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
#!/usr/bin/env python3
"""
AAAK Dialect -- Compressed Symbolic Memory Language
AAAK Dialect -- Structured Symbolic Summary Format
====================================================

A structured symbolic format that ANY LLM reads natively at ~30x compression.
Not latent vectors. Not English prose. A universal memory compression dialect.
A lossy summarization format that extracts entities, topics, key sentences,
emotions, and flags from plain text into a compact structured representation.
Any LLM reads it natively — no decoder required.

Works with: Claude, ChatGPT, Gemini, Llama, Mistral -- any model that reads text.

NOTE: AAAK is NOT lossless compression. The original text cannot be reconstructed
from AAAK output. It is a structured summary layer (closets) that points to the
original verbatim content (drawers). The 96.6% benchmark score is from raw mode,
not AAAK mode.

Adapted for mempalace: works standalone on plain text and ChromaDB drawers.
No dependency on palace.py or layers.py.

Expand Down Expand Up @@ -538,19 +544,19 @@ def _detect_entities_in_text(self, text: str) -> List[str]:

def compress(self, text: str, metadata: dict = None) -> str:
"""
Compress plain text into AAAK Dialect format.
Summarize plain text into AAAK Dialect format.

This is the primary method for mempalace: takes any text content
(drawer content, transcript chunk, note) and returns a compressed
symbolic representation.
Extracts entities, topics, a key sentence, emotions, and flags
from the input text. This is lossy — the original text cannot be
reconstructed from the output.

Args:
text: Plain text content to compress
text: Plain text content to summarize
metadata: Optional dict with keys like 'source_file', 'wing',
'room', 'date', etc.

Returns:
AAAK-compressed string (~30x smaller than input)
AAAK-formatted summary string
"""
metadata = metadata or {}

Expand Down Expand Up @@ -930,19 +936,34 @@ def decode(self, dialect_text: str) -> dict:

@staticmethod
def count_tokens(text: str) -> int:
"""Rough token count (1 token ~ 3 chars for structured text)."""
return len(text) // 3
"""Estimate token count using word-based heuristic (~1.3 tokens per word).

This is an approximation. For accurate counts, use a real tokenizer
like tiktoken. The old len(text)//3 heuristic was wildly inaccurate
and made AAAK compression ratios look much better than reality.
"""
words = text.split()
# Most English words tokenize to 1-2 tokens; punctuation and
# special chars in AAAK (|, +, :) each cost a token.
# ~1.3 tokens/word is a conservative average.
return max(1, int(len(words) * 1.3))

def compression_stats(self, original_text: str, compressed: str) -> dict:
"""Get compression statistics for a text->AAAK conversion."""
"""Get size comparison stats for a text->AAAK conversion.

NOTE: AAAK is lossy summarization, not compression. The "ratio"
reflects how much shorter the summary is, not a compression ratio
in the traditional sense — information is lost.
"""
orig_tokens = self.count_tokens(original_text)
comp_tokens = self.count_tokens(compressed)
return {
"original_tokens": orig_tokens,
"compressed_tokens": comp_tokens,
"ratio": orig_tokens / max(comp_tokens, 1),
"original_tokens_est": orig_tokens,
"summary_tokens_est": comp_tokens,
"size_ratio": round(orig_tokens / max(comp_tokens, 1), 1),
"original_chars": len(original_text),
"compressed_chars": len(compressed),
"summary_chars": len(compressed),
"note": "Estimates only. Use tiktoken for accurate counts. AAAK is lossy.",
}


Expand Down Expand Up @@ -1021,9 +1042,9 @@ def usage():
encoded = dialect.encode_file(data)
stats = dialect.compression_stats(json_str, encoded)
print("=== COMPRESSION STATS ===")
print(f"JSON: ~{stats['original_tokens']:,} tokens")
print(f"AAAK: ~{stats['compressed_tokens']:,} tokens")
print(f"Ratio: {stats['ratio']:.0f}x")
print(f"JSON: ~{stats['original_tokens_est']:,} tokens (est)")
print(f"AAAK: ~{stats['summary_tokens_est']:,} tokens (est)")
print(f"Ratio: {stats['size_ratio']}x (lossy — information is lost)")
print()
print("=== AAAK DIALECT OUTPUT ===")
print(encoded)
Expand All @@ -1043,8 +1064,12 @@ def usage():
text = " ".join(args)
compressed = dialect.compress(text)
stats = dialect.compression_stats(text, compressed)
print(f"Original: ~{stats['original_tokens']} tokens ({stats['original_chars']} chars)")
print(f"AAAK: ~{stats['compressed_tokens']} tokens ({stats['compressed_chars']} chars)")
print(f"Ratio: {stats['ratio']:.1f}x")
print(
f"Original: ~{stats['original_tokens_est']} tokens est ({stats['original_chars']} chars)"
)
print(
f"AAAK: ~{stats['summary_tokens_est']} tokens est ({stats['summary_chars']} chars)"
)
print(f"Ratio: {stats['size_ratio']}x (lossy summary, not lossless compression)")
print()
print(compressed)