test(dialect): strengthen compression_stats and count_tokens assertions

mvalentsev · mvalentsev · commit 794dc81a9ee1 · 2026-04-08T04:28:32.000+05:00
Adds coverage on top of the honest-stats test fix that already landed
in main:

- test_stats now also asserts original_tokens_est is strictly greater
  than summary_tokens_est, which catches a class of regressions where
  the token estimator flattens to a constant.
- test_count_tokens gains edge cases for the empty string and the
  single-word input. Both exercise the max(1, ...) guard in
  Dialect.count_tokens, so a future refactor that drops the guard
  fails loudly instead of silently returning 0.
diff --git a/tests/test_dialect.py b/tests/test_dialect.py
@@ -111,9 +111,14 @@ def test_stats(self):
         stats = d.compression_stats(original, compressed)
         assert stats["size_ratio"] > 1
         assert stats["original_chars"] > stats["summary_chars"]
+        assert stats["original_tokens_est"] > stats["summary_tokens_est"]
 
     def test_count_tokens(self):
+        # count_tokens uses a word-based heuristic (~1.3 tokens per word).
+        # "hello world" is 2 words -> max(1, int(2 * 1.3)) == 2.
         assert Dialect.count_tokens("hello world") == 2
+        assert Dialect.count_tokens("") == 1
+        assert Dialect.count_tokens("one") == 1
 
 
 class TestZettelEncoding: