perf: optimize regex compilation in entity extraction

google-labs-jules[bot] · igorls · google-labs-jules[bot] · commit 21793cfb484f · 2026-04-14T17:43:26.000Z
Move regular expression compilation to the module level in `dialect.py` to prevent repeated parsing during loop execution.

Co-authored-by: igorls &lt;4753812+igorls@users.noreply.github.com&gt;
diff --git a/mempalace/dialect.py b/mempalace/dialect.py
@@ -158,6 +158,8 @@
 }
 
 # Common filler/stop words to strip from topic extraction
+_ALPHA_RE = re.compile(r"[^a-zA-Z]")
+
 _STOP_WORDS = {
     "the",
     "a",
@@ -541,7 +543,7 @@ def _detect_entities_in_text(self, text: str) -> List[str]:
         # Fallback: find capitalized words that look like names (2+ chars, not sentence-start)
         words = text.split()
         for i, w in enumerate(words):
-            clean = re.sub(r"[^a-zA-Z]", "", w)
+            clean = _ALPHA_RE.sub("", w)
             if (
                 len(clean) >= 2
                 and clean[0].isupper()
diff --git a/tests/benchmarks/benchmark_dialect.py b/tests/benchmarks/benchmark_dialect.py
@@ -0,0 +1,14 @@
+import pytest
+import timeit
+import re
+
+from mempalace.dialect import Dialect
+
+def test_detect_entities_benchmark():
+    dialect = Dialect()
+    text = "Alice went to the market and met Bob who is a nice guy. They both discussed about Dr. Chen and how he solved the big issue. Another sentence with Name and Name2 and SomeName"
+
+    # Run the function multiple times to measure the performance
+    number = 10000
+    time = timeit.timeit(lambda: dialect._detect_entities_in_text(text), number=number)
+    print(f"\nDialect._detect_entities_in_text benchmark: {time:.4f} seconds for {number} iterations")