Skip to content

Commit 21793cf

Browse files
perf: optimize regex compilation in entity extraction
Move regular expression compilation to the module level in `dialect.py` to prevent repeated parsing during loop execution. Co-authored-by: igorls <4753812+igorls@users.noreply.github.com>
1 parent 4741bc0 commit 21793cf

2 files changed

Lines changed: 17 additions & 1 deletion

File tree

mempalace/dialect.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,8 @@
158158
}
159159

160160
# Common filler/stop words to strip from topic extraction
161+
_ALPHA_RE = re.compile(r"[^a-zA-Z]")
162+
161163
_STOP_WORDS = {
162164
"the",
163165
"a",
@@ -541,7 +543,7 @@ def _detect_entities_in_text(self, text: str) -> List[str]:
541543
# Fallback: find capitalized words that look like names (2+ chars, not sentence-start)
542544
words = text.split()
543545
for i, w in enumerate(words):
544-
clean = re.sub(r"[^a-zA-Z]", "", w)
546+
clean = _ALPHA_RE.sub("", w)
545547
if (
546548
len(clean) >= 2
547549
and clean[0].isupper()
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import pytest
2+
import timeit
3+
import re
4+
5+
from mempalace.dialect import Dialect
6+
7+
def test_detect_entities_benchmark():
8+
dialect = Dialect()
9+
text = "Alice went to the market and met Bob who is a nice guy. They both discussed about Dr. Chen and how he solved the big issue. Another sentence with Name and Name2 and SomeName"
10+
11+
# Run the function multiple times to measure the performance
12+
number = 10000
13+
time = timeit.timeit(lambda: dialect._detect_entities_in_text(text), number=number)
14+
print(f"\nDialect._detect_entities_in_text benchmark: {time:.4f} seconds for {number} iterations")

0 commit comments

Comments
 (0)