Skip to content

Commit 41e43b5

Browse files
committed
fix: escape regex metacharacters in extract_people and sanitize session_id in precompact hook
- Add re.escape() to extract_people() in split_mega_files.py to prevent false matches and crashes when known names contain regex metacharacters (e.g. "Dr. Smith", "C++"). Matches the pattern already used in entity_detector.py and entity_registry.py. - Add session_id sanitization to mempal_precompact_hook.sh, matching the safe() pattern already present in mempal_save_hook.sh. Previously the precompact hook used raw unsanitized session_id from JSON input. - Add tests covering regex metacharacter edge cases and precompact session_id sanitization.
1 parent 2981433 commit 41e43b5

4 files changed

Lines changed: 47 additions & 2 deletions

File tree

hooks/mempal_precompact_hook.sh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,14 @@ MEMPAL_DIR=""
5757
# Read JSON input from stdin
5858
INPUT=$(cat)
5959

60-
SESSION_ID=$(echo "$INPUT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('session_id','unknown'))" 2>/dev/null)
60+
# Parse session_id with sanitization (match save hook's safe() pattern)
61+
SESSION_ID=$(echo "$INPUT" | python3 -c "
62+
import sys, json, re
63+
data = json.load(sys.stdin)
64+
sid = re.sub(r'[^a-zA-Z0-9_\-]', '', str(data.get('session_id', 'unknown')))
65+
print(sid or 'unknown')
66+
" 2>/dev/null)
67+
[ -z "$SESSION_ID" ] && SESSION_ID="unknown"
6168

6269
echo "[$(date '+%H:%M:%S')] PRE-COMPACT triggered for session $SESSION_ID" >> "$STATE_DIR/hook.log"
6370

mempalace/split_mega_files.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def extract_people(lines):
141141

142142
# Speaker tags: "Alice:", "Ben:", etc.
143143
for person in KNOWN_PEOPLE:
144-
if re.search(rf"\b{person}\b", text, re.IGNORECASE):
144+
if re.search(rf"\b{re.escape(person)}\b", text, re.IGNORECASE):
145145
found.add(person)
146146

147147
# Working directory username hint — map to known people if configured

tests/test_hooks_cli.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,20 @@ def test_session_start_passes_through(tmp_path):
204204
# --- hook_precompact ---
205205

206206

207+
def test_precompact_sanitizes_session_id(tmp_path):
208+
"""Precompact hook must sanitize session_id the same way stop hook does."""
209+
result = _capture_hook_output(
210+
hook_precompact,
211+
{"session_id": "../../etc/passwd"},
212+
state_dir=tmp_path,
213+
)
214+
assert result["decision"] == "block"
215+
# Verify the state dir has no path traversal artifacts
216+
state_files = list(tmp_path.iterdir())
217+
for f in state_files:
218+
assert ".." not in f.name
219+
220+
207221
def test_precompact_always_blocks(tmp_path):
208222
result = _capture_hook_output(
209223
hook_precompact,

tests/test_split_mega_files.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,30 @@ def test_extract_people_detects_names_from_content(monkeypatch):
5151
assert people == ["Alice", "Ben"]
5252

5353

54+
def test_extract_people_escapes_regex_metacharacters(monkeypatch):
55+
"""Names with regex metacharacters must not crash or false-match."""
56+
# Dr. Smith: dot must match literal dot, not any character
57+
monkeypatch.setattr(smf, "KNOWN_PEOPLE", ["Dr. Smith"])
58+
smf._KNOWN_NAMES_CACHE = None
59+
60+
lines_exact = ["Talked with Dr. Smith about plans\n"]
61+
assert "Dr. Smith" in smf.extract_people(lines_exact)
62+
63+
lines_false = ["Talked with Dr_ Smith about plans\n"]
64+
assert "Dr. Smith" not in smf.extract_people(lines_false)
65+
66+
67+
def test_extract_people_no_crash_on_regex_metacharacters(monkeypatch):
68+
"""Names containing brackets, parens, or plus signs must not crash."""
69+
dangerous_names = ["C++", "Mary (Smith)", "[Admin]", "A*B"]
70+
monkeypatch.setattr(smf, "KNOWN_PEOPLE", dangerous_names)
71+
smf._KNOWN_NAMES_CACHE = None
72+
73+
# Should not raise re.error — just run without crashing
74+
result = smf.extract_people(["Some text with C++ mentioned\n"])
75+
assert isinstance(result, list)
76+
77+
5478
# ── Config: force_reload and invalid JSON ──────────────────────────────
5579

5680

0 commit comments

Comments
 (0)