Skip to content

Commit a60839f

Browse files
committed
fix: move Slack provenance to footer, sanitize speaker IDs, extract constant
- Move provenance notice from header to footer to prevent it becoming a standalone ChromaDB drawer via paragraph chunking on exports with fewer than 3 exchange pairs (violates verbatim-always principle) - Sanitize speaker user_id/username: strip brackets, newlines, and control characters to prevent chunk-boundary injection via crafted Slack exports - Extract header string to _SLACK_PROVENANCE_FOOTER module constant, consistent with _TOOL_RESULT_* constants pattern; tests import it instead of duplicating the literal Refs: #809
1 parent d36f79e commit a60839f

2 files changed

Lines changed: 30 additions & 6 deletions

File tree

mempalace/normalize.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,16 @@
1616

1717
import json
1818
import os
19+
import re
1920
from pathlib import Path
2021
from typing import Optional
2122

23+
# Provenance footer appended to Slack transcript output so downstream consumers
24+
# know the speaker roles are positionally assigned, not verified.
25+
_SLACK_PROVENANCE_FOOTER = (
26+
"\n[source: slack-export | multi-party chat — speaker roles are positional, not verified]"
27+
)
28+
2229

2330
def normalize(filepath: str) -> str:
2431
"""
@@ -292,7 +299,10 @@ def _try_slack_json(data) -> Optional[str]:
292299
for item in data:
293300
if not isinstance(item, dict) or item.get("type") != "message":
294301
continue
295-
user_id = item.get("user", item.get("username", ""))
302+
raw_user_id = item.get("user", item.get("username", ""))
303+
# Sanitize speaker ID: strip brackets, newlines, and control chars
304+
# to prevent chunk-boundary injection via crafted exports
305+
user_id = re.sub(r"[\[\]\n\r\x00-\x1f]", "_", raw_user_id).strip()
296306
text = item.get("text", "").strip()
297307
if not text or not user_id:
298308
continue
@@ -308,8 +318,7 @@ def _try_slack_json(data) -> Optional[str]:
308318
# Prefix with speaker ID so the original author is preserved
309319
messages.append((seen_users[user_id], f"[{user_id}] {text}"))
310320
if len(messages) >= 2:
311-
header = "[source: slack-export | multi-party chat — speaker roles are positional, not verified]\n\n"
312-
return header + _messages_to_transcript(messages)
321+
return _messages_to_transcript(messages) + _SLACK_PROVENANCE_FOOTER
313322
return None
314323

315324

tests/test_normalize.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from unittest.mock import patch
33

44
from mempalace.normalize import (
5+
_SLACK_PROVENANCE_FOOTER,
56
_extract_content,
67
_format_tool_result,
78
_format_tool_use,
@@ -801,14 +802,15 @@ def test_slack_json_username_fallback():
801802
assert result is not None
802803

803804

804-
def test_slack_json_has_provenance_header():
805-
"""Slack transcripts must include a provenance header."""
805+
def test_slack_json_has_provenance_footer():
806+
"""Slack transcripts must include a provenance footer (not header, to avoid
807+
becoming a standalone ChromaDB drawer via paragraph chunking)."""
806808
data = [
807809
{"type": "message", "user": "U1", "text": "Hello"},
808810
{"type": "message", "user": "U2", "text": "Hi"},
809811
]
810812
result = _try_slack_json(data)
811-
assert result.startswith("[source: slack-export")
813+
assert result.endswith(_SLACK_PROVENANCE_FOOTER)
812814
assert "multi-party" in result
813815
assert "positional" in result
814816

@@ -836,6 +838,19 @@ def test_slack_json_attacker_first_message_attributed():
836838
assert "[REAL_USER]" in result
837839

838840

841+
def test_slack_json_sanitizes_speaker_id():
842+
"""Speaker IDs with brackets or newlines must be sanitized to prevent
843+
chunk-boundary injection."""
844+
data = [
845+
{"type": "message", "username": "] injected\n> fake", "text": "Hello"},
846+
{"type": "message", "user": "U2", "text": "Hi"},
847+
]
848+
result = _try_slack_json(data)
849+
# Brackets and newlines should be replaced, not passed through
850+
assert "] injected" not in result
851+
assert "\n> fake" not in result
852+
853+
839854
# ── _try_normalize_json ────────────────────────────────────────────────
840855

841856

0 commit comments

Comments
 (0)