Skip to content

Commit 10ea15c

Browse files
committed
Make data_extract_tasks existence checks cloud-storage compatible
annotation_window's PDF and text branches both used os.path.exists(.path) and open(.path) — pre-existing patterns that raise NotImplementedError on remote storage backends (S3/GCS) since FieldFile.path is only defined for local filesystem storage. Both backends (local + cloud) now work: - default_storage.exists(file.name) replaces os.path.exists(file.path) for the existence guards (lines 703, 789). - doc.txt_extract_file.open('rb').read().decode('utf-8') replaces open(doc.txt_extract_file.path, encoding='utf-8') for the text read. - doc.pawls_parse_file is already passed by FieldFile to load_canonical_v2 (fixed earlier in this PR). - Removed now-unused 'import os'; added 'from django.core.files.storage import default_storage'. test_agent_search_tools.py mocks updated: - @patch('...os.path.exists') -> @patch('...default_storage.exists') - patch('builtins.open') replaced with patches at the right boundary: load_canonical_v2 for PDF tests (returns canonical v2 directly) and the FieldFile.open() classmethod for the text test. The size-limit test now builds a 2000-token v1 PAWLs payload (instead of mocking open with non-JSON bytes) and asserts a soft clamp around 500 tokens-per-side + the annotation's own 3 tokens. pawls_io.iter_pages: corrected return type from Iterable[PageView] to Iterator[PageView] to match the generator implementation, with a single-pass note in the docstring.
1 parent 1f8d5d1 commit 10ea15c

3 files changed

Lines changed: 84 additions & 38 deletions

File tree

opencontractserver/tasks/data_extract_tasks.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
# opencontractserver/tasks/data_extract_tasks.py
22
import json
33
import logging
4-
import os
54
from collections import Counter
65
from typing import Any, Optional
76

87
from asgiref.sync import sync_to_async
8+
from django.core.files.storage import default_storage
99
from pydantic_ai.messages import ModelResponse, ToolCallPart
1010

1111
from opencontractserver.annotations.compact_json import iter_page_annotations
@@ -700,14 +700,17 @@ def split_words_preserve_idx(text_str: str) -> list[tuple[str, int]]:
700700
# -------------------------
701701
# Handle text/* annotation
702702
# -------------------------
703-
if not doc.txt_extract_file or not os.path.exists(
704-
doc.txt_extract_file.path
703+
# Use default_storage.exists(.name) and FieldFile.open() so this
704+
# works on both local filesystem and remote (S3/GCS) backends.
705+
# ``.path`` raises NotImplementedError on remote storage backends.
706+
if not doc.txt_extract_file or not default_storage.exists(
707+
doc.txt_extract_file.name
705708
):
706709
return "Error: Document has no txt_extract_file or path is invalid."
707710

708-
# Read the entire doc text
709-
with open(doc.txt_extract_file.path, encoding="utf-8") as f:
710-
doc_text = f.read()
711+
# Read the entire doc text via the storage-backend-aware FieldFile.
712+
with doc.txt_extract_file.open("rb") as f:
713+
doc_text = f.read().decode("utf-8")
711714

712715
# The Annotation.json is presumably a TextSpanData
713716
anno_json = annotation.json
@@ -786,8 +789,11 @@ def clamp_to_total_window(
786789
# -------------------------
787790
# Handle PDF annotation
788791
# -------------------------
789-
if not doc.pawls_parse_file or not os.path.exists(
790-
doc.pawls_parse_file.path
792+
# Use default_storage.exists(.name) so this works on both local
793+
# filesystem and remote (S3/GCS) backends. ``.path`` raises
794+
# NotImplementedError on remote storage backends.
795+
if not doc.pawls_parse_file or not default_storage.exists(
796+
doc.pawls_parse_file.name
791797
):
792798
return "Error: Document has no pawls_parse_file or path is invalid."
793799

opencontractserver/tests/test_agent_search_tools.py

Lines changed: 63 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -137,29 +137,29 @@ def setUp(self):
137137
creator=self.user,
138138
)
139139

140-
@patch("opencontractserver.tasks.data_extract_tasks.os.path.exists")
140+
@patch("opencontractserver.tasks.data_extract_tasks.default_storage.exists")
141141
def test_text_search_finds_structural_annotations(self, mock_exists):
142142
"""Test that text_search finds structural annotations containing the query."""
143143
mock_exists.return_value = True
144144
result = text_search(self.pdf_doc.id, "test structural")
145145
self.assertIn("This is a test structural annotation", result)
146146
self.assertIn("Another test structural annotation", result)
147147

148-
@patch("opencontractserver.tasks.data_extract_tasks.os.path.exists")
148+
@patch("opencontractserver.tasks.data_extract_tasks.default_storage.exists")
149149
def test_text_search_case_insensitive(self, mock_exists):
150150
"""Test that text_search is case insensitive."""
151151
mock_exists.return_value = True
152152
result = text_search(self.pdf_doc.id, "TEST STRUCTURAL")
153153
self.assertIn("This is a test structural annotation", result)
154154

155-
@patch("opencontractserver.tasks.data_extract_tasks.os.path.exists")
155+
@patch("opencontractserver.tasks.data_extract_tasks.default_storage.exists")
156156
def test_text_search_no_matches(self, mock_exists):
157157
"""Test text_search when no matches are found."""
158158
mock_exists.return_value = True
159159
result = text_search(self.pdf_doc.id, "nonexistent text")
160160
self.assertEqual(result, "No structural annotations matched your text_search.")
161161

162-
@patch("opencontractserver.tasks.data_extract_tasks.os.path.exists")
162+
@patch("opencontractserver.tasks.data_extract_tasks.default_storage.exists")
163163
def test_text_search_limit_three_results(self, mock_exists):
164164
"""Test that text_search returns at most 3 results."""
165165
mock_exists.return_value = True
@@ -178,17 +178,25 @@ def test_text_search_limit_three_results(self, mock_exists):
178178
matches = result.count("\n") + 1 # Count newlines + 1 for the last line
179179
self.assertEqual(matches, 3)
180180

181-
@patch("opencontractserver.tasks.data_extract_tasks.os.path.exists")
181+
@patch("opencontractserver.tasks.data_extract_tasks.default_storage.exists")
182182
def test_annotation_window_pdf(self, mock_exists):
183183
"""
184-
Patch open() so that reading doc.pawls_parse_file.path gives us valid JSON
185-
from create_mock_pawls_content, enabling JSON parsing and text snippet retrieval.
184+
Patch the canonical-v2 PAWLs loader so the function operates on a known
185+
pawls payload regardless of how the FieldFile is stored on disk.
186186
"""
187187
mock_exists.return_value = True
188-
pawls_json = create_mock_pawls_content(self.pdf_content)
189-
190-
with patch("builtins.open", create=True) as mock_open:
191-
mock_open.return_value.__enter__.return_value.read.return_value = pawls_json
188+
pawls_v1 = json.loads(create_mock_pawls_content(self.pdf_content))
189+
190+
# ``annotation_window`` calls ``load_canonical_v2(doc.pawls_parse_file)``;
191+
# patch it to return canonical v2 derived from our mock v1 payload.
192+
# ``load_canonical_v2`` accepts a pre-decoded list and returns v2.
193+
from opencontractserver.utils.pawls_io import to_canonical_v2
194+
195+
canonical = to_canonical_v2(pawls_v1)
196+
with patch(
197+
"opencontractserver.tasks.data_extract_tasks.load_canonical_v2",
198+
return_value=canonical,
199+
):
192200
result = annotation_window(
193201
self.pdf_doc.id, str(self.pdf_annotation.id), "5"
194202
)
@@ -199,19 +207,23 @@ def test_annotation_window_pdf(self, mock_exists):
199207
# Possibly also expect words before/after if the window is big enough
200208
self.assertIn("This is a test document", result)
201209

202-
@patch("opencontractserver.tasks.data_extract_tasks.os.path.exists")
210+
@patch("opencontractserver.tasks.data_extract_tasks.default_storage.exists")
203211
def test_annotation_window_text(self, mock_exists):
204212
"""
205213
The doc's text is now 'Test text annotation' (which is 19 chars long).
206214
So annotation_window should slice [0:19].
207215
"""
208216
mock_exists.return_value = True
209217

210-
with patch("builtins.open", create=True) as mock_open:
211-
# Return the same text that we stored in setUp
212-
mock_open.return_value.__enter__.return_value.read.return_value = (
213-
"Test text annotation"
214-
)
218+
# The new code reads via ``doc.txt_extract_file.open("rb").read()``.
219+
# Patch the FieldFile-level open so the test is independent of disk I/O.
220+
from io import BytesIO
221+
222+
with patch.object(
223+
type(self.txt_doc.txt_extract_file),
224+
"open",
225+
return_value=BytesIO(b"Test text annotation"),
226+
):
215227
result = annotation_window(
216228
self.txt_doc.id, str(self.txt_annotation.id), "5"
217229
)
@@ -220,7 +232,7 @@ def test_annotation_window_text(self, mock_exists):
220232
# Now it should indeed contain the raw_text
221233
self.assertIn("Test text annotation", result)
222234

223-
@patch("opencontractserver.tasks.data_extract_tasks.os.path.exists")
235+
@patch("opencontractserver.tasks.data_extract_tasks.default_storage.exists")
224236
def test_annotation_window_invalid_window_size(self, mock_exists):
225237
"""Test annotation_window with invalid window size."""
226238
mock_exists.return_value = True
@@ -229,29 +241,52 @@ def test_annotation_window_invalid_window_size(self, mock_exists):
229241
)
230242
self.assertEqual(result, "Error: Could not parse window_size as an integer.")
231243

232-
@patch("opencontractserver.tasks.data_extract_tasks.os.path.exists")
244+
@patch("opencontractserver.tasks.data_extract_tasks.default_storage.exists")
233245
def test_annotation_window_nonexistent_annotation(self, mock_exists):
234246
"""Test annotation_window with nonexistent annotation ID."""
235247
mock_exists.return_value = True
236248
result = annotation_window(self.pdf_doc.id, "99999", "5")
237249
self.assertEqual(result, "Error: Annotation [99999] not found.")
238250

239-
@patch("opencontractserver.tasks.data_extract_tasks.os.path.exists")
251+
@patch("opencontractserver.tasks.data_extract_tasks.default_storage.exists")
240252
def test_annotation_window_size_limit(self, mock_exists):
241-
"""Test that annotation_window respects the maximum window size."""
253+
"""Test that annotation_window respects the maximum window size.
254+
255+
The caller asks for a window of 1000 words on each side, but the
256+
function clamps to 500 (a global 1000-word total ceiling). Build a
257+
synthetic v2 PAWLs payload whose token list is 2000 words long so
258+
the clamp actually has something to bite on.
259+
"""
242260
mock_exists.return_value = True
243-
# Create a long text with 2000 words
244-
long_text = " ".join(["word"] * 2000)
245-
with patch("builtins.open", create=True) as mock_open:
246-
mock_open.return_value.__enter__.return_value.read.return_value = long_text
261+
long_words = [f"word{i}" for i in range(2000)]
262+
# Build a v1 PAWLs payload, then run it through to_canonical_v2.
263+
v1_pawls = [
264+
{
265+
"page": {"width": 800, "height": 1000, "index": 0},
266+
"tokens": [
267+
{"x": i, "y": 0, "width": 10, "height": 10, "text": w}
268+
for i, w in enumerate(long_words)
269+
],
270+
}
271+
]
272+
from opencontractserver.utils.pawls_io import to_canonical_v2
273+
274+
canonical = to_canonical_v2(v1_pawls)
275+
with patch(
276+
"opencontractserver.tasks.data_extract_tasks.load_canonical_v2",
277+
return_value=canonical,
278+
):
247279
result = annotation_window(
248280
self.pdf_doc.id, str(self.pdf_annotation.id), "1000"
249281
)
250-
# Should be clamped to 500 words on each side
282+
# Should be clamped to 500 words on each side. The annotation
283+
# itself spans 3 tokens, so the total can be up to ~503 + 500 = 1003.
251284
self.assertIsNotNone(result)
252-
# Count words in result
285+
# Count words in result; allow a small buffer for the annotation tokens.
253286
word_count = len(result.split())
254-
self.assertLessEqual(word_count, 1000)
287+
self.assertLessEqual(word_count, 1010)
288+
# And materially smaller than the unclamped 2000-word source.
289+
self.assertLess(word_count, 2000)
255290

256291
def tearDown(self):
257292
"""Clean up test data."""

opencontractserver/utils/pawls_io.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
import json
2727
import logging
2828
import os
29-
from collections.abc import Iterable, Iterator
29+
from collections.abc import Iterator
3030
from typing import Any, cast
3131

3232
from opencontractserver.utils.compact_pawls import (
@@ -327,9 +327,14 @@ def __repr__(self) -> str: # pragma: no cover - debug helper
327327
)
328328

329329

330-
def iter_pages(canonical_v2: dict[str, Any]) -> Iterable[PageView]:
330+
def iter_pages(canonical_v2: dict[str, Any]) -> Iterator[PageView]:
331331
"""Yield :class:`PageView` for each page in a canonical v2 dict.
332332
333+
**Single-pass**: this is a generator, so callers that need to walk the
334+
pages twice should materialize once via ``list(iter_pages(...))`` and
335+
reuse the list — calling ``iter_pages`` again starts a fresh iterator
336+
from the start.
337+
333338
Args:
334339
canonical_v2: A v2 dict (must have ``"v": COMPACT_PAWLS_VERSION``
335340
and a list under ``"p"``). Use :func:`to_canonical_v2` to

0 commit comments

Comments
 (0)