@@ -137,29 +137,29 @@ def setUp(self):
137137 creator = self .user ,
138138 )
139139
140- @patch ("opencontractserver.tasks.data_extract_tasks.os.path .exists" )
140+ @patch ("opencontractserver.tasks.data_extract_tasks.default_storage .exists" )
141141 def test_text_search_finds_structural_annotations (self , mock_exists ):
142142 """Test that text_search finds structural annotations containing the query."""
143143 mock_exists .return_value = True
144144 result = text_search (self .pdf_doc .id , "test structural" )
145145 self .assertIn ("This is a test structural annotation" , result )
146146 self .assertIn ("Another test structural annotation" , result )
147147
148- @patch ("opencontractserver.tasks.data_extract_tasks.os.path .exists" )
148+ @patch ("opencontractserver.tasks.data_extract_tasks.default_storage .exists" )
149149 def test_text_search_case_insensitive (self , mock_exists ):
150150 """Test that text_search is case insensitive."""
151151 mock_exists .return_value = True
152152 result = text_search (self .pdf_doc .id , "TEST STRUCTURAL" )
153153 self .assertIn ("This is a test structural annotation" , result )
154154
155- @patch ("opencontractserver.tasks.data_extract_tasks.os.path .exists" )
155+ @patch ("opencontractserver.tasks.data_extract_tasks.default_storage .exists" )
156156 def test_text_search_no_matches (self , mock_exists ):
157157 """Test text_search when no matches are found."""
158158 mock_exists .return_value = True
159159 result = text_search (self .pdf_doc .id , "nonexistent text" )
160160 self .assertEqual (result , "No structural annotations matched your text_search." )
161161
162- @patch ("opencontractserver.tasks.data_extract_tasks.os.path .exists" )
162+ @patch ("opencontractserver.tasks.data_extract_tasks.default_storage .exists" )
163163 def test_text_search_limit_three_results (self , mock_exists ):
164164 """Test that text_search returns at most 3 results."""
165165 mock_exists .return_value = True
@@ -178,17 +178,25 @@ def test_text_search_limit_three_results(self, mock_exists):
178178 matches = result .count ("\n " ) + 1 # Count newlines + 1 for the last line
179179 self .assertEqual (matches , 3 )
180180
181- @patch ("opencontractserver.tasks.data_extract_tasks.os.path .exists" )
181+ @patch ("opencontractserver.tasks.data_extract_tasks.default_storage .exists" )
182182 def test_annotation_window_pdf (self , mock_exists ):
183183 """
184- Patch open() so that reading doc.pawls_parse_file.path gives us valid JSON
185- from create_mock_pawls_content, enabling JSON parsing and text snippet retrieval .
184+ Patch the canonical-v2 PAWLs loader so the function operates on a known
185+ pawls payload regardless of how the FieldFile is stored on disk .
186186 """
187187 mock_exists .return_value = True
188- pawls_json = create_mock_pawls_content (self .pdf_content )
189-
190- with patch ("builtins.open" , create = True ) as mock_open :
191- mock_open .return_value .__enter__ .return_value .read .return_value = pawls_json
188+ pawls_v1 = json .loads (create_mock_pawls_content (self .pdf_content ))
189+
190+ # ``annotation_window`` calls ``load_canonical_v2(doc.pawls_parse_file)``;
191+ # patch it to return canonical v2 derived from our mock v1 payload.
192+ # ``load_canonical_v2`` accepts a pre-decoded list and returns v2.
193+ from opencontractserver .utils .pawls_io import to_canonical_v2
194+
195+ canonical = to_canonical_v2 (pawls_v1 )
196+ with patch (
197+ "opencontractserver.tasks.data_extract_tasks.load_canonical_v2" ,
198+ return_value = canonical ,
199+ ):
192200 result = annotation_window (
193201 self .pdf_doc .id , str (self .pdf_annotation .id ), "5"
194202 )
@@ -199,19 +207,23 @@ def test_annotation_window_pdf(self, mock_exists):
199207 # Possibly also expect words before/after if the window is big enough
200208 self .assertIn ("This is a test document" , result )
201209
202- @patch ("opencontractserver.tasks.data_extract_tasks.os.path .exists" )
210+ @patch ("opencontractserver.tasks.data_extract_tasks.default_storage .exists" )
203211 def test_annotation_window_text (self , mock_exists ):
204212 """
205213 The doc's text is now 'Test text annotation' (which is 19 chars long).
206214 So annotation_window should slice [0:19].
207215 """
208216 mock_exists .return_value = True
209217
210- with patch ("builtins.open" , create = True ) as mock_open :
211- # Return the same text that we stored in setUp
212- mock_open .return_value .__enter__ .return_value .read .return_value = (
213- "Test text annotation"
214- )
218+ # The new code reads via ``doc.txt_extract_file.open("rb").read()``.
219+ # Patch the FieldFile-level open so the test is independent of disk I/O.
220+ from io import BytesIO
221+
222+ with patch .object (
223+ type (self .txt_doc .txt_extract_file ),
224+ "open" ,
225+ return_value = BytesIO (b"Test text annotation" ),
226+ ):
215227 result = annotation_window (
216228 self .txt_doc .id , str (self .txt_annotation .id ), "5"
217229 )
@@ -220,7 +232,7 @@ def test_annotation_window_text(self, mock_exists):
220232 # Now it should indeed contain the raw_text
221233 self .assertIn ("Test text annotation" , result )
222234
223- @patch ("opencontractserver.tasks.data_extract_tasks.os.path .exists" )
235+ @patch ("opencontractserver.tasks.data_extract_tasks.default_storage .exists" )
224236 def test_annotation_window_invalid_window_size (self , mock_exists ):
225237 """Test annotation_window with invalid window size."""
226238 mock_exists .return_value = True
@@ -229,29 +241,52 @@ def test_annotation_window_invalid_window_size(self, mock_exists):
229241 )
230242 self .assertEqual (result , "Error: Could not parse window_size as an integer." )
231243
232- @patch ("opencontractserver.tasks.data_extract_tasks.os.path .exists" )
244+ @patch ("opencontractserver.tasks.data_extract_tasks.default_storage .exists" )
233245 def test_annotation_window_nonexistent_annotation (self , mock_exists ):
234246 """Test annotation_window with nonexistent annotation ID."""
235247 mock_exists .return_value = True
236248 result = annotation_window (self .pdf_doc .id , "99999" , "5" )
237249 self .assertEqual (result , "Error: Annotation [99999] not found." )
238250
239- @patch ("opencontractserver.tasks.data_extract_tasks.os.path .exists" )
251+ @patch ("opencontractserver.tasks.data_extract_tasks.default_storage .exists" )
240252 def test_annotation_window_size_limit (self , mock_exists ):
241- """Test that annotation_window respects the maximum window size."""
253+ """Test that annotation_window respects the maximum window size.
254+
255+ The caller asks for a window of 1000 words on each side, but the
256+ function clamps to 500 (a global 1000-word total ceiling). Build a
257+ synthetic v2 PAWLs payload whose token list is 2000 words long so
258+ the clamp actually has something to bite on.
259+ """
242260 mock_exists .return_value = True
243- # Create a long text with 2000 words
244- long_text = " " .join (["word" ] * 2000 )
245- with patch ("builtins.open" , create = True ) as mock_open :
246- mock_open .return_value .__enter__ .return_value .read .return_value = long_text
261+ long_words = [f"word{ i } " for i in range (2000 )]
262+ # Build a v1 PAWLs payload, then run it through to_canonical_v2.
263+ v1_pawls = [
264+ {
265+ "page" : {"width" : 800 , "height" : 1000 , "index" : 0 },
266+ "tokens" : [
267+ {"x" : i , "y" : 0 , "width" : 10 , "height" : 10 , "text" : w }
268+ for i , w in enumerate (long_words )
269+ ],
270+ }
271+ ]
272+ from opencontractserver .utils .pawls_io import to_canonical_v2
273+
274+ canonical = to_canonical_v2 (v1_pawls )
275+ with patch (
276+ "opencontractserver.tasks.data_extract_tasks.load_canonical_v2" ,
277+ return_value = canonical ,
278+ ):
247279 result = annotation_window (
248280 self .pdf_doc .id , str (self .pdf_annotation .id ), "1000"
249281 )
250- # Should be clamped to 500 words on each side
282+ # Should be clamped to 500 words on each side. The annotation
283+ # itself spans 3 tokens, so the total can be up to ~503 + 500 = 1003.
251284 self .assertIsNotNone (result )
252- # Count words in result
285+ # Count words in result; allow a small buffer for the annotation tokens.
253286 word_count = len (result .split ())
254- self .assertLessEqual (word_count , 1000 )
287+ self .assertLessEqual (word_count , 1010 )
288+ # And materially smaller than the unclamped 2000-word source.
289+ self .assertLess (word_count , 2000 )
255290
256291 def tearDown (self ):
257292 """Clean up test data."""
0 commit comments