Skip to content

Commit f783c97

Browse files
Martin Dědekclaude
andcommitted
Merge upstream + cherry-pick PR VectifyAI#188 (TOC crash fix)
- Upstream main (28542de) — latest VectifyAI/PageIndex - PR VectifyAI#188: fix prevent KeyError crash and context exhaustion in TOC processing - Fix list_index shadowing (VectifyAI#167) already in upstream - LiteLLM integration (VectifyAI#168) already in upstream Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 28542de commit f783c97

File tree

2 files changed

+168
-45
lines changed

2 files changed

+168
-45
lines changed

pageindex/page_index.py

Lines changed: 33 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,8 @@ def toc_detector_single_page(content, model=None):
117117
Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents."""
118118

119119
response = llm_completion(model=model, prompt=prompt)
120-
# print('response', response)
121120
json_content = extract_json(response)
122-
return json_content['toc_detected']
121+
return json_content.get('toc_detected', 'no')
123122

124123

125124
def check_if_toc_extraction_is_complete(content, toc, model=None):
@@ -137,7 +136,7 @@ def check_if_toc_extraction_is_complete(content, toc, model=None):
137136
prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc
138137
response = llm_completion(model=model, prompt=prompt)
139138
json_content = extract_json(response)
140-
return json_content['completed']
139+
return json_content.get('completed', 'no')
141140

142141

143142
def check_if_toc_transformation_is_complete(content, toc, model=None):
@@ -155,7 +154,7 @@ def check_if_toc_transformation_is_complete(content, toc, model=None):
155154
prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc
156155
response = llm_completion(model=model, prompt=prompt)
157156
json_content = extract_json(response)
158-
return json_content['completed']
157+
return json_content.get('completed', 'no')
159158

160159
def extract_toc_content(content, model=None):
161160
prompt = f"""
@@ -175,27 +174,19 @@ def extract_toc_content(content, model=None):
175174
{"role": "user", "content": prompt},
176175
{"role": "assistant", "content": response},
177176
]
178-
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
179-
new_response, finish_reason = llm_completion(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True)
180-
response = response + new_response
181-
if_complete = check_if_toc_transformation_is_complete(content, response, model)
177+
continue_prompt = "please continue the generation of table of contents, directly output the remaining part of the structure"
182178

183-
attempt = 0
184179
max_attempts = 5
185-
186-
while not (if_complete == "yes" and finish_reason == "finished"):
187-
attempt += 1
188-
if attempt > max_attempts:
189-
raise Exception('Failed to complete table of contents after maximum retries')
190-
191-
chat_history = [
192-
{"role": "user", "content": prompt},
193-
{"role": "assistant", "content": response},
194-
]
195-
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
196-
new_response, finish_reason = llm_completion(model=model, prompt=prompt, chat_history=chat_history, return_finish_reason=True)
180+
for attempt in range(max_attempts):
181+
new_response, finish_reason = llm_completion(model=model, prompt=continue_prompt, chat_history=chat_history, return_finish_reason=True)
197182
response = response + new_response
183+
chat_history.append({"role": "user", "content": continue_prompt})
184+
chat_history.append({"role": "assistant", "content": new_response})
198185
if_complete = check_if_toc_transformation_is_complete(content, response, model)
186+
if if_complete == "yes" and finish_reason == "finished":
187+
break
188+
else:
189+
logging.warning('extract_toc_content: max retries reached, returning best effort result')
199190

200191
return response
201192

@@ -217,7 +208,7 @@ def detect_page_index(toc_content, model=None):
217208

218209
response = llm_completion(model=model, prompt=prompt)
219210
json_content = extract_json(response)
220-
return json_content['page_index_given_in_toc']
211+
return json_content.get('page_index_given_in_toc', 'no')
221212

222213
def toc_extractor(page_list, toc_page_list, model):
223214
def transform_dots_to_colon(text):
@@ -296,43 +287,40 @@ def toc_transformer(toc_content, model=None):
296287
if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model)
297288
if if_complete == "yes" and finish_reason == "finished":
298289
last_complete = extract_json(last_complete)
299-
cleaned_response=convert_page_to_int(last_complete['table_of_contents'])
290+
cleaned_response = convert_page_to_int(last_complete.get('table_of_contents', []))
300291
return cleaned_response
301292

302293
last_complete = get_json_content(last_complete)
303-
attempt = 0
294+
chat_history = [
295+
{"role": "user", "content": prompt},
296+
{"role": "assistant", "content": last_complete},
297+
]
298+
continue_prompt = "Please continue the table of contents JSON structure from where you left off. Directly output only the remaining part."
299+
304300
max_attempts = 5
305-
while not (if_complete == "yes" and finish_reason == "finished"):
306-
attempt += 1
307-
if attempt > max_attempts:
308-
raise Exception('Failed to complete toc transformation after maximum retries')
301+
for attempt in range(max_attempts):
309302
position = last_complete.rfind('}')
310303
if position != -1:
311304
last_complete = last_complete[:position+2]
312-
prompt = f"""
313-
Your task is to continue the table of contents json structure, directly output the remaining part of the json structure.
314-
The response should be in the following JSON format:
315-
316-
The raw table of contents json structure is:
317-
{toc_content}
318-
319-
The incomplete transformed table of contents json structure is:
320-
{last_complete}
321-
322-
Please continue the json structure, directly output the remaining part of the json structure."""
323305

324-
new_complete, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True)
306+
new_complete, finish_reason = llm_completion(model=model, prompt=continue_prompt, chat_history=chat_history, return_finish_reason=True)
325307

326308
if new_complete.startswith('```json'):
327-
new_complete = get_json_content(new_complete)
328-
last_complete = last_complete+new_complete
309+
new_complete = get_json_content(new_complete)
310+
last_complete = last_complete + new_complete
311+
312+
chat_history.append({"role": "user", "content": continue_prompt})
313+
chat_history.append({"role": "assistant", "content": new_complete})
329314

330315
if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model)
331-
316+
if if_complete == "yes" and finish_reason == "finished":
317+
break
318+
else:
319+
logging.warning('toc_transformer: max retries reached, returning best effort result')
332320

333321
last_complete = extract_json(last_complete)
334322

335-
cleaned_response=convert_page_to_int(last_complete['table_of_contents'])
323+
cleaned_response = convert_page_to_int(last_complete.get('table_of_contents', []))
336324
return cleaned_response
337325

338326

@@ -753,7 +741,7 @@ async def single_toc_item_index_fixer(section_title, content, model=None):
753741
prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content
754742
response = await llm_acompletion(model=model, prompt=prompt)
755743
json_content = extract_json(response)
756-
return convert_physical_index_to_int(json_content['physical_index'])
744+
return convert_physical_index_to_int(json_content.get('physical_index'))
757745

758746

759747

tests/test_issue_163.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import pytest
2+
import sys
3+
import os
4+
from unittest.mock import patch, MagicMock
5+
6+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
7+
8+
from pageindex.page_index import (
9+
check_if_toc_extraction_is_complete,
10+
check_if_toc_transformation_is_complete,
11+
toc_detector_single_page,
12+
detect_page_index,
13+
extract_toc_content,
14+
toc_transformer,
15+
)
16+
17+
18+
class TestRobustKeyAccess:
19+
@patch("pageindex.page_index.llm_completion", return_value="")
20+
def test_toc_detector_empty_response(self, mock_llm):
21+
result = toc_detector_single_page("some content", model="test")
22+
assert result == "no"
23+
24+
@patch("pageindex.page_index.llm_completion", return_value='{"toc_detected": "yes"}')
25+
def test_toc_detector_valid_response(self, mock_llm):
26+
result = toc_detector_single_page("some content", model="test")
27+
assert result == "yes"
28+
29+
@patch("pageindex.page_index.llm_completion", return_value="not json at all")
30+
def test_toc_detector_malformed_response(self, mock_llm):
31+
result = toc_detector_single_page("some content", model="test")
32+
assert result == "no"
33+
34+
@patch("pageindex.page_index.llm_completion", return_value="")
35+
def test_extraction_complete_empty_response(self, mock_llm):
36+
result = check_if_toc_extraction_is_complete("doc", "toc", model="test")
37+
assert result == "no"
38+
39+
@patch("pageindex.page_index.llm_completion", return_value='{"completed": "yes"}')
40+
def test_extraction_complete_valid_response(self, mock_llm):
41+
result = check_if_toc_extraction_is_complete("doc", "toc", model="test")
42+
assert result == "yes"
43+
44+
@patch("pageindex.page_index.llm_completion", return_value="")
45+
def test_transformation_complete_empty_response(self, mock_llm):
46+
result = check_if_toc_transformation_is_complete("raw", "cleaned", model="test")
47+
assert result == "no"
48+
49+
@patch("pageindex.page_index.llm_completion", return_value='{"thinking": "looks fine", "completed": "yes"}')
50+
def test_transformation_complete_valid_response(self, mock_llm):
51+
result = check_if_toc_transformation_is_complete("raw", "cleaned", model="test")
52+
assert result == "yes"
53+
54+
@patch("pageindex.page_index.llm_completion", return_value="")
55+
def test_detect_page_index_empty_response(self, mock_llm):
56+
result = detect_page_index("toc text", model="test")
57+
assert result == "no"
58+
59+
60+
class TestExtractTocContentRetryLoop:
61+
@patch("pageindex.page_index.check_if_toc_transformation_is_complete")
62+
@patch("pageindex.page_index.llm_completion")
63+
def test_completes_on_first_try(self, mock_llm, mock_check):
64+
mock_llm.return_value = ("full toc content", "finished")
65+
mock_check.return_value = "yes"
66+
result = extract_toc_content("raw content", model="test")
67+
assert result == "full toc content"
68+
assert mock_llm.call_count == 1
69+
70+
@patch("pageindex.page_index.check_if_toc_transformation_is_complete")
71+
@patch("pageindex.page_index.llm_completion")
72+
def test_continues_on_incomplete(self, mock_llm, mock_check):
73+
mock_llm.side_effect = [
74+
("partial toc", "max_output_reached"),
75+
(" continued toc", "finished"),
76+
]
77+
mock_check.side_effect = ["no", "yes"]
78+
result = extract_toc_content("raw content", model="test")
79+
assert result == "partial toc continued toc"
80+
assert mock_llm.call_count == 2
81+
82+
@patch("pageindex.page_index.check_if_toc_transformation_is_complete")
83+
@patch("pageindex.page_index.llm_completion")
84+
def test_max_retries_returns_best_effort(self, mock_llm, mock_check):
85+
mock_llm.return_value = ("chunk", "max_output_reached")
86+
mock_check.return_value = "no"
87+
result = extract_toc_content("raw content", model="test")
88+
assert "chunk" in result
89+
assert mock_llm.call_count == 6
90+
91+
@patch("pageindex.page_index.check_if_toc_transformation_is_complete")
92+
@patch("pageindex.page_index.llm_completion")
93+
def test_chat_history_grows_incrementally(self, mock_llm, mock_check):
94+
call_count = [0]
95+
96+
def side_effect(*args, **kwargs):
97+
call_count[0] += 1
98+
if call_count[0] == 1:
99+
return ("initial", "max_output_reached")
100+
if call_count[0] == 2:
101+
history = kwargs.get("chat_history", [])
102+
assert len(history) == 2
103+
return (" part2", "max_output_reached")
104+
if call_count[0] == 3:
105+
history = kwargs.get("chat_history", [])
106+
assert len(history) == 4
107+
return (" part3", "finished")
108+
return ("", "finished")
109+
110+
mock_llm.side_effect = side_effect
111+
mock_check.side_effect = ["no", "no", "yes"]
112+
result = extract_toc_content("raw content", model="test")
113+
assert result == "initial part2 part3"
114+
115+
116+
class TestTocTransformerRetryLoop:
117+
@patch("pageindex.page_index.check_if_toc_transformation_is_complete")
118+
@patch("pageindex.page_index.llm_completion")
119+
def test_completes_on_first_try(self, mock_llm, mock_check):
120+
mock_llm.return_value = (
121+
'{"table_of_contents": [{"structure": "1", "title": "Intro", "page": 1}]}',
122+
"finished",
123+
)
124+
mock_check.return_value = "yes"
125+
result = toc_transformer("raw toc", model="test")
126+
assert len(result) == 1
127+
assert result[0]["title"] == "Intro"
128+
129+
@patch("pageindex.page_index.check_if_toc_transformation_is_complete")
130+
@patch("pageindex.page_index.llm_completion")
131+
def test_handles_missing_table_of_contents_key(self, mock_llm, mock_check):
132+
mock_llm.return_value = ('{"other_key": "value"}', "finished")
133+
mock_check.return_value = "yes"
134+
result = toc_transformer("raw toc", model="test")
135+
assert result == []

0 commit comments

Comments
 (0)