@@ -117,9 +117,8 @@ def toc_detector_single_page(content, model=None):
117117 Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents."""
118118
119119 response = llm_completion (model = model , prompt = prompt )
120- # print('response', response)
121120 json_content = extract_json (response )
122- return json_content [ 'toc_detected' ]
121+ return json_content . get ( 'toc_detected' , 'no' )
123122
124123
125124def check_if_toc_extraction_is_complete (content , toc , model = None ):
@@ -137,7 +136,7 @@ def check_if_toc_extraction_is_complete(content, toc, model=None):
137136 prompt = prompt + '\n Document:\n ' + content + '\n Table of contents:\n ' + toc
138137 response = llm_completion (model = model , prompt = prompt )
139138 json_content = extract_json (response )
140- return json_content [ 'completed' ]
139+ return json_content . get ( 'completed' , 'no' )
141140
142141
143142def check_if_toc_transformation_is_complete (content , toc , model = None ):
@@ -155,7 +154,7 @@ def check_if_toc_transformation_is_complete(content, toc, model=None):
155154 prompt = prompt + '\n Raw Table of contents:\n ' + content + '\n Cleaned Table of contents:\n ' + toc
156155 response = llm_completion (model = model , prompt = prompt )
157156 json_content = extract_json (response )
158- return json_content [ 'completed' ]
157+ return json_content . get ( 'completed' , 'no' )
159158
160159def extract_toc_content (content , model = None ):
161160 prompt = f"""
@@ -175,27 +174,19 @@ def extract_toc_content(content, model=None):
175174 {"role" : "user" , "content" : prompt },
176175 {"role" : "assistant" , "content" : response },
177176 ]
178- prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
179- new_response , finish_reason = llm_completion (model = model , prompt = prompt , chat_history = chat_history , return_finish_reason = True )
180- response = response + new_response
181- if_complete = check_if_toc_transformation_is_complete (content , response , model )
177+ continue_prompt = "please continue the generation of table of contents, directly output the remaining part of the structure"
182178
183- attempt = 0
184179 max_attempts = 5
185-
186- while not (if_complete == "yes" and finish_reason == "finished" ):
187- attempt += 1
188- if attempt > max_attempts :
189- raise Exception ('Failed to complete table of contents after maximum retries' )
190-
191- chat_history = [
192- {"role" : "user" , "content" : prompt },
193- {"role" : "assistant" , "content" : response },
194- ]
195- prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
196- new_response , finish_reason = llm_completion (model = model , prompt = prompt , chat_history = chat_history , return_finish_reason = True )
180+ for attempt in range (max_attempts ):
181+ new_response , finish_reason = llm_completion (model = model , prompt = continue_prompt , chat_history = chat_history , return_finish_reason = True )
197182 response = response + new_response
183+ chat_history .append ({"role" : "user" , "content" : continue_prompt })
184+ chat_history .append ({"role" : "assistant" , "content" : new_response })
198185 if_complete = check_if_toc_transformation_is_complete (content , response , model )
186+ if if_complete == "yes" and finish_reason == "finished" :
187+ break
188+ else :
189+ logging .warning ('extract_toc_content: max retries reached, returning best effort result' )
199190
200191 return response
201192
@@ -217,7 +208,7 @@ def detect_page_index(toc_content, model=None):
217208
218209 response = llm_completion (model = model , prompt = prompt )
219210 json_content = extract_json (response )
220- return json_content [ 'page_index_given_in_toc' ]
211+ return json_content . get ( 'page_index_given_in_toc' , 'no' )
221212
222213def toc_extractor (page_list , toc_page_list , model ):
223214 def transform_dots_to_colon (text ):
@@ -296,43 +287,40 @@ def toc_transformer(toc_content, model=None):
296287 if_complete = check_if_toc_transformation_is_complete (toc_content , last_complete , model )
297288 if if_complete == "yes" and finish_reason == "finished" :
298289 last_complete = extract_json (last_complete )
299- cleaned_response = convert_page_to_int (last_complete [ 'table_of_contents' ] )
290+ cleaned_response = convert_page_to_int (last_complete . get ( 'table_of_contents' , []) )
300291 return cleaned_response
301292
302293 last_complete = get_json_content (last_complete )
303- attempt = 0
294+ chat_history = [
295+ {"role" : "user" , "content" : prompt },
296+ {"role" : "assistant" , "content" : last_complete },
297+ ]
298+ continue_prompt = "Please continue the table of contents JSON structure from where you left off. Directly output only the remaining part."
299+
304300 max_attempts = 5
305- while not (if_complete == "yes" and finish_reason == "finished" ):
306- attempt += 1
307- if attempt > max_attempts :
308- raise Exception ('Failed to complete toc transformation after maximum retries' )
301+ for attempt in range (max_attempts ):
309302 position = last_complete .rfind ('}' )
310303 if position != - 1 :
311304 last_complete = last_complete [:position + 2 ]
312- prompt = f"""
313- Your task is to continue the table of contents json structure, directly output the remaining part of the json structure.
314- The response should be in the following JSON format:
315-
316- The raw table of contents json structure is:
317- { toc_content }
318-
319- The incomplete transformed table of contents json structure is:
320- { last_complete }
321-
322- Please continue the json structure, directly output the remaining part of the json structure."""
323305
324- new_complete , finish_reason = llm_completion (model = model , prompt = prompt , return_finish_reason = True )
306+ new_complete , finish_reason = llm_completion (model = model , prompt = continue_prompt , chat_history = chat_history , return_finish_reason = True )
325307
326308 if new_complete .startswith ('```json' ):
327- new_complete = get_json_content (new_complete )
328- last_complete = last_complete + new_complete
309+ new_complete = get_json_content (new_complete )
310+ last_complete = last_complete + new_complete
311+
312+ chat_history .append ({"role" : "user" , "content" : continue_prompt })
313+ chat_history .append ({"role" : "assistant" , "content" : new_complete })
329314
330315 if_complete = check_if_toc_transformation_is_complete (toc_content , last_complete , model )
331-
316+ if if_complete == "yes" and finish_reason == "finished" :
317+ break
318+ else :
319+ logging .warning ('toc_transformer: max retries reached, returning best effort result' )
332320
333321 last_complete = extract_json (last_complete )
334322
335- cleaned_response = convert_page_to_int (last_complete [ 'table_of_contents' ] )
323+ cleaned_response = convert_page_to_int (last_complete . get ( 'table_of_contents' , []) )
336324 return cleaned_response
337325
338326
@@ -753,7 +741,7 @@ async def single_toc_item_index_fixer(section_title, content, model=None):
753741 prompt = toc_extractor_prompt + '\n Section Title:\n ' + str (section_title ) + '\n Document pages:\n ' + content
754742 response = await llm_acompletion (model = model , prompt = prompt )
755743 json_content = extract_json (response )
756- return convert_physical_index_to_int (json_content [ 'physical_index' ] )
744+ return convert_physical_index_to_int (json_content . get ( 'physical_index' ) )
757745
758746
759747
0 commit comments