@@ -123,55 +123,25 @@ endif;
123123 $ media = $ resource ->media ();
124124
125125 if (isset ($ media [1 ])):
126- echo $ media [1 ]->mediaType ();
127- $ text = $ media [1 ]->render ();
128- //echo $text;
129-
130- $ text = str_replace (array ("- \r" , "- \n" ), '' , $ text );
131- $ text = str_replace (array ("\r" , "\n" ), ' ' , $ text );
132- $ text = str_replace (" " , ' ' , $ text );
133- $ text = str_replace (" : " , ' ' , $ text );
134- $ text = str_replace (" — " , '' , $ text );
135- $ text = str_replace ("| " , '' , $ text );
136- $ text = strip_tags ($ text );
137-
126+ $ text = normalize_ocr_text ($ media [1 ]->render ());
138127 endif ;
139128
140- if ($ text ):
141- foreach ($ _GET ["filter " ] as $ key => $ value ):
142- if ($ value ["join " ] != "not " && $ value ["value " ]):
143- $ search_text = htmlspecialchars (strip_tags ($ value ["value " ]));
144- //$texts = explode($search_text,str_ireplace($search_text,$search_text,$text));
145- $ search_text = str_replace ("- " ," " ,$ search_text );
146- $ search_text_escaped = preg_quote ($ search_text , '/ ' );
147- $ texts = preg_split ("/\b " .$ search_text_escaped ."\b/i " , $ text );
148-
149- if (sizeof ($ texts )>1 ):
150- //$first = explode(" ",$texts[0]);
151- $ first = substr ($ texts [0 ],-45 );
152- //$last = explode(" ",$texts[1]);
153- $ last = substr ($ texts [1 ],0 ,45 );
154- echo "<div class='snippet'> " ;
155- echo "... " .$ first ."<span class='hit'> " .$ search_text ."</span> " .$ last .'... ' ;
156- echo "</div> " ;
157- break ;
158- endif ;
159- $ search_text = str_replace (" " ,"- " ,$ search_text );
160- $ search_text_escaped = preg_quote ($ search_text , '/ ' );
161- $ texts = preg_split ("/\b " .$ search_text_escaped ."\b/i " , $ text );
162- if (sizeof ($ texts )>1 ):
163- //$first = explode(" ",$texts[0]);
164- $ first = substr ($ texts [0 ], -45 );
165- //$last = explode(" ",$texts[1]);
166- $ last = substr ($ texts [1 ],0 ,45 );
167- echo "<div class='snippet'> " ;
168- echo "... " .$ first ."<span class='hit'> " .$ search_text ."</span> " .$ last .'... ' ;
169- echo "</div> " ;
170- break ;
171- endif ;
172- endif ;
173- endforeach ;
174- endif ;
129+ if ($ text ) {
130+ foreach ($ _GET ["filter " ] as $ key => $ value ) {
131+
132+ if ($ value ["join " ] != "not " && $ value ["value " ]) {
133+
134+ $ search_text = mb_strtolower (strip_tags ($ value ["value " ]));
135+
136+ $ snippet = make_snippet_flexible ($ text , $ search_text );
137+
138+ if ($ snippet ) {
139+ echo "<div class='snippet'> {$ snippet }</div> " ;
140+ break ;
141+ }
142+ }
143+ }
144+ }
175145 endif ;
176146 ?>
177147
@@ -244,3 +214,60 @@ endif;
244214
245215</div></div>
246216
217+ <?php
218+ function normalize_ocr_text ($ text ) {
219+ // unify case
220+ $ text = mb_strtolower ($ text , 'UTF-8 ' );
221+
222+ // fix word splits: "to-\nmaat" → "tomaat"
223+ $ text = preg_replace ('/(\w)-\s*\n\s*(\w)/u ' , '$1$2 ' , $ text );
224+
225+ // remove remaining line-breaks
226+ $ text = preg_replace ('/\s+/ ' , ' ' , $ text );
227+
228+ // remove punctuation noise commonly introduced by OCR
229+ $ text = preg_replace ('/[^\p{L}\p{N}\s]/u ' , '' , $ text );
230+
231+ $ text = preg_replace ('/[-‐-‒–—―]/u ' , ' ' , $ text ); // all dash types → space
232+
233+ // normalize double spaces
234+ $ text = preg_replace ('/\s+/ ' , ' ' , $ text );
235+
236+ return trim ($ text );
237+ }
238+
239+ function make_snippet_flexible ($ text , $ search , $ radius = 45 ) {
240+
241+ // normalize search term for consistent behavior
242+ $ search = mb_strtolower (trim ($ search ));
243+
244+ // create variants
245+ $ search_space = preg_replace ('/[-‐-‒–—―]/u ' , ' ' , $ search ); // hyphens → spaces
246+ $ search_dash = preg_replace ('/\s+/ ' , '- ' , $ search_space ); // spaces → hyphens
247+
248+ // build regex (matches either version)
249+ $ pattern =
250+ '/(?<!\p{L})( '
251+ . preg_quote ($ search_space , '/ ' )
252+ . '| '
253+ . preg_quote ($ search_dash , '/ ' )
254+ . ')(?!\p{L})/iu ' ;
255+
256+ if (!preg_match ($ pattern , $ text , $ m , PREG_OFFSET_CAPTURE )) {
257+ return null ;
258+ }
259+
260+ $ pos = $ m [0 ][1 ];
261+ $ len = mb_strlen ($ m [0 ][0 ]);
262+
263+ // build snippet
264+ $ start = max (0 , $ pos - $ radius );
265+ $ end = $ pos + $ len + $ radius ;
266+ $ snippet = mb_substr ($ text , $ start , $ end - $ start );
267+
268+ // highlight match
269+ $ snippet = preg_replace ($ pattern , '<span class="hit">$0</span> ' , $ snippet );
270+
271+ return "… " . $ snippet . "… " ;
272+ }
273+ ?>
0 commit comments