@@ -127,21 +127,21 @@ endif;
127127 endif ;
128128
129129 if ($ text ) {
130- foreach ($ _GET ["filter " ] as $ key => $ value ) {
130+ foreach ($ _GET ["filter " ] as $ key => $ value ) {
131131
132- if ($ value ["join " ] != "not " && $ value ["value " ]) {
132+ if ($ value ["join " ] != "not " && $ value ["value " ]) {
133133
134- $ search_text = mb_strtolower ( strip_tags ( $ value ["value " ]));
134+ $ search_text = $ value ["value " ]; // raw user input
135135
136- $ snippet = make_snippet_flexible ($ text , $ search_text );
136+ $ snippet = make_flexible_snippet ($ text , $ search_text );
137137
138- if ($ snippet ) {
139- echo "<div class='snippet'> {$ snippet }</div> " ;
140- break ;
141- }
142- }
143- }
144- }
138+ if ($ snippet ) {
139+ echo "<div class='snippet'> {$ snippet }</div> " ;
140+ break ;
141+ }
142+ }
143+ }
144+ }
145145 endif ;
146146 ?>
147147
@@ -215,59 +215,126 @@ endif;
215215</div></div>
216216
217217<?php
218+ // ---------- Normalizer (Option 2) ----------
218219function normalize_ocr_text ($ text ) {
219- // unify case
220+ // lower
220221 $ text = mb_strtolower ($ text , 'UTF-8 ' );
221222
222- // fix word splits: "to-\nmaat" → "tomaat"
223- $ text = preg_replace ('/(\w )-\s*\n\s*(\w )/u ' , '$1$2 ' , $ text );
223+ // FIX 1: join hyphenated line-break splits (cre-\natief → creatief)
224+ $ text = preg_replace ('/(\p{L}+ )-\s*\n\s*(\p{L}+ )/u ' , '$1$2 ' ,$ text );
224225
225- // remove remaining line-breaks
226- $ text = preg_replace ('/\s+/ ' , ' ' , $ text );
226+ // FIX 2: join soft hyphens inside line (cre- atief → creatief)
227+ $ text = preg_replace ('/(\p{L}+)-\s+(\p{L}+)/u ' , '$1$2 ' , $ text );
227228
228- // remove punctuation noise commonly introduced by OCR
229- $ text = preg_replace ('/[^\p{L}\p{N}\s ]/u ' , '' , $ text );
229+ // convert all dash types to spaces (for flexible matching)
230+ $ text = preg_replace ('/[-‐-‒–—― ]/u ' , ' ' , $ text );
230231
231- $ text = preg_replace ('/[-‐-‒–—―]/u ' , ' ' , $ text ); // all dash types → space
232+ // remove punctuation that breaks word boundaries
233+ $ text = preg_replace ('/[.,;:!?(){}\[\]<>\/ \\\\"“” \']+/u ' , ' ' , $ text );
232234
233- // normalize double spaces
234- $ text = preg_replace ('/\s+/ ' , ' ' , $ text );
235+ // replace tabs & newlines with spaces
236+ $ text = str_replace (["\t" , "\r" , "\n" ], ' ' , $ text );
237+
238+ // strip HTML
239+ $ text = strip_tags ($ text );
240+
241+ // collapse whitespace
242+ $ text = preg_replace ('/\s+/u ' , ' ' , $ text );
235243
236244 return trim ($ text );
237245}
238246
239- function make_snippet_flexible ($ text , $ search , $ radius = 45 ) {
247+ // ---------- Make the two flexible variants (space/hyphen) ----------
248+ function flexible_search_variants ($ search ) {
249+ $ s = mb_strtolower (trim ((string )$ search ), 'UTF-8 ' );
250+
251+ // normalize dashes to hyphen for canonical form
252+ $ s_dash = preg_replace ('/[-‐-‒–—―\s]+/u ' , '- ' , $ s ); // all spaces/dashes -> single hyphen
253+ $ s_space = preg_replace ('/[-‐-‒–—―\s]+/u ' , ' ' , $ s ); // all spaces/dashes -> single space
240254
241- // normalize search term for consistent behavior
242- $ search = mb_strtolower (trim ($ search ));
255+ // prefer longer first to avoid partial matches (e.g. multiword vs single word)
256+ $ variants = array_unique ([$ s_dash , $ s_space ]);
257+
258+ // sort by length desc so longer variants are tested first
259+ usort ($ variants , function ($ a , $ b ){ return mb_strlen ($ b ) - mb_strlen ($ a ); });
260+
261+ return $ variants ;
262+ }
243263
244- // create variants
245- $ search_space = preg_replace ( ' /[-‐-‒–—―]/u ' , ' ' , $ search ); // hyphens → spaces
246- $ search_dash = preg_replace ( ' /\s+/ ' , ' - ' , $ search_space ); // spaces → hyphens
264+ // ---------- Core snippet builder (fixed) ----------
265+ function make_flexible_snippet ( $ raw_text , $ search , $ radius = 30 ) {
266+ if (! strlen ( trim ( $ search ))) return null ;
247267
248- // build regex (matches either version)
249- $ pattern =
250- '/(?<!\p{L})( '
251- . preg_quote ($ search_space , '/ ' )
252- . '| '
253- . preg_quote ($ search_dash , '/ ' )
254- . ')(?!\p{L})/iu ' ;
268+ // normalize OCR text (we will extract snippet from this normalized text)
269+ $ text = normalize_ocr_text ($ raw_text );
270+ if ($ text === '' ) return null ;
255271
272+ // variants: "tomaat garnaal" and "tomaat-garnaal"
273+ $ variants = flexible_search_variants ($ search );
274+
275+ // build an alternation group of escaped variants for regex
276+ $ escaped = array_map (function ($ v ){ return preg_quote ($ v , '/ ' ); }, $ variants );
277+ $ group = implode ('| ' , $ escaped );
278+
279+ // word-boundary like anchors that work with unicode letters & numbers
280+ $ pattern = '/(?<![\p{L}\p{N}])( ' . $ group . ')(?![\p{L}\p{N}])/iu ' ;
281+
282+ // find first match and get offset
256283 if (!preg_match ($ pattern , $ text , $ m , PREG_OFFSET_CAPTURE )) {
257284 return null ;
258285 }
259286
260- $ pos = $ m [0 ][1 ];
261- $ len = mb_strlen ($ m [0 ][0 ]);
287+ // $m[0][0] is matched text, $m[0][1] is byte offset (works with PREG_OFFSET_CAPTURE)
288+ $ match_text = $ m [0 ][0 ];
289+ $ byte_offset = $ m [0 ][1 ];
290+
291+ // convert byte offset to character offset for mb_substr
292+ // mb_substr accepts character index, but mb_strpos can be used to find char offset:
293+ // simpler: use mb_substr with mb_strlen on left part
294+ $ left = mb_substr ($ text , 0 , mb_strlen (mb_substr ($ text , 0 , mb_stripos ($ text , $ match_text , 0 , 'UTF-8 ' ), 'UTF-8 ' ), 'UTF-8 ' ), 'UTF-8 ' );
295+ // however above is convoluted. We'll find character offset properly:
296+ $ char_offset = mb_strpos ($ text , $ match_text , 0 , 'UTF-8 ' );
297+ if ($ char_offset === false ) {
298+ // fallback: compute from byte offset
299+ $ char_offset = mb_strlen (substr ($ text , 0 , $ byte_offset ), '8bit ' );
300+ }
301+
302+ // snippet bounds
303+ $ start_char = max (0 , $ char_offset - $ radius );
304+ $ end_char = min (mb_strlen ($ text , 'UTF-8 ' ), $ char_offset + mb_strlen ($ match_text , 'UTF-8 ' ) + $ radius );
305+ list ($ start_char , $ end_char ) = expand_to_full_words ($ text , $ start_char , $ end_char );
306+
307+ // Extract final snippet
308+ $ snippet = mb_substr ($ text , $ start_char , $ end_char - $ start_char , 'UTF-8 ' );
309+
310+ // highlight the matched part(s) in the snippet (highlight all variants to be safe)
311+ foreach ($ variants as $ v ) {
312+ if (mb_strlen ($ v ) < 1 ) continue ;
313+ $ snippet = preg_replace ('/(?<![\p{L}\p{N}]) ' . preg_quote ($ v , '/ ' ) . '(?![\p{L}\p{N}])/iu ' ,
314+ '<span class="hit">$0</span> ' , $ snippet );
315+ }
316+
317+ return '… ' . trim ($ snippet ) . '… ' ;
318+ }
262319
263- // build snippet
264- $ start = max (0 , $ pos - $ radius );
265- $ end = $ pos + $ len + $ radius ;
266- $ snippet = mb_substr ($ text , $ start , $ end - $ start );
320+ function expand_to_full_words ($ text , $ start , $ end ) {
267321
268- // highlight match
269- $ snippet = preg_replace ($ pattern , '<span class="hit">$0</span> ' , $ snippet );
322+ $ len = mb_strlen ($ text , 'UTF-8 ' );
323+
324+ // correct bounds
325+ if ($ start < 0 ) $ start = 0 ;
326+ if ($ end > $ len ) $ end = $ len ;
327+
328+ // step left until hitting whitespace or start
329+ while ($ start > 0 && !preg_match ('/\s/u ' , mb_substr ($ text , $ start , 1 , 'UTF-8 ' ))) {
330+ $ start --;
331+ }
332+
333+ // step right until hitting whitespace or end
334+ while ($ end < $ len && !preg_match ('/\s/u ' , mb_substr ($ text , $ end , 1 , 'UTF-8 ' ))) {
335+ $ end ++;
336+ }
270337
271- return " … " . $ snippet . " … " ;
338+ return [ $ start , $ end ] ;
272339}
273340?>
0 commit comments