3232 * @var \Omeka\Api\Representation\AbstractResourceEntityRepresentation $resource
3333 * @var string $tag
3434 */
35-
35+ $ helper = $ this -> plugin ( ' Custom ' );
3636$ hyperlink = $ this ->plugin ('hyperlink ' );
3737$ type = $ resource ->resourceTemplate ();
3838if ($ type ):
@@ -123,7 +123,7 @@ endif;
123123 $ media = $ resource ->media ();
124124
125125 if (isset ($ media [1 ])):
126- $ text = normalize_ocr_text ($ media [1 ]->render ());
126+ $ text = $ helper -> normalize_ocr_text ($ media [1 ]->render ());
127127 endif ;
128128
129129 if ($ text ) {
@@ -133,7 +133,7 @@ endif;
133133
134134 $ search_text = $ value ["value " ]; // raw user input
135135
136- $ snippet = make_flexible_snippet ($ text , $ search_text );
136+ $ snippet = $ helper -> make_flexible_snippet ($ text , $ search_text );
137137
138138 if ($ snippet ) {
139139 echo "<div class='snippet'> {$ snippet }</div> " ;
@@ -212,129 +212,4 @@ endif;
212212 </a>
213213<?php echo sprintf ('</%s> ' , $ tag ); ?>
214214
215- </div></div>
216-
217- <?php
218- // ---------- Normalizer (Option 2) ----------
219- function normalize_ocr_text ($ text ) {
220- // lower
221- $ text = mb_strtolower ($ text , 'UTF-8 ' );
222-
223- // FIX 1: join hyphenated line-break splits (cre-\natief → creatief)
224- $ text = preg_replace ('/(\p{L}+)-\s*\n\s*(\p{L}+)/u ' , '$1$2 ' ,$ text );
225-
226- // FIX 2: join soft hyphens inside line (cre- atief → creatief)
227- $ text = preg_replace ('/(\p{L}+)-\s+(\p{L}+)/u ' , '$1$2 ' ,$ text );
228-
229- // convert all dash types to spaces (for flexible matching)
230- $ text = preg_replace ('/[-‐-‒–—―]/u ' , ' ' , $ text );
231-
232- // remove punctuation that breaks word boundaries
233- $ text = preg_replace ('/[.,;:!?(){}\[\]<>\/ \\\\"“” \']+/u ' , ' ' , $ text );
234-
235- // replace tabs & newlines with spaces
236- $ text = str_replace (["\t" , "\r" , "\n" ], ' ' , $ text );
237-
238- // strip HTML
239- $ text = strip_tags ($ text );
240-
241- // collapse whitespace
242- $ text = preg_replace ('/\s+/u ' , ' ' , $ text );
243-
244- return trim ($ text );
245- }
246-
247- // ---------- Make the two flexible variants (space/hyphen) ----------
248- function flexible_search_variants ($ search ) {
249- $ s = mb_strtolower (trim ((string )$ search ), 'UTF-8 ' );
250-
251- // normalize dashes to hyphen for canonical form
252- $ s_dash = preg_replace ('/[-‐-‒–—―\s]+/u ' , '- ' , $ s ); // all spaces/dashes -> single hyphen
253- $ s_space = preg_replace ('/[-‐-‒–—―\s]+/u ' , ' ' , $ s ); // all spaces/dashes -> single space
254-
255- // prefer longer first to avoid partial matches (e.g. multiword vs single word)
256- $ variants = array_unique ([$ s_dash , $ s_space ]);
257-
258- // sort by length desc so longer variants are tested first
259- usort ($ variants , function ($ a , $ b ){ return mb_strlen ($ b ) - mb_strlen ($ a ); });
260-
261- return $ variants ;
262- }
263-
264- // ---------- Core snippet builder (fixed) ----------
265- function make_flexible_snippet ($ raw_text , $ search , $ radius = 30 ) {
266- if (!strlen (trim ($ search ))) return null ;
267-
268- // normalize OCR text (we will extract snippet from this normalized text)
269- $ text = normalize_ocr_text ($ raw_text );
270- if ($ text === '' ) return null ;
271-
272- // variants: "tomaat garnaal" and "tomaat-garnaal"
273- $ variants = flexible_search_variants ($ search );
274-
275- // build an alternation group of escaped variants for regex
276- $ escaped = array_map (function ($ v ){ return preg_quote ($ v , '/ ' ); }, $ variants );
277- $ group = implode ('| ' , $ escaped );
278-
279- // word-boundary like anchors that work with unicode letters & numbers
280- $ pattern = '/(?<![\p{L}\p{N}])( ' . $ group . ')(?![\p{L}\p{N}])/iu ' ;
281-
282- // find first match and get offset
283- if (!preg_match ($ pattern , $ text , $ m , PREG_OFFSET_CAPTURE )) {
284- return null ;
285- }
286-
287- // $m[0][0] is matched text, $m[0][1] is byte offset (works with PREG_OFFSET_CAPTURE)
288- $ match_text = $ m [0 ][0 ];
289- $ byte_offset = $ m [0 ][1 ];
290-
291- // convert byte offset to character offset for mb_substr
292- // mb_substr accepts character index, but mb_strpos can be used to find char offset:
293- // simpler: use mb_substr with mb_strlen on left part
294- $ left = mb_substr ($ text , 0 , mb_strlen (mb_substr ($ text , 0 , mb_stripos ($ text , $ match_text , 0 , 'UTF-8 ' ), 'UTF-8 ' ), 'UTF-8 ' ), 'UTF-8 ' );
295- // however above is convoluted. We'll find character offset properly:
296- $ char_offset = mb_strpos ($ text , $ match_text , 0 , 'UTF-8 ' );
297- if ($ char_offset === false ) {
298- // fallback: compute from byte offset
299- $ char_offset = mb_strlen (substr ($ text , 0 , $ byte_offset ), '8bit ' );
300- }
301-
302- // snippet bounds
303- $ start_char = max (0 , $ char_offset - $ radius );
304- $ end_char = min (mb_strlen ($ text , 'UTF-8 ' ), $ char_offset + mb_strlen ($ match_text , 'UTF-8 ' ) + $ radius );
305- list ($ start_char , $ end_char ) = expand_to_full_words ($ text , $ start_char , $ end_char );
306-
307- // Extract final snippet
308- $ snippet = mb_substr ($ text , $ start_char , $ end_char - $ start_char , 'UTF-8 ' );
309-
310- // highlight the matched part(s) in the snippet (highlight all variants to be safe)
311- foreach ($ variants as $ v ) {
312- if (mb_strlen ($ v ) < 1 ) continue ;
313- $ snippet = preg_replace ('/(?<![\p{L}\p{N}]) ' . preg_quote ($ v , '/ ' ) . '(?![\p{L}\p{N}])/iu ' ,
314- '<span class="hit">$0</span> ' , $ snippet );
315- }
316-
317- return '… ' . trim ($ snippet ) . '… ' ;
318- }
319-
320- function expand_to_full_words ($ text , $ start , $ end ) {
321-
322- $ len = mb_strlen ($ text , 'UTF-8 ' );
323-
324- // correct bounds
325- if ($ start < 0 ) $ start = 0 ;
326- if ($ end > $ len ) $ end = $ len ;
327-
328- // step left until hitting whitespace or start
329- while ($ start > 0 && !preg_match ('/\s/u ' , mb_substr ($ text , $ start , 1 , 'UTF-8 ' ))) {
330- $ start --;
331- }
332-
333- // step right until hitting whitespace or end
334- while ($ end < $ len && !preg_match ('/\s/u ' , mb_substr ($ text , $ end , 1 , 'UTF-8 ' ))) {
335- $ end ++;
336- }
337-
338- return [$ start , $ end ];
339- }
340- ?>
215+ </div></div>
0 commit comments