snippet v3

JorisLambrechts · JorisLambrechts · commit eb0f23e828cd · 2025-12-01T14:03:49.000+01:00
diff --git a/themes/default/view/search/resource.phtml b/themes/default/view/search/resource.phtml
@@ -127,21 +127,21 @@ endif;
                 endif;  
 
                 if ($text) {
-                  foreach ($_GET["filter"] as $key => $value) {
+                    foreach ($_GET["filter"] as $key => $value) {
 
-                      if ($value["join"] != "not" && $value["value"]) {
+                        if ($value["join"] != "not" && $value["value"]) {
 
-                          $search_text = mb_strtolower(strip_tags($value["value"]));
+                            $search_text = $value["value"]; // raw user input
 
-                          $snippet = make_snippet_flexible($text, $search_text);
+                            $snippet = make_flexible_snippet($text, $search_text);
 
-                          if ($snippet) {
-                              echo "<div class='snippet'>{$snippet}</div>";
-                              break;
-                          }
-                      }
-                  }
-              }
+                            if ($snippet) {
+                                echo "<div class='snippet'>{$snippet}</div>";
+                                break;
+                            }
+                        }
+                    }
+                }
               endif;  
             ?>
 
@@ -215,59 +215,126 @@ endif;
 </div></div>
 
 <?php 
+// ---------- Normalizer (Option 2) ----------
 function normalize_ocr_text($text) {
-    // unify case
+    // lower
     $text = mb_strtolower($text, 'UTF-8');
 
-    // fix word splits: "to-\nmaat" → "tomaat"
-    $text = preg_replace('/(\w)-\s*\n\s*(\w)/u', '$1$2', $text);
+    // FIX 1: join hyphenated line-break splits (cre-\natief → creatief)
+    $text = preg_replace('/(\p{L}+)-\s*\n\s*(\p{L}+)/u', '$1$2',$text);
 
-    // remove remaining line-breaks
-    $text = preg_replace('/\s+/', ' ', $text);
+    // FIX 2: join soft hyphens inside line (cre- atief → creatief)
+    $text = preg_replace('/(\p{L}+)-\s+(\p{L}+)/u', '$1$2',$text);
 
-    // remove punctuation noise commonly introduced by OCR
-    $text = preg_replace('/[^\p{L}\p{N}\s]/u', '', $text);
+    // convert all dash types to spaces (for flexible matching)
+    $text = preg_replace('/[-‐-‒–—―]/u', ' ', $text);
 
-    $text = preg_replace('/[-‐-‒–—―]/u', ' ', $text); // all dash types → space
+    // remove punctuation that breaks word boundaries
+    $text = preg_replace('/[.,;:!?(){}\[\]<>\/\\\\"“”\']+/u', ' ', $text);
 
-    // normalize double spaces
-    $text = preg_replace('/\s+/', ' ', $text);
+    // replace tabs & newlines with spaces
+    $text = str_replace(["\t", "\r", "\n"], ' ', $text);
+
+    // strip HTML
+    $text = strip_tags($text);
+
+    // collapse whitespace
+    $text = preg_replace('/\s+/u', ' ', $text);
 
     return trim($text);
 }
 
-function make_snippet_flexible($text, $search, $radius = 45) {
+// ---------- Make the two flexible variants (space/hyphen) ----------
+function flexible_search_variants($search) {
+    $s = mb_strtolower(trim((string)$search), 'UTF-8');
+
+    // normalize dashes to hyphen for canonical form
+    $s_dash = preg_replace('/[-‐-‒–—―\s]+/u', '-', $s); // all spaces/dashes -> single hyphen
+    $s_space = preg_replace('/[-‐-‒–—―\s]+/u', ' ', $s); // all spaces/dashes -> single space
 
-    // normalize search term for consistent behavior
-    $search = mb_strtolower(trim($search));
+    // prefer longer first to avoid partial matches (e.g. multiword vs single word)
+    $variants = array_unique([$s_dash, $s_space]);
+
+    // sort by length desc so longer variants are tested first
+    usort($variants, function($a, $b){ return mb_strlen($b) - mb_strlen($a); });
+
+    return $variants;
+}
 
-    // create variants
-    $search_space = preg_replace('/[-‐-‒–—―]/u', ' ', $search); // hyphens → spaces
-    $search_dash  = preg_replace('/\s+/', '-', $search_space);   // spaces → hyphens
+// ---------- Core snippet builder (fixed) ----------
+function make_flexible_snippet($raw_text, $search, $radius = 30) {
+    if (!strlen(trim($search))) return null;
 
-    // build regex (matches either version)
-    $pattern = 
-        '/(?<!\p{L})('
-        . preg_quote($search_space, '/') 
-        . '|'
-        . preg_quote($search_dash, '/')
-        . ')(?!\p{L})/iu';
+    // normalize OCR text (we will extract snippet from this normalized text)
+    $text = normalize_ocr_text($raw_text);
+    if ($text === '') return null;
 
+    // variants: "tomaat garnaal" and "tomaat-garnaal"
+    $variants = flexible_search_variants($search);
+
+    // build an alternation group of escaped variants for regex
+    $escaped = array_map(function($v){ return preg_quote($v, '/'); }, $variants);
+    $group = implode('|', $escaped);
+
+    // word-boundary like anchors that work with unicode letters & numbers
+    $pattern = '/(?<![\p{L}\p{N}])(' . $group . ')(?![\p{L}\p{N}])/iu';
+
+    // find first match and get offset
     if (!preg_match($pattern, $text, $m, PREG_OFFSET_CAPTURE)) {
         return null;
     }
 
-    $pos = $m[0][1];
-    $len = mb_strlen($m[0][0]);
+    // $m[0][0] is matched text, $m[0][1] is byte offset (works with PREG_OFFSET_CAPTURE)
+    $match_text = $m[0][0];
+    $byte_offset = $m[0][1];
+
+    // convert byte offset to character offset for mb_substr
+    // mb_substr accepts character index, but mb_strpos can be used to find char offset:
+    // simpler: use mb_substr with mb_strlen on left part
+    $left = mb_substr($text, 0, mb_strlen(mb_substr($text, 0, mb_stripos($text, $match_text, 0, 'UTF-8'), 'UTF-8'), 'UTF-8'), 'UTF-8');
+    // however above is convoluted. We'll find character offset properly:
+    $char_offset = mb_strpos($text, $match_text, 0, 'UTF-8');
+    if ($char_offset === false) {
+        // fallback: compute from byte offset
+        $char_offset = mb_strlen(substr($text, 0, $byte_offset), '8bit');
+    }
+
+    // snippet bounds
+    $start_char = max(0, $char_offset - $radius);
+    $end_char   = min(mb_strlen($text, 'UTF-8'), $char_offset + mb_strlen($match_text, 'UTF-8') + $radius);
+    list($start_char, $end_char) = expand_to_full_words($text, $start_char, $end_char);
+
+// Extract final snippet
+$snippet = mb_substr($text, $start_char, $end_char - $start_char, 'UTF-8');
+
+    // highlight the matched part(s) in the snippet (highlight all variants to be safe)
+    foreach ($variants as $v) {
+        if (mb_strlen($v) < 1) continue;
+        $snippet = preg_replace('/(?<![\p{L}\p{N}])' . preg_quote($v, '/') . '(?![\p{L}\p{N}])/iu',
+                                '<span class="hit">$0</span>', $snippet);
+    }
+
+    return '…' . trim($snippet) . '…';
+}
 
-    // build snippet
-    $start = max(0, $pos - $radius);
-    $end   = $pos + $len + $radius;
-    $snippet = mb_substr($text, $start, $end - $start);
+function expand_to_full_words($text, $start, $end) {
 
-    // highlight match
-    $snippet = preg_replace($pattern, '<span class="hit">$0</span>', $snippet);
+    $len = mb_strlen($text, 'UTF-8');
+
+    // correct bounds
+    if ($start < 0) $start = 0;
+    if ($end > $len) $end = $len;
+
+    // step left until hitting whitespace or start
+    while ($start > 0 && !preg_match('/\s/u', mb_substr($text, $start, 1, 'UTF-8'))) {
+        $start--;
+    }
+
+    // step right until hitting whitespace or end
+    while ($end < $len && !preg_match('/\s/u', mb_substr($text, $end, 1, 'UTF-8'))) {
+        $end++;
+    }
 
-    return "…" . $snippet . "…";
+    return [$start, $end];
 }
 ?>