Skip to content

Commit eb0f23e

Browse files
snippet v3
1 parent ad7b2ee commit eb0f23e

1 file changed

Lines changed: 110 additions & 43 deletions

File tree

themes/default/view/search/resource.phtml

Lines changed: 110 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -127,21 +127,21 @@ endif;
127127
endif;
128128

129129
if ($text) {
130-
foreach ($_GET["filter"] as $key => $value) {
130+
foreach ($_GET["filter"] as $key => $value) {
131131

132-
if ($value["join"] != "not" && $value["value"]) {
132+
if ($value["join"] != "not" && $value["value"]) {
133133

134-
$search_text = mb_strtolower(strip_tags($value["value"]));
134+
$search_text = $value["value"]; // raw user input
135135

136-
$snippet = make_snippet_flexible($text, $search_text);
136+
$snippet = make_flexible_snippet($text, $search_text);
137137

138-
if ($snippet) {
139-
echo "<div class='snippet'>{$snippet}</div>";
140-
break;
141-
}
142-
}
143-
}
144-
}
138+
if ($snippet) {
139+
echo "<div class='snippet'>{$snippet}</div>";
140+
break;
141+
}
142+
}
143+
}
144+
}
145145
endif;
146146
?>
147147

@@ -215,59 +215,126 @@ endif;
215215
</div></div>
216216

217217
<?php
218+
// ---------- Normalizer (Option 2) ----------
218219
function normalize_ocr_text($text) {
219-
// unify case
220+
// lower
220221
$text = mb_strtolower($text, 'UTF-8');
221222

222-
// fix word splits: "to-\nmaat""tomaat"
223-
$text = preg_replace('/(\w)-\s*\n\s*(\w)/u', '$1$2', $text);
223+
// FIX 1: join hyphenated line-break splits (cre-\natiefcreatief)
224+
$text = preg_replace('/(\p{L}+)-\s*\n\s*(\p{L}+)/u', '$1$2',$text);
224225

225-
// remove remaining line-breaks
226-
$text = preg_replace('/\s+/', ' ', $text);
226+
// FIX 2: join soft hyphens inside line (cre- atief → creatief)
227+
$text = preg_replace('/(\p{L}+)-\s+(\p{L}+)/u', '$1$2',$text);
227228

228-
// remove punctuation noise commonly introduced by OCR
229-
$text = preg_replace('/[^\p{L}\p{N}\s]/u', '', $text);
229+
// convert all dash types to spaces (for flexible matching)
230+
$text = preg_replace('/[-‐-‒–—―]/u', ' ', $text);
230231

231-
$text = preg_replace('/[-‐-‒–—―]/u', ' ', $text); // all dash types → space
232+
// remove punctuation that breaks word boundaries
233+
$text = preg_replace('/[.,;:!?(){}\[\]<>\/\\\\"“”\']+/u', ' ', $text);
232234

233-
// normalize double spaces
234-
$text = preg_replace('/\s+/', ' ', $text);
235+
// replace tabs & newlines with spaces
236+
$text = str_replace(["\t", "\r", "\n"], ' ', $text);
237+
238+
// strip HTML
239+
$text = strip_tags($text);
240+
241+
// collapse whitespace
242+
$text = preg_replace('/\s+/u', ' ', $text);
235243

236244
return trim($text);
237245
}
238246

239-
function make_snippet_flexible($text, $search, $radius = 45) {
247+
// ---------- Make the two flexible variants (space/hyphen) ----------
248+
function flexible_search_variants($search) {
249+
$s = mb_strtolower(trim((string)$search), 'UTF-8');
250+
251+
// normalize dashes to hyphen for canonical form
252+
$s_dash = preg_replace('/[-‐-‒–—―\s]+/u', '-', $s); // all spaces/dashes -> single hyphen
253+
$s_space = preg_replace('/[-‐-‒–—―\s]+/u', ' ', $s); // all spaces/dashes -> single space
240254

241-
// normalize search term for consistent behavior
242-
$search = mb_strtolower(trim($search));
255+
// prefer longer first to avoid partial matches (e.g. multiword vs single word)
256+
$variants = array_unique([$s_dash, $s_space]);
257+
258+
// sort by length desc so longer variants are tested first
259+
usort($variants, function($a, $b){ return mb_strlen($b) - mb_strlen($a); });
260+
261+
return $variants;
262+
}
243263

244-
// create variants
245-
$search_space = preg_replace('/[-‐-‒–—―]/u', ' ', $search); // hyphens → spaces
246-
$search_dash = preg_replace('/\s+/', '-', $search_space); // spaces → hyphens
264+
// ---------- Core snippet builder (fixed) ----------
265+
function make_flexible_snippet($raw_text, $search, $radius = 30) {
266+
if (!strlen(trim($search))) return null;
247267

248-
// build regex (matches either version)
249-
$pattern =
250-
'/(?<!\p{L})('
251-
. preg_quote($search_space, '/')
252-
. '|'
253-
. preg_quote($search_dash, '/')
254-
. ')(?!\p{L})/iu';
268+
// normalize OCR text (we will extract snippet from this normalized text)
269+
$text = normalize_ocr_text($raw_text);
270+
if ($text === '') return null;
255271

272+
// variants: "tomaat garnaal" and "tomaat-garnaal"
273+
$variants = flexible_search_variants($search);
274+
275+
// build an alternation group of escaped variants for regex
276+
$escaped = array_map(function($v){ return preg_quote($v, '/'); }, $variants);
277+
$group = implode('|', $escaped);
278+
279+
// word-boundary like anchors that work with unicode letters & numbers
280+
$pattern = '/(?<![\p{L}\p{N}])(' . $group . ')(?![\p{L}\p{N}])/iu';
281+
282+
// find first match and get offset
256283
if (!preg_match($pattern, $text, $m, PREG_OFFSET_CAPTURE)) {
257284
return null;
258285
}
259286

260-
$pos = $m[0][1];
261-
$len = mb_strlen($m[0][0]);
287+
// $m[0][0] is matched text, $m[0][1] is byte offset (works with PREG_OFFSET_CAPTURE)
288+
$match_text = $m[0][0];
289+
$byte_offset = $m[0][1];
290+
291+
// convert byte offset to character offset for mb_substr
292+
// mb_substr accepts character index, but mb_strpos can be used to find char offset:
293+
// simpler: use mb_substr with mb_strlen on left part
294+
$left = mb_substr($text, 0, mb_strlen(mb_substr($text, 0, mb_stripos($text, $match_text, 0, 'UTF-8'), 'UTF-8'), 'UTF-8'), 'UTF-8');
295+
// however above is convoluted. We'll find character offset properly:
296+
$char_offset = mb_strpos($text, $match_text, 0, 'UTF-8');
297+
if ($char_offset === false) {
298+
// fallback: compute from byte offset
299+
$char_offset = mb_strlen(substr($text, 0, $byte_offset), '8bit');
300+
}
301+
302+
// snippet bounds
303+
$start_char = max(0, $char_offset - $radius);
304+
$end_char = min(mb_strlen($text, 'UTF-8'), $char_offset + mb_strlen($match_text, 'UTF-8') + $radius);
305+
list($start_char, $end_char) = expand_to_full_words($text, $start_char, $end_char);
306+
307+
// Extract final snippet
308+
$snippet = mb_substr($text, $start_char, $end_char - $start_char, 'UTF-8');
309+
310+
// highlight the matched part(s) in the snippet (highlight all variants to be safe)
311+
foreach ($variants as $v) {
312+
if (mb_strlen($v) < 1) continue;
313+
$snippet = preg_replace('/(?<![\p{L}\p{N}])' . preg_quote($v, '/') . '(?![\p{L}\p{N}])/iu',
314+
'<span class="hit">$0</span>', $snippet);
315+
}
316+
317+
return '' . trim($snippet) . '';
318+
}
262319

263-
// build snippet
264-
$start = max(0, $pos - $radius);
265-
$end = $pos + $len + $radius;
266-
$snippet = mb_substr($text, $start, $end - $start);
320+
function expand_to_full_words($text, $start, $end) {
267321

268-
// highlight match
269-
$snippet = preg_replace($pattern, '<span class="hit">$0</span>', $snippet);
322+
$len = mb_strlen($text, 'UTF-8');
323+
324+
// correct bounds
325+
if ($start < 0) $start = 0;
326+
if ($end > $len) $end = $len;
327+
328+
// step left until hitting whitespace or start
329+
while ($start > 0 && !preg_match('/\s/u', mb_substr($text, $start, 1, 'UTF-8'))) {
330+
$start--;
331+
}
332+
333+
// step right until hitting whitespace or end
334+
while ($end < $len && !preg_match('/\s/u', mb_substr($text, $end, 1, 'UTF-8'))) {
335+
$end++;
336+
}
270337

271-
return "" . $snippet . "";
338+
return [$start, $end];
272339
}
273340
?>

0 commit comments

Comments
 (0)