Skip to content

Commit 35b0ce2

Browse files
move to helper
1 parent eb0f23e commit 35b0ce2

3 files changed

Lines changed: 148 additions & 129 deletions

File tree

themes/default/config/theme.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ author = "Statik, Libis"
66
;theme_link = ""
77
author_link = ""
88
omeka_version_constraint = "^1.3.0 || ^3.0.0"
9+
helpers[] = Custom
910

1011
[config]
1112
elements.nav_depth.name = "nav_depth"

themes/default/helper/Custom.php

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
<?php
2+
3+
namespace OmekaTheme\Helper;
4+
5+
use Laminas\View\Helper\AbstractHelper;
6+
7+
class Custom extends AbstractHelper
8+
{
9+
// ---------- Normalizer (Option 2) ----------
10+
public function normalize_ocr_text($text)
11+
{
12+
// lower
13+
$text = mb_strtolower($text, 'UTF-8');
14+
15+
// FIX 1: join hyphenated line-break splits (cre-\natief → creatief)
16+
$text = preg_replace('/(\p{L}+)-\s*\n\s*(\p{L}+)/u', '$1$2', $text);
17+
18+
// FIX 2: join soft hyphens inside line (cre- atief → creatief)
19+
$text = preg_replace('/(\p{L}+)-\s+(\p{L}+)/u', '$1$2', $text);
20+
21+
// convert all dash types to spaces (for flexible matching)
22+
$text = preg_replace('/[-‐-‒–—―]/u', ' ', $text);
23+
24+
// remove punctuation that breaks word boundaries
25+
$text = preg_replace('/[.,;:!?(){}\[\]<>\/\\\\"“”\']+/u', ' ', $text);
26+
27+
// replace tabs & newlines with spaces
28+
$text = str_replace(["\t", "\r", "\n"], ' ', $text);
29+
30+
// strip HTML
31+
$text = strip_tags($text);
32+
33+
// collapse whitespace
34+
$text = preg_replace('/\s+/u', ' ', $text);
35+
36+
return trim($text);
37+
}
38+
39+
// ---------- Make the two flexible variants (space/hyphen) ----------
40+
public function flexible_search_variants($search)
41+
{
42+
$s = mb_strtolower(trim((string)$search), 'UTF-8');
43+
44+
// normalize dashes to hyphen for canonical form
45+
$s_dash = preg_replace('/[-‐-‒–—―\s]+/u', '-', $s); // all spaces/dashes -> single hyphen
46+
$s_space = preg_replace('/[-‐-‒–—―\s]+/u', ' ', $s); // all spaces/dashes -> single space
47+
48+
// prefer longer first to avoid partial matches (e.g. multiword vs single word)
49+
$variants = array_unique([$s_dash, $s_space]);
50+
51+
// sort by length desc so longer variants are tested first
52+
usort($variants, function ($a, $b) {
53+
return mb_strlen($b) - mb_strlen($a);
54+
});
55+
56+
return $variants;
57+
}
58+
59+
// ---------- Core snippet builder (fixed) ----------
60+
public function make_flexible_snippet($raw_text, $search, $radius = 30)
61+
{
62+
if (!strlen(trim($search))) return null;
63+
64+
// normalize OCR text (we will extract snippet from this normalized text)
65+
$text = $this->normalize_ocr_text($raw_text);
66+
67+
if ($text === '') return null;
68+
69+
// variants: "tomaat garnaal" and "tomaat-garnaal"
70+
$variants = $this->flexible_search_variants($search);
71+
72+
// build an alternation group of escaped variants for regex
73+
$escaped = array_map(function ($v) {
74+
return preg_quote($v, '/');
75+
}, $variants);
76+
$group = implode('|', $escaped);
77+
78+
// word-boundary like anchors that work with unicode letters & numbers
79+
$pattern = '/(?<![\p{L}\p{N}])(' . $group . ')(?![\p{L}\p{N}])/iu';
80+
81+
// find first match and get offset
82+
if (!preg_match($pattern, $text, $m, PREG_OFFSET_CAPTURE)) {
83+
return null;
84+
}
85+
86+
// $m[0][0] is matched text, $m[0][1] is byte offset (works with PREG_OFFSET_CAPTURE)
87+
$match_text = $m[0][0];
88+
$byte_offset = $m[0][1];
89+
90+
// convert byte offset to character offset for mb_substr
91+
// mb_substr accepts character index, but mb_strpos can be used to find char offset:
92+
// simpler: use mb_substr with mb_strlen on left part
93+
$left = mb_substr($text, 0, mb_strlen(mb_substr($text, 0, mb_stripos($text, $match_text, 0, 'UTF-8'), 'UTF-8'), 'UTF-8'), 'UTF-8');
94+
// however above is convoluted. We'll find character offset properly:
95+
$char_offset = mb_strpos($text, $match_text, 0, 'UTF-8');
96+
if ($char_offset === false) {
97+
// fallback: compute from byte offset
98+
$char_offset = mb_strlen(substr($text, 0, $byte_offset), '8bit');
99+
}
100+
101+
// snippet bounds
102+
$start_char = max(0, $char_offset - $radius);
103+
$end_char = min(mb_strlen($text, 'UTF-8'), $char_offset + mb_strlen($match_text, 'UTF-8') + $radius);
104+
list($start_char, $end_char) = $this->expand_to_full_words($text, $start_char, $end_char);
105+
106+
// Extract final snippet
107+
$snippet = mb_substr($text, $start_char, $end_char - $start_char, 'UTF-8');
108+
109+
// highlight the matched part(s) in the snippet (highlight all variants to be safe)
110+
foreach ($variants as $v) {
111+
if (mb_strlen($v) < 1) continue;
112+
$snippet = preg_replace(
113+
'/(?<![\p{L}\p{N}])' . preg_quote($v, '/') . '(?![\p{L}\p{N}])/iu',
114+
'<span class="hit">$0</span>',
115+
$snippet
116+
);
117+
}
118+
119+
return '' . trim($snippet) . '';
120+
}
121+
122+
public function expand_to_full_words($text, $start, $end)
123+
{
124+
125+
$len = mb_strlen($text, 'UTF-8');
126+
127+
// correct bounds
128+
if ($start < 0) $start = 0;
129+
if ($end > $len) $end = $len;
130+
131+
// step left until hitting whitespace or start
132+
while ($start > 0 && !preg_match('/\s/u', mb_substr($text, $start, 1, 'UTF-8'))) {
133+
$start--;
134+
}
135+
136+
// step right until hitting whitespace or end
137+
while ($end < $len && !preg_match('/\s/u', mb_substr($text, $end, 1, 'UTF-8'))) {
138+
$end++;
139+
}
140+
141+
return [$start, $end];
142+
}
143+
}

themes/default/view/search/resource.phtml

Lines changed: 4 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
* @var \Omeka\Api\Representation\AbstractResourceEntityRepresentation $resource
3333
* @var string $tag
3434
*/
35-
35+
$helper = $this->plugin('Custom');
3636
$hyperlink = $this->plugin('hyperlink');
3737
$type = $resource->resourceTemplate();
3838
if($type):
@@ -123,7 +123,7 @@ endif;
123123
$media = $resource->media();
124124

125125
if(isset($media[1])):
126-
$text = normalize_ocr_text($media[1]->render());
126+
$text = $helper->normalize_ocr_text($media[1]->render());
127127
endif;
128128

129129
if ($text) {
@@ -133,7 +133,7 @@ endif;
133133

134134
$search_text = $value["value"]; // raw user input
135135

136-
$snippet = make_flexible_snippet($text, $search_text);
136+
$snippet = $helper->make_flexible_snippet($text, $search_text);
137137

138138
if ($snippet) {
139139
echo "<div class='snippet'>{$snippet}</div>";
@@ -212,129 +212,4 @@ endif;
212212
</a>
213213
<?php echo sprintf('</%s>', $tag); ?>
214214

215-
</div></div>
216-
217-
<?php
218-
// ---------- Normalizer (Option 2) ----------
219-
function normalize_ocr_text($text) {
220-
// lower
221-
$text = mb_strtolower($text, 'UTF-8');
222-
223-
// FIX 1: join hyphenated line-break splits (cre-\natief → creatief)
224-
$text = preg_replace('/(\p{L}+)-\s*\n\s*(\p{L}+)/u', '$1$2',$text);
225-
226-
// FIX 2: join soft hyphens inside line (cre- atief → creatief)
227-
$text = preg_replace('/(\p{L}+)-\s+(\p{L}+)/u', '$1$2',$text);
228-
229-
// convert all dash types to spaces (for flexible matching)
230-
$text = preg_replace('/[-‐-‒–—―]/u', ' ', $text);
231-
232-
// remove punctuation that breaks word boundaries
233-
$text = preg_replace('/[.,;:!?(){}\[\]<>\/\\\\"“”\']+/u', ' ', $text);
234-
235-
// replace tabs & newlines with spaces
236-
$text = str_replace(["\t", "\r", "\n"], ' ', $text);
237-
238-
// strip HTML
239-
$text = strip_tags($text);
240-
241-
// collapse whitespace
242-
$text = preg_replace('/\s+/u', ' ', $text);
243-
244-
return trim($text);
245-
}
246-
247-
// ---------- Make the two flexible variants (space/hyphen) ----------
248-
function flexible_search_variants($search) {
249-
$s = mb_strtolower(trim((string)$search), 'UTF-8');
250-
251-
// normalize dashes to hyphen for canonical form
252-
$s_dash = preg_replace('/[-‐-‒–—―\s]+/u', '-', $s); // all spaces/dashes -> single hyphen
253-
$s_space = preg_replace('/[-‐-‒–—―\s]+/u', ' ', $s); // all spaces/dashes -> single space
254-
255-
// prefer longer first to avoid partial matches (e.g. multiword vs single word)
256-
$variants = array_unique([$s_dash, $s_space]);
257-
258-
// sort by length desc so longer variants are tested first
259-
usort($variants, function($a, $b){ return mb_strlen($b) - mb_strlen($a); });
260-
261-
return $variants;
262-
}
263-
264-
// ---------- Core snippet builder (fixed) ----------
265-
function make_flexible_snippet($raw_text, $search, $radius = 30) {
266-
if (!strlen(trim($search))) return null;
267-
268-
// normalize OCR text (we will extract snippet from this normalized text)
269-
$text = normalize_ocr_text($raw_text);
270-
if ($text === '') return null;
271-
272-
// variants: "tomaat garnaal" and "tomaat-garnaal"
273-
$variants = flexible_search_variants($search);
274-
275-
// build an alternation group of escaped variants for regex
276-
$escaped = array_map(function($v){ return preg_quote($v, '/'); }, $variants);
277-
$group = implode('|', $escaped);
278-
279-
// word-boundary like anchors that work with unicode letters & numbers
280-
$pattern = '/(?<![\p{L}\p{N}])(' . $group . ')(?![\p{L}\p{N}])/iu';
281-
282-
// find first match and get offset
283-
if (!preg_match($pattern, $text, $m, PREG_OFFSET_CAPTURE)) {
284-
return null;
285-
}
286-
287-
// $m[0][0] is matched text, $m[0][1] is byte offset (works with PREG_OFFSET_CAPTURE)
288-
$match_text = $m[0][0];
289-
$byte_offset = $m[0][1];
290-
291-
// convert byte offset to character offset for mb_substr
292-
// mb_substr accepts character index, but mb_strpos can be used to find char offset:
293-
// simpler: use mb_substr with mb_strlen on left part
294-
$left = mb_substr($text, 0, mb_strlen(mb_substr($text, 0, mb_stripos($text, $match_text, 0, 'UTF-8'), 'UTF-8'), 'UTF-8'), 'UTF-8');
295-
// however above is convoluted. We'll find character offset properly:
296-
$char_offset = mb_strpos($text, $match_text, 0, 'UTF-8');
297-
if ($char_offset === false) {
298-
// fallback: compute from byte offset
299-
$char_offset = mb_strlen(substr($text, 0, $byte_offset), '8bit');
300-
}
301-
302-
// snippet bounds
303-
$start_char = max(0, $char_offset - $radius);
304-
$end_char = min(mb_strlen($text, 'UTF-8'), $char_offset + mb_strlen($match_text, 'UTF-8') + $radius);
305-
list($start_char, $end_char) = expand_to_full_words($text, $start_char, $end_char);
306-
307-
// Extract final snippet
308-
$snippet = mb_substr($text, $start_char, $end_char - $start_char, 'UTF-8');
309-
310-
// highlight the matched part(s) in the snippet (highlight all variants to be safe)
311-
foreach ($variants as $v) {
312-
if (mb_strlen($v) < 1) continue;
313-
$snippet = preg_replace('/(?<![\p{L}\p{N}])' . preg_quote($v, '/') . '(?![\p{L}\p{N}])/iu',
314-
'<span class="hit">$0</span>', $snippet);
315-
}
316-
317-
return '' . trim($snippet) . '';
318-
}
319-
320-
function expand_to_full_words($text, $start, $end) {
321-
322-
$len = mb_strlen($text, 'UTF-8');
323-
324-
// correct bounds
325-
if ($start < 0) $start = 0;
326-
if ($end > $len) $end = $len;
327-
328-
// step left until hitting whitespace or start
329-
while ($start > 0 && !preg_match('/\s/u', mb_substr($text, $start, 1, 'UTF-8'))) {
330-
$start--;
331-
}
332-
333-
// step right until hitting whitespace or end
334-
while ($end < $len && !preg_match('/\s/u', mb_substr($text, $end, 1, 'UTF-8'))) {
335-
$end++;
336-
}
337-
338-
return [$start, $end];
339-
}
340-
?>
215+
</div></div>

0 commit comments

Comments
 (0)