Skip to content

Commit ad7b2ee

Browse files
redo snippet creation
1 parent ce153f7 commit ad7b2ee

1 file changed

Lines changed: 74 additions & 47 deletions

File tree

themes/default/view/search/resource.phtml

Lines changed: 74 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -123,55 +123,25 @@ endif;
123123
$media = $resource->media();
124124

125125
if(isset($media[1])):
126-
echo $media[1]->mediaType();
127-
$text = $media[1]->render();
128-
//echo $text;
129-
130-
$text = str_replace(array("-\r", "-\n"), '', $text);
131-
$text = str_replace(array("\r", "\n"), ' ', $text);
132-
$text = str_replace(" ", ' ', $text);
133-
$text = str_replace(" : ", ' ', $text);
134-
$text = str_replace("", '', $text);
135-
$text = str_replace("|", '', $text);
136-
$text = strip_tags($text);
137-
126+
$text = normalize_ocr_text($media[1]->render());
138127
endif;
139128

140-
if($text):
141-
foreach($_GET["filter"] as $key => $value):
142-
if($value["join"] != "not" && $value["value"]):
143-
$search_text = htmlspecialchars(strip_tags($value["value"]));
144-
//$texts = explode($search_text,str_ireplace($search_text,$search_text,$text));
145-
$search_text = str_replace("-"," ",$search_text);
146-
$search_text_escaped = preg_quote($search_text, '/');
147-
$texts = preg_split("/\b".$search_text_escaped."\b/i", $text);
148-
149-
if(sizeof($texts)>1):
150-
//$first = explode(" ",$texts[0]);
151-
$first = substr($texts[0],-45);
152-
//$last = explode(" ",$texts[1]);
153-
$last = substr($texts[1],0,45);
154-
echo "<div class='snippet'>";
155-
echo "...".$first."<span class='hit'>".$search_text."</span>".$last.'...';
156-
echo "</div>";
157-
break;
158-
endif;
159-
$search_text = str_replace(" ","-",$search_text);
160-
$search_text_escaped = preg_quote($search_text, '/');
161-
$texts = preg_split("/\b".$search_text_escaped."\b/i", $text);
162-
if(sizeof($texts)>1):
163-
//$first = explode(" ",$texts[0]);
164-
$first = substr($texts[0], -45);
165-
//$last = explode(" ",$texts[1]);
166-
$last = substr($texts[1],0,45);
167-
echo "<div class='snippet'>";
168-
echo "...".$first."<span class='hit'>".$search_text."</span>".$last.'...';
169-
echo "</div>";
170-
break;
171-
endif;
172-
endif;
173-
endforeach;
174-
endif;
129+
if ($text) {
130+
foreach ($_GET["filter"] as $key => $value) {
131+
132+
if ($value["join"] != "not" && $value["value"]) {
133+
134+
$search_text = mb_strtolower(strip_tags($value["value"]));
135+
136+
$snippet = make_snippet_flexible($text, $search_text);
137+
138+
if ($snippet) {
139+
echo "<div class='snippet'>{$snippet}</div>";
140+
break;
141+
}
142+
}
143+
}
144+
}
175145
endif;
176146
?>
177147

@@ -244,3 +214,60 @@ endif;
244214

245215
</div></div>
246216

217+
<?php
218+
function normalize_ocr_text($text) {
219+
// unify case
220+
$text = mb_strtolower($text, 'UTF-8');
221+
222+
// fix word splits: "to-\nmaat" → "tomaat"
223+
$text = preg_replace('/(\w)-\s*\n\s*(\w)/u', '$1$2', $text);
224+
225+
// remove remaining line-breaks
226+
$text = preg_replace('/\s+/', ' ', $text);
227+
228+
// remove punctuation noise commonly introduced by OCR
229+
$text = preg_replace('/[^\p{L}\p{N}\s]/u', '', $text);
230+
231+
$text = preg_replace('/[-‐-‒–—―]/u', ' ', $text); // all dash types → space
232+
233+
// normalize double spaces
234+
$text = preg_replace('/\s+/', ' ', $text);
235+
236+
return trim($text);
237+
}
238+
239+
function make_snippet_flexible($text, $search, $radius = 45) {
240+
241+
// normalize search term for consistent behavior
242+
$search = mb_strtolower(trim($search));
243+
244+
// create variants
245+
$search_space = preg_replace('/[-‐-‒–—―]/u', ' ', $search); // hyphens → spaces
246+
$search_dash = preg_replace('/\s+/', '-', $search_space); // spaces → hyphens
247+
248+
// build regex (matches either version)
249+
$pattern =
250+
'/(?<!\p{L})('
251+
. preg_quote($search_space, '/')
252+
. '|'
253+
. preg_quote($search_dash, '/')
254+
. ')(?!\p{L})/iu';
255+
256+
if (!preg_match($pattern, $text, $m, PREG_OFFSET_CAPTURE)) {
257+
return null;
258+
}
259+
260+
$pos = $m[0][1];
261+
$len = mb_strlen($m[0][0]);
262+
263+
// build snippet
264+
$start = max(0, $pos - $radius);
265+
$end = $pos + $len + $radius;
266+
$snippet = mb_substr($text, $start, $end - $start);
267+
268+
// highlight match
269+
$snippet = preg_replace($pattern, '<span class="hit">$0</span>', $snippet);
270+
271+
return "" . $snippet . "";
272+
}
273+
?>

0 commit comments

Comments
 (0)