|
20 | 20 | * --m=<n> HNSW M parameter (default: 16) |
21 | 21 | * --seed=<n> Random seed for reproducibility (default: 42) |
22 | 22 | * --output=<file> Write Markdown report to file (default: stdout) |
23 | | - * --no-persist Skip persistence benchmarks |
| 23 | + * --no-save Skip persistence benchmarks (save / open) |
24 | 24 | * --no-recall Skip recall computation |
25 | 25 | * --help, -h Show this help |
26 | 26 | * |
|
70 | 70 | $opts = getopt('h', [ |
71 | 71 | 'scenarios:', 'k:', 'queries:', 'recall-samples:', |
72 | 72 | 'ef-search:', 'ef-construction:', 'm:', 'seed:', |
73 | | - 'output:', 'no-persist', 'no-recall', 'help', 'h', |
| 73 | + 'output:', 'no-save', 'no-persist', 'no-recall', 'help', 'h', |
74 | 74 | ]); |
75 | 75 |
|
76 | 76 | if (isset($opts['help']) || isset($opts['h'])) { |
|
100 | 100 | $m = max(2, (int) ($opts['m'] ?? 16)); |
101 | 101 | $seed = (int) ($opts['seed'] ?? 42); |
102 | 102 | $outputFile = $opts['output'] ?? null; |
103 | | -$noPersist = isset($opts['no-persist']); |
| 103 | +$noPersist = isset($opts['no-save']) || isset($opts['no-persist']); |
104 | 104 | $noRecall = isset($opts['no-recall']); |
105 | 105 |
|
106 | 106 | $hnswConfig = new HNSWConfig( |
@@ -181,6 +181,33 @@ function computeRecall( |
181 | 181 | return array_map(static fn(float $t): float => $t / $nRecall, $totals); |
182 | 182 | } |
183 | 183 |
|
| 184 | +/** |
| 185 | + * Recursively compute the total size of a directory in megabytes. |
| 186 | + */ |
| 187 | +function folderSizeMb(string $dir): float |
| 188 | +{ |
| 189 | + $bytes = 0; |
| 190 | + $iter = new RecursiveIteratorIterator(new RecursiveDirectoryIterator($dir, FilesystemIterator::SKIP_DOTS)); |
| 191 | + foreach ($iter as $file) { |
| 192 | + $bytes += $file->getSize(); |
| 193 | + } |
| 194 | + return $bytes / (1024 * 1024); |
| 195 | +} |
| 196 | + |
| 197 | +/** |
| 198 | + * Recursively delete a directory and all its contents. |
| 199 | + */ |
| 200 | +function rrmdir(string $dir): void |
| 201 | +{ |
| 202 | + if (!is_dir($dir)) { |
| 203 | + return; |
| 204 | + } |
| 205 | + foreach ((array) glob($dir . '/*') as $item) { |
| 206 | + is_dir((string) $item) ? rrmdir((string) $item) : unlink((string) $item); |
| 207 | + } |
| 208 | + rmdir($dir); |
| 209 | +} |
| 210 | + |
184 | 211 | /** |
185 | 212 | * Run a complete benchmark for one scenario and return the result array. |
186 | 213 | */ |
@@ -258,30 +285,41 @@ function runScenario( |
258 | 285 | } |
259 | 286 |
|
260 | 287 | // 6. Persistence ─────────────────────────────────────────────────────── |
| 288 | + // |
| 289 | + // Build a fresh VectorDatabase with a temp folder path so document files |
| 290 | + // are written async during insert. save() flushes the HNSW graph and |
| 291 | + // BM25 index (waiting for any outstanding async doc writes first). |
| 292 | + // open() reads only hnsw.bin + bm25.bin — document files are lazy. |
261 | 293 | $persist = null; |
262 | 294 | if (!$noPersist) { |
263 | | - progress(" Benchmarking persist / load …\n"); |
| 295 | + progress(" Benchmarking save / open …\n"); |
| 296 | + |
| 297 | + $tmpDir = sys_get_temp_dir() . '/phpvbench_' . uniqid('', true); |
| 298 | + mkdir($tmpDir, 0755, true); |
264 | 299 |
|
265 | | - $tmpFile = tempnam(sys_get_temp_dir(), 'phpvbench_') . '.phpv'; |
| 300 | + $dbSave = new VectorDatabase($hnswConfig, new BM25Config(), new SimpleTokenizer([]), $tmpDir); |
| 301 | + for ($i = 0; $i < $n; $i++) { |
| 302 | + $dbSave->addDocument(new Document(id: $i, vector: $dataVectors[$i])); |
| 303 | + } |
266 | 304 |
|
267 | 305 | $t0 = hrtime(true); |
268 | | - $db->persist($tmpFile); |
269 | | - $persistTime = (hrtime(true) - $t0) / 1e9; |
| 306 | + $dbSave->save(); |
| 307 | + $saveTime = (hrtime(true) - $t0) / 1e9; |
270 | 308 |
|
271 | | - $fileSizeMb = filesize($tmpFile) / (1024 * 1024); |
| 309 | + $folderSizeMb = folderSizeMb($tmpDir); |
272 | 310 |
|
273 | 311 | $t0 = hrtime(true); |
274 | | - VectorDatabase::load($tmpFile, $hnswConfig); |
275 | | - $loadTime = (hrtime(true) - $t0) / 1e9; |
| 312 | + VectorDatabase::open($tmpDir, $hnswConfig); |
| 313 | + $openTime = (hrtime(true) - $t0) / 1e9; |
276 | 314 |
|
277 | | - unlink($tmpFile); |
| 315 | + rrmdir($tmpDir); |
278 | 316 |
|
279 | 317 | $persist = [ |
280 | | - 'file_size_mb' => $fileSizeMb, |
281 | | - 'persist_s' => $persistTime, |
282 | | - 'persist_mb_s' => $persistTime > 0.0 ? $fileSizeMb / $persistTime : 0.0, |
283 | | - 'load_s' => $loadTime, |
284 | | - 'load_mb_s' => $loadTime > 0.0 ? $fileSizeMb / $loadTime : 0.0, |
| 318 | + 'folder_size_mb' => $folderSizeMb, |
| 319 | + 'save_s' => $saveTime, |
| 320 | + 'save_mb_s' => $saveTime > 0.0 ? $folderSizeMb / $saveTime : 0.0, |
| 321 | + 'open_s' => $openTime, |
| 322 | + 'open_mb_s' => $openTime > 0.0 ? $folderSizeMb / $openTime : 0.0, |
285 | 323 | ]; |
286 | 324 | } |
287 | 325 |
|
|
0 commit comments