Skip to content

Commit 8dd13ae

Browse files
committed
Updated the benchmark
1 parent 1d682fc commit 8dd13ae

File tree

4 files changed

+78
-37
lines changed

4 files changed

+78
-37
lines changed

BENCHMARK.md

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ PHPVector, measuring the four metrics that matter in production:
1010
| **P99 latency** | Worst-case latency for 99% of queries |
1111
| **Recall@k** | Fraction of true nearest neighbours returned |
1212

13-
A **persistence section** is also included, measuring `persist()` and `load()` speed and file throughput.
13+
A **persistence section** is also included, measuring `save()` and `open()` speed and folder throughput.
1414

1515
---
1616

@@ -101,7 +101,7 @@ php benchmark/benchmark.php [options]
101101
| Option | Default | Description |
102102
|--------|---------|-------------|
103103
| `--output=<file>` | stdout | Write Markdown report to this path |
104-
| `--no-persist` | off | Skip the persistence (save/load) phase |
104+
| `--no-save` | off | Skip the persistence (`save` / `open`) phase |
105105
| `--no-recall` | off | Skip recall computation (faster for large datasets) |
106106
| `--help`, `-h` || Print usage and exit |
107107

@@ -194,11 +194,13 @@ A Recall@10 of 95% means HNSW returns 9–10 of the true 10 nearest neighbours
194194
on average. Values below ~80% suggest `efSearch` is too low for the dataset, or
195195
`M`/`efConstruction` need increasing.
196196

197-
### Persistence (persist / load)
197+
### Persistence (save / open)
198198

199-
Wall-clock time and throughput (MB/s) for `VectorDatabase::persist()` and
200-
`VectorDatabase::load()`. The `.phpv` binary format uses `pack/unpack` for
201-
float arrays, so throughput is typically limited by filesystem speed.
199+
Wall-clock time and throughput (MB/s) for `VectorDatabase::save()` and
200+
`VectorDatabase::open()`. `save()` waits for any outstanding async document
201+
writes, then flushes `hnsw.bin` and `bm25.bin`. `open()` reads only those two
202+
index files; individual document files (`docs/{n}.bin`) are loaded lazily after
203+
search, so open time is typically fast regardless of document count.
202204

203205
---
204206

@@ -327,9 +329,10 @@ nodes for some queries.
327329
**Recall** — breakdown at k=1, k=5, and k=K. Recall@1 is always ≥ Recall@k
328330
because finding the single nearest neighbour is easier than finding the top-K.
329331

330-
**Persistence** — file size on disk, time and throughput for `persist()` and
331-
`load()`. Useful for planning deployment workflows where the index is built once
332-
and served from disk on each restart.
332+
**Persistence** — total folder size on disk, time and throughput for `save()` and
333+
`open()`. Useful for planning deployment workflows where the index is built once
334+
and served from disk on each restart. `open()` time is typically much faster than
335+
`save()` because document files are not read eagerly.
333336

334337
---
335338

@@ -342,7 +345,7 @@ and served from disk on each restart.
342345
| Recall@10 | ≥ 95% | < 85% — increase `efSearch` or `M` |
343346
| P99 / P50 ratio | < 3× | > 5× — graph may have poor connectivity |
344347
| Build throughput | Consistent with N·log(N) growth | Sudden drops may indicate GC pressure |
345-
| persist() throughput | > 50 MB/s | Lower suggests filesystem bottleneck |
348+
| save() throughput | > 50 MB/s | Lower suggests filesystem bottleneck |
346349

347350
### The recall/speed tradeoff
348351

benchmark/Report.php

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -139,17 +139,17 @@ public static function generate(
139139
$p = $result['persist'];
140140
$lines[] = '### Persistence';
141141
$lines[] = '';
142-
$lines[] = '| Operation | File size | Time | Throughput |';
143-
$lines[] = '|-----------|-----------|------|------------|';
144-
$lines[] = sprintf('| `persist()` | %s | %s | %.1f MB/s |',
145-
self::fmtMb($p['file_size_mb']),
146-
self::fmtTime($p['persist_s']),
147-
$p['persist_mb_s'],
142+
$lines[] = '| Operation | Folder size | Time | Throughput |';
143+
$lines[] = '|-----------|-------------|------|------------|';
144+
$lines[] = sprintf('| `save()` | %s | %s | %.1f MB/s |',
145+
self::fmtMb($p['folder_size_mb']),
146+
self::fmtTime($p['save_s']),
147+
$p['save_mb_s'],
148148
);
149-
$lines[] = sprintf('| `load()` | %s | %s | %.1f MB/s |',
150-
self::fmtMb($p['file_size_mb']),
151-
self::fmtTime($p['load_s']),
152-
$p['load_mb_s'],
149+
$lines[] = sprintf('| `open()` | %s | %s | %.1f MB/s |',
150+
self::fmtMb($p['folder_size_mb']),
151+
self::fmtTime($p['open_s']),
152+
$p['open_mb_s'],
153153
);
154154
$lines[] = '';
155155
}

benchmark/benchmark.php

Lines changed: 54 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
* --m=<n> HNSW M parameter (default: 16)
2121
* --seed=<n> Random seed for reproducibility (default: 42)
2222
* --output=<file> Write Markdown report to file (default: stdout)
23-
* --no-persist Skip persistence benchmarks
23+
* --no-save Skip persistence benchmarks (save / open)
2424
* --no-recall Skip recall computation
2525
* --help, -h Show this help
2626
*
@@ -70,7 +70,7 @@
7070
$opts = getopt('h', [
7171
'scenarios:', 'k:', 'queries:', 'recall-samples:',
7272
'ef-search:', 'ef-construction:', 'm:', 'seed:',
73-
'output:', 'no-persist', 'no-recall', 'help', 'h',
73+
'output:', 'no-save', 'no-persist', 'no-recall', 'help', 'h',
7474
]);
7575

7676
if (isset($opts['help']) || isset($opts['h'])) {
@@ -100,7 +100,7 @@
100100
$m = max(2, (int) ($opts['m'] ?? 16));
101101
$seed = (int) ($opts['seed'] ?? 42);
102102
$outputFile = $opts['output'] ?? null;
103-
$noPersist = isset($opts['no-persist']);
103+
$noPersist = isset($opts['no-save']) || isset($opts['no-persist']);
104104
$noRecall = isset($opts['no-recall']);
105105

106106
$hnswConfig = new HNSWConfig(
@@ -181,6 +181,33 @@ function computeRecall(
181181
return array_map(static fn(float $t): float => $t / $nRecall, $totals);
182182
}
183183

184+
/**
185+
* Recursively compute the total size of a directory in megabytes.
186+
*/
187+
function folderSizeMb(string $dir): float
188+
{
189+
$bytes = 0;
190+
$iter = new RecursiveIteratorIterator(new RecursiveDirectoryIterator($dir, FilesystemIterator::SKIP_DOTS));
191+
foreach ($iter as $file) {
192+
$bytes += $file->getSize();
193+
}
194+
return $bytes / (1024 * 1024);
195+
}
196+
197+
/**
198+
* Recursively delete a directory and all its contents.
199+
*/
200+
function rrmdir(string $dir): void
201+
{
202+
if (!is_dir($dir)) {
203+
return;
204+
}
205+
foreach ((array) glob($dir . '/*') as $item) {
206+
is_dir((string) $item) ? rrmdir((string) $item) : unlink((string) $item);
207+
}
208+
rmdir($dir);
209+
}
210+
184211
/**
185212
* Run a complete benchmark for one scenario and return the result array.
186213
*/
@@ -258,30 +285,41 @@ function runScenario(
258285
}
259286

260287
// 6. Persistence ───────────────────────────────────────────────────────
288+
//
289+
// Build a fresh VectorDatabase with a temp folder path so document files
290+
// are written async during insert. save() flushes the HNSW graph and
291+
// BM25 index (waiting for any outstanding async doc writes first).
292+
// open() reads only hnsw.bin + bm25.bin — document files are lazy.
261293
$persist = null;
262294
if (!$noPersist) {
263-
progress(" Benchmarking persist / load …\n");
295+
progress(" Benchmarking save / open …\n");
296+
297+
$tmpDir = sys_get_temp_dir() . '/phpvbench_' . uniqid('', true);
298+
mkdir($tmpDir, 0755, true);
264299

265-
$tmpFile = tempnam(sys_get_temp_dir(), 'phpvbench_') . '.phpv';
300+
$dbSave = new VectorDatabase($hnswConfig, new BM25Config(), new SimpleTokenizer([]), $tmpDir);
301+
for ($i = 0; $i < $n; $i++) {
302+
$dbSave->addDocument(new Document(id: $i, vector: $dataVectors[$i]));
303+
}
266304

267305
$t0 = hrtime(true);
268-
$db->persist($tmpFile);
269-
$persistTime = (hrtime(true) - $t0) / 1e9;
306+
$dbSave->save();
307+
$saveTime = (hrtime(true) - $t0) / 1e9;
270308

271-
$fileSizeMb = filesize($tmpFile) / (1024 * 1024);
309+
$folderSizeMb = folderSizeMb($tmpDir);
272310

273311
$t0 = hrtime(true);
274-
VectorDatabase::load($tmpFile, $hnswConfig);
275-
$loadTime = (hrtime(true) - $t0) / 1e9;
312+
VectorDatabase::open($tmpDir, $hnswConfig);
313+
$openTime = (hrtime(true) - $t0) / 1e9;
276314

277-
unlink($tmpFile);
315+
rrmdir($tmpDir);
278316

279317
$persist = [
280-
'file_size_mb' => $fileSizeMb,
281-
'persist_s' => $persistTime,
282-
'persist_mb_s' => $persistTime > 0.0 ? $fileSizeMb / $persistTime : 0.0,
283-
'load_s' => $loadTime,
284-
'load_mb_s' => $loadTime > 0.0 ? $fileSizeMb / $loadTime : 0.0,
318+
'folder_size_mb' => $folderSizeMb,
319+
'save_s' => $saveTime,
320+
'save_mb_s' => $saveTime > 0.0 ? $folderSizeMb / $saveTime : 0.0,
321+
'open_s' => $openTime,
322+
'open_mb_s' => $openTime > 0.0 ? $folderSizeMb / $openTime : 0.0,
285323
];
286324
}
287325

src/Persistence/IndexSerializer.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ public function writeHnsw(string $path, array $state): void
6666

6767
$buf = self::HNSW_MAGIC;
6868
$buf .= pack('C', self::VERSION);
69-
$buf .= pack('NNNN', $dim, $nodeCount, $ep ?? self::NULL_ENTRY_POINT, (int) $state['maxLayer']);
69+
$buf .= pack('NNNN', $dim, $nodeCount, $ep, (int) $state['maxLayer']);
7070

7171
foreach ($nodes as $nodeId => $node) {
7272
$buf .= pack('NN', $nodeId, $node['maxLayer']);

0 commit comments

Comments
 (0)