Skip to content

Commit 538e224

Browse files
authored
Merge pull request #6 from danielebarbaro/refactor/save
Enhancing Performance through Adaptive efConstruction and Streaming Serialization
2 parents 5ae150a + 8eb624f commit 538e224

File tree

3 files changed

+84
-42
lines changed

3 files changed

+84
-42
lines changed

src/HNSW/Config.php

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,12 +82,14 @@ public function __construct(
8282
if ($M < 2) {
8383
throw new \InvalidArgumentException('M must be at least 2.');
8484
}
85-
if ($efConstruction < $M) {
86-
throw new \InvalidArgumentException('efConstruction must be ≥ M.');
85+
$resolvedM0 = $M0 ?? ($M * 2);
86+
87+
if ($efConstruction < $resolvedM0) {
88+
throw new \InvalidArgumentException("efConstruction must be ≥ M0 ({$resolvedM0}).");
8789
}
8890

8991
$this->M = $M;
90-
$this->M0 = $M0 ?? ($M * 2);
92+
$this->M0 = $resolvedM0;
9193
$this->mL = $mL ?? (1.0 / log($M));
9294
$this->efConstruction = $efConstruction;
9395
$this->efSearch = $efSearch;

src/HNSW/Index.php

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,11 +198,25 @@ public function insert(Document $document): void
198198
}
199199

200200
// Phase 2: from min(L, l) down to layer 0 — build connections.
201+
//
202+
// Adaptive efConstruction: scale with graph size so early inserts
203+
// (where the graph is small and fewer candidates exist) use a lower
204+
// beam width, ramping up to the configured maximum as the index grows.
205+
// This avoids wasting cycles searching a sparse graph while preserving
206+
// full recall quality at scale.
201207
for ($lc = min($this->maxLayer, $maxLayer); $lc >= 0; $lc--) {
202208
$mMax = $lc === 0 ? $this->config->M0 : $this->config->M;
203209

204-
// Find ef_construction nearest neighbours at this layer.
205-
$W = $this->searchLayer($dv, [[$epDist, $ep]], $this->config->efConstruction, $lc);
210+
// Compute adaptive efConstruction per-layer, ensuring it is at least
211+
// the maximum desired degree for this layer (mMax). This guarantees
212+
// that searchLayer() can return enough candidates to fill M0 on the
213+
// base layer and M on upper layers, even during early inserts.
214+
$efC = max(
215+
$mMax,
216+
min($this->config->efConstruction, (int) ($nodeId / 10))
217+
);
218+
// Find efConstruction nearest neighbours at this layer.
219+
$W = $this->searchLayer($dv, [[$epDist, $ep]], $efC, $lc);
206220

207221
// Select the best M neighbours using simple or heuristic strategy.
208222
$neighbours = $this->config->useHeuristic

src/Persistence/IndexSerializer.php

Lines changed: 63 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ final class IndexSerializer
5050
/**
5151
* Write the HNSW graph state to $path.
5252
*
53+
* Uses streaming fwrite() instead of buffer concatenation to avoid
54+
* O(n²) memory copies. Each pack() result is written directly to
55+
* disk, producing the same binary format with O(n) cost.
56+
*
5357
* @param array{
5458
* entryPoint: int|null,
5559
* maxLayer: int,
@@ -64,27 +68,32 @@ public function writeHnsw(string $path, array $state): void
6468
$nodeCount = count($nodes);
6569
$ep = $state['entryPoint'] ?? self::NULL_ENTRY_POINT;
6670

67-
$buf = self::HNSW_MAGIC;
68-
$buf .= pack('C', self::VERSION);
69-
$buf .= pack('NNNN', $dim, $nodeCount, $ep, (int) $state['maxLayer']);
71+
$fh = fopen($path, 'wb');
72+
if ($fh === false) {
73+
throw new \RuntimeException("Failed to open file for writing: {$path}");
74+
}
7075

71-
foreach ($nodes as $nodeId => $node) {
72-
$buf .= pack('NN', $nodeId, $node['maxLayer']);
73-
if ($dim > 0) {
74-
$buf .= pack('d*', ...$node['vector']);
75-
}
76-
for ($l = 0; $l <= $node['maxLayer']; $l++) {
77-
$conns = $node['connections'][$l] ?? [];
78-
$cnt = count($conns);
79-
$buf .= pack('N', $cnt);
80-
if ($cnt > 0) {
81-
$buf .= pack('N*', ...$conns);
76+
try {
77+
$this->checkedWrite($fh, self::HNSW_MAGIC);
78+
$this->checkedWrite($fh, pack('C', self::VERSION));
79+
$this->checkedWrite($fh, pack('NNNN', $dim, $nodeCount, $ep, (int) $state['maxLayer']));
80+
81+
foreach ($nodes as $nodeId => $node) {
82+
$this->checkedWrite($fh, pack('NN', $nodeId, $node['maxLayer']));
83+
if ($dim > 0) {
84+
$this->checkedWrite($fh, pack('d*', ...$node['vector']));
85+
}
86+
for ($l = 0; $l <= $node['maxLayer']; $l++) {
87+
$conns = $node['connections'][$l] ?? [];
88+
$cnt = count($conns);
89+
$this->checkedWrite($fh, pack('N', $cnt));
90+
if ($cnt > 0) {
91+
$this->checkedWrite($fh, pack('N*', ...$conns));
92+
}
8293
}
8394
}
84-
}
85-
86-
if (file_put_contents($path, $buf) === false) {
87-
throw new \RuntimeException("Failed to write hnsw.bin: {$path}");
95+
} finally {
96+
fclose($fh);
8897
}
8998
}
9099

@@ -173,6 +182,8 @@ public function readHnsw(string $path): array
173182
/**
174183
* Write the BM25 inverted index to $path.
175184
*
185+
* Streams directly to disk
186+
*
176187
* @param array{
177188
* totalTokens: int,
178189
* docLengths: array<int, int>,
@@ -181,29 +192,34 @@ public function readHnsw(string $path): array
181192
*/
182193
public function writeBm25(string $path, array $state): void
183194
{
184-
$buf = self::BM25_MAGIC;
185-
$buf .= pack('C', self::VERSION);
186-
$buf .= pack('N', $state['totalTokens']);
187-
188-
$docLengths = $state['docLengths'];
189-
$buf .= pack('N', count($docLengths));
190-
foreach ($docLengths as $nodeId => $length) {
191-
$buf .= pack('NN', $nodeId, $length);
195+
$fh = fopen($path, 'wb');
196+
if ($fh === false) {
197+
throw new \RuntimeException("Failed to open file for writing: {$path}");
192198
}
193199

194-
$invertedIndex = $state['invertedIndex'];
195-
$buf .= pack('N', count($invertedIndex));
196-
foreach ($invertedIndex as $term => $postings) {
197-
$termBytes = (string) $term;
198-
$buf .= pack('n', strlen($termBytes)) . $termBytes;
199-
$buf .= pack('N', count($postings));
200-
foreach ($postings as $postNodeId => $tf) {
201-
$buf .= pack('NN', $postNodeId, $tf);
200+
try {
201+
$this->checkedWrite($fh, self::BM25_MAGIC);
202+
$this->checkedWrite($fh, pack('C', self::VERSION));
203+
$this->checkedWrite($fh, pack('N', $state['totalTokens']));
204+
205+
$docLengths = $state['docLengths'];
206+
$this->checkedWrite($fh, pack('N', count($docLengths)));
207+
foreach ($docLengths as $nodeId => $length) {
208+
$this->checkedWrite($fh, pack('NN', $nodeId, $length));
202209
}
203-
}
204210

205-
if (file_put_contents($path, $buf) === false) {
206-
throw new \RuntimeException("Failed to write bm25.bin: {$path}");
211+
$invertedIndex = $state['invertedIndex'];
212+
$this->checkedWrite($fh, pack('N', count($invertedIndex)));
213+
foreach ($invertedIndex as $term => $postings) {
214+
$termBytes = (string) $term;
215+
$this->checkedWrite($fh, pack('n', strlen($termBytes)) . $termBytes);
216+
$this->checkedWrite($fh, pack('N', count($postings)));
217+
foreach ($postings as $postNodeId => $tf) {
218+
$this->checkedWrite($fh, pack('NN', $postNodeId, $tf));
219+
}
220+
}
221+
} finally {
222+
fclose($fh);
207223
}
208224
}
209225

@@ -278,4 +294,14 @@ public function readBm25(string $path): array
278294
'invertedIndex' => $invertedIndex,
279295
];
280296
}
297+
298+
/**
299+
* @param resource $fh
300+
*/
301+
private function checkedWrite($fh, string $data): void
302+
{
303+
if (@fwrite($fh, $data) === false) {
304+
throw new \RuntimeException("Failed to write data");
305+
}
306+
}
281307
}

0 commit comments

Comments
 (0)