@@ -50,6 +50,10 @@ final class IndexSerializer
5050 /**
5151 * Write the HNSW graph state to $path.
5252 *
53+ * Uses streaming fwrite() instead of buffer concatenation to avoid
54+ * O(n²) memory copies. Each pack() result is written directly to
55+ * disk, producing the same binary format with O(n) cost.
56+ *
5357 * @param array{
5458 * entryPoint: int|null,
5559 * maxLayer: int,
@@ -64,27 +68,32 @@ public function writeHnsw(string $path, array $state): void
6468 $ nodeCount = count ($ nodes );
6569 $ ep = $ state ['entryPoint ' ] ?? self ::NULL_ENTRY_POINT ;
6670
67- $ buf = self ::HNSW_MAGIC ;
68- $ buf .= pack ('C ' , self ::VERSION );
69- $ buf .= pack ('NNNN ' , $ dim , $ nodeCount , $ ep , (int ) $ state ['maxLayer ' ]);
71+ $ fh = fopen ($ path , 'wb ' );
72+ if ($ fh === false ) {
73+ throw new \RuntimeException ("Failed to open file for writing: {$ path }" );
74+ }
7075
71- foreach ($ nodes as $ nodeId => $ node ) {
72- $ buf .= pack ('NN ' , $ nodeId , $ node ['maxLayer ' ]);
73- if ($ dim > 0 ) {
74- $ buf .= pack ('d* ' , ...$ node ['vector ' ]);
75- }
76- for ($ l = 0 ; $ l <= $ node ['maxLayer ' ]; $ l ++) {
77- $ conns = $ node ['connections ' ][$ l ] ?? [];
78- $ cnt = count ($ conns );
79- $ buf .= pack ('N ' , $ cnt );
80- if ($ cnt > 0 ) {
81- $ buf .= pack ('N* ' , ...$ conns );
76+ try {
77+ $ this ->checkedWrite ($ fh , self ::HNSW_MAGIC );
78+ $ this ->checkedWrite ($ fh , pack ('C ' , self ::VERSION ));
79+ $ this ->checkedWrite ($ fh , pack ('NNNN ' , $ dim , $ nodeCount , $ ep , (int ) $ state ['maxLayer ' ]));
80+
81+ foreach ($ nodes as $ nodeId => $ node ) {
82+ $ this ->checkedWrite ($ fh , pack ('NN ' , $ nodeId , $ node ['maxLayer ' ]));
83+ if ($ dim > 0 ) {
84+ $ this ->checkedWrite ($ fh , pack ('d* ' , ...$ node ['vector ' ]));
85+ }
86+ for ($ l = 0 ; $ l <= $ node ['maxLayer ' ]; $ l ++) {
87+ $ conns = $ node ['connections ' ][$ l ] ?? [];
88+ $ cnt = count ($ conns );
89+ $ this ->checkedWrite ($ fh , pack ('N ' , $ cnt ));
90+ if ($ cnt > 0 ) {
91+ $ this ->checkedWrite ($ fh , pack ('N* ' , ...$ conns ));
92+ }
8293 }
8394 }
84- }
85-
86- if (file_put_contents ($ path , $ buf ) === false ) {
87- throw new \RuntimeException ("Failed to write hnsw.bin: {$ path }" );
95+ } finally {
96+ fclose ($ fh );
8897 }
8998 }
9099
@@ -173,6 +182,8 @@ public function readHnsw(string $path): array
173182 /**
174183 * Write the BM25 inverted index to $path.
175184 *
185+ * Streams directly to disk
186+ *
176187 * @param array{
177188 * totalTokens: int,
178189 * docLengths: array<int, int>,
@@ -181,29 +192,34 @@ public function readHnsw(string $path): array
181192 */
182193 public function writeBm25 (string $ path , array $ state ): void
183194 {
184- $ buf = self ::BM25_MAGIC ;
185- $ buf .= pack ('C ' , self ::VERSION );
186- $ buf .= pack ('N ' , $ state ['totalTokens ' ]);
187-
188- $ docLengths = $ state ['docLengths ' ];
189- $ buf .= pack ('N ' , count ($ docLengths ));
190- foreach ($ docLengths as $ nodeId => $ length ) {
191- $ buf .= pack ('NN ' , $ nodeId , $ length );
195+ $ fh = fopen ($ path , 'wb ' );
196+ if ($ fh === false ) {
197+ throw new \RuntimeException ("Failed to open file for writing: {$ path }" );
192198 }
193199
194- $ invertedIndex = $ state ['invertedIndex ' ];
195- $ buf .= pack ('N ' , count ($ invertedIndex ));
196- foreach ($ invertedIndex as $ term => $ postings ) {
197- $ termBytes = (string ) $ term ;
198- $ buf .= pack ('n ' , strlen ($ termBytes )) . $ termBytes ;
199- $ buf .= pack ('N ' , count ($ postings ));
200- foreach ($ postings as $ postNodeId => $ tf ) {
201- $ buf .= pack ('NN ' , $ postNodeId , $ tf );
200+ try {
201+ $ this ->checkedWrite ($ fh , self ::BM25_MAGIC );
202+ $ this ->checkedWrite ($ fh , pack ('C ' , self ::VERSION ));
203+ $ this ->checkedWrite ($ fh , pack ('N ' , $ state ['totalTokens ' ]));
204+
205+ $ docLengths = $ state ['docLengths ' ];
206+ $ this ->checkedWrite ($ fh , pack ('N ' , count ($ docLengths )));
207+ foreach ($ docLengths as $ nodeId => $ length ) {
208+ $ this ->checkedWrite ($ fh , pack ('NN ' , $ nodeId , $ length ));
202209 }
203- }
204210
205- if (file_put_contents ($ path , $ buf ) === false ) {
206- throw new \RuntimeException ("Failed to write bm25.bin: {$ path }" );
211+ $ invertedIndex = $ state ['invertedIndex ' ];
212+ $ this ->checkedWrite ($ fh , pack ('N ' , count ($ invertedIndex )));
213+ foreach ($ invertedIndex as $ term => $ postings ) {
214+ $ termBytes = (string ) $ term ;
215+ $ this ->checkedWrite ($ fh , pack ('n ' , strlen ($ termBytes )) . $ termBytes );
216+ $ this ->checkedWrite ($ fh , pack ('N ' , count ($ postings )));
217+ foreach ($ postings as $ postNodeId => $ tf ) {
218+ $ this ->checkedWrite ($ fh , pack ('NN ' , $ postNodeId , $ tf ));
219+ }
220+ }
221+ } finally {
222+ fclose ($ fh );
207223 }
208224 }
209225
@@ -278,4 +294,14 @@ public function readBm25(string $path): array
278294 'invertedIndex ' => $ invertedIndex ,
279295 ];
280296 }
297+
298+ /**
299+ * @param resource $fh
300+ */
301+ private function checkedWrite ($ fh , string $ data ): void
302+ {
303+ if (@fwrite ($ fh , $ data ) === false ) {
304+ throw new \RuntimeException ("Failed to write data " );
305+ }
306+ }
281307}
0 commit comments