Skip to content

Commit 3eab525

Browse files
felixtrzmeta-codesync[bot]
authored andcommitted
fix(reference): replace non-deterministic archive verification with per-file hashes
Summary: The reference warmup model verification was re-compressing extracted model files into a .tgz and comparing the archive's SHA-256 and size against values baked into embeddings.json. This is fundamentally non-deterministic because gzip output varies across Node versions, tar package versions, OS zlib implementations, and CPU architectures. Users on different machines than the build server would get checksum mismatches and be blocked from using reference tools. Replaced the archive-based verification with per-file SHA-256 hashing: - Added optional `fileHashes: Record<string, string>` to the model metadata type, mapping each model file's relative path to its SHA-256. - Removed `createDeterministicModelArchive` and `readInstalledModelMetadata` from assets.ts (the runtime verification path). - Added `validateInstalledModelFiles` that checks each file individually. - Updated `getReferenceCacheStatus` and `installPinnedModelFiles` to use per-file verification instead of re-archiving. - Updated the build-time tools (model.ts, pinned-model.mjs, build-model.mjs) to compute and include fileHashes in the model metadata. - Old corpus formats without fileHashes gracefully degrade to file-existence checks only, which is strictly better than the broken archive comparison. Reviewed By: cabanier Differential Revision: D105088189 fbshipit-source-id: 83a245f3938108ec56f6083884e54c359c3312a5
1 parent 8f3817b commit 3eab525

9 files changed

Lines changed: 645 additions & 132 deletions

File tree

packages/reference-assets/pinned-model.mjs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,13 @@ export const REQUIRED_MODEL_FILES = Object.freeze(
5454
export function buildReferenceEmbeddingModelMetadata(
5555
archiveSha256,
5656
archiveSize,
57+
fileHashes,
5758
) {
5859
return {
5960
...DEFAULT_REFERENCE_MODEL_SETTINGS,
6061
archiveSha256,
6162
archiveSize,
63+
...(fileHashes ? { fileHashes } : {}),
6264
};
6365
}
6466

@@ -86,6 +88,16 @@ export async function sha256File(filePath) {
8688
});
8789
}
8890

91+
export async function computeModelFileHashes(modelDir) {
92+
const hashes = {};
93+
for (const file of REFERENCE_MODEL_FILE_SOURCES) {
94+
hashes[file.relativePath] = await sha256File(
95+
path.join(modelDir, file.relativePath),
96+
);
97+
}
98+
return hashes;
99+
}
100+
89101
async function writeResponseToFile(response, destination, sourceUrl) {
90102
if (!response.body) {
91103
throw new Error(`No response body received from ${sourceUrl}`);

packages/reference-assets/scripts/build-model.mjs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import fsp from 'node:fs/promises';
1010
import path from 'node:path';
1111
import { fileURLToPath } from 'node:url';
1212
import {
13+
computeModelFileHashes,
1314
createDeterministicModelArchive,
1415
downloadPinnedModelFile,
1516
REFERENCE_MODEL_FILE_SOURCES,
@@ -70,6 +71,7 @@ async function main() {
7071

7172
const archiveStat = await fsp.stat(archivePath);
7273
const archiveSha256 = await sha256File(archivePath);
74+
const fileHashes = await computeModelFileHashes(sourceDir);
7375
const manifest = {
7476
schemaVersion: 1,
7577
referenceVersion: packageJson.version,
@@ -80,6 +82,7 @@ async function main() {
8082
size: archiveStat.size,
8183
format: 'transformers-js',
8284
requiredFiles: REQUIRED_MODEL_FILES,
85+
fileHashes,
8386
},
8487
};
8588

packages/reference-assets/test/model.test.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,15 @@ import {
2121
let tempDir: string;
2222
let sharedRoot: string;
2323

24-
async function createModelArchive(): Promise<{ sha256: string; size: number }> {
24+
async function createModelArchive(): Promise<{
25+
sha256: string;
26+
size: number;
27+
fileHashes: Record<string, string>;
28+
}> {
2529
const sourceRoot = path.join(tempDir, 'model-source');
2630
const archivePath = path.join(tempDir, 'model.tgz');
2731
const archiveRoot = path.join(sourceRoot, 'model');
32+
const fileHashes: Record<string, string> = {};
2833

2934
await rm(sourceRoot, { recursive: true, force: true });
3035
await mkdir(archiveRoot, { recursive: true });
@@ -40,6 +45,7 @@ async function createModelArchive(): Promise<{ sha256: string; size: number }> {
4045
const destination = path.join(archiveRoot, relativePath);
4146
await mkdir(path.dirname(destination), { recursive: true });
4247
await writeFile(destination, body);
48+
fileHashes[relativePath] = createHash('sha256').update(body).digest('hex');
4349
}
4450

4551
await tar.c(
@@ -58,6 +64,7 @@ async function createModelArchive(): Promise<{ sha256: string; size: number }> {
5864
return {
5965
sha256: createHash('sha256').update(buffer).digest('hex'),
6066
size: buffer.length,
67+
fileHashes,
6168
};
6269
}
6370
const ONNX_BUFFER = Buffer.from('fake-onnx');
@@ -124,6 +131,7 @@ describe('reference model installer', () => {
124131
pooling: 'mean',
125132
normalize: true,
126133
});
134+
expect(installed.metadata.fileHashes).toEqual(expectedArchive.fileHashes);
127135
expect(formatReferenceEmbeddingModel(installed.metadata)).toBe(
128136
`sha256:${expectedArchive.sha256}`,
129137
);

packages/reference-assets/tools/model.ts

Lines changed: 45 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ export interface ReferenceEmbeddingModelMetadata {
3535
format: 'transformers-js';
3636
archiveSha256: string;
3737
archiveSize: number;
38+
fileHashes?: Record<string, string>;
3839
dtype: ReferenceEmbeddingModelDType;
3940
pooling: 'mean';
4041
normalize: true;
@@ -88,11 +89,13 @@ const REQUIRED_MODEL_FILES = Object.freeze(
8889
function buildReferenceEmbeddingModelMetadata(
8990
archiveSha256: string,
9091
archiveSize: number,
92+
fileHashes?: Record<string, string>,
9193
): ReferenceEmbeddingModelMetadata {
9294
return {
9395
...DEFAULT_REFERENCE_MODEL_SETTINGS,
9496
archiveSha256,
9597
archiveSize,
98+
...(fileHashes ? { fileHashes } : {}),
9699
};
97100
}
98101

@@ -246,27 +249,44 @@ async function createDeterministicModelArchive(
246249
);
247250
}
248251

249-
async function readInstalledModelMetadata(
252+
async function computeModelFileHashes(
250253
modelDir: string,
251-
stagingRoot = getReferenceModelStagingRoot(),
252-
): Promise<ReferenceEmbeddingModelMetadata> {
253-
const archivePath = path.join(
254-
stagingRoot,
255-
'verification',
256-
`model-${process.pid}-${Date.now()}-${Math.random().toString(36).slice(2)}.tgz`,
257-
);
258-
await mkdir(path.dirname(archivePath), { recursive: true });
259-
260-
try {
261-
await createDeterministicModelArchive(modelDir, archivePath);
262-
const archiveStat = await stat(archivePath);
263-
return buildReferenceEmbeddingModelMetadata(
264-
await sha256File(archivePath),
265-
archiveStat.size,
254+
): Promise<Record<string, string>> {
255+
const hashes: Record<string, string> = {};
256+
for (const file of REFERENCE_MODEL_FILE_SOURCES) {
257+
hashes[file.relativePath] = await sha256File(
258+
path.join(modelDir, file.relativePath),
266259
);
267-
} finally {
268-
await rm(archivePath, { force: true }).catch(() => {});
269260
}
261+
return hashes;
262+
}
263+
264+
async function validateInstalledModelFiles(
265+
modelDir: string,
266+
expectedFileHashes: Record<string, string> | undefined,
267+
): Promise<boolean> {
268+
if (!hasReferenceEmbeddingModelFiles(modelDir)) {
269+
return false;
270+
}
271+
272+
if (!expectedFileHashes) {
273+
return true;
274+
}
275+
276+
for (const [relativePath, expectedSha] of Object.entries(
277+
expectedFileHashes,
278+
)) {
279+
try {
280+
const actualSha = await sha256File(path.join(modelDir, relativePath));
281+
if (actualSha !== expectedSha) {
282+
return false;
283+
}
284+
} catch {
285+
return false;
286+
}
287+
}
288+
289+
return true;
270290
}
271291

272292
export interface InstalledReferenceEmbeddingModel {
@@ -323,16 +343,10 @@ async function installReferenceModelFiles(
323343
metadata.archiveSha256,
324344
);
325345
const finalDir = path.join(finalRoot, 'model');
326-
if (hasReferenceEmbeddingModelFiles(finalDir)) {
327-
const installedMetadata = await readInstalledModelMetadata(finalDir);
328-
if (
329-
installedMetadata.archiveSha256 === metadata.archiveSha256 &&
330-
installedMetadata.archiveSize === metadata.archiveSize
331-
) {
332-
return finalDir;
333-
}
334-
await rm(finalRoot, { recursive: true, force: true });
346+
if (await validateInstalledModelFiles(finalDir, metadata.fileHashes)) {
347+
return finalDir;
335348
}
349+
await rm(finalRoot, { recursive: true, force: true });
336350

337351
if (!hasReferenceEmbeddingModelFiles(extractedDir)) {
338352
throw new Error(
@@ -349,13 +363,8 @@ async function installReferenceModelFiles(
349363
await rename(tempFinalRoot, finalRoot);
350364
} catch (error) {
351365
await rm(tempFinalRoot, { recursive: true, force: true });
352-
if (!hasReferenceEmbeddingModelFiles(finalDir)) {
353-
throw error;
354-
}
355-
const installedMetadata = await readInstalledModelMetadata(finalDir);
356366
if (
357-
installedMetadata.archiveSha256 !== metadata.archiveSha256 ||
358-
installedMetadata.archiveSize !== metadata.archiveSize
367+
!(await validateInstalledModelFiles(finalDir, metadata.fileHashes))
359368
) {
360369
throw error;
361370
}
@@ -382,11 +391,14 @@ export async function installReferenceEmbeddingModel(): Promise<InstalledReferen
382391
await downloadPinnedModelFile(file.sourceUrl, destination);
383392
}
384393

394+
const fileHashes = await computeModelFileHashes(extractedDir);
395+
385396
await createDeterministicModelArchive(extractedDir, archivePath);
386397
const archiveStat = await stat(archivePath);
387398
const metadata = buildReferenceEmbeddingModelMetadata(
388399
await sha256File(archivePath),
389400
archiveStat.size,
401+
fileHashes,
390402
);
391403
const modelDir = await installReferenceModelFiles(
392404
metadata,

packages/reference/src/assets.ts

Lines changed: 40 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ import os from 'os';
1818
import path from 'path';
1919
import * as tar from 'tar';
2020
import {
21-
buildReferenceEmbeddingModelMetadata,
2221
hasReferenceEmbeddingModelFiles,
2322
REFERENCE_MODEL_FILE_SOURCES,
2423
REFERENCE_MODEL_ONNX_URL,
@@ -459,43 +458,30 @@ async function validateModelDir(modelDir: string): Promise<boolean> {
459458
return hasReferenceEmbeddingModelFiles(modelDir);
460459
}
461460

462-
async function createDeterministicModelArchive(
463-
sourceDir: string,
464-
archivePath: string,
465-
): Promise<void> {
466-
await tar.c(
467-
{
468-
cwd: path.dirname(sourceDir),
469-
file: archivePath,
470-
gzip: true,
471-
portable: true,
472-
noPax: true,
473-
mtime: new Date(0),
474-
},
475-
[path.basename(sourceDir)],
476-
);
477-
}
478-
479-
async function readInstalledModelMetadata(
461+
async function validateInstalledModelFiles(
480462
modelDir: string,
481-
): Promise<ReferenceEmbeddingModelMetadata> {
482-
const archivePath = path.join(
483-
getStagingRoot(),
484-
'verification',
485-
`model-${process.pid}-${Date.now()}-${Math.random().toString(36).slice(2)}.tgz`,
486-
);
487-
await mkdir(path.dirname(archivePath), { recursive: true });
463+
expectedFileHashes: Record<string, string> | undefined,
464+
): Promise<boolean> {
465+
if (!(await validateModelDir(modelDir))) {
466+
return false;
467+
}
488468

489-
try {
490-
await createDeterministicModelArchive(modelDir, archivePath);
491-
const archiveStat = await stat(archivePath);
492-
return buildReferenceEmbeddingModelMetadata(
493-
await sha256File(archivePath),
494-
archiveStat.size,
495-
);
496-
} finally {
497-
await rm(archivePath, { force: true }).catch(() => {});
469+
if (!expectedFileHashes) {
470+
return true;
498471
}
472+
473+
for (const [relativePath, expectedSha] of Object.entries(expectedFileHashes)) {
474+
try {
475+
const actualSha = await sha256File(path.join(modelDir, relativePath));
476+
if (actualSha !== expectedSha) {
477+
return false;
478+
}
479+
} catch {
480+
return false;
481+
}
482+
}
483+
484+
return true;
499485
}
500486

501487
async function readCurrentModelMetadata(
@@ -622,23 +608,13 @@ export async function getReferenceCacheStatus(): Promise<ReferenceCacheStatus> {
622608
);
623609
}
624610

625-
if (!(await validateModelDir(state.modelDir))) {
626-
return buildFailureStatus(
627-
packageVersion,
628-
state,
629-
'Reference model cache is incomplete or corrupted. Run "iwsdk reference warmup" again.',
630-
);
631-
}
632-
633-
const installedModel = await readInstalledModelMetadata(state.modelDir);
634611
if (
635-
installedModel.archiveSha256 !== model.archiveSha256 ||
636-
installedModel.archiveSize !== model.archiveSize
612+
!(await validateInstalledModelFiles(state.modelDir, model.fileHashes))
637613
) {
638614
return buildFailureStatus(
639615
packageVersion,
640616
state,
641-
`Reference model cache metadata ${installedModel.archiveSha256}/${installedModel.archiveSize} does not match the warmed corpus metadata ${model.archiveSha256}/${model.archiveSize}. Run "iwsdk reference warmup" again to refresh the pinned model files.`,
617+
'Reference model cache is incomplete or corrupted. Run "iwsdk reference warmup" again.',
642618
);
643619
}
644620

@@ -1087,19 +1063,12 @@ async function installPinnedModelFiles(
10871063
metadata.archiveSha256,
10881064
);
10891065
const finalDir = path.join(finalRoot, 'model');
1090-
if (await validateModelDir(finalDir)) {
1091-
const installedModel = await readInstalledModelMetadata(finalDir);
1092-
if (
1093-
installedModel.archiveSha256 === metadata.archiveSha256 &&
1094-
installedModel.archiveSize === metadata.archiveSize
1095-
) {
1096-
return finalDir;
1097-
}
1098-
await rm(finalRoot, { recursive: true, force: true });
1066+
if (await validateInstalledModelFiles(finalDir, metadata.fileHashes)) {
1067+
return finalDir;
10991068
}
1069+
await rm(finalRoot, { recursive: true, force: true });
11001070

11011071
const extractedDir = path.join(stagingRoot, 'model-extract', 'model');
1102-
const archivePath = path.join(stagingRoot, 'model.tgz');
11031072
await rm(path.dirname(extractedDir), { recursive: true, force: true });
11041073
await mkdir(path.join(extractedDir, 'onnx'), { recursive: true });
11051074

@@ -1115,18 +1084,19 @@ async function installPinnedModelFiles(
11151084
);
11161085
}
11171086

1118-
await createDeterministicModelArchive(extractedDir, archivePath);
1119-
const archiveStat = await stat(archivePath);
1120-
const archiveSha256 = await sha256File(archivePath);
1121-
if (archiveStat.size !== metadata.archiveSize) {
1122-
throw new Error(
1123-
`Pinned reference model archive size ${archiveStat.size} does not match the warmed corpus metadata ${metadata.archiveSize}. Run "iwsdk reference warmup" again to refresh the pinned model files.`,
1124-
);
1125-
}
1126-
if (archiveSha256 !== metadata.archiveSha256) {
1127-
throw new Error(
1128-
`Pinned reference model archive sha ${archiveSha256} does not match the warmed corpus metadata ${metadata.archiveSha256}. Run "iwsdk reference warmup" again to refresh the pinned model files.`,
1129-
);
1087+
if (metadata.fileHashes) {
1088+
for (const [relativePath, expectedSha] of Object.entries(
1089+
metadata.fileHashes,
1090+
)) {
1091+
const actualSha = await sha256File(
1092+
path.join(extractedDir, relativePath),
1093+
);
1094+
if (actualSha !== expectedSha) {
1095+
throw new Error(
1096+
`Pinned reference model file ${relativePath} has unexpected content after download. Run "iwsdk reference warmup" again.`,
1097+
);
1098+
}
1099+
}
11301100
}
11311101

11321102
await mkdir(path.dirname(finalRoot), { recursive: true });
@@ -1138,13 +1108,8 @@ async function installPinnedModelFiles(
11381108
await rename(tempFinalRoot, finalRoot);
11391109
} catch (error) {
11401110
await rm(tempFinalRoot, { recursive: true, force: true });
1141-
if (!(await validateModelDir(finalDir))) {
1142-
throw error;
1143-
}
1144-
const installedModel = await readInstalledModelMetadata(finalDir);
11451111
if (
1146-
installedModel.archiveSha256 !== metadata.archiveSha256 ||
1147-
installedModel.archiveSize !== metadata.archiveSize
1112+
!(await validateInstalledModelFiles(finalDir, metadata.fileHashes))
11481113
) {
11491114
throw error;
11501115
}

0 commit comments

Comments
 (0)