Skip to content

Commit dbf2672

Browse files
committed
feat(audit): swap premium model to gemma-4-26b-a4b + dual-shape adapter
The previous commit registered gemma-3-12b as the premium option because my stale local types didn't list gemma-4-26b-a4b-it. The model is in fact GA on Workers AI (docs at developers.cloudflare.com/workers-ai/models/ gemma-4-26b-a4b-it) — the wrangler type generator just hasn't picked it up yet. Runtime calls work; the existing TypeScript cast in the consumer handles the gap. - AUDIT_MODELS swapped: gemma-3-12b → gemma-4-26b-a4b (MoE 26B/4B active, 256k context, vision + reasoning + function calling). - AuditModelKey union updated to match. - Consumer now normalises across two response shapes: * Standard Workers AI: { response: string, usage: {...} } * OpenAI chat completion (gemma-4): { choices: [{message: {content}}], usage: {...} } Fallback chain reads response first then choices[0].message.content so existing tests (which mock the standard shape) keep passing. - Request body sends both max_tokens and max_completion_tokens — Workers AI ignores the unrecognised one for each model so a single call works for both shapes. - Stored rawResponse now uses the normalised text instead of the envelope-specific field.
1 parent c1e4bf7 commit dbf2672

File tree

3 files changed

+47
-19
lines changed

3 files changed

+47
-19
lines changed

src/lib/audit/consumer.ts

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -324,21 +324,40 @@ export async function processAuditJob(
324324
const modelDef = resolveAuditModel(job.modelOverride);
325325
const modelId = modelDef.workersAiId;
326326

327-
// 7. Call Workers AI
328-
let result: { response?: string; usage?: { prompt_tokens: number; completion_tokens: number; total_tokens: number } };
327+
// 7. Call Workers AI.
328+
// Two response shapes are observed across Workers AI text-gen models:
329+
// a) Standard: { response: string, usage: {...} }
330+
// b) OpenAI-compat (e.g. gemma-4-26b-a4b-it):
331+
// { choices: [{ message: { content: string } }], usage: {...} }
332+
// Below we send both `max_tokens` and `max_completion_tokens` so each
333+
// model receives the parameter it expects (Workers AI ignores the
334+
// unrecognised one). The shape is normalised after the call by
335+
// extractAiResponseText() so the rest of the pipeline only sees a
336+
// single { response, usage } envelope regardless of the model.
337+
type AiResponseEnvelope = {
338+
response?: string;
339+
choices?: Array<{ message?: { content?: string } }>;
340+
usage?: {
341+
prompt_tokens: number;
342+
completion_tokens: number;
343+
total_tokens: number;
344+
};
345+
};
346+
let raw: AiResponseEnvelope;
329347
try {
330348
// No response_format — many Workers AI models reject json_schema with
331349
// "5025: This model doesn't support JSON Schema". The prompt mandates
332350
// JSON-only output and extractJsonFromResponse() recovers it even if
333351
// the model wraps it in code fences or adds prose.
334-
result = await (bindings.ai as Ai).run(modelId as Parameters<Ai["run"]>[0], {
352+
raw = await (bindings.ai as Ai).run(modelId as Parameters<Ai["run"]>[0], {
335353
messages: [
336354
{ role: "system", content: SYSTEM_PROMPT },
337355
{ role: "user", content: promptContent },
338356
],
339357
max_tokens: 1024,
358+
max_completion_tokens: 1024,
340359
temperature: 0.1,
341-
}) as unknown as { response?: string; usage?: { prompt_tokens: number; completion_tokens: number; total_tokens: number } };
360+
}) as unknown as AiResponseEnvelope;
342361
} catch (err) {
343362
if (isTransientAiError(err)) {
344363
throw new TransientError(
@@ -351,11 +370,19 @@ export async function processAuditJob(
351370
return { verdict: null, status: "error", neuronsUsed: 0 };
352371
}
353372

354-
// 7. Parse and validate response. The model may add prose, markdown
373+
// 7. Normalise the response shape. Standard Workers AI models put the
374+
// text on `result.response`; OpenAI-compatible models (gemma-4-26b-a4b)
375+
// put it on `result.choices[0].message.content`. Either way the
376+
// downstream parser only sees a single string.
377+
const responseText =
378+
raw.response ??
379+
raw.choices?.[0]?.message?.content ??
380+
"";
381+
382+
// 8. Parse and validate response. The model may add prose, markdown
355383
// fences, or trailing chatter — extractJsonFromResponse handles all of
356384
// those and returns null only if no parseable JSON object exists.
357385
let parsed: { verdict: "pass" | "warn" | "fail"; riskScore: number; findings: unknown[] };
358-
const responseText = result.response ?? "";
359386

360387
if (!responseText.trim()) {
361388
throw new TransientError("AI returned empty response");
@@ -383,15 +410,16 @@ export async function processAuditJob(
383410
}
384411
parsed = extracted;
385412

386-
// 8. Calculate neurons
387-
const promptTokens = result.usage?.prompt_tokens ?? 0;
388-
const completionTokens = result.usage?.completion_tokens ?? 0;
389-
if (!result.usage) {
413+
// 9. Calculate neurons (usage shape is identical across both response
414+
// envelopes — same prompt_tokens / completion_tokens field names).
415+
const promptTokens = raw.usage?.prompt_tokens ?? 0;
416+
const completionTokens = raw.usage?.completion_tokens ?? 0;
417+
if (!raw.usage) {
390418
console.warn(`[audit] WARNING: No usage data from AI response`);
391419
}
392420
const neuronsUsed = tokensToNeurons(promptTokens, completionTokens);
393421

394-
// 9. Store audit record (atomically updates version status).
422+
// 10. Store audit record (atomically updates version status).
395423
// Merge static-scan findings with AI findings — the static signals stay
396424
// useful even when AI passes the verdict.
397425
const aiFindings = parsed.findings as MarketplaceAuditFinding[];
@@ -406,7 +434,7 @@ export async function processAuditJob(
406434
promptTokens,
407435
completionTokens,
408436
neuronsUsed,
409-
rawResponse: result.response ?? "",
437+
rawResponse: responseText,
410438
verdict: parsed.verdict,
411439
riskScore: parsed.riskScore,
412440
findings: mergedFindings,

src/lib/audit/prompt.ts

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,13 @@ export const AUDIT_MODELS: Record<AuditModelKey, AuditModelDef> = {
3838
"Default. Small (3B params) but capable enough for the lightweight audit we run. ~17 neurons/audit, ~588 audits/day on the free tier.",
3939
estimatedNeurons: "~17",
4040
},
41-
"gemma-3-12b": {
42-
key: "gemma-3-12b",
43-
workersAiId: "@cf/google/gemma-3-12b-it",
44-
label: "Gemma 3 12B",
41+
"gemma-4-26b-a4b": {
42+
key: "gemma-4-26b-a4b",
43+
workersAiId: "@cf/google/gemma-4-26b-a4b-it",
44+
label: "Gemma 4 26B-A4B",
4545
description:
46-
"Premium. 4x the parameters of llama, sharper findings and better reasoning for borderline plugins. Higher neuron cost — reserve for cases the cheap pass flagged or for spot checks.",
47-
estimatedNeurons: "~60-100",
46+
"Premium. Mixture-of-experts (26B total / ~4B active params) with 256k context, vision, function calling, and reasoning. Sharper findings on borderline plugins. Higher neuron cost — reserve for spot checks or when the cheap pass flagged.",
47+
estimatedNeurons: "~80-150",
4848
},
4949
};
5050

src/types/marketplace.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ export interface PluginBundle {
183183
* from the AUDIT_MODELS registry in src/lib/audit/prompt.ts. Keys live in
184184
* the type layer so AuditJob can carry them without importing audit code.
185185
*/
186-
export type AuditModelKey = "llama-3.2-3b" | "gemma-3-12b";
186+
export type AuditModelKey = "llama-3.2-3b" | "gemma-4-26b-a4b";
187187

188188
export interface AuditJob {
189189
pluginId: string;

0 commit comments

Comments
 (0)