exercism · iHiD · Sep 10, 2025 · Sep 4, 2025 · Sep 4, 2025 · Sep 4, 2025
@@ -2,39 +2,63 @@ export function buildPrompt(batchContent: string): string {
   return `
 You are an assistant that extracts i18n metadata from TSX React component files.
 
-Input format:
-- Each file appears twice in the batch:
-  - OLD: before i18n extraction, with literal user-facing text (or "[not found]").
-  - NEW: after i18n extraction, with calls like t("...") or t('...').
+Scope:
+- ONLY process files/sections that contain a \`t("...")\` or \`t('...')\` call in the NEW code.
+- If a batch has no \`t(...)\` calls in any NEW section, respond with an empty JSON object: {}.
+- Ignore <Trans i18nKey="..."> entirely for this task.
 
-Task:
-- For every t("...") / t('...') key found in a NEW section, output exactly one object with:
-  - "key": the exact key string as written in the code (copy verbatim).
-  - "desc": a concise description (1-3 sentences) of what the string represents in the UI, grounded by the OLD text if available.
+Goal:
+- Output a single JSON OBJECT whose properties follow this exact format:
+  "<EXACT i18n key from the code>": "<multi-line description>"
 
-Style rules for "desc":
-- Each sentence must begin with "This is ...".
-- Prefer precise UI nouns: "heading", "button label", "menu item", "tooltip", "helper text".
-- Avoid filler like "the text for a button"; be specific and succinct.
+Key rules (CRITICAL):
+- Use the EXACT key string as written inside \`t('...')\` or \`t("...")\`.
+- Do NOT transform or infer namespaces.
+- Do NOT add, remove, or modify leading dots, prefixes, or suffixes (e.g., keep ".heading", keep "_html").
+- Do NOT deduce or prepend any namespace — the property name must match the code verbatim.
+
+What to extract:
+- Scan only the NEW sections to find all \`t("...")\` / \`t('...')\` usages.
+- For every discovered key, create exactly one entry in the output JSON:
+  - Property name: the exact key string from the code.
+  - Property value: a single multi-line string with EXACTLY these fields, in this order,
+    each on its own line starting with a bold label:
+    **Functional Purpose**: <short, specific purpose in the UI>
+    **UI Location**: <precise place in the UI hierarchy (e.g., "Settings → General → Header")>
+    **When Users See This**: <concise trigger/context>
+    **Technical Context**: <only relevant technical notes; list variables exactly and state they must remain unchanged>
+    **Current English**: "<English text from OLD if available; else empty quotes>"
+
+Grounding & variables:
+- Use OLD text and nearby JSX to keep descriptions specific.
+- If placeholders/variables appear (e.g., \`%{name}\`, \`{{count}}\`, \`{value}\`), list them under **Technical Context** EXACTLY as written and say "must remain unchanged".
+- Be brief; do not over-explain obvious UI strings.
+- Do not invent content not supported by OLD/NEW.
+
+Deduplication:
+- If the same exact key appears multiple times, include it once; the last occurrence wins.
 
 Output rules:
-- Output MUST be a single JSON array of objects. Do not return NDJSON, prose, or code fences.
-- Preserve suffixes like "_html" in keys.
-- Include ONLY keys that appear in NEW sections.
-- Do not duplicate keys.
+- Output MUST be a single JSON object (not an array). No prose, comments, or code fences.
+- Include ONLY keys found in NEW sections via \`t(...)\`.
+- If no \`t(...)\` keys are found, output \`{}\`.
 
-Example:
+Example (conceptual):
 OLD:
   <h1>General settings</h1>
 NEW:
   <h1>{t('.heading')}</h1>
+
 Output:
-  [
-    {"key":".heading","desc":"This is the main heading for the general settings page."}
-  ]
+{
+  ".heading": "**Functional Purpose**: Page heading for General settings\\n**UI Location**: Settings → General (page header)\\n**When Users See This**: On opening the General settings page\\n**Technical Context**: Standard text; no special formatting\\n**Current English**: \\"General settings\\""
+}
+
+Respond with a single JSON object only. Do not include code fences, comments, or extra text.
 
 File batch content:
 ---
 ${batchContent}
----`.trim()
+---
+`.trim()
 }
@@ -5,49 +5,85 @@ import { promisify } from 'node:util'
 import { buildPrompt } from './buildPrompt'
 import { runLLM } from '../extract-jsx-copy/runLLM'
 import { createBatches } from './createBatches'
+import { parseLLMOutput } from './parseLLMOutput'
 
 export const execFileAsync = promisify(execFile)
 
 const OUTPUT_DIR = process.env.OUTPUT_DIR || './i18n-descriptions'
+const DEBUG_DIR = process.env.DEBUG_DIR || './i18n-debug'
 
-const parseLLMOutput = (output: string) => {
-  if (output.trim().startsWith('[')) {
-    return JSON.parse(output)
-  } else {
-    return output
-      .split('\n')
-      .map((l) => l.trim())
-      .filter(Boolean)
-      .map((l) => JSON.parse(l))
-  }
-}
+const DEFAULT_COMMIT_SHA = 'ccaebe4d435f235be6e624b72e9a4e1c841c7520'
 
-async function writeBatchJson(batchIndex: number, data: string) {
+async function writeBatchJson(batchIndex: number, data: unknown) {
   await fs.mkdir(OUTPUT_DIR, { recursive: true })
   const fileName = `batch-${String(batchIndex + 1).padStart(3, '0')}.json`
   const outPath = path.join(OUTPUT_DIR, fileName)
   await fs.writeFile(outPath, JSON.stringify(data, null, 2), 'utf8')
   return outPath
 }
 
+async function writeDebugFile(
+  batchIndex: number,
+  kind: string,
+  content: string
+) {
+  await fs.mkdir(DEBUG_DIR, { recursive: true })
+  const fileName = `batch-${String(batchIndex + 1).padStart(3, '0')}.${kind}`
+  const outPath = path.join(DEBUG_DIR, fileName)
+  await fs.writeFile(outPath, content, 'utf8')
+  return outPath
+}
+
 ;(async () => {
   const inputDir = process.argv[2] || './input'
+
+  const startFromRaw = process.argv[3]
+  const startFrom =
+    startFromRaw && /^\d+$/.test(startFromRaw) ? Number(startFromRaw) : 1
+
   const commitSha =
-    process.argv[3] || 'ccaebe4d435f235be6e624b72e9a4e1c841c7520'
+    process.argv[4] || process.env.COMMIT_SHA || DEFAULT_COMMIT_SHA
+
   const batches = await createBatches(inputDir, commitSha)
 
-  for (let i = 0; i < batches.length; i++) {
+  const startIndex = Math.min(batches.length, Math.max(1, startFrom)) - 1
+
+  console.log(
+    `Total batches: ${batches.length}. Starting from batch ${startFrom} (index ${startIndex}).`
+  )
+
+  for (let i = startIndex; i < batches.length; i++) {
     console.log('started batch', i + 1, 'of', batches.length)
 
     const batch = batches[i]
+
+    await writeDebugFile(i, 'batch.txt', batch.content ?? '(no batch content)')
+
     const prompt = buildPrompt(batch.content)
+
+    if (prompt.includes('${batchContent}')) {
+      throw new Error(
+        'Prompt still contains a literal ${batchContent}. Check buildPrompt interpolation.'
+      )
+    }
+
+    await writeDebugFile(i, 'prompt.txt', prompt)
+
     const llmOutput = await runLLM(prompt)
+    await writeDebugFile(i, 'output.raw.txt', llmOutput ?? '(undefined)')
 
     const parsedOutput = llmOutput ? parseLLMOutput(llmOutput) : null
 
     if (parsedOutput) {
-      const outPath = await writeBatchJson(i, parsedOutput)
-      console.log(`Wrote ${parsedOutput.length} entries → ${outPath}`)
+      const outPath = await writeBatchJson(i, parsedOutput as any)
+
+      const count = Array.isArray(parsedOutput)
+        ? parsedOutput.length
+        : Object.keys(parsedOutput as Record<string, unknown>).length
+
+      console.log(
+        `Wrote ${count} entr${count === 1 ? 'y' : 'ies'} → ${outPath}`
+      )
     } else {
       console.log(`No results from batch ${i + 1}`)
     }

@@ -0,0 +1,75 @@
+import { jsonrepair } from 'jsonrepair'
+
+type LLMJson = Record<string, string> | unknown[] // object (your new format) or array (old)
+const CODE_FENCE_RE = /^```(?:json)?\s*([\s\S]*?)\s*```$/i
+
+export function parseLLMOutput(raw: string): LLMJson {
+  const output = raw.trim()
+
+  // Strip code fences if the model adds them
+  const fencedMatch = output.match(CODE_FENCE_RE)
+  const unwrapped = fencedMatch ? fencedMatch[1].trim() : output
+
+  // 1) Try direct JSON (object or array)
+  try {
+    return JSON.parse(unwrapped)
+  } catch {
+    // 1a) Try to repair the whole thing
+    try {
+      const repaired = jsonrepair(unwrapped)
+      return JSON.parse(repaired)
+    } catch {
+      // continue
+    }
+
+    // 2) Try to salvage by extracting the first top-level JSON object/array
+    const firstBrace = unwrapped.indexOf('{')
+    const lastBrace = unwrapped.lastIndexOf('}')
+    const firstBracket = unwrapped.indexOf('[')
+    const lastBracket = unwrapped.lastIndexOf(']')
+
+    const hasObject =
+      firstBrace !== -1 && lastBrace !== -1 && lastBrace > firstBrace
+    const hasArray =
+      firstBracket !== -1 && lastBracket !== -1 && lastBracket > firstBracket
+
+    const candidate = hasObject
+      ? unwrapped.slice(firstBrace, lastBrace + 1)
+      : hasArray
+      ? unwrapped.slice(firstBracket, lastBracket + 1)
+      : null
+
+    if (candidate) {
+      // 2a) Parse candidate directly
+      try {
+        return JSON.parse(candidate)
+      } catch {
+        // 2b) Repair candidate if still broken
+        try {
+          const repairedCandidate = jsonrepair(candidate)
+          return JSON.parse(repairedCandidate)
+        } catch {
+          // fall through to NDJSON attempt
+        }
+      }
+    }
+
+    // 3) As a last resort, attempt NDJSON (one JSON per line)
+    const lines = unwrapped
+      .split('\n')
+      .map((l) => l.trim())
+      .filter(Boolean)
+
+    // If it's NDJSON, all lines must be valid JSON (possibly after repair)
+    const parsedLines = lines.map((l) => {
+      try {
+        return JSON.parse(l)
+      } catch {
+        const repairedLine = jsonrepair(l)
+        return JSON.parse(repairedLine)
+      }
+    })
+
+    return parsedLines
+  }
+}
@@ -8,7 +8,6 @@ export async function runLLM(prompt: string): Promise<string | undefined> {
     model: 'gemini-2.5-flash',
     contents: prompt,
     config: {
-      responseMimeType: 'application/json',
       thinkingConfig: {
         thinkingBudget: 0,
       },

@@ -131,4 +131,4 @@ def build_nested_hash(keys, value)
 #   I18n.Backend::Exercism.new,
 #   I18n.backend
 # )
-# nd
+# end
-Original file line number
+Diff line change
@@ Expand Up / @@ -131,4 +131,4 @@ def build_nested_hash(keys, value) @@
     #   I18n.Backend::Exercism.new,
     #   I18n.backend
     # )
-    # nd
+    # end