Skip to content

Commit cdcc204

Browse files
committed
feat(cli): add micasa eval fts subcommand
Wires the chat-quality eval described in the plan: - internal/ftseval/ package with typed Config, Question, ArmResult, RunResult; Run() drives each question through both FTS arms against a pre-built store, grades with a deterministic regex rubric plus an optional LLM judge, and returns the per-question results. - Fixture seed (SeedFixture) populating projects, vendors, appliances, maintenance items, incidents, one service log, and one quote that ties kitchen to Pacific Plumbing (with the "permit delays" vendor note the long-tail-note question relies on). - Default question set covering disambiguation, cross-entity joins, service-log lookup, aggregate (FTS-neutral), basement incidents, nonexistent entity, long-tail note, and brand filter. - Judge-score sentinel -1 when the judge didn't run (--skip-judge, no summary, parse failure, or judge error); 0-5 when it did. - Markdown report (default) and JSON; JSON writes a redactedConfig that excludes APIKey so the key never leaks to stdout, --output, or CI artifacts. Judge-score aggregates exclude sentinel rows. - --strict exits 1 on per-question FTS-on rubric regression over questions completed on both arms (sql_error still counts as completed per production behavior; stage-1/stage-2 provider errors do not). - Empty ExpectedEntityIDs are skipped in entity-hit scoring so --db runs (which have a zero-valued SeededFixture) don't false-positive. CLI: `micasa eval fts` with --db, --provider, --model, --judge-model, --questions, --skip-judge, --no-ab, --format, --output, --strict. Default fixture is built in a tempdir that cleans up on exit; --db points at an existing store. Privacy warning printed on stderr when running against a non-fixture DB on a non-local provider. Nix: `nix run '.#fts-eval'` wraps the subcommand. Refactor: moves buildFTSContext and buildTableInfoFrom out of internal/app/chat.go into internal/llm as exported BuildFTSContextFromStore and BuildTableInfo so the eval harness reproduces exactly the prompt-building logic chat uses. Refs #707.
1 parent 1a143bd commit cdcc204

13 files changed

Lines changed: 1539 additions & 68 deletions

File tree

cmd/micasa/eval.go

Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
// Copyright 2026 Phillip Cloud
2+
// Licensed under the Apache License, Version 2.0
3+
4+
package main
5+
6+
import (
7+
"context"
8+
"fmt"
9+
"os"
10+
"os/signal"
11+
"strings"
12+
"time"
13+
14+
"github.com/micasa-dev/micasa/internal/config"
15+
"github.com/micasa-dev/micasa/internal/data"
16+
"github.com/micasa-dev/micasa/internal/ftseval"
17+
"github.com/micasa-dev/micasa/internal/llm"
18+
"github.com/spf13/cobra"
19+
)
20+
21+
// evalOpts mirrors ftseval.Config plus CLI-only knobs. Populated by
22+
// Cobra flag parsing; validated inside RunE.
23+
type evalOpts struct {
24+
dbPath string
25+
provider string
26+
model string
27+
judgeModel string
28+
questions []string
29+
skipJudge bool
30+
noAB bool
31+
format string
32+
output string
33+
strict bool
34+
}
35+
36+
// newEvalCmd returns the `micasa eval` parent command. Sub-evals slot in
37+
// as children (`eval fts`, future `eval extraction`, etc.).
38+
func newEvalCmd() *cobra.Command {
39+
cmd := &cobra.Command{
40+
Use: "eval",
41+
Short: "Run chat-quality benchmarks against a fixture or user DB",
42+
Long: `Parent command for chat-quality evaluations. See subcommands.`,
43+
SilenceErrors: true,
44+
SilenceUsage: true,
45+
}
46+
cmd.AddCommand(newEvalFTSCmd())
47+
return cmd
48+
}
49+
50+
func newEvalFTSCmd() *cobra.Command {
51+
opts := &evalOpts{}
52+
cmd := &cobra.Command{
53+
Use: "fts",
54+
Short: "Run the FTS context-enrichment chat benchmark",
55+
Long: `Run the FTS chat benchmark against the default fixture DB or a
56+
user-supplied SQLite file. Each question runs twice (FTS on and FTS off) and
57+
is graded by a deterministic regex rubric, with an optional LLM judge pass.
58+
59+
The eval uses the chat config from the user's config file; --provider and
60+
--model override specific fields. Pointing --db at a real micasa DB sends
61+
prompts derived from household data to the configured provider -- if that
62+
provider is a cloud service, the data leaves the machine.`,
63+
SilenceErrors: true,
64+
SilenceUsage: true,
65+
RunE: func(cmd *cobra.Command, args []string) error {
66+
return runEvalFTS(cmd.OutOrStdout(), opts)
67+
},
68+
}
69+
70+
cmd.Flags().StringVar(&opts.dbPath, "db", "",
71+
"path to a micasa SQLite DB (default: fixture)")
72+
cmd.Flags().StringVar(&opts.provider, "provider", "",
73+
"override chat provider from config")
74+
cmd.Flags().StringVar(&opts.model, "model", "",
75+
"override chat model from config")
76+
cmd.Flags().StringVar(&opts.judgeModel, "judge-model", "",
77+
"model for the LLM judge (default: same as --model)")
78+
cmd.Flags().StringSliceVar(&opts.questions, "questions", nil,
79+
"comma-separated names of questions to run (default: all)")
80+
cmd.Flags().BoolVar(&opts.skipJudge, "skip-judge", false,
81+
"deterministic rubric only; skip the LLM judge")
82+
cmd.Flags().BoolVar(&opts.noAB, "no-ab", false,
83+
"run each question once (FTS on) instead of twice")
84+
cmd.Flags().StringVar(&opts.format, "format", "markdown",
85+
"report format: markdown or json")
86+
cmd.Flags().StringVar(&opts.output, "output", "",
87+
"write report to this file instead of stdout")
88+
cmd.Flags().BoolVar(&opts.strict, "strict", false,
89+
"exit non-zero on per-question rubric regression (completed on both arms)")
90+
91+
return cmd
92+
}
93+
94+
func runEvalFTS(defaultOut interface {
95+
Write([]byte) (int, error)
96+
}, opts *evalOpts,
97+
) error {
98+
cfg, err := config.Load()
99+
if err != nil {
100+
return fmt.Errorf("load config: %w", err)
101+
}
102+
103+
chatLLM := cfg.Chat.LLM
104+
provider := opts.provider
105+
if provider == "" {
106+
provider = chatLLM.Provider
107+
}
108+
model := opts.model
109+
if model == "" {
110+
model = chatLLM.Model
111+
}
112+
judgeModel := opts.judgeModel
113+
if judgeModel == "" {
114+
judgeModel = model
115+
}
116+
timeout := chatLLM.TimeoutDuration()
117+
if timeout <= 0 {
118+
timeout = 60 * time.Second
119+
}
120+
121+
// Privacy warning when running against a real DB on a non-local
122+
// provider.
123+
if opts.dbPath != "" && !isLocalLLMProvider(provider) {
124+
fmt.Fprintf(os.Stderr,
125+
"warning: eval will send prompts derived from %s to %s.\n"+
126+
"Press Ctrl-C within 5s to abort.\n",
127+
opts.dbPath, provider,
128+
)
129+
time.Sleep(5 * time.Second)
130+
}
131+
132+
// Open (or build) the store.
133+
store, fixture, cleanup, err := openEvalStore(opts.dbPath)
134+
if err != nil {
135+
return err
136+
}
137+
defer cleanup()
138+
139+
// Build LLM clients.
140+
client, err := llm.NewClient(provider, chatLLM.BaseURL, model, chatLLM.APIKey, timeout)
141+
if err != nil {
142+
return fmt.Errorf("build chat client: %w", err)
143+
}
144+
judge := client
145+
if judgeModel != model {
146+
judge, err = llm.NewClient(provider, chatLLM.BaseURL, judgeModel, chatLLM.APIKey, timeout)
147+
if err != nil {
148+
return fmt.Errorf("build judge client: %w", err)
149+
}
150+
}
151+
152+
harnessCfg := ftseval.Config{
153+
DBPath: opts.dbPath,
154+
Provider: provider,
155+
Model: model,
156+
JudgeModel: judgeModel,
157+
APIKey: chatLLM.APIKey,
158+
Timeout: timeout,
159+
Questions: opts.questions,
160+
SkipJudge: opts.skipJudge,
161+
NoAB: opts.noAB,
162+
Format: opts.format,
163+
Strict: opts.strict,
164+
}
165+
166+
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
167+
defer cancel()
168+
169+
results, err := ftseval.Run(ctx, harnessCfg, store, fixture, client, judge)
170+
if err != nil {
171+
return fmt.Errorf("run eval: %w", err)
172+
}
173+
174+
// Write report.
175+
out := defaultOut
176+
if opts.output != "" {
177+
f, err := os.Create(opts.output)
178+
if err != nil {
179+
return fmt.Errorf("open report file: %w", err)
180+
}
181+
defer func() { _ = f.Close() }()
182+
out = f
183+
}
184+
if err := ftseval.WriteReport(out, harnessCfg, results); err != nil {
185+
return fmt.Errorf("write report: %w", err)
186+
}
187+
188+
if code := ftseval.ExitCode(harnessCfg, results); code != 0 {
189+
os.Exit(code)
190+
}
191+
return nil
192+
}
193+
194+
// openEvalStore returns either the user-supplied SQLite store or a
195+
// freshly-seeded fixture. The returned cleanup closes the store and, for
196+
// the fixture path, removes the tempdir the fixture lives in.
197+
func openEvalStore(
198+
dbPath string,
199+
) (*data.Store, ftseval.SeededFixture, func(), error) {
200+
if dbPath != "" {
201+
s, err := data.Open(dbPath)
202+
if err != nil {
203+
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("open %s: %w", dbPath, err)
204+
}
205+
cleanup := func() { _ = s.Close() }
206+
return s, ftseval.SeededFixture{}, cleanup, nil
207+
}
208+
209+
tmp, err := os.MkdirTemp("", "micasa-eval-*")
210+
if err != nil {
211+
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("create fixture tempdir: %w", err)
212+
}
213+
removeTmp := func() { _ = os.RemoveAll(tmp) }
214+
215+
path := tmp + "/fixture.db"
216+
s, err := data.Open(path)
217+
if err != nil {
218+
removeTmp()
219+
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("open fixture: %w", err)
220+
}
221+
closeStore := func() { _ = s.Close() }
222+
if err := s.AutoMigrate(); err != nil {
223+
closeStore()
224+
removeTmp()
225+
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("migrate fixture: %w", err)
226+
}
227+
if err := s.SeedDefaults(); err != nil {
228+
closeStore()
229+
removeTmp()
230+
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("seed fixture defaults: %w", err)
231+
}
232+
fx, err := ftseval.SeedFixture(s)
233+
if err != nil {
234+
closeStore()
235+
removeTmp()
236+
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("seed fixture entities: %w", err)
237+
}
238+
239+
cleanup := func() {
240+
closeStore()
241+
removeTmp()
242+
}
243+
return s, fx, cleanup, nil
244+
}
245+
246+
// isLocalLLMProvider reports whether the named provider runs on the same
247+
// machine (so no household data leaves the machine).
248+
func isLocalLLMProvider(provider string) bool {
249+
switch strings.ToLower(provider) {
250+
case "ollama", "llamacpp", "llamafile":
251+
return true
252+
}
253+
return false
254+
}

cmd/micasa/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ func newRootCmd() *cobra.Command {
7979
newGenCLIRefCmd(),
8080
newDBCmd(),
8181
newStatusCmd(),
82+
newEvalCmd(),
8283
)
8384

8485
return root

docs/content/docs/reference/cli.md

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ micasa [database-path] [flags]
3535
- [`micasa config`](#micasa-config) -- Manage application configuration
3636
- [`micasa db`](#micasa-db) -- Read and write entity data
3737
- [`micasa demo`](#micasa-demo) -- Launch with sample data in an in-memory database
38+
- [`micasa eval`](#micasa-eval) -- Run chat-quality benchmarks against a fixture or user DB
3839
- [`micasa mcp`](#micasa-mcp) -- Run MCP server for LLM client access
3940
- [`micasa pro`](#micasa-pro) -- Manage micasa Pro sync
4041
- [`micasa query`](#micasa-query) -- Run a read-only SQL query
@@ -1643,6 +1644,61 @@ micasa demo [database-path] [flags]
16431644

16441645
- [`micasa`](#micasa) -- A terminal UI for tracking everything about your home
16451646

1647+
## micasa eval
1648+
1649+
Parent command for chat-quality evaluations. See subcommands.
1650+
1651+
### Flags
1652+
1653+
| Flag | Default | Description |
1654+
|------|---------|-------------|
1655+
| `-h`, `--help` | - | help for eval |
1656+
1657+
### Subcommands
1658+
1659+
- [`micasa eval fts`](#micasa-eval-fts) -- Run the FTS context-enrichment chat benchmark
1660+
1661+
### See also
1662+
1663+
- [`micasa`](#micasa) -- A terminal UI for tracking everything about your home
1664+
1665+
## micasa eval fts
1666+
1667+
Run the FTS chat benchmark against the default fixture DB or a
1668+
user-supplied SQLite file. Each question runs twice (FTS on and FTS off) and
1669+
is graded by a deterministic regex rubric, with an optional LLM judge pass.
1670+
1671+
The eval uses the chat config from the user's config file; --provider and
1672+
--model override specific fields. Pointing --db at a real micasa DB sends
1673+
prompts derived from household data to the configured provider -- if that
1674+
provider is a cloud service, the data leaves the machine.
1675+
1676+
### Usage
1677+
1678+
```
1679+
micasa eval fts [flags]
1680+
```
1681+
1682+
### Flags
1683+
1684+
| Flag | Default | Description |
1685+
|------|---------|-------------|
1686+
| `--db` | - | path to a micasa SQLite DB (default: fixture) |
1687+
| `--format` | `markdown` | report format: markdown or json |
1688+
| `-h`, `--help` | - | help for fts |
1689+
| `--judge-model` | - | model for the LLM judge (default: same as --model) |
1690+
| `--model` | - | override chat model from config |
1691+
| `--no-ab` | - | run each question once (FTS on) instead of twice |
1692+
| `--output` | - | write report to this file instead of stdout |
1693+
| `--provider` | - | override chat provider from config |
1694+
| `--questions` | `[]` | comma-separated names of questions to run (default: all) |
1695+
| `--skip-judge` | - | deterministic rubric only; skip the LLM judge |
1696+
| `--strict` | - | exit non-zero on per-question rubric regression (completed on both arms) |
1697+
1698+
### See also
1699+
1700+
- [`micasa eval`](#micasa-eval) -- Run chat-quality benchmarks against a fixture or user DB
1701+
16461702
## micasa mcp
16471703

16481704
Start a Model Context Protocol server over stdio, exposing micasa data to LLM clients like Claude Desktop and Claude Code.

flake.nix

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,13 @@
484484
| column -t
485485
'';
486486
};
487+
fts-eval = pkgs.writeShellApplication {
488+
name = "fts-eval";
489+
runtimeInputs = [ self.packages.micasa ];
490+
text = ''
491+
exec micasa eval fts "$@"
492+
'';
493+
};
487494
run-pre-commit = pkgs.writeShellApplication {
488495
name = "run-pre-commit";
489496
runtimeInputs = [

0 commit comments

Comments
 (0)