Skip to content

Commit 3965ce6

Browse files
committed
feat(cli): add micasa eval fts subcommand
Wires a chat-quality evaluation harness for the FTS-enrichment feature. No behavior change to the TUI chat pipeline -- this PR only adds the eval surface and exports the prompt-building helpers it needs. - `internal/ftseval/` package: typed Config, Question, ArmResult, RunResult, GradeResult. Run() drives each question through both FTS arms against a pre-built store, grades with a deterministic regex rubric plus an optional LLM judge, and returns per-question results. - `SeedFixture` populates projects, vendors, appliances, maintenance items, incidents, one service log, and one quote with the "permit delays" long-tail vendor note. - Default question set covering disambiguation, cross-entity joins, service-log lookup, FTS-neutral aggregate, basement incidents, nonexistent entity, long-tail note, and brand filter. - Judge-score sentinel -1 when the judge didn't run; 0-5 when it did. Judge parser tolerates real-world model output: markdown decoration, `:`/`=` separators, mixed case, leading <think>/<thinking>/<reasoning> blocks, and "Rationale" as an alias for "Reason". judge_reason surfaces in Notes when the score is the sentinel. - Table report (default on TTYs, via lipgloss), markdown (default when piping or writing to a file), and JSON. JSON redacts APIKey. Judge-score aggregates exclude sentinel rows. detectDarkBG guards lipgloss.HasDarkBackground behind a stdin-is-a-TTY check plus a recover() fallback so the reporter stays safe in CI (including Windows, where lipgloss's terminal query can panic on non-TTY stdin). - `--strict` exits 1 on per-question FTS-on rubric regression over questions completed on both arms (sql_error counts as completed; provider errors don't). runEvalFTS splits into an inner doEvalFTS that returns (int, error) so deferred cleanup fires before os.Exit when strict mode triggers a non-zero exit. Prompt-builder refactor (in `internal/llm/prompt.go`): - `BuildTableInfo(store)` exports the former `app.buildTableInfoFrom` so the eval reproduces the schema section of chat prompts exactly. - `BuildFTSContext(entries)` and `BuildFTSContextFromStore(store, q)` are the FTS-context formatters. They're unused on the chat path (chat passes `""` for ftsContext everywhere); the follow-up chat wiring PR routes real FTS results through them. - `BuildSQLPrompt` / `BuildSummaryPrompt` / `BuildSystemPrompt` take a new `ftsContext string` positional arg. Chat passes `""` -- identical prompt text to pre-FTS behavior. The arg is load-bearing only when a caller populates it; the eval does, chat does not. CLI: `micasa eval fts` with --db, --provider, --model, --judge-model, --questions, --skip-judge, --no-ab, --format, --output, --strict. Default fixture is built in a tempdir that cleans up on exit; --db points at an existing store. Privacy warning on stderr when running against a non-fixture DB on a non-local provider. Nix: `nix run '.#fts-eval'` wraps the subcommand. Refs #707.
1 parent 6a1fbcc commit 3965ce6

18 files changed

Lines changed: 2335 additions & 86 deletions

File tree

cmd/micasa/eval.go

Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
// Copyright 2026 Phillip Cloud
2+
// Licensed under the Apache License, Version 2.0
3+
4+
package main
5+
6+
import (
7+
"context"
8+
"fmt"
9+
"io"
10+
"os"
11+
"os/signal"
12+
"strings"
13+
"time"
14+
15+
"github.com/micasa-dev/micasa/internal/config"
16+
"github.com/micasa-dev/micasa/internal/data"
17+
"github.com/micasa-dev/micasa/internal/ftseval"
18+
"github.com/micasa-dev/micasa/internal/llm"
19+
"github.com/micasa-dev/micasa/internal/termio"
20+
"github.com/spf13/cobra"
21+
)
22+
23+
// evalOpts mirrors ftseval.Config plus CLI-only knobs. Populated by
24+
// Cobra flag parsing; validated inside RunE.
25+
type evalOpts struct {
26+
dbPath string
27+
provider string
28+
model string
29+
judgeModel string
30+
questions []string
31+
skipJudge bool
32+
noAB bool
33+
format string
34+
output string
35+
strict bool
36+
}
37+
38+
// newEvalCmd returns the `micasa eval` parent command. Sub-evals slot in
39+
// as children (`eval fts`, future `eval extraction`, etc.).
40+
func newEvalCmd() *cobra.Command {
41+
cmd := &cobra.Command{
42+
Use: "eval",
43+
Short: "Run chat-quality benchmarks against a fixture or user DB",
44+
Long: `Parent command for chat-quality evaluations. See subcommands.`,
45+
SilenceErrors: true,
46+
SilenceUsage: true,
47+
}
48+
cmd.AddCommand(newEvalFTSCmd())
49+
return cmd
50+
}
51+
52+
func newEvalFTSCmd() *cobra.Command {
53+
opts := &evalOpts{}
54+
cmd := &cobra.Command{
55+
Use: "fts",
56+
Short: "Run the FTS context-enrichment chat benchmark",
57+
Long: `Run the FTS chat benchmark against the default fixture DB or a
58+
user-supplied SQLite file. Each question runs twice (FTS on and FTS off) and
59+
is graded by a deterministic regex rubric, with an optional LLM judge pass.
60+
61+
The eval uses the chat config from the user's config file; --provider and
62+
--model override specific fields. Pointing --db at a real micasa DB sends
63+
prompts derived from household data to the configured provider -- if that
64+
provider is a cloud service, the data leaves the machine.`,
65+
SilenceErrors: true,
66+
SilenceUsage: true,
67+
RunE: func(cmd *cobra.Command, _ []string) error {
68+
return runEvalFTS(cmd.OutOrStdout(), opts)
69+
},
70+
}
71+
72+
cmd.Flags().StringVar(&opts.dbPath, "db", "",
73+
"path to a micasa SQLite DB (default: fixture)")
74+
cmd.Flags().StringVar(&opts.provider, "provider", "",
75+
"override chat provider from config")
76+
cmd.Flags().StringVar(&opts.model, "model", "",
77+
"override chat model from config")
78+
cmd.Flags().StringVar(&opts.judgeModel, "judge-model", "",
79+
"model for the LLM judge (default: same as --model)")
80+
cmd.Flags().StringSliceVar(&opts.questions, "questions", nil,
81+
"comma-separated names of questions to run (default: all)")
82+
cmd.Flags().BoolVar(&opts.skipJudge, "skip-judge", false,
83+
"deterministic rubric only; skip the LLM judge")
84+
cmd.Flags().BoolVar(&opts.noAB, "no-ab", false,
85+
"run each question once (FTS on) instead of twice")
86+
cmd.Flags().StringVar(&opts.format, "format", "",
87+
"report format: table (default when TTY), markdown, or json")
88+
cmd.Flags().StringVar(&opts.output, "output", "",
89+
"write report to this file instead of stdout")
90+
cmd.Flags().BoolVar(&opts.strict, "strict", false,
91+
"exit non-zero on per-question rubric regression (completed on both arms)")
92+
93+
return cmd
94+
}
95+
96+
func runEvalFTS(defaultOut io.Writer, opts *evalOpts) error {
97+
code, err := doEvalFTS(defaultOut, opts)
98+
if err != nil {
99+
return err
100+
}
101+
if code != 0 {
102+
os.Exit(code)
103+
}
104+
return nil
105+
}
106+
107+
func doEvalFTS(defaultOut io.Writer, opts *evalOpts) (int, error) {
108+
cfg, err := config.Load()
109+
if err != nil {
110+
return 0, fmt.Errorf("load config: %w", err)
111+
}
112+
113+
chatLLM := cfg.Chat.LLM
114+
provider := opts.provider
115+
if provider == "" {
116+
provider = chatLLM.Provider
117+
}
118+
model := opts.model
119+
if model == "" {
120+
model = chatLLM.Model
121+
}
122+
judgeModel := opts.judgeModel
123+
if judgeModel == "" {
124+
judgeModel = model
125+
}
126+
timeout := chatLLM.TimeoutDuration()
127+
if timeout <= 0 {
128+
timeout = 60 * time.Second
129+
}
130+
131+
// Privacy warning when running against a real DB on a non-local
132+
// provider.
133+
if opts.dbPath != "" && !isLocalLLMProvider(provider) {
134+
fmt.Fprintf(os.Stderr,
135+
"warning: eval will send prompts derived from %s to %s.\n"+
136+
"Press Ctrl-C within 5s to abort.\n",
137+
opts.dbPath, provider,
138+
)
139+
time.Sleep(5 * time.Second)
140+
}
141+
142+
// Open (or build) the store.
143+
store, fixture, cleanup, err := openEvalStore(opts.dbPath)
144+
if err != nil {
145+
return 0, err
146+
}
147+
defer cleanup()
148+
149+
// Build LLM clients.
150+
client, err := llm.NewClient(provider, chatLLM.BaseURL, model, chatLLM.APIKey, timeout)
151+
if err != nil {
152+
return 0, fmt.Errorf("build chat client: %w", err)
153+
}
154+
judge := client
155+
if judgeModel != model {
156+
judge, err = llm.NewClient(provider, chatLLM.BaseURL, judgeModel, chatLLM.APIKey, timeout)
157+
if err != nil {
158+
return 0, fmt.Errorf("build judge client: %w", err)
159+
}
160+
}
161+
162+
harnessCfg := ftseval.Config{
163+
DBPath: opts.dbPath,
164+
Provider: provider,
165+
Model: model,
166+
JudgeModel: judgeModel,
167+
APIKey: chatLLM.APIKey,
168+
Timeout: timeout,
169+
Questions: opts.questions,
170+
SkipJudge: opts.skipJudge,
171+
NoAB: opts.noAB,
172+
Format: opts.format,
173+
Strict: opts.strict,
174+
}
175+
176+
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
177+
defer cancel()
178+
179+
results, err := ftseval.Run(ctx, harnessCfg, store, fixture, client, judge)
180+
if err != nil {
181+
return 0, fmt.Errorf("run eval: %w", err)
182+
}
183+
184+
// Write report. Default format: "table" when writing to a TTY,
185+
// "markdown" otherwise (pipes, files, CI). --format overrides.
186+
out := defaultOut
187+
if opts.output != "" {
188+
f, err := os.Create(opts.output)
189+
if err != nil {
190+
return 0, fmt.Errorf("open report file: %w", err)
191+
}
192+
defer func() { _ = f.Close() }()
193+
out = f
194+
}
195+
if harnessCfg.Format == "" {
196+
if termio.IsTerminal(out) {
197+
harnessCfg.Format = "table"
198+
} else {
199+
harnessCfg.Format = "markdown"
200+
}
201+
}
202+
if err := ftseval.WriteReport(out, harnessCfg, results); err != nil {
203+
return 0, fmt.Errorf("write report: %w", err)
204+
}
205+
206+
return ftseval.ExitCode(harnessCfg, results), nil
207+
}
208+
209+
// openEvalStore returns either the user-supplied SQLite store or a
210+
// freshly-seeded fixture. The returned cleanup closes the store and, for
211+
// the fixture path, removes the tempdir the fixture lives in.
212+
func openEvalStore(
213+
dbPath string,
214+
) (*data.Store, ftseval.SeededFixture, func(), error) {
215+
if dbPath != "" {
216+
s, err := data.Open(dbPath)
217+
if err != nil {
218+
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("open %s: %w", dbPath, err)
219+
}
220+
cleanup := func() { _ = s.Close() }
221+
return s, ftseval.SeededFixture{}, cleanup, nil
222+
}
223+
224+
tmp, err := os.MkdirTemp("", "micasa-eval-*")
225+
if err != nil {
226+
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("create fixture tempdir: %w", err)
227+
}
228+
removeTmp := func() { _ = os.RemoveAll(tmp) }
229+
230+
path := tmp + "/fixture.db"
231+
s, err := data.Open(path)
232+
if err != nil {
233+
removeTmp()
234+
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("open fixture: %w", err)
235+
}
236+
closeStore := func() { _ = s.Close() }
237+
if err := s.AutoMigrate(); err != nil {
238+
closeStore()
239+
removeTmp()
240+
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("migrate fixture: %w", err)
241+
}
242+
if err := s.SeedDefaults(); err != nil {
243+
closeStore()
244+
removeTmp()
245+
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("seed fixture defaults: %w", err)
246+
}
247+
fx, err := ftseval.SeedFixture(s)
248+
if err != nil {
249+
closeStore()
250+
removeTmp()
251+
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("seed fixture entities: %w", err)
252+
}
253+
254+
cleanup := func() {
255+
closeStore()
256+
removeTmp()
257+
}
258+
return s, fx, cleanup, nil
259+
}
260+
261+
// isLocalLLMProvider reports whether the named provider runs on the same
262+
// machine (so no household data leaves the machine).
263+
func isLocalLLMProvider(provider string) bool {
264+
switch strings.ToLower(provider) {
265+
case "ollama", "llamacpp", "llamafile":
266+
return true
267+
}
268+
return false
269+
}

cmd/micasa/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ func newRootCmd() *cobra.Command {
7979
newGenCLIRefCmd(),
8080
newDBCmd(),
8181
newStatusCmd(),
82+
newEvalCmd(),
8283
)
8384

8485
return root

cmd/micasa/status.go

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,11 @@ import (
1414

1515
"charm.land/lipgloss/v2"
1616
"charm.land/lipgloss/v2/table"
17-
"github.com/charmbracelet/x/term"
1817
"github.com/micasa-dev/micasa/internal/data"
18+
"github.com/micasa-dev/micasa/internal/termio"
1919
"github.com/spf13/cobra"
2020
)
2121

22-
// writerIsTerminal reports whether w is an *os.File backed by a terminal.
23-
// Any non-file writer (bytes.Buffer, io.Pipe, custom writers) is treated
24-
// as non-terminal so styled output never leaks to non-TTY destinations.
25-
func writerIsTerminal(w io.Writer) bool {
26-
f, ok := w.(*os.File)
27-
return ok && term.IsTerminal(f.Fd())
28-
}
29-
3022
type statusOpts struct {
3123
asJSON bool
3224
days int
@@ -67,7 +59,7 @@ shell prompts, and status bar widgets.`,
6759
}
6860
opts.isDark = lipgloss.HasDarkBackground(os.Stdin, os.Stderr)
6961
out := cmd.OutOrStdout()
70-
opts.noStyle = !writerIsTerminal(out)
62+
opts.noStyle = !termio.IsTerminal(out)
7163
store, err := openExisting(dbPathFromEnvOrArg(args))
7264
if err != nil {
7365
return err

cmd/micasa/status_test.go

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@ import (
77
"bytes"
88
"encoding/json"
99
"errors"
10-
"io"
11-
"os"
1210
"strings"
1311
"testing"
1412
"time"
@@ -46,24 +44,6 @@ func TestExtractExitCodeRegularError(t *testing.T) {
4644
assert.Equal(t, 1, extractExitCode(errors.New("boom")))
4745
}
4846

49-
// --- writerIsTerminal ---
50-
51-
func TestWriterIsTerminal(t *testing.T) {
52-
t.Parallel()
53-
// bytes.Buffer is not a file
54-
var buf bytes.Buffer
55-
assert.False(t, writerIsTerminal(&buf))
56-
57-
// io.Discard is not a file
58-
assert.False(t, writerIsTerminal(io.Discard))
59-
60-
// Temp file is an *os.File but not a terminal
61-
f, err := os.CreateTemp(t.TempDir(), "tty-test")
62-
require.NoError(t, err)
63-
defer func() { _ = f.Close() }()
64-
assert.False(t, writerIsTerminal(f))
65-
}
66-
6747
// --- text output ---
6848

6949
func TestStatusTextEmpty(t *testing.T) {

0 commit comments

Comments
 (0)