Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
269 changes: 269 additions & 0 deletions cmd/micasa/eval.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,269 @@
// Copyright 2026 Phillip Cloud
// Licensed under the Apache License, Version 2.0

package main

import (
"context"
"fmt"
"io"
"os"
"os/signal"
"strings"
"time"

"github.com/micasa-dev/micasa/internal/config"
"github.com/micasa-dev/micasa/internal/data"
"github.com/micasa-dev/micasa/internal/ftseval"
"github.com/micasa-dev/micasa/internal/llm"
"github.com/micasa-dev/micasa/internal/termio"
"github.com/spf13/cobra"
)

// evalOpts mirrors ftseval.Config plus CLI-only knobs. Populated by
// Cobra flag parsing; validated inside RunE.
type evalOpts struct {
dbPath string
provider string
model string
judgeModel string
questions []string
skipJudge bool
noAB bool
format string
output string
strict bool
}

// newEvalCmd returns the `micasa eval` parent command. Sub-evals slot in
// as children (`eval fts`, future `eval extraction`, etc.).
func newEvalCmd() *cobra.Command {
cmd := &cobra.Command{
Use: "eval",
Short: "Run chat-quality benchmarks against a fixture or user DB",
Long: `Parent command for chat-quality evaluations. See subcommands.`,
SilenceErrors: true,
SilenceUsage: true,
}
cmd.AddCommand(newEvalFTSCmd())
return cmd
}

func newEvalFTSCmd() *cobra.Command {
opts := &evalOpts{}
cmd := &cobra.Command{
Use: "fts",
Short: "Run the FTS context-enrichment chat benchmark",
Long: `Run the FTS chat benchmark against the default fixture DB or a
user-supplied SQLite file. Each question runs twice (FTS on and FTS off) and
is graded by a deterministic regex rubric, with an optional LLM judge pass.

The eval uses the chat config from the user's config file; --provider and
--model override specific fields. Pointing --db at a real micasa DB sends
prompts derived from household data to the configured provider -- if that
provider is a cloud service, the data leaves the machine.`,
SilenceErrors: true,
SilenceUsage: true,
RunE: func(cmd *cobra.Command, _ []string) error {
return runEvalFTS(cmd.OutOrStdout(), opts)
},
}

cmd.Flags().StringVar(&opts.dbPath, "db", "",
"path to a micasa SQLite DB (default: fixture)")
cmd.Flags().StringVar(&opts.provider, "provider", "",
"override chat provider from config")
cmd.Flags().StringVar(&opts.model, "model", "",
"override chat model from config")
cmd.Flags().StringVar(&opts.judgeModel, "judge-model", "",
"model for the LLM judge (default: same as --model)")
cmd.Flags().StringSliceVar(&opts.questions, "questions", nil,
"comma-separated names of questions to run (default: all)")
cmd.Flags().BoolVar(&opts.skipJudge, "skip-judge", false,
"deterministic rubric only; skip the LLM judge")
cmd.Flags().BoolVar(&opts.noAB, "no-ab", false,
"run each question once (FTS on) instead of twice")
cmd.Flags().StringVar(&opts.format, "format", "",
"report format: table (default when TTY), markdown, or json")
cmd.Flags().StringVar(&opts.output, "output", "",
"write report to this file instead of stdout")
cmd.Flags().BoolVar(&opts.strict, "strict", false,
"exit non-zero on per-question rubric regression (completed on both arms)")

return cmd
}

func runEvalFTS(defaultOut io.Writer, opts *evalOpts) error {
code, err := doEvalFTS(defaultOut, opts)
if err != nil {
return err
}
if code != 0 {
os.Exit(code)
}
return nil
}

func doEvalFTS(defaultOut io.Writer, opts *evalOpts) (int, error) {
cfg, err := config.Load()
if err != nil {
return 0, fmt.Errorf("load config: %w", err)
}

chatLLM := cfg.Chat.LLM
provider := opts.provider
if provider == "" {
provider = chatLLM.Provider
}
model := opts.model
if model == "" {
model = chatLLM.Model
}
judgeModel := opts.judgeModel
if judgeModel == "" {
judgeModel = model
}
timeout := chatLLM.TimeoutDuration()
if timeout <= 0 {
timeout = 60 * time.Second
}

// Privacy warning when running against a real DB on a non-local
// provider.
if opts.dbPath != "" && !isLocalLLMProvider(provider) {
fmt.Fprintf(os.Stderr,
"warning: eval will send prompts derived from %s to %s.\n"+
"Press Ctrl-C within 5s to abort.\n",
opts.dbPath, provider,
)
time.Sleep(5 * time.Second)
}

// Open (or build) the store.
store, fixture, cleanup, err := openEvalStore(opts.dbPath)
if err != nil {
return 0, err
}
defer cleanup()

// Build LLM clients.
client, err := llm.NewClient(provider, chatLLM.BaseURL, model, chatLLM.APIKey, timeout)
if err != nil {
return 0, fmt.Errorf("build chat client: %w", err)
}
judge := client
if judgeModel != model {
judge, err = llm.NewClient(provider, chatLLM.BaseURL, judgeModel, chatLLM.APIKey, timeout)
if err != nil {
return 0, fmt.Errorf("build judge client: %w", err)
}
}

harnessCfg := ftseval.Config{
DBPath: opts.dbPath,
Provider: provider,
Model: model,
JudgeModel: judgeModel,
APIKey: chatLLM.APIKey,
Timeout: timeout,
Questions: opts.questions,
SkipJudge: opts.skipJudge,
NoAB: opts.noAB,
Format: opts.format,
Strict: opts.strict,
}

ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
defer cancel()

results, err := ftseval.Run(ctx, harnessCfg, store, fixture, client, judge)
if err != nil {
return 0, fmt.Errorf("run eval: %w", err)
}

// Write report. Default format: "table" when writing to a TTY,
// "markdown" otherwise (pipes, files, CI). --format overrides.
out := defaultOut
if opts.output != "" {
f, err := os.Create(opts.output)
if err != nil {
return 0, fmt.Errorf("open report file: %w", err)
}
defer func() { _ = f.Close() }()
out = f
}
if harnessCfg.Format == "" {
if termio.IsTerminal(out) {
harnessCfg.Format = "table"
} else {
harnessCfg.Format = "markdown"
}
}
if err := ftseval.WriteReport(out, harnessCfg, results); err != nil {
return 0, fmt.Errorf("write report: %w", err)
}

return ftseval.ExitCode(harnessCfg, results), nil
}

// openEvalStore returns either the user-supplied SQLite store or a
// freshly-seeded fixture. The returned cleanup closes the store and, for
// the fixture path, removes the tempdir the fixture lives in.
func openEvalStore(
dbPath string,
) (*data.Store, ftseval.SeededFixture, func(), error) {
if dbPath != "" {
s, err := data.Open(dbPath)
if err != nil {
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("open %s: %w", dbPath, err)
}
cleanup := func() { _ = s.Close() }
return s, ftseval.SeededFixture{}, cleanup, nil
}

tmp, err := os.MkdirTemp("", "micasa-eval-*")
if err != nil {
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("create fixture tempdir: %w", err)
}
removeTmp := func() { _ = os.RemoveAll(tmp) }

path := tmp + "/fixture.db"
s, err := data.Open(path)
if err != nil {
removeTmp()
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("open fixture: %w", err)
}
closeStore := func() { _ = s.Close() }
if err := s.AutoMigrate(); err != nil {
closeStore()
removeTmp()
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("migrate fixture: %w", err)
}
if err := s.SeedDefaults(); err != nil {
closeStore()
removeTmp()
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("seed fixture defaults: %w", err)
}
fx, err := ftseval.SeedFixture(s)
if err != nil {
closeStore()
removeTmp()
return nil, ftseval.SeededFixture{}, nil, fmt.Errorf("seed fixture entities: %w", err)
}

cleanup := func() {
closeStore()
removeTmp()
}
return s, fx, cleanup, nil
}

// isLocalLLMProvider reports whether the named provider runs on the same
// machine (so no household data leaves the machine).
func isLocalLLMProvider(provider string) bool {
switch strings.ToLower(provider) {
case "ollama", "llamacpp", "llamafile":
return true
}
return false
}
1 change: 1 addition & 0 deletions cmd/micasa/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ func newRootCmd() *cobra.Command {
newGenCLIRefCmd(),
newDBCmd(),
newStatusCmd(),
newEvalCmd(),
)

return root
Expand Down
12 changes: 2 additions & 10 deletions cmd/micasa/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,11 @@ import (

"charm.land/lipgloss/v2"
"charm.land/lipgloss/v2/table"
"github.com/charmbracelet/x/term"
"github.com/micasa-dev/micasa/internal/data"
"github.com/micasa-dev/micasa/internal/termio"
"github.com/spf13/cobra"
)

// writerIsTerminal reports whether w is an *os.File backed by a terminal.
// Any non-file writer (bytes.Buffer, io.Pipe, custom writers) is treated
// as non-terminal so styled output never leaks to non-TTY destinations.
func writerIsTerminal(w io.Writer) bool {
f, ok := w.(*os.File)
return ok && term.IsTerminal(f.Fd())
}

type statusOpts struct {
asJSON bool
days int
Expand Down Expand Up @@ -67,7 +59,7 @@ shell prompts, and status bar widgets.`,
}
opts.isDark = lipgloss.HasDarkBackground(os.Stdin, os.Stderr)
out := cmd.OutOrStdout()
opts.noStyle = !writerIsTerminal(out)
opts.noStyle = !termio.IsTerminal(out)
store, err := openExisting(dbPathFromEnvOrArg(args))
if err != nil {
return err
Expand Down
20 changes: 0 additions & 20 deletions cmd/micasa/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ import (
"bytes"
"encoding/json"
"errors"
"io"
"os"
"strings"
"testing"
"time"
Expand Down Expand Up @@ -46,24 +44,6 @@ func TestExtractExitCodeRegularError(t *testing.T) {
assert.Equal(t, 1, extractExitCode(errors.New("boom")))
}

// --- writerIsTerminal ---

func TestWriterIsTerminal(t *testing.T) {
t.Parallel()
// bytes.Buffer is not a file
var buf bytes.Buffer
assert.False(t, writerIsTerminal(&buf))

// io.Discard is not a file
assert.False(t, writerIsTerminal(io.Discard))

// Temp file is an *os.File but not a terminal
f, err := os.CreateTemp(t.TempDir(), "tty-test")
require.NoError(t, err)
defer func() { _ = f.Close() }()
assert.False(t, writerIsTerminal(f))
}

// --- text output ---

func TestStatusTextEmpty(t *testing.T) {
Expand Down
Loading
Loading