Skip to content

Commit eb0e3af

Browse files
authored
Merge pull request #44 from agent-ecosystem/fix/claude-cli-isolation
fix: add preflight check and --bare for claude CLI client
2 parents 225a296 + 9bfbb79 commit eb0e3af

6 files changed

Lines changed: 89 additions & 5 deletions

File tree

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,9 @@ skill-validator score evaluate --provider claude-cli <path>
335335
|---|---|---|---|
336336
| `anthropic` (default) | `ANTHROPIC_API_KEY` | `claude-sonnet-4-5-20250929` | Anthropic |
337337
| `openai` | `OPENAI_API_KEY` | `gpt-5.2` | OpenAI, Ollama, Together, Groq, Azure, etc. |
338-
| `claude-cli` | _(none)_ | `sonnet` | Claude CLI (uses locally authenticated `claude` binary) |
338+
| `claude-cli` | _(none)_ | `sonnet` | Claude CLI (uses locally authenticated `claude` binary) \* |
339+
340+
\* **Accuracy note:** The `claude-cli` provider shells out to the `claude` CLI, which loads local context (CLAUDE.md files, project memory, rules) into each scoring call. This extra context may influence scores, making them less reproducible across environments compared to the API-based providers. For the most consistent results, use the `anthropic` or `openai` providers with an API key.
339341
340342
Use `--model` to override the default model and `--base-url` to point at any OpenAI-compatible endpoint (e.g. `http://localhost:11434/v1` for Ollama). If the endpoint requires a specific token limit parameter, use `--max-tokens-style` to override auto-detection:
341343

examples/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,9 @@ improve the skill content before requesting a human review.
4141
- OpenAI-compatible: `export OPENAI_API_KEY=...` (some endpoints accept a
4242
placeholder) and provide the `--base-url` when prompted.
4343
- Claude CLI: No API key needed — uses the locally authenticated `claude`
44-
binary (e.g. via a company or team subscription).
44+
binary (e.g. via a company or team subscription). Note: scores may be less
45+
consistent than API-based providers because the CLI loads local context
46+
(CLAUDE.md, memory) into each call.
4547
4. Add `.score_cache/` to your `.gitignore`. LLM scoring caches results inside
4648
each skill directory, and these should not be committed.
4749
5. Ask your agent to review a skill. The skill stores configuration in

examples/review-skill/references/llm-scoring.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,12 @@ installation instructions.
6060
The default model is `sonnet`. The user can specify a different model with the
6161
`--model` flag (e.g. `--model opus`).
6262

63+
**Accuracy note:** The Claude CLI loads local context (CLAUDE.md files, project
64+
memory, rules) into each scoring call. This extra context may influence scores,
65+
making them less reproducible across environments compared to the API-based
66+
providers. For the most consistent results, use the `anthropic` or `openai`
67+
providers with an API key.
68+
6369
### OpenAI-compatible provider
6470

6571
This uses the OpenAI provider with a custom `--base-url`. It supports any

judge/client.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ import (
1616
// that a hanging upstream doesn't block the caller indefinitely.
1717
var defaultHTTPClient = &http.Client{Timeout: 30 * time.Second}
1818

19+
// lookPath is used to locate the claude binary. It is a variable so tests
20+
// can substitute a stub when the real binary is not installed.
21+
var lookPath = exec.LookPath
22+
1923
// LLMClient is the interface for making LLM API calls.
2024
type LLMClient interface {
2125
// Complete sends a system prompt and user content to the LLM and returns the text response.
@@ -52,6 +56,9 @@ func NewClient(opts ClientOptions) (LLMClient, error) {
5256

5357
switch strings.ToLower(opts.Provider) {
5458
case "claude-cli":
59+
if _, err := lookPath("claude"); err != nil {
60+
return nil, fmt.Errorf("claude-cli provider requires the \"claude\" binary: %w", err)
61+
}
5562
model := opts.Model
5663
if model == "" {
5764
model = "sonnet"
@@ -302,7 +309,8 @@ type claudeCLIClient struct {
302309
func (c *claudeCLIClient) Provider() string { return "claude-cli" }
303310
func (c *claudeCLIClient) ModelName() string { return c.model }
304311

305-
func (c *claudeCLIClient) Complete(ctx context.Context, systemPrompt, userContent string) (string, error) {
312+
// buildArgs returns the CLI arguments for a claude invocation.
313+
func (c *claudeCLIClient) buildArgs(systemPrompt, userContent string) []string {
306314
args := []string{
307315
"-p",
308316
"--output-format", "text",
@@ -312,6 +320,11 @@ func (c *claudeCLIClient) Complete(ctx context.Context, systemPrompt, userConten
312320
args = append(args, "--system-prompt", systemPrompt)
313321
}
314322
args = append(args, userContent)
323+
return args
324+
}
325+
326+
func (c *claudeCLIClient) Complete(ctx context.Context, systemPrompt, userContent string) (string, error) {
327+
args := c.buildArgs(systemPrompt, userContent)
315328

316329
cmd := exec.CommandContext(ctx, "claude", args...)
317330
var stdout, stderr bytes.Buffer

judge/client_test.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,28 @@ package judge
22

33
import (
44
"encoding/json"
5+
"fmt"
56
"net/http"
67
"net/http/httptest"
8+
"strings"
79
"testing"
810
)
911

12+
// stubLookPath replaces the lookPath variable for the duration of a test,
13+
// restoring the original when the test completes.
14+
func stubLookPath(t *testing.T, found bool) {
15+
t.Helper()
16+
orig := lookPath
17+
t.Cleanup(func() { lookPath = orig })
18+
if found {
19+
lookPath = func(file string) (string, error) { return "/usr/bin/" + file, nil }
20+
} else {
21+
lookPath = func(file string) (string, error) { return "", fmt.Errorf("not found: %s", file) }
22+
}
23+
}
24+
1025
func TestClaudeCLIClientDefaults(t *testing.T) {
26+
stubLookPath(t, true)
1127
client, err := NewClient(ClientOptions{Provider: "claude-cli"})
1228
if err != nil {
1329
t.Fatalf("NewClient: %v", err)
@@ -21,6 +37,7 @@ func TestClaudeCLIClientDefaults(t *testing.T) {
2137
}
2238

2339
func TestClaudeCLIClientCustomModel(t *testing.T) {
40+
stubLookPath(t, true)
2441
client, err := NewClient(ClientOptions{Provider: "claude-cli", Model: "opus"})
2542
if err != nil {
2643
t.Fatalf("NewClient: %v", err)
@@ -31,6 +48,8 @@ func TestClaudeCLIClientCustomModel(t *testing.T) {
3148
}
3249

3350
func TestClaudeCLINoAPIKeyRequired(t *testing.T) {
51+
stubLookPath(t, true)
52+
3453
// claude-cli should not require an API key
3554
_, err := NewClient(ClientOptions{Provider: "claude-cli"})
3655
if err != nil {
@@ -44,6 +63,48 @@ func TestClaudeCLINoAPIKeyRequired(t *testing.T) {
4463
}
4564
}
4665

66+
func TestClaudeCLIMissingBinary(t *testing.T) {
67+
stubLookPath(t, false)
68+
69+
_, err := NewClient(ClientOptions{Provider: "claude-cli"})
70+
if err == nil {
71+
t.Fatal("expected error when claude binary is not found")
72+
}
73+
if got := err.Error(); !strings.Contains(got, "claude-cli provider requires") {
74+
t.Errorf("unexpected error message: %s", got)
75+
}
76+
}
77+
78+
func TestClaudeCLIBuildArgs(t *testing.T) {
79+
c := &claudeCLIClient{model: "sonnet"}
80+
81+
t.Run("with system prompt", func(t *testing.T) {
82+
args := c.buildArgs("you are a judge", "score this")
83+
want := []string{"-p", "--output-format", "text", "--model", "sonnet", "--system-prompt", "you are a judge", "score this"}
84+
if len(args) != len(want) {
85+
t.Fatalf("got %d args, want %d: %v", len(args), len(want), args)
86+
}
87+
for i := range want {
88+
if args[i] != want[i] {
89+
t.Errorf("args[%d] = %q, want %q", i, args[i], want[i])
90+
}
91+
}
92+
})
93+
94+
t.Run("without system prompt", func(t *testing.T) {
95+
args := c.buildArgs("", "score this")
96+
for _, a := range args {
97+
if a == "--system-prompt" {
98+
t.Error("--system-prompt should not be present when system prompt is empty")
99+
}
100+
}
101+
// Last arg should be the user content
102+
if args[len(args)-1] != "score this" {
103+
t.Errorf("last arg = %q, want %q", args[len(args)-1], "score this")
104+
}
105+
})
106+
}
107+
47108
func TestUseMaxCompletionTokens(t *testing.T) {
48109
tests := []struct {
49110
model string

judge/example_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ func ExampleNewClient() {
2323
// Provider: anthropic, Model: claude-sonnet-4-5-20250929
2424
}
2525

26+
// ExampleNewClient_claudeCLI demonstrates creating a claude-cli client.
27+
// This example is not executed as a test because it requires the claude binary.
2628
func ExampleNewClient_claudeCLI() {
2729
client, err := judge.NewClient(judge.ClientOptions{
2830
Provider: "claude-cli",
@@ -33,8 +35,6 @@ func ExampleNewClient_claudeCLI() {
3335
}
3436

3537
fmt.Printf("Provider: %s, Model: %s\n", client.Provider(), client.ModelName())
36-
// Output:
37-
// Provider: claude-cli, Model: sonnet
3838
}
3939

4040
func ExampleNewClient_openai() {

0 commit comments

Comments
 (0)