Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions evals/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,13 @@ export OPENROUTER_API_KEY="your_key"
export OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"

npm ci
npm run evals:create-dataset # one-time: creates dataset from test-cases.json
npm run evals:create-dataset # one-time: creates dataset from test_cases.json
npm run evals:run # runs evaluation on default dataset (v1.4)
```

### Using a specific dataset version

By default, the evaluation uses the dataset version from `test-cases.json` (`v1.4`). To use a different dataset:
By default, the evaluation uses the dataset version from `test_cases.json` (`v1.4`). To use a different dataset:

```bash
# Create a new dataset with custom name
Expand Down Expand Up @@ -285,4 +285,3 @@ NOTES:
// System prompt - instructions mainly cursor (very similar instructions in copilot)
// https://github.com/x1xhlol/system-prompts-and-models-of-ai-tools/blob/main/Cursor%20Prompts/Agent%20Prompt%20v1.2.txt
// https://github.com/x1xhlol/system-prompts-and-models-of-ai-tools/blob/main/VSCode%20Agent/Prompt.txt

6 changes: 3 additions & 3 deletions evals/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ import log from '@apify/log';
// Re-export shared config
export { OPENROUTER_CONFIG, sanitizeHeaderValue, validateEnvVars, getRequiredEnvVars } from './shared/config.js';

// Read version from test-cases.json
// Read the version from test-cases.json
function getTestCasesVersion(): string {
const dir = dirname(fileURLToPath(import.meta.url));
const raw = readFileSync(join(dir, 'test-cases.json'), 'utf-8');
const raw = readFileSync(join(dir, 'test_cases.json'), 'utf-8');
return JSON.parse(raw).version;
}

Expand All @@ -28,7 +28,7 @@ export type EvaluatorName = typeof EVALUATOR_NAMES[keyof typeof EVALUATOR_NAMES]

// Models to evaluate
// 'openai/gpt-4.1-mini', // DO NOT USE - it has much worse performance than gpt-4o-mini and other models
// 'openai/gpt-4o-mini', // Neither used in cursor nor copilot
// 'openai/gpt-4o-mini', // Neither used in cursor nor copilot
// 'openai/gpt-4.1',
export const MODELS_TO_EVALUATE = [
'anthropic/claude-haiku-4.5',
Expand Down
6 changes: 3 additions & 3 deletions evals/workflows/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ npm run evals:workflow -- --category search
# Run specific test
npm run evals:workflow -- --id search-google-maps

# Filter by line range in test-cases.json
# Filter by line range in test_cases.json
npm run evals:workflow -- --lines 277-283

# Show detailed conversation logs
Expand Down Expand Up @@ -242,7 +242,7 @@ export OPENROUTER_API_KEY="your_openrouter_key" # Get from https://openrouter.ai
| `--id <id>` | | Run specific test by ID | All tests |
| `--lines <range>` | `-l` | Filter by line range in test-cases.json | All tests |
| `--verbose` | | Show detailed conversation logs | `false` |
| `--test-cases-path <path>` | | Custom test cases file path | `test-cases.json` |
| `--test-cases-path <path>` | | Custom test cases file path | `test_cases.json` |
| `--agent-model <model>` | | Override agent model | `anthropic/claude-haiku-4.5` |
| `--judge-model <model>` | | Override judge model | `x-ai/grok-4.1-fast` |
| `--tool-timeout <seconds>` | | Tool call timeout | `60` |
Expand All @@ -252,7 +252,7 @@ export OPENROUTER_API_KEY="your_openrouter_key" # Get from https://openrouter.ai

### Line Range Filtering

The `--lines` (or `-l`) option filters test cases by their line numbers in the `test-cases.json` file.
The `--lines` (or `-l`) option filters test cases by their line numbers in the `test_cases.json` file.

**Format options:**
- **Single line:** `--lines 100` (includes tests that contain line 100)
Expand Down
4 changes: 2 additions & 2 deletions evals/workflows/test_cases_loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ export type WorkflowTestCaseWithLineNumbers = WorkflowTestCase & TestCaseWithLin
* Load workflow test cases from JSON file with validation
*/
export function loadTestCases(filePath?: string): WorkflowTestCase[] {
const testCasesPath = filePath || path.join(process.cwd(), 'evals/workflows/test-cases.json');
const testCasesPath = filePath || path.join(process.cwd(), 'evals/workflows/test_cases.json');

if (!fs.existsSync(testCasesPath)) {
throw new Error(`Test cases file not found: ${testCasesPath}`);
Expand Down Expand Up @@ -89,7 +89,7 @@ export function loadTestCasesWithLineNumbers(filePath?: string): {
testCases: WorkflowTestCaseWithLineNumbers[];
totalLines: number;
} {
const testCasesPath = filePath || path.join(process.cwd(), 'evals/workflows/test-cases.json');
const testCasesPath = filePath || path.join(process.cwd(), 'evals/workflows/test_cases.json');

if (!fs.existsSync(testCasesPath)) {
throw new Error(`Test cases file not found: ${testCasesPath}`);
Expand Down
8 changes: 7 additions & 1 deletion src/mcp/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,14 @@ export function getProxyMCPServerToolName(url: string, toolName: string): string
* @param url The URL to process
* @param apifyClient The Apify client instance
* @param mode Server mode for tool variant resolution
* @param actorStore
*/
export async function processParamsGetTools(url: string, apifyClient: ApifyClient, mode: ServerMode, actorStore?: ActorStore) {
export async function processParamsGetTools(
url: string,
apifyClient: ApifyClient,
mode: ServerMode = 'default',
actorStore?: ActorStore,
) {
const input = parseInputParamsFromUrl(url);
return await loadToolsFromInput(input, apifyClient, mode, actorStore);
}
Expand Down
4 changes: 2 additions & 2 deletions src/resources/resource_service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ type ResourceService = {

type ResourceServiceOptions = {
skyfireMode?: boolean;
mode: ServerMode;
mode?: ServerMode;
getAvailableWidgets: () => Map<string, AvailableWidget>;
};

export function createResourceService(options: ResourceServiceOptions): ResourceService {
const { skyfireMode, mode, getAvailableWidgets } = options;
const { skyfireMode, mode = 'default', getAvailableWidgets } = options;

const listResources = async (): Promise<ListResourcesResult> => {
const resources: Resource[] = [];
Expand Down
2 changes: 1 addition & 1 deletion src/tools/categories.ts
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ function resolveCategoryEntries(entries: readonly CategoryToolEntry[], mode: Ser
* @param mode - Required. Use `'default'` or `'openai'`.
* Made explicit (no default value) to prevent accidentally serving wrong-mode tools.
*/
export function getCategoryTools(mode: ServerMode): ToolCategoryMap {
export function getCategoryTools(mode: ServerMode = 'default'): ToolCategoryMap {
return Object.fromEntries(
CATEGORY_NAMES.map((name) => [name, resolveCategoryEntries(toolCategories[name], mode)]),
) as ToolCategoryMap;
Expand Down
2 changes: 1 addition & 1 deletion src/tools/common/add_actor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ USAGE EXAMPLES:
const tools = await apifyMcpServer.loadActorsAsTools([parsed.actor], apifyClient);
/**
* If no tools were found, return a message that the Actor was not found
* instead of returning that non existent tool was added since the
* instead of returning that non-existent tool was added since the
* loadActorsAsTools method returns an empty array and does not throw an error.
*/
if (tools.length === 0) {
Expand Down
2 changes: 1 addition & 1 deletion src/tools/core/call_actor_common.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ For MCP server Actors, use format "actorName:toolName" to call a specific tool (
.describe('The input JSON to pass to the Actor. Required.'),
async: z.boolean()
.optional()
.describe(`When true: starts the run and returns immediately with runId. When false or not provided: waits for completion and returns results immediately. Default: true when UI mode is enabled (enforced), false otherwise. IMPORTANT: Only set async to true if the user explicitly asks to run the Actor in the background or does not need immediate results. When the user asks for data or results, always use async: false (default) so the results are returned immediately.`),
.describe(`When true, starts the run and returns immediately with runId. When false or omitted, behavior depends on the active server mode/tool variant. IMPORTANT: use async=true only when the user explicitly asks to run in the background or does not need immediate results.`),
previewOutput: z.boolean()
.optional()
.describe('When true (default): includes preview items. When false: metadata only (reduces context). Use when fetching fields via get-actor-output.'),
Expand Down
2 changes: 1 addition & 1 deletion src/tools/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ export { CATEGORY_NAME_SET, CATEGORY_NAMES, getCategoryTools, toolCategories, to
* Returns the tool entries for the default-enabled categories resolved for the given mode.
* Computed here (not in helper file) to avoid module initialization issues.
*/
export function getDefaultTools(mode: ServerMode): ToolEntry[] {
export function getDefaultTools(mode: ServerMode = 'default'): ToolEntry[] {
return getExpectedToolsByCategories(toolCategoriesEnabledByDefault, mode);
}

Expand Down
2 changes: 1 addition & 1 deletion src/tools/openai/call_actor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ Do NOT proactively poll using ${HelperTools.ACTOR_RUNS_GET}. Wait for the widget
return buildMCPResponse({
texts: [`Failed to call Actor '${baseActorName}': ${error instanceof Error ? error.message : String(error)}.
Please verify the Actor name, input parameters, and ensure the Actor exists.
You can search for available Actors using the tool: ${HelperTools.STORE_SEARCH}, or get Actor details using: ${HelperTools.ACTOR_GET_DETAILS}.`],
You can search for available Actors using the tool: ${HelperTools.STORE_SEARCH_INTERNAL}, or get Actor details using: ${HelperTools.ACTOR_GET_DETAILS_INTERNAL}.`],
isError: true,
});
}
Expand Down
2 changes: 1 addition & 1 deletion src/utils/server-instructions/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@ const instructionsByMode: Record<ServerMode, () => string> = {
/**
* Build server instructions for the given server mode.
*/
export function getServerInstructions(mode: ServerMode): string {
export function getServerInstructions(mode: ServerMode = 'default'): string {
return instructionsByMode[mode]();
}
2 changes: 1 addition & 1 deletion src/utils/tools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -110,5 +110,5 @@ export function applySkyfireAugmentation(tool: ToolEntry): ToolEntry {
}
}

return cloned;
return Object.freeze(cloned);
}
8 changes: 5 additions & 3 deletions src/utils/tools_loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,13 @@ function getAllInternalToolNames(): Set<string> {
* @param input The processed Input object
* @param apifyClient The Apify client instance
* @param mode Server mode for tool variant resolution
* @param actorStore
* @returns An array of tool entries
*/
export async function loadToolsFromInput(
input: Input,
apifyClient: ApifyClient,
mode: ServerMode,
mode: ServerMode = 'default',
actorStore?: ActorStore,
): Promise<ToolEntry[]> {
// Build mode-resolved categories — tools are already the correct variant for this mode
Expand All @@ -70,6 +71,7 @@ export async function loadToolsFromInput(
const selectorsExplicitEmpty = selectorsProvided && (selectors as string[]).length === 0;
const addActorEnabled = input.enableAddingActors === true;
const actorsExplicitlyEmpty = (Array.isArray(input.actors) && input.actors.length === 0) || input.actors === '';
const explicitlyNoToolsRequested = selectorsExplicitEmpty || actorsExplicitlyEmpty;

// Build mode-specific tool-by-name map for individual tool selection
const modeToolByName = new Map<string, ToolEntry>();
Expand Down Expand Up @@ -157,7 +159,7 @@ export async function loadToolsFromInput(
}

// In openai mode, unconditionally add UI-specific tools (regardless of selectors)
if (mode === 'openai') {
if (mode === 'openai' && !explicitlyNoToolsRequested) {
result.push(...categories.ui);
}

Expand All @@ -181,7 +183,7 @@ export async function loadToolsFromInput(
const hasGetActorOutput = result.some((entry) => entry.name === HelperTools.ACTOR_OUTPUT_GET);

const toolsToInject: ToolEntry[] = [];
if (!hasGetActorRun && (hasCallActor || mode === 'openai')) {
if (!hasGetActorRun && (hasCallActor || (mode === 'openai' && !explicitlyNoToolsRequested))) {
// Use mode-resolved get-actor-run variant
const modeGetActorRun = modeToolByName.get(HelperTools.ACTOR_RUNS_GET);
if (modeGetActorRun) toolsToInject.push(modeGetActorRun);
Expand Down
10 changes: 10 additions & 0 deletions tests/unit/tools.mode_contract.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,16 @@ describe('getCategoryTools mode contract (tool-mode separation)', () => {
}
});

describe('mode-specific call-actor behavior guidance', () => {
it('should document that openai call-actor always runs asynchronously', () => {
const openaiCallActor = openaiCategories.actors.find((t) => t.name === HelperTools.ACTOR_CALL);

expect(openaiCallActor).toBeDefined();
expect(openaiCallActor!.description).toContain('always runs asynchronously');
expect(openaiCallActor!.description).toContain('do NOT poll or call any other tool');
});
});

describe('tool definitions are frozen', () => {
for (const mode of SERVER_MODES) {
const categories = getCategoryTools(mode);
Expand Down
2 changes: 2 additions & 0 deletions tests/unit/tools.skyfire.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ describe('applySkyfireAugmentation', () => {
type: 'string',
description: SKYFIRE_PAY_ID_PROPERTY_DESCRIPTION,
});
expect(Object.isFrozen(result)).toBe(true);
});

// Test each SKYFIRE_ENABLED_TOOLS member
Expand All @@ -181,6 +182,7 @@ describe('applySkyfireAugmentation', () => {

const props = result.inputSchema.properties as Record<string, unknown>;
expect(props['skyfire-pay-id']).toBeDefined();
expect(Object.isFrozen(result)).toBe(true);
});
});

Expand Down
38 changes: 38 additions & 0 deletions tests/unit/utils.tools_loader.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import { ApifyClient } from 'apify';
import { describe, expect, it } from 'vitest';

import { HelperTools } from '../../src/const.js';
import { loadToolsFromInput } from '../../src/utils/tools_loader.js';

describe('loadToolsFromInput explicit-empty semantics', () => {
const apifyClient = new ApifyClient({ token: 'test-token' });

it('should not auto-add openai ui tools when tools are explicitly empty', async () => {
const tools = await loadToolsFromInput({
tools: [],
}, apifyClient, 'openai');

expect(tools).toHaveLength(0);
});

it('should not auto-add openai ui tools when actors are explicitly empty', async () => {
const tools = await loadToolsFromInput({
actors: [],
}, apifyClient, 'openai');

expect(tools).toHaveLength(0);
});

it('should keep openai ui tools and get-actor-run for non-empty selectors', async () => {
const tools = await loadToolsFromInput({
tools: ['docs'],
}, apifyClient, 'openai');

const toolNames = tools.map((tool) => tool.name);
expect(toolNames).toContain(HelperTools.DOCS_SEARCH);
expect(toolNames).toContain(HelperTools.DOCS_FETCH);
expect(toolNames).toContain(HelperTools.STORE_SEARCH_INTERNAL);
expect(toolNames).toContain(HelperTools.ACTOR_GET_DETAILS_INTERNAL);
expect(toolNames).toContain(HelperTools.ACTOR_RUNS_GET);
});
});