Skip to content

Commit a88ff7b

Browse files
authored
Merge pull request #11 from verisoft-ai/feat/mcp-server
Add MCP server (novawindows-mcp) Exposes the NovaWindows driver as an MCP server, allowing AI agents to automate Windows desktop apps directly via the Model Context Protocol 39 tools across 8 domains: session lifecycle, app launch/close, element finding, interaction (click, type, scroll, drag), inspection, window management, clipboard, and advanced UIA patterns. Includes a full unit test suite and .mcp.json for zero-config local setup.
2 parents 18d4538 + 5b72f12 commit a88ff7b

29 files changed

+2690
-5
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# Claude Code local settings
2+
.claude/
3+
14
# Logs
25
logs
36
*.log

.mcp.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"mcpServers": {
3+
"novawindows-mcp": {
4+
"command": "node",
5+
"args": ["./build/lib/mcp/index.js"],
6+
"env": {}
7+
}
8+
}
9+
}

lib/mcp/config.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
/** Infrastructure config — read from env vars at startup. */
2+
export interface McpConfig {
3+
appiumHost: string;
4+
appiumPort: number;
5+
}
6+
7+
export function loadConfig(): McpConfig {
8+
const appiumPort = parseInt(process.env.APPIUM_PORT ?? '4723', 10);
9+
if (isNaN(appiumPort) || appiumPort < 1 || appiumPort > 65535) {
10+
throw new Error(`APPIUM_PORT must be a valid port number (1-65535), got: '${process.env.APPIUM_PORT}'`);
11+
}
12+
13+
return {
14+
appiumHost: process.env.APPIUM_HOST ?? '127.0.0.1',
15+
appiumPort,
16+
};
17+
}

lib/mcp/constants.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
/** W3C WebDriver element reference key */
2+
export const ELEMENT_KEY = 'element-6066-11e4-a52e-4f735466cecf';

lib/mcp/errors.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
export function formatError(err: unknown): string {
2+
if (err instanceof Error) {return `${err.constructor.name}: ${err.message}`;}
3+
return String(err);
4+
}

lib/mcp/index.ts

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
#!/usr/bin/env node
2+
import * as http from 'node:http';
3+
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
4+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
5+
import { loadConfig } from './config.js';
6+
import { AppiumSession } from './session.js';
7+
import { registerAllTools } from './tools/index.js';
8+
9+
function checkAppiumReachable(host: string, port: number): Promise<boolean> {
10+
return new Promise((resolve) => {
11+
const req = http.get(
12+
{ hostname: host, port, path: '/status', timeout: 3000 },
13+
(res) => {
14+
let body = '';
15+
res.on('data', (chunk) => { body += chunk; });
16+
res.on('end', () => {
17+
try { resolve(JSON.parse(body)?.value?.ready === true); }
18+
catch { resolve(false); }
19+
});
20+
}
21+
);
22+
req.on('error', () => resolve(false));
23+
req.on('timeout', () => { req.destroy(); resolve(false); });
24+
});
25+
}
26+
27+
async function main() {
28+
// Step 1: Load infrastructure config (host, port — no app required)
29+
let config;
30+
try {
31+
config = loadConfig();
32+
} catch (err) {
33+
process.stderr.write(`[MCP] Configuration error: ${err instanceof Error ? err.message : String(err)}\n`);
34+
process.exit(1);
35+
}
36+
37+
// Step 2: Verify Appium is reachable
38+
const { appiumHost: host, appiumPort: port } = config;
39+
if (!await checkAppiumReachable(host, port)) {
40+
process.stderr.write(
41+
`[MCP] Appium is not running on ${host}:${port}.\n` +
42+
`Start it first with: appium --port ${port}\n`
43+
);
44+
process.exit(1);
45+
}
46+
process.stderr.write(`[MCP] Appium detected on ${host}:${port}\n`);
47+
48+
// Step 3: Create session holder (no app launched yet — agent calls create_session)
49+
const session = new AppiumSession(config);
50+
51+
// Step 4: Create and configure MCP server
52+
const server = new McpServer({
53+
name: 'novawindows-mcp',
54+
version: '1.3.0',
55+
});
56+
57+
// Step 5: Register all tools (including create_session / delete_session)
58+
registerAllTools(server, session);
59+
60+
// Step 6: Shutdown handler
61+
let shuttingDown = false;
62+
async function shutdown(reason: string) {
63+
if (shuttingDown) {return;}
64+
shuttingDown = true;
65+
process.stderr.write(`[MCP] Shutting down (${reason})...\n`);
66+
67+
if (session.isActive()) {
68+
await Promise.race([
69+
session.delete(),
70+
new Promise<void>((resolve) => setTimeout(resolve, 10_000)),
71+
]);
72+
}
73+
74+
process.exit(0);
75+
}
76+
77+
process.on('SIGINT', () => { shutdown('SIGINT'); });
78+
process.on('SIGTERM', () => { shutdown('SIGTERM'); });
79+
process.stdin.on('end', () => { shutdown('stdin closed'); });
80+
81+
// Step 7: Connect transport (stdout is owned by MCP protocol — all logs go to stderr)
82+
const transport = new StdioServerTransport();
83+
await server.connect(transport);
84+
process.stderr.write('[MCP] novawindows-mcp server ready. Call create_session to launch an app.\n');
85+
}
86+
87+
main();

lib/mcp/session.ts

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import { remote } from 'webdriverio';
2+
import type { Browser } from 'webdriverio';
3+
import type { McpConfig } from './config.js';
4+
5+
/** Session parameters provided by the agent via the create_session tool. */
6+
export interface SessionParams {
7+
app: string;
8+
appArguments?: string;
9+
appWorkingDir?: string;
10+
waitForAppLaunch?: number;
11+
shouldCloseApp?: boolean;
12+
implicitTimeout?: number;
13+
delayAfterClick?: number;
14+
delayBeforeClick?: number;
15+
smoothPointerMove?: string;
16+
}
17+
18+
export class AppiumSession {
19+
private driver: Browser | null = null;
20+
21+
constructor(private readonly appiumConfig: McpConfig) {}
22+
23+
async create(params: SessionParams): Promise<void> {
24+
if (this.driver) {
25+
throw new Error('A session is already active. Call delete_session first.');
26+
}
27+
28+
process.stderr.write(`[MCP] Creating Appium session for app: ${params.app}\n`);
29+
30+
const caps: Record<string, unknown> = {
31+
platformName: 'Windows',
32+
'appium:automationName': 'DesktopDriver',
33+
'appium:app': params.app,
34+
};
35+
36+
if (params.appArguments !== undefined) {caps['appium:appArguments'] = params.appArguments;}
37+
if (params.appWorkingDir !== undefined) {caps['appium:appWorkingDir'] = params.appWorkingDir;}
38+
if (params.waitForAppLaunch !== undefined) {caps['appium:waitForAppLaunch'] = params.waitForAppLaunch;}
39+
if (params.shouldCloseApp !== undefined) {caps['appium:shouldCloseApp'] = params.shouldCloseApp;}
40+
if (params.delayAfterClick !== undefined) {caps['appium:delayAfterClick'] = params.delayAfterClick;}
41+
if (params.delayBeforeClick !== undefined) {caps['appium:delayBeforeClick'] = params.delayBeforeClick;}
42+
if (params.smoothPointerMove !== undefined) {caps['appium:smoothPointerMove'] = params.smoothPointerMove;}
43+
44+
this.driver = await remote({
45+
hostname: this.appiumConfig.appiumHost,
46+
port: this.appiumConfig.appiumPort,
47+
path: '/',
48+
capabilities: caps as WebdriverIO.Capabilities,
49+
});
50+
51+
await this.driver.setTimeout({ implicit: params.implicitTimeout });
52+
process.stderr.write('[MCP] Session created successfully\n');
53+
}
54+
55+
async delete(): Promise<void> {
56+
if (!this.driver) {return;}
57+
try {
58+
await this.driver.deleteSession();
59+
process.stderr.write('[MCP] Session deleted\n');
60+
} catch (err) {
61+
process.stderr.write(`[MCP] Warning: session delete failed: ${err}\n`);
62+
} finally {
63+
this.driver = null;
64+
}
65+
}
66+
67+
isActive(): boolean {
68+
return this.driver !== null;
69+
}
70+
71+
getDriver(): Browser {
72+
if (!this.driver) {throw new Error('No active session. Call create_session first.');}
73+
return this.driver;
74+
}
75+
}

lib/mcp/tools/advanced.ts

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
2+
import { z } from 'zod';
3+
import type { AppiumSession } from '../session.js';
4+
import { formatError } from '../errors.js';
5+
6+
const modifierKeys = z.array(z.enum(['shift', 'ctrl', 'alt', 'win'])).default([]);
7+
8+
export function registerAdvancedTools(server: McpServer, session: AppiumSession): void {
9+
server.registerTool(
10+
'advanced_click',
11+
{
12+
description: 'Perform an advanced click with modifier keys, multiple clicks, or custom duration. Use this for right-click, Ctrl+click, double-click, etc.',
13+
annotations: { destructiveHint: false },
14+
inputSchema: {
15+
elementId: z.string().optional().describe('Element to click (its center). Provide either elementId or x+y.'),
16+
x: z.number().int().optional().describe('Absolute screen x coordinate'),
17+
y: z.number().int().optional().describe('Absolute screen y coordinate'),
18+
button: z.enum(['left', 'right', 'middle', 'back', 'forward']).default('left'),
19+
modifierKeys,
20+
durationMs: z.number().int().min(0).default(0).describe('Hold duration in ms (for long-press)'),
21+
times: z.number().int().min(1).default(1).describe('Number of clicks (2 = double-click)'),
22+
interClickDelayMs: z.number().int().min(0).default(100),
23+
},
24+
},
25+
async (args) => {
26+
try {
27+
const driver = session.getDriver();
28+
await driver.executeScript('windows: click', [args]);
29+
return { content: [{ type: 'text' as const, text: 'clicked' }] };
30+
} catch (err) {
31+
return { isError: true, content: [{ type: 'text' as const, text: formatError(err) }] };
32+
}
33+
}
34+
);
35+
36+
server.registerTool(
37+
'send_keys',
38+
{
39+
description: 'Send keyboard input. Each action can be a pause (ms delay), text to type, or a virtual key code press/release.',
40+
annotations: { destructiveHint: false },
41+
inputSchema: {
42+
actions: z.array(z.object({
43+
pause: z.number().int().optional().describe('Pause in milliseconds'),
44+
text: z.string().optional().describe('Text to type (unicode supported)'),
45+
virtualKeyCode: z.number().int().optional().describe('Windows Virtual Key code (e.g. 13 = Enter, 27 = Escape)'),
46+
down: z.boolean().optional().describe('true = key down only, false = key up only, omit = press and release'),
47+
})).describe('Sequence of keyboard actions to perform'),
48+
forceUnicode: z.boolean().default(false).describe('Use Unicode input method for special characters'),
49+
},
50+
},
51+
async (args) => {
52+
try {
53+
const driver = session.getDriver();
54+
await driver.executeScript('windows: keys', [args]);
55+
return { content: [{ type: 'text' as const, text: 'keys sent' }] };
56+
} catch (err) {
57+
return { isError: true, content: [{ type: 'text' as const, text: formatError(err) }] };
58+
}
59+
}
60+
);
61+
62+
server.registerTool(
63+
'hover',
64+
{
65+
description: 'Move the mouse pointer from one position to another, optionally with modifier keys held. Useful for hover effects and drag-without-click.',
66+
inputSchema: {
67+
startElementId: z.string().optional().describe('Element to start hover from (uses element center)'),
68+
startX: z.number().int().optional(),
69+
startY: z.number().int().optional(),
70+
endElementId: z.string().optional().describe('Element to hover to'),
71+
endX: z.number().int().optional(),
72+
endY: z.number().int().optional(),
73+
modifierKeys,
74+
durationMs: z.number().int().min(0).default(500).describe('Duration of the hover movement in ms'),
75+
},
76+
},
77+
async (args) => {
78+
try {
79+
const driver = session.getDriver();
80+
await driver.executeScript('windows: hover', [args]);
81+
return { content: [{ type: 'text' as const, text: 'hovered' }] };
82+
} catch (err) {
83+
return { isError: true, content: [{ type: 'text' as const, text: formatError(err) }] };
84+
}
85+
}
86+
);
87+
88+
server.registerTool(
89+
'scroll',
90+
{
91+
description: 'Scroll the mouse wheel at an element or screen coordinate.',
92+
inputSchema: {
93+
elementId: z.string().optional().describe('Element to scroll over (uses element center)'),
94+
x: z.number().int().optional().describe('Absolute screen x coordinate'),
95+
y: z.number().int().optional().describe('Absolute screen y coordinate'),
96+
deltaX: z.number().int().default(0).describe('Horizontal scroll amount (positive = right)'),
97+
deltaY: z.number().int().default(0).describe('Vertical scroll amount (positive = down)'),
98+
modifierKeys,
99+
},
100+
},
101+
async (args) => {
102+
try {
103+
const driver = session.getDriver();
104+
await driver.executeScript('windows: scroll', [args]);
105+
return { content: [{ type: 'text' as const, text: 'scrolled' }] };
106+
} catch (err) {
107+
return { isError: true, content: [{ type: 'text' as const, text: formatError(err) }] };
108+
}
109+
}
110+
);
111+
112+
server.registerTool(
113+
'click_and_drag',
114+
{
115+
description: 'Click and drag from one position to another. Useful for resizing, reordering, or moving elements.',
116+
inputSchema: {
117+
startElementId: z.string().optional().describe('Element to start drag from'),
118+
startX: z.number().int().optional(),
119+
startY: z.number().int().optional(),
120+
endElementId: z.string().optional().describe('Element to drag to'),
121+
endX: z.number().int().optional(),
122+
endY: z.number().int().optional(),
123+
modifierKeys,
124+
durationMs: z.number().int().min(0).default(500),
125+
button: z.enum(['left', 'right', 'middle']).default('left'),
126+
},
127+
},
128+
async (args) => {
129+
try {
130+
const driver = session.getDriver();
131+
await driver.executeScript('windows: clickAndDrag', [args]);
132+
return { content: [{ type: 'text' as const, text: 'drag completed' }] };
133+
} catch (err) {
134+
return { isError: true, content: [{ type: 'text' as const, text: formatError(err) }] };
135+
}
136+
}
137+
);
138+
}

0 commit comments

Comments
 (0)