diff --git a/.gitignore b/.gitignore index 6b3d49e..0c4e131 100644 --- a/.gitignore +++ b/.gitignore @@ -62,6 +62,7 @@ BenchmarkDotNet.Artifacts/ project.lock.json project.fragment.lock.json artifacts/ +.env # ASP.NET Scaffolding ScaffoldingReadMe.txt @@ -344,3 +345,9 @@ MigrationBackup/ # Fody - auto-generated XML schema FodyWeavers.xsd Output/ + +# Repo-local tool caches and workspace metadata +.claude/ +.dotnet-cli/ +.playwright-cli/ +output/playwright/ diff --git a/README.md b/README.md index 27290b2..2718eaf 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,7 @@ Modern Windows 11-style system tray companion that connects to your local OpenCl - 🚀 **Auto-start** - Launch with Windows - ⚙️ **Settings** - Full configuration dialog - 🎯 **First-run experience** - Welcome dialog guides new users +- Voice Mode **Voice Mode (new)** - Talk to your Claw via your Windows node #### Quick Send scope requirement @@ -122,13 +123,14 @@ If Quick Send fails with `pairing required` / `NOT_PAIRED`, that is a **device a ### Menu Sections - **Status** - Gateway connection status with click-to-view details +- **Voice** - Access to Voice controls - **Sessions** - Active agent sessions with preview and per-session controls - **Usage** - Provider/cost summary with quick jump to activity details - **Channels** - Telegram/WhatsApp status with toggle control - **Nodes** - Online/offline node inventory and copyable summary - **Recent Activity** - Timestamped event stream for sessions, usage, nodes, and notifications - **Actions** - Dashboard, Web Chat, Quick Send, Activity Stream, History -- **Settings** - Configuration, auto-start, logs +- **Settings** - Configuration, auto-start, logs, voice ### Mac Parity Status @@ -148,6 +150,7 @@ Comparing against [openclaw-menubar](https://github.com/magimetal/openclaw-menub | Refresh | ✅ | ✅ | Auto-refresh on menu open | | Launch at Login | ✅ | ✅ | | | Notifications toggle | ✅ | ✅ | | +| Voice Mode | ✅ | 🟡 | Talk Mode implemented (half-duplex), WakeWord, Interrupt, etc. in progress ### Windows-Only Features @@ -281,6 +284,14 @@ OpenClaw registers the `openclaw://` URL scheme for automation and integration: Deep links work even when Molty is already running - they're forwarded via IPC. +### Voice Mode +*contributed by NichUK and his colleagues @codex and @copilot* + +Currently supports Talk Mode - Always on talk to your Claw! Wakeword and PTT modes coming soon +- Uses internal Windows STT (cloud providers coming soon) +- Windows/Minimax/Eleven Labs TTS voices + - Give your Claw a voice! + ## 📦 OpenClaw.CommandPalette PowerToys Command Palette extension for quick OpenClaw access. diff --git a/docs/VOICE-MODE.md b/docs/VOICE-MODE.md new file mode 100644 index 0000000..87d6011 --- /dev/null +++ b/docs/VOICE-MODE.md @@ -0,0 +1,988 @@ +# Voice Mode Architecture +*Author: Nich Overend (NichUK@GitHub) - with @codex and @copilot* +https://github.com/openclaw/openclaw-windows-node + + +This document defines the voice subsystem for the Windows node only. It introduces the command surface, persisted settings schema, and minimum runtime boundaries needed to add Windows voice support without reshaping the existing node architecture. + +## Goals + +- Add a node-local voice mode with two activation modes: `VoiceWake` and `TalkMode` +- Utilise minimal touch points to the existing app to reduce the potential for screw-ups +- Use NanoWakeWord for wakeword detection on-device +- Present the user-facing mode names as `Voice Wake` and `Talk Mode` +- Keep STT/TTS provider selection configurable, with Windows implementations as the default built-in baseline +- Implement `MiniMax` TTS and `ElevenLabs` TTS as required non-Windows providers after the Windows baseline +- Make adding new voice providers an update to a Json catalog, rather than requiring code changes where possible +- Reuse the existing node capability pattern instead of introducing a parallel control path +- Ensure that the voice sub-system is extensible +- Ensure that the voice sub-system is controllable from other applications + +## Non-Goals + +- True full-duplex or chunk-streaming audio transport between node and gateway +- Subtantial changes to the existing project + +## Design Position + +The Windows node should own device-local audio concerns: + +- microphone capture +- wakeword detection +- silence detection / utterance segmentation +- speaker playback +- device enumeration and persisted local settings + +OpenClaw remains responsible for conversation/session routing and upstream voice orchestration. + +This keeps the Windows node lean for the first implementation and avoids introducing provider-routing settings before they are needed. + +## Visible Mode Names + +The tray app now uses user-facing names (borrowed from the macOS app) rather than exposing the internal enum names directly: + +| Internal Mode | Visible Name | Availability | +|---|---|---| +| `Off` | Off | available | +| `VoiceWake` | Voice Wake | visible but disabled for now | +| `TalkMode` | Talk Mode | available | + +The contracts and persisted settings now use `VoiceWake` and `TalkMode` as well. + +## Transport Boundary + +`TalkMode` follows the current talk-mode style control flow: + +- the node captures audio locally +- local speech recognition turns that audio into transcript text on the active STT route +- interim hypotheses are surfaced live, but only final `Medium` or `High` confidence recognizer results are submitted +- if speech activity ends without any usable final transcript surviving, Talk Mode now clears the draft and gives a short local repeat prompt instead of silently doing nothing +- the compact voice repeater window, when open, shows the live transcript draft plus local sent/received turns in a single scrolling surface +- the tray chat window, when open, mirrors the live transcript draft into the compose box only +- the finalized transcript is always sent to OpenClaw via direct `chat.send` on the voice mode target session, which is currently hardcoded in the tray app to `agent:main:main` +- OpenClaw returns the assistant reply as normal chat output +- the node performs local or remote TTS playback of that reply +- assistant replies are queued locally and spoken sequentially, with a short (500 ms currently) pause between queued replies so overlapping responses are not lost +- if a reply arrives after the normal 45-second wait timeout, the tray still accepts and speaks that late reply for a short bounded grace window (currently 120s) so slow upstream responses are not silently lost +- assistant replies are currently accepted from either `agent:main:main` or the `main` alias so the tray can tolerate upstream session-key normalisation differences + +To avoid obvious duplicate sends from the Windows recognizer, exact duplicate final transcripts are suppressed within a short 750 ms window. + +The current Windows implementation uses a voice-local operator connection inside the tray app while node mode is active. That connection carries assistant chat events for `TalkMode`, while the recognized transcript is always sent through the tray app's direct `chat.send` path. + +## Voice APIs + +The Windows tray implementation now has two API layers: + +- shared node-capability commands in `OpenClaw.Shared` +- in-process tray interfaces used by the windows/forms + +### Shared Capability Commands + +The node capability command surface is: + +- `voice.devices.list` +- `voice.settings.get` +- `voice.settings.set` +- `voice.status.get` +- `voice.start` +- `voice.stop` +- `voice.pause` +- `voice.resume` +- `voice.response.skip` + +These commands are defined in [VoiceModeSchema.cs](../src/OpenClaw.Shared/VoiceModeSchema.cs) and handled by [VoiceCapability.cs](../src/OpenClaw.Shared/Capabilities/VoiceCapability.cs). + +`voice.settings.get` / `voice.settings.set` are the configuration API. + +`voice.start` / `voice.stop` / `voice.pause` / `voice.resume` / `voice.response.skip` are the runtime control API. + +### Status Surface + +`VoiceStatusInfo` now carries the basic state needed by control surfaces: + +- mode +- runtime state +- session key +- input/output device ids +- last wake / last utterance timestamps +- pending reply count +- whether a reply can currently be skipped +- current reply preview +- last error + +### In-Process Tray Interfaces + +The tray app also exposes in-process interfaces so its own windows do not need to bind directly to the concrete `VoiceService` implementation: + +- `IVoiceConfigurationApi` + - get voice settings + - update voice settings + - list devices + - get provider catalog + - get/set provider configuration +- `IVoiceRuntimeControlApi` + - get runtime status + - start / stop + - pause / resume + - skip current reply +- `IVoiceRuntime` + - transcript draft and conversation events for chat integration + +This now powers multiple tray-local voice surfaces, including the compact voice repeater window. + +### Can the Settings Form Use This API? + +Yes. The Settings form can use the configuration API cleanly. + +The current tray implementation now uses the voice configuration interface for: + +- provider catalog loading +- device enumeration +- applying updated voice settings / provider configuration on save + +That means the settings UI is no longer hard-wired only to concrete `VoiceService` internals for its voice-specific behavior. + +## Speech Output Implementation + +In order to reduce output latency as much as possible, the current Windows implementation has made the following implementation decisions: + +- the Windows `SpeechSynthesizer` is created once per `TalkMode` runtime and reused for subsequent replies + - Frankly, no one will probably use it, but everyone has it, so... +- cloud TTS uses a shared static `HttpClient`, so HTTP/TLS connections can be reused across replies +- cloud requests use `ResponseHeadersRead`, which lets the client observe response-header arrival without waiting for full buffering first +- the tray app now logs per-reply synthesis timings for both Windows and cloud TTS paths so latency can be measured directly during testing + +The main remaining gap is streaming playback from the first audio chunk. Best practice recommends chunked playback as soon as the first audio arrives, but the current implementation still waits for a complete playable stream before starting output (but not for long...): + +- Windows `SpeechSynthesizer` is used through `SynthesizeTextToStreamAsync`, which returns a complete stream for playback +- MiniMax now uses the provider catalog's WebSocket TTS contract, but the current player still waits for a complete playable stream before output starts +- ElevenLabs now uses the provider catalog's `stream-input` WebSocket contract, but the current player still waits for a complete playable stream before output starts + +So the current design minimizes avoidable setup and connection latency, but does not yet implement first-chunk playback streaming. This is however, planned for an early release (I'm working on it next). + +## Tray Chat Integration Decision + +Ideally Voice mode and typed chat should remain part of the same user-visible conversation in the web chat UI, however this proved difficult to achieve, as the gateway treated a message stream from the tray app seperately to that from the WebUI, even with the same session key. + +The only way of achieving this vaguely reliably seemed to be to locally insert messages into the DOM, but as this was a brittle, hacky solution, it was disgarded. + +### Chosen Approach + +It was therefore decided to create a separate *voice repeater form* to serve as a message window for voice, as well as making the messages available via toasts. + +The tray app keeps a tray-local interim transcript buffer for the current utterance, independent of whether any chat window or voice repeater form is open. + +## Provider Selection + +Voice settings now carry explicit provider ids for both STT and TTS: + +- `Voice.SpeechToTextProviderId` +- `Voice.TextToSpeechProviderId` + +The built-in default for both is `windows`. + +Runtime behavior in the current phase: + +- `windows` is implemented for both STT and TTS +- the `windows` STT route is a pure `Windows.Media.SpeechRecognition.SpeechRecognizer` path with no `AudioGraph` dependency +- `windows` STT is currently treated as `half-duplex, non-streamed` +- `http/ws` is now catalogued as a visible "coming soon" STT slot for generic streaming HTTP/WebSocket adapters +- built-in catalog entries exist for both `minimax` and `elevenlabs` TTS +- `minimax` defaults to `speech-2.8-turbo` and `English_MatureBoss` at present +- `minimax` now uses a catalog-driven WebSocket contract for synchronous TTS +- `elevenlabs` defaults to `eleven_multilingual_v2` and voice id `6aDn1KB0hjpdcocrUkmq (Tiffany)` for now +- only currently usable providers are selectable in Settings +- `sherpa-onnx` is visible but greyed out as a coming-soon local embedded route +- unsupported providers fall back to Windows at runtime with a status warning + +### Settings Surface Notes + +The Settings panel now shows short inline descriptions for: + +- the selected voice mode +- the selected speech-to-text provider +- the selected text-to-speech provider + +Those provider descriptions are drawn directly from the provider catalog. + +When `Windows Speech Recognition` is selected for STT, the Settings panel now forces both audio device pickers back to the system defaults and greys them out. That matches the current Windows route limitation and avoids advertising per-device microphone routing that does not exist on this route yet. + +### Provider Catalog + +The provider catalog now ships with the tray app as a bundled asset: + +- `Assets\\voice-providers.json` + +Example: + +```json +{ + "speechToTextProviders": [ + { + "id": "windows", + "name": "Windows Speech Recognition", + "runtime": "windows", + "enabled": true, + "description": "Built-in Windows.Media speech recognition, half-duplex, non-streamed." + }, + { + "id": "http-ws", + "name": "http/ws", + "runtime": "streaming", + "enabled": false, + "visibleInSettings": true, + "selectable": false, + "description": "Will support most cloud and local stand-alone models full or half-duplex, streaming." + }, + ], + "textToSpeechProviders": [ + { + "id": "windows", + "name": "Windows Speech Synthesis", + "runtime": "windows", + "enabled": true, + "description": "Built-in Windows text-to-speech playback." + }, + { + "id": "minimax", + "name": "MiniMax", + "runtime": "cloud", + "enabled": true, + "description": "Cloud TTS using the MiniMax WebSocket text-to-speech API.", + "settings": [ + { "key": "apiKey", "label": "API key", "secret": true }, + { + "key": "model", + "label": "Model", + "defaultValue": "speech-2.8-turbo", + "options": [ + "speech-2.5-turbo-preview", + "speech-02-turbo", + "speech-02-hd", + "speech-2.6-turbo", + "speech-2.6-hd", + "speech-2.8-turbo", + "speech-2.8-hd" + ] + }, + { "key": "voiceId", "label": "Voice ID", "defaultValue": "English_MatureBoss" }, + { + "key": "voiceSettingsJson", + "label": "Voice settings JSON", + "defaultValue": "\"voice_setting\": { \"voice_id\": {{voiceId}}, \"speed\": 1, \"vol\": 1, \"pitch\": 0 }", + "placeholder": "\"voice_setting\": { \"voice_id\": \"English_MatureBoss\", \"speed\": 1, \"vol\": 1, \"pitch\": 0 }" + } + ], + "textToSpeechWebSocket": { + "endpointTemplate": "wss://api.minimax.io/ws/v1/t2a_v2", + "authenticationHeaderName": "Authorization", + "authenticationScheme": "Bearer", + "apiKeySettingKey": "apiKey", + "connectSuccessEventName": "connected_success", + "startMessageTemplate": "{ \"event\": \"task_start\", \"model\": {{model}}, \"language_boost\": \"English\", {{voiceSettingsJson}}, \"audio_setting\": { \"sample_rate\": 32000, \"bitrate\": 128000, \"format\": \"mp3\", \"channel\": 1 } }", + "startSuccessEventName": "task_started", + "continueMessageTemplate": "{ \"event\": \"task_continue\", \"text\": {{text}} }", + "finishMessageTemplate": "{ \"event\": \"task_finish\" }", + "responseAudioMode": "hexJsonString", + "responseAudioJsonPath": "data.audio", + "responseStatusCodeJsonPath": "base_resp.status_code", + "responseStatusMessageJsonPath": "base_resp.status_msg", + "finalFlagJsonPath": "is_final", + "taskFailedEventName": "task_failed", + "successStatusValue": "0", + "outputContentType": "audio/mpeg" + } + }, + { + "id": "elevenlabs", + "name": "ElevenLabs", + "runtime": "cloud", + "enabled": true, + "description": "Cloud TTS using the ElevenLabs WebSocket stream-input API.", + "settings": [ + { "key": "apiKey", "label": "API key", "secret": true }, + { + "key": "model", + "label": "Model", + "defaultValue": "eleven_multilingual_v2", + "options": [ + "eleven_flash_v2_5", + "eleven_turbo_v2_5", + "eleven_multilingual_v2", + "eleven_monolingual_v1" + ] + }, + { "key": "voiceId", "label": "Voice ID", "defaultValue": "6aDn1KB0hjpdcocrUkmq", "placeholder": "Enter an ElevenLabs voice ID" }, + { + "key": "voiceSettingsJson", + "label": "Voice settings JSON", + "defaultValue": "\"voice_settings\": { \"speed\": 0.9, \"stability\": 0.5, \"similarity_boost\": 0.75 }", + "placeholder": "\"voice_settings\": { \"speed\": 0.9, \"stability\": 0.5, \"similarity_boost\": 0.75 }" + } + ], + "textToSpeechWebSocket": { + "endpointTemplate": "wss://api.elevenlabs.io/v1/text-to-speech/{{voiceId}}/stream-input?model_id={{model}}&output_format=mp3_44100_128&auto_mode=true", + "authenticationHeaderName": "xi-api-key", + "authenticationScheme": "", + "apiKeySettingKey": "apiKey", + "connectSuccessEventName": "", + "startMessageTemplate": "{ \"text\": \" \", {{voiceSettingsJson}}, \"xi_api_key\": {{apiKey}} }", + "startSuccessEventName": "", + "continueMessageTemplate": "{ \"text\": {{textWithTrailingSpace}}, \"try_trigger_generation\": true }", + "finishMessageTemplate": "{ \"text\": \"\" }", + "responseAudioMode": "base64JsonString", + "responseAudioJsonPath": "audio", + "finalFlagJsonPath": "isFinal", + "taskFailedEventName": "error", + "outputContentType": "audio/mpeg" + } + } + ] +} +``` + +For cloud-backed TTS providers, the catalog carries either an HTTP or WebSocket request/response contract. That allows a new provider to be added by shipping an updated catalog file with the app, as long as it follows the same general templated transport approach. + +This file defines provider metadata and transport contracts. It does not carry API keys, these are stored with the standard config. + +### Local Provider Configuration + +That means the current design is: + +- local tray settings choose the preferred STT/TTS provider ids +- provider API keys and editable values are stored in `%APPDATA%\\OpenClawTray\\settings.json` under `VoiceProviderConfiguration` +- OpenClaw remains the conversation endpoint for `chat.send` +- the shipped provider catalog remains metadata-only and must not contain secrets + +This is an intentional short-term design choice so the Windows tray app can use cloud TTS providers without inventing a second catalog file for secrets. It can be revisited later if provider ownership is split differently. + +Current configuration values are keyed by provider id. The built-in providers use: + +- `apiKey` +- `model` +- `voiceId` +- `voiceSettingsJson` + +When the selected TTS provider in Settings is not `windows`, the tray app shows provider-specific fields in the configuration form so the user can enter or edit: + +- API key +- model +- voice id +- voice settings JSON + +If a provider setting definition includes an `options` list, the settings UI renders that setting as a drop-down instead of a free-text field. That is how built-in cloud providers expose a provider-level choice plus a separate model choice without recompilation. + +If a provider setting definition is marked as JSON, the value is inserted into the provider request template as a raw JSON fragment rather than a quoted string. That allows the provider catalog to define whether the user is entering: + +- a bare object +- or a full keyed fragment such as `"voice_setting": { ... }` + +without hard-coding provider-specific wrapper keys into the runtime. + +The current cloud TTS transports are: + +- `MiniMax`: catalog-driven WebSocket synthesis +- `ElevenLabs`: catalog-driven WebSocket synthesis (`stream-input`) + +For `VoiceWake`, trigger words are gateway-owned global state. The Windows node should eventually consume the same shared trigger list and keep only a local enabled/disabled toggle plus device/runtime settings. + +In-flight voice controls are supported, if supported by the chosen provider and provided in their format, although an abstraction/translation layer is being considered, to accompany support for OpenClaw voice directives in replies records. +Pronunciation dictionaries are also only currently supported directly on the voice provider, however a centralised dictionary is possible, and a proposal is being considered. + +## Command Surface + +The voice subsystem is introduced as a new node capability category: `voice`. + +### Commands + +| Command | Purpose | Request Payload | Response Payload | +|---|---|---|---| +| `voice.devices.list` | Enumerate input/output audio devices | none | `VoiceAudioDeviceInfo[]` | +| `voice.settings.get` | Return the effective voice configuration | none | `VoiceSettings` | +| `voice.settings.set` | Update the voice configuration | `VoiceSettingsUpdateArgs` | `VoiceSettings` | +| `voice.status.get` | Return runtime voice status | none | `VoiceStatusInfo` | +| `voice.start` | Start the voice runtime with the supplied or persisted mode | `VoiceStartArgs` | `VoiceStatusInfo` | +| `voice.stop` | Stop the voice runtime | `VoiceStopArgs` | `VoiceStatusInfo` | +| `voice.pause` | Pause the active voice runtime | `VoicePauseArgs` | `VoiceStatusInfo` | +| `voice.resume` | Resume a paused voice runtime | `VoiceResumeArgs` | `VoiceStatusInfo` | +| `voice.response.skip` | Skip the currently spoken reply and advance the queue if another reply is pending | `VoiceSkipArgs` | `VoiceStatusInfo` | + +### Payload Types + +- `VoiceSettings` +- `VoiceWakeSettings` +- `TalkModeSettings` +- `VoiceAudioDeviceInfo` +- `VoiceStatusInfo` +- `VoiceStartArgs` +- `VoiceStopArgs` +- `VoicePauseArgs` +- `VoiceResumeArgs` +- `VoiceSkipArgs` +- `VoiceSettingsUpdateArgs` + +These contracts are defined in [VoiceModeSchema.cs](../src/OpenClaw.Shared/VoiceModeSchema.cs). + +## Settings Schema + +Voice settings are persisted as `SettingsData.Voice` in [SettingsData.cs](../src/OpenClaw.Shared/SettingsData.cs). +Provider configuration is persisted as `SettingsData.VoiceProviderConfiguration` in the same local settings file. +The compact repeater window state is persisted as `SettingsData.VoiceRepeaterWindow` in the same settings file. + +The editable voice configuration now lives in the main Settings window. +The tray `Voice Mode` window is a read-only runtime status/detail surface with a shortcut back into Settings. + +### Voice Repeater Window Settings + +The compact repeater persists its own local UI state in `SettingsData.VoiceRepeaterWindow`: + +| Setting | Type | Default | Meaning | +|---|---|---|---| +| `VoiceRepeaterWindow.AutoScroll` | bool | `true` | Automatically scroll the transcript surface to the latest draft/reply | +| `VoiceRepeaterWindow.FloatingEnabled` | bool | `true` | Keep the repeater floating above other windows | +| `VoiceRepeaterWindow.TextSize` | double | `13` | Repeater transcript font size | +| `VoiceRepeaterWindow.HasSavedPlacement` | bool | `false` | Whether a user placement has been persisted yet | +| `VoiceRepeaterWindow.Width` | int? | `null` | Saved repeater width | +| `VoiceRepeaterWindow.Height` | int? | `null` | Saved repeater height | +| `VoiceRepeaterWindow.X` | int? | `null` | Saved repeater screen X coordinate | +| `VoiceRepeaterWindow.Y` | int? | `null` | Saved repeater screen Y coordinate | + +### Effective Schema + +```json +{ + "Voice": { + "Mode": "VoiceWake", + "Enabled": true, + "ShowRepeaterAtStartup": true, + "SpeechToTextProviderId": "windows", + "TextToSpeechProviderId": "windows", + "InputDeviceId": "default-mic", + "OutputDeviceId": "default-speaker", + "SampleRateHz": 16000, + "CaptureChunkMs": 80, + "BargeInEnabled": true, + "VoiceWake": { + "Engine": "NanoWakeWord", + "ModelId": "hey_openclaw", + "TriggerThreshold": 0.65, + "TriggerCooldownMs": 2000, + "PreRollMs": 1200, + "EndSilenceMs": 900 + }, + "TalkMode": { + "MinSpeechMs": 250, + "EndSilenceMs": 900, + "MaxUtteranceMs": 15000 + } + }, + "VoiceProviderConfiguration": { + "Providers": [ + { + "ProviderId": "minimax", + "Values": { + "apiKey": "", + "model": "speech-2.8-turbo", + "voiceId": "English_MatureBoss", + "voiceSettingsJson": "\"voice_setting\": { \"voice_id\": \"English_MatureBoss\", \"speed\": 1, \"vol\": 1, \"pitch\": 0 }" + } + }, + { + "ProviderId": "elevenlabs", + "Values": { + "apiKey": "", + "model": "eleven_multilingual_v2", + "voiceId": "voice-id", + "voiceSettingsJson": "\"voice_settings\": { \"stability\": 0.5, \"similarity_boost\": 0.8 }" + } + } + ] + } +} +``` + +### Field Rationale + +| Field | Purpose | +|---|---| +| `Mode` | Top-level activation mode: `Off`, `VoiceWake`, `TalkMode` | +| `Enabled` | Global feature kill-switch independent of mode | +| `ShowRepeaterAtStartup` | Opens the compact Voice Mode repeater automatically when the app starts with voice mode active | +| `SpeechToTextProviderId` | Selected STT provider id from the local provider catalog | +| `TextToSpeechProviderId` | Selected TTS provider id from the local provider catalog | +| `InputDeviceId` / `OutputDeviceId` | Preferred audio device binding, with selected-speaker support implemented first | +| `SampleRateHz` | Shared capture sample rate, fixed to a speech-friendly default | +| `CaptureChunkMs` | Frame size for capture, VAD, and wakeword processing | +| `BargeInEnabled` | Allows microphone capture while audio playback is active | +| `VoiceWake.*` | NanoWakeWord and post-trigger utterance capture tuning | +| `TalkMode.*` | Continuous-listening segmentation tuning | + +### Complete Settings Definition + +| Setting | Type | Default | Applies To | Meaning | +|---|---|---|---|---| +| `Voice.Mode` | enum | `Off` | all | Activation mode: `Off`, `VoiceWake`, `TalkMode` | +| `Voice.Enabled` | bool | `false` | all | Master enable/disable flag for voice mode | +| `Voice.ShowRepeaterAtStartup` | bool | `true` | all | If `true`, the compact Voice Mode repeater opens automatically when the app starts with voice mode active | +| `Voice.SpeechToTextProviderId` | string | `windows` | all | Preferred speech-to-text provider id | +| `Voice.TextToSpeechProviderId` | string | `windows` | all | Preferred text-to-speech provider id | +| `Voice.InputDeviceId` | string? | `null` | all | Preferred microphone device id; `null` means system default | +| `Voice.OutputDeviceId` | string? | `null` | all | Preferred speaker device id; `null` means system default | +| `Voice.SampleRateHz` | int | `16000` | all | Internal capture rate used for wakeword, VAD, and utterance assembly | +| `Voice.CaptureChunkMs` | int | `80` | all | Audio frame duration used by the capture loop | +| `Voice.BargeInEnabled` | bool | `true` | all | If `true`, microphone capture may continue while response audio is playing | +| `Voice.VoiceWake.Engine` | string | `NanoWakeWord` | voice wake | Voice Wake engine identifier | +| `Voice.VoiceWake.ModelId` | string | `hey_openclaw` | voice wake | Voice Wake model/profile identifier | +| `Voice.VoiceWake.TriggerThreshold` | float | `0.65` | voice wake | Minimum score required to trigger Voice Wake activation | +| `Voice.VoiceWake.TriggerCooldownMs` | int | `2000` | voice wake | Minimum delay before another Voice Wake trigger is accepted | +| `Voice.VoiceWake.PreRollMs` | int | `1200` | voice wake | Buffered audio retained before the trigger point | +| `Voice.VoiceWake.EndSilenceMs` | int | `900` | voice wake | Silence timeout used to finalize the post-trigger utterance | +| `Voice.TalkMode.MinSpeechMs` | int | `250` | talk mode | Minimum detected speech duration before an utterance is treated as real input | +| `Voice.TalkMode.EndSilenceMs` | int | `900` | talk mode | Silence timeout used to finalize an utterance | +| `Voice.TalkMode.MaxUtteranceMs` | int | `15000` | talk mode | Hard cap on utterance length before forced submission/finalization | +| `VoiceProviderConfiguration.Providers[].ProviderId` | string | none | cloud providers | Provider id matching an `Assets\\voice-providers.json` entry | +| `VoiceProviderConfiguration.Providers[].Values["apiKey"]` | string? | `null` | cloud providers | API key sent using the provider contract's configured auth header | +| `VoiceProviderConfiguration.Providers[].Values["model"]` | string? | provider default | cloud providers | Model identifier inserted into the configured request template | +| `VoiceProviderConfiguration.Providers[].Values["voiceId"]` | string? | provider default | cloud providers | Voice id inserted into the configured request template or URL | +| `VoiceProviderConfiguration.Providers[].Values["voiceSettingsJson"]` | string? | provider default | cloud providers | Raw JSON fragment inserted into the configured request template; may be a keyed fragment like `"voice_setting": { ... }` | + +At runtime today: + +- `Voice.OutputDeviceId` is applied to Talk Mode playback through `MediaPlayer.AudioDevice` +- `VoiceCaptureService` now runs an `AudioGraph` capture pipeline in parallel with Talk Mode and binds it to the selected or default microphone device +- `Voice.InputDeviceId` is now used by that `AudioGraph` capture path, but transcript generation still uses the Windows default speech input path until the STT adapter migration is complete +- Talk Mode only advertises `ListeningContinuously` after the capture graph has produced live frames and the recognizer warm-up window has elapsed, so the status acts as a real “you can start talking now” signal instead of a timer-only guess +- recognizer recovery is now speech-triggered rather than silence-triggered: the Windows recognizer is only recycled when sustained capture speech is present but no recognition activity follows +- when a recognizer session ends after real hypothesis activity but before a final result arrives, Talk Mode now promotes the last recent hypothesis and submits it instead of dropping the utterance +- the speech-mismatch recovery watchdog is single-owner and only armed from capture speech, so a new recognition session does not spawn overlapping recovery loops +- when the system default capture device changes and Talk Mode is using the default mic, the recognizer is rebuilt so device switches such as AirPods are picked up without a full app restart +- explicit non-default microphone transcript generation is still pending the planned STT adapter migration + +## Current Runtime Architecture + +The current Windows implementation is still centred on `VoiceService`, with a few supporting seams around it: + +- `VoiceCapability` + exposes shared `voice.*` commands to the node/gateway surface +- `VoiceCaptureService` + owns the new `AudioGraph` capture backbone, selected/default microphone binding, and live signal detection +- `VoiceService` + owns Talk Mode runtime state, recognizer/TTS integration, reply queuing, timeouts, gateway reply handling, and the transition layer between `AudioGraph` capture and the current recognizer-owned STT path +- `VoiceChatCoordinator` + mirrors interim transcript drafts and conversation turns into attached tray windows without making any window part of the transport path +- `OpenClawGatewayClient` + carries direct `chat.send`, final chat events, and the `sessions.preview` fallback path for bare final markers +- `WebChatWindow` + mirrors live transcript drafts into the WebChat compose box +- `VoiceRepeaterWindow` + is the compact local transcript/reply/control surface for Talk Mode + +### Current End-to-End Talk Mode + +```mermaid +flowchart LR + A["User speech"] --> B["VoiceCaptureService
AudioGraph on selected/default mic"] + A --> C["Windows SpeechRecognizer
continuous dictation on current default mic"] + + B --> D["FrameCaptured / SignalDetected"] + D --> E["VoiceService
capture-backed health + device state"] + + C --> F["HypothesisGenerated
interim text"] + F --> G["VoiceService
draft event"] + G --> H["VoiceChatCoordinator"] + H --> I["WebChatWindow
compose-box mirror only"] + H --> I2["VoiceRepeaterWindow
compact local draft surface"] + + C --> J["ResultGenerated
final Medium/High text"] + J --> K["VoiceService
duplicate guard + late hypothesis promotion"] + K --> L["Stop recognition session"] + L --> M["OpenClawGatewayClient.SendChatMessageAsync
direct chat.send(agent:main:main, transcript)"] + M --> N["OpenClaw / session pipeline"] + K --> H2["VoiceChatCoordinator
outgoing turn event"] + H2 --> I2 + N --> O["Chat final event"] + O --> P{"assistant text present?"} + P -- "yes" --> Q["assistant text"] + P -- "no" --> R["sessions.preview fallback
with stale-preview retry guard"] + R --> Q + Q --> H3["VoiceChatCoordinator
incoming turn event"] + H3 --> I2 + + Q --> S["VoiceService reply queue"] + S --> T{"TTS provider"} + T -- "windows" --> U["SpeechSynthesizer"] + T -- "cloud" --> V["VoiceCloudTextToSpeechClient
MiniMax websocket or other provider"] + U --> W["Complete playable stream"] + V --> W + W --> X["MediaPlayer
selected OutputDeviceId if set"] + X --> Y["Speaker / headset output"] + Y --> Z["Resume recognition when queue drains"] +``` + +### Current Processing Stages + +| Stage | Component | Input | Output | +|---|---|---|---| +| 1 | `VoiceCaptureService` | selected/default microphone device | continuous frame and signal events from `AudioGraph` | +| 2 | `SpeechRecognizer` | Windows default speech-input path | interim/final transcript text | +| 3 | `VoiceService` | capture signal + final transcript text | health/restart decisions, de-duplicated transcript, runtime state changes | +| 4 | `VoiceChatCoordinator` | draft and conversation-turn events | mirrored draft for WebChat plus compact local transcript/reply updates | +| 5 | `OpenClawGatewayClient` | transcript text + session key | `chat.send` request + assistant reply events | +| 6 | `OpenClawGatewayClient` preview fallback | bare final chat marker | assistant preview text, guarded against stale replay | +| 7 | `VoiceService` reply queue | assistant reply text | ordered reply playback work | +| 8 | `VoiceCloudTextToSpeechClient` / `SpeechSynthesizer` | assistant reply text | complete playable audio stream | +| 9 | `MediaPlayer` | complete playable audio stream | rendered audio on default or selected speaker | + +## Planned AudioGraph Input Architecture + +The next input-phase refactor will move microphone ownership away from `SpeechRecognizer` and into an explicit capture pipeline built around `AudioGraph`. + +The purpose of that change is to unlock: + +- true selected non-default microphone support +- streaming rather than utterance-owned capture +- a proper ring buffer and VAD pipeline +- future non-Windows and streaming STT providers +- future barge-in / full-duplex work + +### Target Input Stack + +```mermaid +flowchart TD + A["Selected microphone device id
or system default mic"] --> B["VoiceCaptureService
AudioGraph input node"] + B --> C["PCM frame stream
fixed chunk duration"] + C --> D["Ring buffer
bounded pre-roll"] + C --> E["VoiceActivityDetector"] + C --> F["VoiceWake engine
later"] + C --> G["SpeechToText adapter"] + E --> H["UtteranceAssembler
for non-streaming STT adapters"] + D --> H + H --> G + G --> I["Transcript events
interim + final"] + I --> J["VoiceService / runtime controller"] + J --> K["OpenClawGatewayClient
chat.send + reply events"] +``` + +### Proposed Seams + +The target split should look like this: + +- `VoiceCaptureService` + - owns `AudioGraph` + - binds to an explicit input device id when one is selected + - emits continuous PCM frames +- `IVoiceActivityDetector` + - emits speech / silence transitions from frame data +- `IUtteranceAssembler` + - builds bounded utterances from frames for non-streaming STT backends +- `ISpeechToTextAdapter` + - consumes either live frames or completed utterances + - emits interim and final transcript events +- `VoiceService` + - remains the runtime orchestrator rather than the owner of low-level capture + +## Selected-Device Roadmap + +The current selected-device position is now: + +- selected non-default speaker: implemented +- selected/default microphone binding for `SpeechRecognizer` capture: implemented +- selected non-default microphone for actual transcript generation: not implemented yet (requires `AudioGraph` support) + +## Control Flow + +```mermaid +sequenceDiagram + participant Gateway as Gateway / Operator + participant VoiceCap as VoiceCapability + participant Coord as VoiceService + participant Store as SettingsData.Voice + + Gateway->>VoiceCap: voice.settings.get + VoiceCap-->>Gateway: VoiceSettings + + Gateway->>VoiceCap: voice.settings.set(settings, persist=true) + VoiceCap->>Store: save VoiceSettings + VoiceCap-->>Gateway: VoiceSettings + +Gateway->>VoiceCap: voice.start(mode=TalkMode, sessionKey=...) + VoiceCap->>Coord: Start(VoiceStartArgs) +Coord-->>VoiceCap: VoiceStatusInfo(state=ListeningContinuously) + VoiceCap-->>Gateway: VoiceStatusInfo + + Gateway->>VoiceCap: voice.status.get + VoiceCap-->>Gateway: VoiceStatusInfo + + Gateway->>VoiceCap: voice.pause(reason=...) + VoiceCap->>Coord: Pause() + Coord-->>VoiceCap: VoiceStatusInfo(state=Paused) + VoiceCap-->>Gateway: VoiceStatusInfo + + Gateway->>VoiceCap: voice.resume(reason=...) + VoiceCap->>Coord: Resume() + Coord-->>VoiceCap: VoiceStatusInfo(state=ListeningContinuously) + VoiceCap-->>Gateway: VoiceStatusInfo + + Gateway->>VoiceCap: voice.response.skip(reason=...) + VoiceCap->>Coord: SkipCurrentReply() + Coord-->>VoiceCap: VoiceStatusInfo + VoiceCap-->>Gateway: VoiceStatusInfo + + Gateway->>VoiceCap: voice.stop(reason=...) + VoiceCap->>Coord: Stop() + Coord-->>VoiceCap: VoiceStatusInfo(state=Stopped) + VoiceCap-->>Gateway: VoiceStatusInfo +``` + +## Integration Boundaries + +### Existing Components Reused + +- `NodeService` remains the capability registration and lifecycle owner +- `SettingsData` remains the persisted JSON settings model +- `WindowsNodeClient` remains the gateway/node transport +- existing node capability registration remains the integration pattern +- current request/response transport remains the v1 control plane + +### Supporting Components In Current Use + +- `VoiceCapability` in `OpenClaw.Shared.Capabilities` +- `VoiceCaptureService` in `OpenClaw.Tray.WinUI.Services` +- `VoiceChatCoordinator` in `OpenClaw.Tray.WinUI.Services` +- `VoiceRepeaterWindow` in `OpenClaw.Tray.WinUI.Windows` +- `WebChatWindow` in `OpenClaw.Tray.WinUI.Windows` + +### Components Still Expected Later + +- `VoiceWakeService` in `OpenClaw.Tray.WinUI.Services` +- a dedicated `VoicePlaybackService` seam when playback is split out of `VoiceService` + +## Parity with macOS Node + +Status values used below: + +- `Supported` +- `Partial` +- `NotSupported (planned)` +- `Exceeded*` + +| macOS feature | Current Windows state | Notes | +|---|---|---| +| Talk Mode continuous loop (`listen -> chat.send(main) -> wait -> speak`) | `Supported` | Windows Talk Mode uses direct `chat.send` on the tray voice target session (`agent:main:main` today, while still accepting the `main` alias on replies) and loops back to listening after reply playback. | +| Talk Mode sends after a short silence window | `Supported` | The current runtime finalizes on recognition pause and uses configurable Talk Mode silence settings. | +| Talk Mode visible phase transitions (`Listening -> Thinking -> Speaking`) | `Partial` | Runtime states, tray icon changes, and the compact voice repeater window exist, but there is no always-visible overlay yet. | +| Talk Mode always-on overlay with click-to-stop / click-X controls | `NotSupported (planned)` | Windows currently has a tray icon, a manually-opened compact repeater window, and WebChat draft mirroring, but no always-on overlay surface. | +| Talk Mode writes replies into WebChat the same way typed chat does | `Partial` | Replies appear in WebChat through normal session updates, but Talk Mode uses direct send rather than a same-as-typing transport path. | +| Talk Mode interrupt-on-speech / barge-in | `NotSupported (planned)` | Windows is still half-duplex during reply playback. | +| Talk Mode voice directives in replies | `NotSupported (planned)` | Windows does not yet parse or apply the JSON voice directive line described in the Talk Mode docs. | +| Talk Mode true streaming TTS playback | `NotSupported (planned)` | MiniMax uses WebSocket transport, but playback still waits for a complete playable stream. | +| Talk Mode cloud TTS provider flexibility | `Exceeded` | Windows already supports Windows built-in TTS plus catalog-driven cloud providers rather than being limited to a single provider path. This exceeds the documented macOS baseline on provider flexibility, but not yet on true streaming playback latency because incremental playback is still pending. | +| Voice Wake wake-word runtime | `NotSupported (planned)` | `VoiceWake` remains a documented target mode, but there is no active wake-word runtime yet. | +| Voice Wake push-to-talk capture | `NotSupported (planned)` | There is no Windows push-to-talk path yet. | +| Voice Wake overlay with committed / volatile transcript states | `NotSupported (planned)` | No Voice Wake overlay exists on Windows yet. | +| Voice Wake restart invariants when UI is dismissed | `NotSupported (planned)` | The macOS overlay-dismiss resilience behavior has no Windows equivalent yet because the overlay/runtime does not exist. | +| Voice Wake forwarding to the active gateway / agent | `NotSupported (planned)` | Forwarding semantics are only implemented for Talk Mode today. | +| Voice Wake machine-hint transcript prefixing | `NotSupported (planned)` | Windows does not currently prepend a machine hint on forwarded wake transcripts. | +| Voice Wake mic picker, live level meter, trigger-word table, and tester | `NotSupported (planned)` | Windows has general voice settings and device lists, but not the Voice Wake-specific settings surface from macOS. | +| Voice mic device selection | `Partial` | When `Windows Speech Recognition` is selected, Settings now locks both audio device pickers to the system defaults. Explicit per-device transcription routing remains a future AudioGraph/streaming-route feature. | +| Voice Wake send / trigger chimes | `NotSupported (planned)` | Windows currently has no configurable trigger/send sounds. | + +## Feature List - Backlog - Not in Order, except maybe the first two ;) + +### Story: Streaming STT Capture Pipeline + +Implement `AudioGraph` to create an extensible streaming speech input pipeline, rather than the current self-contained `Windows.Media.SpeechRecognizer` pipeline. + +This will allow us to mix/match components, and reduce latency. + +- Will support Cloud or Local http/ws providers (including Microsoft Foundry Local/OpenAI Whisper/etc) +- Will support Embedded sherpa-onnx engine for user-defined/downloaded models +- This will enable selection of best of class model for required use/language + +### Story: True streaming TTS playback + +Start speaking assistant replies from the first usable audio chunk instead of waiting for a complete playable stream. + +Notes: + +- the current implementation uses WebSocket transport for MiniMax, but still buffers the entire audio response before playback begins +- `firstChunk=...ms` in the log is currently provider-chunk arrival time, not actual speech-start time +- implement a playback path that can consume incremental audio data as it arrives from the provider +- the provider catalog contract should remain transport-driven and provider-agnostic, so streaming behavior should be expressed through the existing TTS contract model rather than hard-coded for MiniMax +- preserve the existing queued reply behavior, skip support, and late-reply handling while switching playback to progressive output +- add timing logs that separate `firstChunk`, `playbackStart`, and `playbackEnd` so latency improvements are measurable + +### Story: True selected-microphone transcription support + +Make actual STT transcription follow the selected microphone device, not just the default device. + +- depends on `AudioGraph` support + + +### Story: Talk Mode overlay and visible phase parity + +Add a Talk Mode overlay that makes `Listening`, `Thinking`, and `Speaking` visible to the user in the same way the macOS experience does. Probably via the current voice mode form. I haven't actually seen the macOS version, so not sure how they do it. + + +### Story: Talk Mode overlay controls + +Add explicit Talk Mode overlay controls for stopping speech playback and exiting Talk Mode. + +Notes: + +- macOS exposes click-to-stop and click-to-exit controls directly on the overlay +- Windows currently requires tray or settings interaction instead +- this should plug into the shared runtime control API rather than directly manipulating `VoiceService` + + +### Story: Voice directives in replies + +Support the Talk Mode reply-prefix JSON directive described in the OpenClaw docs. + +Notes: + +- parse only the first non-empty reply line +- strip the directive before playback +- support per-reply `once: true` and persistent default updates +- supported keys should at least include voice, model, and the documented voice-shaping parameters +- provider-specific validation should happen through the provider contract layer where possible + +### Story: Foundry Local STT provider + +Implement the AudioGraph-fed streaming STT adapter for Foundry Local. + +Notes: + +- provider metadata now lives in the provider catalog, but it should stay disabled in settings until the runtime adapter exists +- this route should use the shared streaming STT path rather than the Windows.Media recognizer path +- endpoint and model selection should come from the provider catalog settings contract + +### Story: OpenAI Whisper STT provider + +Implement the AudioGraph-fed streaming STT adapter for OpenAI Whisper transcription. + +Notes: + +- this should be catalog-driven and disabled in settings until the adapter is production-ready +- the initial implementation only needs the basic transcription path, not translation or diarization +- API key and model configuration should come from the provider catalog + +### Story: ElevenLabs Speech to Text provider + +Implement the AudioGraph-fed streaming STT adapter for ElevenLabs speech-to-text. + +Notes: + +- keep it catalog-driven and disabled in settings until the runtime path is implemented +- match the same route abstraction used by the other non-Windows STT providers +- any provider-specific partial/final transcript semantics should be normalized in the adapter layer + +### Story: Azure AI Speech STT provider + +Implement the AudioGraph-fed streaming STT adapter for Azure AI Speech. + +Notes: + +- use the official Azure AI Speech naming in settings and docs rather than an internal "Foundry Azure STT" label +- keep the provider catalog entry disabled until the adapter is functional end to end +- endpoint and credential handling should come from the provider settings contract + +### Story: sherpa-onnx embedded STT provider + +Implement the local embedded sherpa-onnx STT route for user-supplied model bundles. + +Notes: + +- keep this visible but greyed out in settings until the embedded runtime is implemented +- the user should be able to choose their own downloaded model bundle and language-appropriate package +- model lifecycle, validation, and error reporting should be handled in the embedded adapter rather than in the Windows.Media route + + +### Story: Full-duplex / barge-in Talk Mode + +Allow the node to keep listening while it is speaking, so the user can interrupt or interleave speech without waiting for reply playback to finish. + +Notes: + +- the current Windows implementation is half-duplex: recognition is stopped or ignored while a reply is being spoken +- practical requirements are likely to include: + - microphone capture that can remain active during playback + - acoustic echo cancellation / echo suppression + - barge-in detection and playback interruption rules + - a policy for whether interrupt speech cancels the current reply or queues behind it + - additional runtime control/status so the UI can show when barge-in is armed +- this should be treated as a separate engineering phase, not a small extension of the current Talk Mode runtime + +### Story: Voice Wake wake-word runtime + +Implement the actual Windows Voice Wake runtime. + +Notes: + +- this should cover wake-word listening, trigger detection, post-trigger capture, silence finalization, hard-stop protection, and debounce between sessions +- the runtime should restart cleanly after send and should remain armed whenever Voice Wake is enabled and permissions are available +- the implementation should be based on the planned `AudioGraph` capture pipeline rather than a second unrelated microphone stack + +### Story: Voice Wake push-to-talk + +Implement a Windows push-to-talk capture path alongside wake-word activation. + +Notes: + +- this should support press-to-capture, release-to-finalize semantics +- it should pause the wake runtime while push-to-talk capture is active, then resume it cleanly afterward +- Windows-specific hotkey and permissions behavior should be documented explicitly once chosen + +### Story: Voice Wake settings parity + +Add the user-facing Voice Wake settings surface that exists on macOS. + +Notes: + +- include language and mic pickers +- include a live level meter +- include trigger-word editing or table management +- include a local-only tester that does not forward +- preserve the chosen mic if it disconnects, surface a disconnected hint, and fall back to the system default until it returns + +### Story: Voice Wake sounds and chimes + +Add configurable trigger and send sounds for Voice Wake. + +Notes: + +- trigger and send events should be independently configurable +- support `No Sound` +- keep the sound implementation distinct from assistant reply playback + +### Story: Voice Wake forwarding semantics + +Implement the documented Voice Wake forwarding behavior. + +Notes: + +- forwarded transcripts should go to the active gateway / agent path +- reply delivery and logging behavior should match the rest of the node session model +- the forwarding path should be resilient even when UI surfaces are closed + +### Story: Voice Wake machine-hint prefixing + +Implement the documented transcript prefixing / machine-hint behavior for forwarded Voice Wake utterances. + +Notes: + +- the prefixing rule should be explicit and testable +- both wake-word and push-to-talk paths should share the same forwarding helper + +### Story: Voice Wake trigger tuning and pause semantics + +Implement the documented Voice Wake trigger-gap, silence-window, hard-stop, and debounce semantics. + +Notes: + +- include the wake-word gap behavior before command capture begins +- support distinct silence windows for trigger-only vs flowing speech cases +- include a hard maximum capture duration +- expose the tuning through voice settings rather than hard-coded constants alone + diff --git a/moltbot-windows-hub.slnx b/moltbot-windows-hub.slnx index 79eaf12..b83139f 100644 --- a/moltbot-windows-hub.slnx +++ b/moltbot-windows-hub.slnx @@ -5,6 +5,7 @@ + diff --git a/src/OpenClaw.Shared/Capabilities/VoiceCapability.cs b/src/OpenClaw.Shared/Capabilities/VoiceCapability.cs new file mode 100644 index 0000000..728b8fd --- /dev/null +++ b/src/OpenClaw.Shared/Capabilities/VoiceCapability.cs @@ -0,0 +1,248 @@ +using System; +using System.Collections.Generic; +using System.Text.Json; +using System.Threading.Tasks; + +namespace OpenClaw.Shared.Capabilities; + +public class VoiceCapability : NodeCapabilityBase +{ + private const string LegacySkipCommand = "voice.skip"; + + private static readonly JsonSerializerOptions s_jsonOptions = new() + { + PropertyNameCaseInsensitive = true + }; + + public override string Category => "voice"; + + public override IReadOnlyList Commands => VoiceCommands.All; + + public event Func>? ListDevicesRequested; + public event Func>? SettingsRequested; + public event Func>? SettingsUpdateRequested; + public event Func>? StatusRequested; + public event Func>? StartRequested; + public event Func>? StopRequested; + public event Func>? PauseRequested; + public event Func>? ResumeRequested; + public event Func>? SkipRequested; + + public VoiceCapability(IOpenClawLogger logger) : base(logger) + { + } + + public override async Task ExecuteAsync(NodeInvokeRequest request) + { + return request.Command switch + { + VoiceCommands.ListDevices => await HandleListDevicesAsync(), + VoiceCommands.GetSettings => await HandleGetSettingsAsync(), + VoiceCommands.SetSettings => await HandleSetSettingsAsync(request), + VoiceCommands.GetStatus => await HandleGetStatusAsync(), + VoiceCommands.Start => await HandleStartAsync(request), + VoiceCommands.Stop => await HandleStopAsync(request), + VoiceCommands.Pause => await HandlePauseAsync(request), + VoiceCommands.Resume => await HandleResumeAsync(request), + VoiceCommands.Skip or LegacySkipCommand => await HandleSkipAsync(request), + _ => Error($"Unknown command: {request.Command}") + }; + } + + private async Task HandleListDevicesAsync() + { + Logger.Info(VoiceCommands.ListDevices); + + if (ListDevicesRequested == null) + return Error("Voice device enumeration not available"); + + try + { + return Success(await ListDevicesRequested()); + } + catch (Exception ex) + { + Logger.Error("Voice device enumeration failed", ex); + return Error($"Device enumeration failed: {ex.Message}"); + } + } + + private async Task HandleGetSettingsAsync() + { + Logger.Info(VoiceCommands.GetSettings); + + if (SettingsRequested == null) + return Error("Voice settings not available"); + + try + { + return Success(await SettingsRequested()); + } + catch (Exception ex) + { + Logger.Error("Voice settings get failed", ex); + return Error($"Get settings failed: {ex.Message}"); + } + } + + private async Task HandleSetSettingsAsync(NodeInvokeRequest request) + { + Logger.Info(VoiceCommands.SetSettings); + + if (SettingsUpdateRequested == null) + return Error("Voice settings update not available"); + + try + { + var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null + ? "{}" + : request.Args.GetRawText(); + VoiceSettingsUpdateArgs? update = null; + if (request.Args.ValueKind == JsonValueKind.Object && + request.Args.TryGetProperty("update", out var updateEl)) + { + update = JsonSerializer.Deserialize(updateEl.GetRawText(), s_jsonOptions); + } + + update ??= JsonSerializer.Deserialize(rawArgs, s_jsonOptions); + + if (update == null) + return Error("Missing update payload"); + + return Success(await SettingsUpdateRequested(update)); + } + catch (Exception ex) + { + Logger.Error("Voice settings update failed", ex); + return Error($"Set settings failed: {ex.Message}"); + } + } + + private async Task HandleGetStatusAsync() + { + Logger.Info(VoiceCommands.GetStatus); + + if (StatusRequested == null) + return Error("Voice status not available"); + + try + { + return Success(await StatusRequested()); + } + catch (Exception ex) + { + Logger.Error("Voice status get failed", ex); + return Error($"Get status failed: {ex.Message}"); + } + } + + private async Task HandleStartAsync(NodeInvokeRequest request) + { + Logger.Info(VoiceCommands.Start); + + if (StartRequested == null) + return Error("Voice start not available"); + + try + { + var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null + ? "{}" + : request.Args.GetRawText(); + var args = JsonSerializer.Deserialize(rawArgs, s_jsonOptions) ?? new VoiceStartArgs(); + return Success(await StartRequested(args)); + } + catch (Exception ex) + { + Logger.Error("Voice start failed", ex); + return Error($"Start failed: {ex.Message}"); + } + } + + private async Task HandleStopAsync(NodeInvokeRequest request) + { + Logger.Info(VoiceCommands.Stop); + + if (StopRequested == null) + return Error("Voice stop not available"); + + try + { + var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null + ? "{}" + : request.Args.GetRawText(); + var args = JsonSerializer.Deserialize(rawArgs, s_jsonOptions) ?? new VoiceStopArgs(); + return Success(await StopRequested(args)); + } + catch (Exception ex) + { + Logger.Error("Voice stop failed", ex); + return Error($"Stop failed: {ex.Message}"); + } + } + + private async Task HandlePauseAsync(NodeInvokeRequest request) + { + Logger.Info(VoiceCommands.Pause); + + if (PauseRequested == null) + return Error("Voice pause not available"); + + try + { + var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null + ? "{}" + : request.Args.GetRawText(); + var args = JsonSerializer.Deserialize(rawArgs, s_jsonOptions) ?? new VoicePauseArgs(); + return Success(await PauseRequested(args)); + } + catch (Exception ex) + { + Logger.Error("Voice pause failed", ex); + return Error($"Pause failed: {ex.Message}"); + } + } + + private async Task HandleResumeAsync(NodeInvokeRequest request) + { + Logger.Info(VoiceCommands.Resume); + + if (ResumeRequested == null) + return Error("Voice resume not available"); + + try + { + var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null + ? "{}" + : request.Args.GetRawText(); + var args = JsonSerializer.Deserialize(rawArgs, s_jsonOptions) ?? new VoiceResumeArgs(); + return Success(await ResumeRequested(args)); + } + catch (Exception ex) + { + Logger.Error("Voice resume failed", ex); + return Error($"Resume failed: {ex.Message}"); + } + } + + private async Task HandleSkipAsync(NodeInvokeRequest request) + { + Logger.Info(VoiceCommands.Skip); + + if (SkipRequested == null) + return Error("Voice skip not available"); + + try + { + var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null + ? "{}" + : request.Args.GetRawText(); + var args = JsonSerializer.Deserialize(rawArgs, s_jsonOptions) ?? new VoiceSkipArgs(); + return Success(await SkipRequested(args)); + } + catch (Exception ex) + { + Logger.Error("Voice skip failed", ex); + return Error($"Skip failed: {ex.Message}"); + } + } +} diff --git a/src/OpenClaw.Shared/Models.cs b/src/OpenClaw.Shared/Models.cs index 15b1afb..a6feecc 100644 --- a/src/OpenClaw.Shared/Models.cs +++ b/src/OpenClaw.Shared/Models.cs @@ -88,6 +88,14 @@ public class OpenClawNotification public string[]? Tags { get; set; } // free-form routing tags } +public class ChatMessageEventArgs : EventArgs +{ + public string SessionKey { get; set; } = "main"; + public string Role { get; set; } = ""; + public string Message { get; set; } = ""; + public bool IsFinal { get; set; } +} + /// /// A user-defined notification categorization rule. /// diff --git a/src/OpenClaw.Shared/OpenClawGatewayClient.cs b/src/OpenClaw.Shared/OpenClawGatewayClient.cs index 05189f5..2fec9e8 100644 --- a/src/OpenClaw.Shared/OpenClawGatewayClient.cs +++ b/src/OpenClaw.Shared/OpenClawGatewayClient.cs @@ -41,8 +41,11 @@ private enum SignatureTokenMode private GatewayUsageStatusInfo? _usageStatus; private GatewayCostUsageInfo? _usageCost; private readonly Dictionary _pendingRequestMethods = new(); + private readonly Dictionary _pendingChatPreviewSessionKeys = new(); + private readonly Dictionary _lastAssistantMessagesBySession = new(); private readonly Dictionary> _pendingChatSendRequests = new(); private readonly object _pendingRequestLock = new(); + private readonly object _pendingChatPreviewLock = new(); private readonly object _pendingChatSendLock = new(); private readonly object _sessionsLock = new(); private readonly object _nodesLock = new(); @@ -58,11 +61,19 @@ private enum SignatureTokenMode private bool _usageCostUnsupported; private bool _sessionPreviewUnsupported; private bool _nodeListUnsupported; + private string _defaultChatSessionKey = DefaultChatSessionKey; private bool _operatorReadScopeUnavailable; private bool _pairingRequiredAwaitingApproval; private IReadOnlyList? _userRules; private bool _preferStructuredCategories = true; + private const string DefaultChatSessionKey = "main"; + private sealed class PendingChatPreviewState + { + public string? LastKnownAssistantText { get; init; } + public int AttemptCount { get; set; } + } + /// /// Controls whether structured notification metadata (Intent, Channel) takes priority /// over keyword-based classification. Call after construction and whenever settings change. @@ -111,15 +122,18 @@ protected override bool ShouldAutoReconnect() protected override void OnDisconnected() { ClearPendingRequests(); + ClearPendingChatPreviewSessions(); } protected override void OnDisposing() { ClearPendingRequests(); + ClearPendingChatPreviewSessions(); } // Events public event EventHandler? NotificationReceived; + public event EventHandler? ChatMessageReceived; public event EventHandler? ActivityChanged; public event EventHandler? ChannelHealthUpdated; public event EventHandler? SessionsUpdated; @@ -191,35 +205,32 @@ public async Task CheckHealthAsync() } } - public async Task SendChatMessageAsync(string message, string? sessionKey = null) + public async Task SendChatMessageAsync(string message, string? sessionKey = null, string? idempotencyKey = null) { if (!IsConnected) throw new InvalidOperationException("Gateway connection is not open"); if (string.IsNullOrWhiteSpace(message)) throw new ArgumentException("Message is required", nameof(message)); - var effectiveSessionKey = string.IsNullOrWhiteSpace(sessionKey) - ? _mainSessionKey - : sessionKey.Trim(); - var requestId = Guid.NewGuid().ToString(); var completion = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); TrackPendingChatSend(requestId, completion); + var resolvedSessionKey = ResolveChatSessionKey(sessionKey); + var resolvedIdempotencyKey = string.IsNullOrWhiteSpace(idempotencyKey) + ? Guid.NewGuid().ToString() + : idempotencyKey; + var parameters = BuildChatSendParameters(message, resolvedSessionKey, resolvedIdempotencyKey); - var req = new + TrackPendingRequest(requestId, "chat.send"); + try { - type = "req", - id = requestId, - method = "chat.send", - @params = new - { - sessionKey = effectiveSessionKey, - message, - idempotencyKey = Guid.NewGuid().ToString() - } - }; - - await SendRawAsync(JsonSerializer.Serialize(req)); + await SendRawAsync(SerializeRequest(requestId, "chat.send", parameters)); + } + catch + { + RemovePendingRequest(requestId); + throw; + } var completedTask = await Task.WhenAny(completion.Task, Task.Delay(5000, CancellationToken)); if (completedTask != completion.Task) @@ -459,6 +470,31 @@ private async Task SendConnectMessageAsync(string? nonce = null) } } + private object BuildConnectParameters() + { + return new + { + minProtocol = 3, + maxProtocol = 3, + client = new + { + id = "cli", + version = "1.0.0", + platform = "windows", + mode = "cli", + displayName = "OpenClaw Windows Tray" + }, + role = "operator", + scopes = new[] { "operator.read", "operator.write", "operator.admin", "operator.approvals", "operator.pairing" }, + caps = Array.Empty(), + commands = Array.Empty(), + permissions = new { }, + auth = new { token = _token }, + locale = "en-US", + userAgent = "openclaw-windows-tray/1.0.0" + }; + } + private async Task SendTrackedRequestAsync(string method, object? parameters = null) { if (!IsConnected) return; @@ -666,6 +702,7 @@ private void HandleResponse(JsonElement root) // Handle handshake acknowledgement payload. if (payload.TryGetProperty("type", out var t) && t.GetString() == "hello-ok") { + UpdateDefaultChatSessionKeyFromHello(payload); _pairingRequiredAwaitingApproval = false; _operatorDeviceId = TryGetHandshakeDeviceId(payload); _grantedOperatorScopes = TryGetHandshakeScopes(payload); @@ -677,7 +714,6 @@ private void HandleResponse(JsonElement root) _connectAuthToken = newDeviceToken; _logger.Info("Operator device token stored for reconnect"); } - _logger.Info("Handshake complete (hello-ok)"); if (!string.IsNullOrWhiteSpace(_operatorDeviceId)) { @@ -1257,13 +1293,17 @@ private void HandleChatEvent(JsonElement root) { var rawText = root.GetRawText(); _logger.Debug($"Chat event received: {rawText.Substring(0, Math.Min(200, rawText.Length))}"); - if (!root.TryGetProperty("payload", out var payload)) return; + var sessionKey = NormalizeChatSessionKey(TryGetSessionKey(root, payload)); + var isFinal = !payload.TryGetProperty("state", out var state) || + string.Equals(state.GetString(), "final", StringComparison.OrdinalIgnoreCase); + var emittedAssistantText = false; // Try new format: payload.message.role + payload.message.content[].text if (payload.TryGetProperty("message", out var message)) { - if (message.TryGetProperty("role", out var role) && role.GetString() == "assistant") + var roleName = GetString(message, "role"); + if (roleName == "assistant") { // Extract text from content array if (message.TryGetProperty("content", out var content) && content.ValueKind == JsonValueKind.Array) @@ -1274,11 +1314,11 @@ private void HandleChatEvent(JsonElement root) item.TryGetProperty("text", out var textProp)) { var text = textProp.GetString() ?? ""; - if (!string.IsNullOrEmpty(text) && - payload.TryGetProperty("state", out var state) && - state.GetString() == "final") + if (!string.IsNullOrEmpty(text) && isFinal) { + emittedAssistantText = true; _logger.Info($"Assistant response: {text.Substring(0, Math.Min(100, text.Length))}..."); + EmitChatMessage(sessionKey, roleName ?? "assistant", text, isFinal); EmitChatNotification(text); } } @@ -1291,14 +1331,40 @@ private void HandleChatEvent(JsonElement root) else if (payload.TryGetProperty("text", out var textProp)) { var text = textProp.GetString() ?? ""; - if (payload.TryGetProperty("role", out var role) && - role.GetString() == "assistant" && + var roleName = GetString(payload, "role"); + if (roleName == "assistant" && !string.IsNullOrEmpty(text)) { + emittedAssistantText = true; _logger.Info($"Assistant response (legacy): {text.Substring(0, Math.Min(100, text.Length))}"); + EmitChatMessage(sessionKey, roleName, text, isFinal: true); EmitChatNotification(text); } } + + if (isFinal && !emittedAssistantText) + { + RequestChatPreviewForFinalState(sessionKey); + } + } + + private void EmitChatMessage(string sessionKey, string role, string text, bool isFinal) + { + if (isFinal && string.Equals(role, "assistant", StringComparison.OrdinalIgnoreCase)) + { + lock (_pendingChatPreviewLock) + { + _lastAssistantMessagesBySession[NormalizeChatSessionKey(sessionKey)] = text; + } + } + + ChatMessageReceived?.Invoke(this, new ChatMessageEventArgs + { + SessionKey = sessionKey, + Role = role, + Message = text, + IsFinal = isFinal + }); } private void EmitChatNotification(string text) @@ -1512,6 +1578,7 @@ private void ParseSessions(JsonElement sessions) } snapshot = GetSessionListInternal(); + UpdateDefaultChatSessionKeyFromSessions(); } SessionsUpdated?.Invoke(this, snapshot); @@ -1540,6 +1607,205 @@ private void ParseSessionItem(JsonElement item) PopulateSessionFromObject(session, item); _sessions[session.Key] = session; + if (session.IsMain) + { + UpdateDefaultChatSessionKey(session.Key); + } + } + + private object BuildChatSendParameters(string message, string sessionKey, string idempotencyKey) + { + return new + { + message, + sessionKey, + idempotencyKey + }; + } + + private string ResolveChatSessionKey(string? sessionKey) + { + if (!string.IsNullOrWhiteSpace(sessionKey)) + { + return NormalizeChatSessionKey(sessionKey); + } + + return string.IsNullOrWhiteSpace(_defaultChatSessionKey) + ? DefaultChatSessionKey + : _defaultChatSessionKey; + } + + private void UpdateDefaultChatSessionKeyFromHello(JsonElement payload) + { + if (!payload.TryGetProperty("snapshot", out var snapshot) || + snapshot.ValueKind != JsonValueKind.Object || + !snapshot.TryGetProperty("sessionDefaults", out var sessionDefaults) || + sessionDefaults.ValueKind != JsonValueKind.Object) + { + return; + } + + var mainSessionKey = GetString(sessionDefaults, "mainKey") ?? + GetString(sessionDefaults, "mainSessionKey"); + if (!string.IsNullOrWhiteSpace(mainSessionKey)) + { + UpdateDefaultChatSessionKey(mainSessionKey); + } + } + + private void UpdateDefaultChatSessionKeyFromSessions() + { + var mainSession = _sessions.Values.FirstOrDefault(s => s.IsMain && !string.IsNullOrWhiteSpace(s.Key)); + if (!string.IsNullOrWhiteSpace(mainSession?.Key)) + { + UpdateDefaultChatSessionKey(mainSession.Key); + } + } + + private void UpdateDefaultChatSessionKey(string? sessionKey) + { + if (!string.IsNullOrWhiteSpace(sessionKey)) + { + _defaultChatSessionKey = NormalizeChatSessionKey(sessionKey); + } + } + + private void RequestChatPreviewForFinalState(string sessionKey) + { + if (string.IsNullOrWhiteSpace(sessionKey) || _sessionPreviewUnsupported) + { + return; + } + + var normalizedSessionKey = NormalizeChatSessionKey(sessionKey); + string? lastKnownAssistantText; + lock (_pendingChatPreviewLock) + { + if (_pendingChatPreviewSessionKeys.ContainsKey(normalizedSessionKey)) + { + return; + } + + _lastAssistantMessagesBySession.TryGetValue(normalizedSessionKey, out lastKnownAssistantText); + _pendingChatPreviewSessionKeys[normalizedSessionKey] = new PendingChatPreviewState + { + LastKnownAssistantText = lastKnownAssistantText, + AttemptCount = 0 + }; + } + + RequestChatPreviewForFinalStateAsync(normalizedSessionKey, delayMs: 0); + } + + private void RequestChatPreviewForFinalStateAsync(string normalizedSessionKey, int delayMs) + { + _ = Task.Run(async () => + { + try + { + if (delayMs > 0) + { + await Task.Delay(delayMs); + } + + await RequestSessionPreviewAsync([normalizedSessionKey], limit: 2, maxChars: 4000); + } + catch (Exception ex) + { + _logger.Warn($"sessions.preview request failed for {normalizedSessionKey}: {ex.Message}"); + lock (_pendingChatPreviewLock) + { + _pendingChatPreviewSessionKeys.Remove(normalizedSessionKey); + } + } + }); + } + + private void EmitPendingChatPreviewMessages(SessionsPreviewPayloadInfo payload) + { + foreach (var preview in payload.Previews) + { + var normalizedSessionKey = NormalizeChatSessionKey(preview.Key); + PendingChatPreviewState? pendingState = null; + + lock (_pendingChatPreviewLock) + { + _pendingChatPreviewSessionKeys.TryGetValue(normalizedSessionKey, out pendingState); + } + + if (pendingState == null) + { + continue; + } + + var assistantText = preview.Items + .LastOrDefault(item => string.Equals(item.Role, "assistant", StringComparison.OrdinalIgnoreCase))? + .Text? + .Trim(); + + if (string.IsNullOrWhiteSpace(assistantText)) + { + continue; + } + + if (string.Equals(assistantText, pendingState.LastKnownAssistantText, StringComparison.Ordinal)) + { + if (pendingState.AttemptCount < 3) + { + pendingState.AttemptCount++; + _logger.Warn( + $"sessions.preview returned the previous assistant reply for {normalizedSessionKey}; retrying preview ({pendingState.AttemptCount}/3)"); + RequestChatPreviewForFinalStateAsync(normalizedSessionKey, delayMs: 400 * pendingState.AttemptCount); + continue; + } + } + + lock (_pendingChatPreviewLock) + { + _pendingChatPreviewSessionKeys.Remove(normalizedSessionKey); + } + + _logger.Info($"Assistant response (preview): {assistantText.Substring(0, Math.Min(100, assistantText.Length))}..."); + EmitChatMessage(normalizedSessionKey, "assistant", assistantText, isFinal: true); + EmitChatNotification(assistantText); + } + } + + private void ClearPendingChatPreviewSessions() + { + lock (_pendingChatPreviewLock) + { + _pendingChatPreviewSessionKeys.Clear(); + _lastAssistantMessagesBySession.Clear(); + } + } + + private static string NormalizeChatSessionKey(string? sessionKey) + { + if (string.IsNullOrWhiteSpace(sessionKey)) + { + return DefaultChatSessionKey; + } + + return sessionKey == "main" || sessionKey.Contains(":main:", StringComparison.Ordinal) + ? DefaultChatSessionKey + : sessionKey; + } + + private static string? TryGetSessionKey(JsonElement root, JsonElement payload) + { + if (root.TryGetProperty("sessionKey", out var rootSessionKey)) + { + return rootSessionKey.GetString(); + } + + if (payload.ValueKind == JsonValueKind.Object && + payload.TryGetProperty("sessionKey", out var payloadSessionKey)) + { + return payloadSessionKey.GetString(); + } + + return null; } private void PopulateSessionFromObject(SessionInfo session, JsonElement item) @@ -1853,6 +2119,7 @@ private void ParseSessionsPreview(JsonElement payload) } SessionPreviewUpdated?.Invoke(this, previewPayload); + EmitPendingChatPreviewMessages(previewPayload); } catch (Exception ex) { diff --git a/src/OpenClaw.Shared/SettingsData.cs b/src/OpenClaw.Shared/SettingsData.cs index 7dee87f..60e2939 100644 --- a/src/OpenClaw.Shared/SettingsData.cs +++ b/src/OpenClaw.Shared/SettingsData.cs @@ -1,3 +1,5 @@ +using System; +using System.Text.Json.Serialization; using System.Text.Json; namespace OpenClaw.Shared; @@ -32,6 +34,11 @@ public class SettingsData public bool NotifyChatResponses { get; set; } = true; public bool PreferStructuredCategories { get; set; } = true; public List? UserRules { get; set; } + public VoiceSettings Voice { get; set; } = new(); + public VoiceRepeaterWindowSettings VoiceRepeaterWindow { get; set; } = new(); + public VoiceProviderConfigurationStore VoiceProviderConfiguration { get; set; } = new(); + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public VoiceProviderCredentials? VoiceProviderCredentials { get; set; } private static readonly JsonSerializerOptions s_options = new() { WriteIndented = true }; @@ -43,11 +50,39 @@ public class SettingsData return null; try { - return JsonSerializer.Deserialize(json); + return JsonSerializer.Deserialize(MigrateLegacyVoiceJson(json)); } catch (JsonException) { return null; } } + + private static string MigrateLegacyVoiceJson(string json) + { + return json + .Replace("\"WakeWord\":", "\"VoiceWake\":", StringComparison.Ordinal) + .Replace("\"AlwaysOn\":", "\"TalkMode\":", StringComparison.Ordinal) + .Replace("\"WakeWordModelId\":", "\"VoiceWakeModelId\":", StringComparison.Ordinal) + .Replace("\"WakeWordLoaded\":", "\"VoiceWakeLoaded\":", StringComparison.Ordinal) + .Replace("\"LastWakeWordUtc\":", "\"LastVoiceWakeUtc\":", StringComparison.Ordinal) + .Replace("\"Mode\":\"WakeWord\"", "\"Mode\":\"VoiceWake\"", StringComparison.Ordinal) + .Replace("\"Mode\": \"WakeWord\"", "\"Mode\": \"VoiceWake\"", StringComparison.Ordinal) + .Replace("\"Mode\":\"AlwaysOn\"", "\"Mode\":\"TalkMode\"", StringComparison.Ordinal) + .Replace("\"Mode\": \"AlwaysOn\"", "\"Mode\": \"TalkMode\"", StringComparison.Ordinal) + .Replace("\"State\":\"ListeningForWakeWord\"", "\"State\":\"ListeningForVoiceWake\"", StringComparison.Ordinal) + .Replace("\"State\": \"ListeningForWakeWord\"", "\"State\": \"ListeningForVoiceWake\"", StringComparison.Ordinal); + } +} + +public sealed class VoiceRepeaterWindowSettings +{ + public bool AutoScroll { get; set; } = true; + public bool FloatingEnabled { get; set; } = true; + public bool HasSavedPlacement { get; set; } + public double TextSize { get; set; } = 13; + public int? Width { get; set; } + public int? Height { get; set; } + public int? X { get; set; } + public int? Y { get; set; } } diff --git a/src/OpenClaw.Shared/VoiceModeSchema.cs b/src/OpenClaw.Shared/VoiceModeSchema.cs new file mode 100644 index 0000000..e47af8c --- /dev/null +++ b/src/OpenClaw.Shared/VoiceModeSchema.cs @@ -0,0 +1,354 @@ +using System; +using System.Collections.ObjectModel; +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace OpenClaw.Shared; + +public static class VoiceCommands +{ + public const string ListDevices = "voice.devices.list"; + public const string GetSettings = "voice.settings.get"; + public const string SetSettings = "voice.settings.set"; + public const string GetStatus = "voice.status.get"; + public const string Start = "voice.start"; + public const string Stop = "voice.stop"; + public const string Pause = "voice.pause"; + public const string Resume = "voice.resume"; + public const string Skip = "voice.response.skip"; + + private static readonly ReadOnlyCollection s_all = Array.AsReadOnly( + [ + ListDevices, + GetSettings, + SetSettings, + GetStatus, + Start, + Stop, + Pause, + Resume, + Skip + ]); + + public static IReadOnlyList All => s_all; +} + +[JsonConverter(typeof(VoiceActivationModeJsonConverter))] +public enum VoiceActivationMode +{ + Off, + VoiceWake, + TalkMode +} + +[JsonConverter(typeof(VoiceRuntimeStateJsonConverter))] +public enum VoiceRuntimeState +{ + Stopped, + Paused, + Idle, + Arming, + ListeningForVoiceWake, + ListeningContinuously, + RecordingUtterance, + SubmittingAudio, + AwaitingResponse, + PlayingResponse, + Error +} + +public sealed class VoiceSettings +{ + public VoiceActivationMode Mode { get; set; } = VoiceActivationMode.Off; + public bool Enabled { get; set; } + public bool ShowRepeaterAtStartup { get; set; } = true; + public bool ShowConversationToasts { get; set; } + public string SpeechToTextProviderId { get; set; } = VoiceProviderIds.Windows; + public string TextToSpeechProviderId { get; set; } = VoiceProviderIds.Windows; + public string? InputDeviceId { get; set; } + public string? OutputDeviceId { get; set; } + public int SampleRateHz { get; set; } = 16000; + public int CaptureChunkMs { get; set; } = 80; + public bool BargeInEnabled { get; set; } = true; + public VoiceWakeSettings VoiceWake { get; set; } = new(); + public TalkModeSettings TalkMode { get; set; } = new(); +} + +public sealed class VoiceWakeSettings +{ + public string Engine { get; set; } = "NanoWakeWord"; + public string ModelId { get; set; } = "hey_openclaw"; + public float TriggerThreshold { get; set; } = 0.65f; + public int TriggerCooldownMs { get; set; } = 2000; + public int PreRollMs { get; set; } = 1200; + public int EndSilenceMs { get; set; } = 900; +} + +public sealed class TalkModeSettings +{ + public int MinSpeechMs { get; set; } = 250; + public int EndSilenceMs { get; set; } = 900; + public int MaxUtteranceMs { get; set; } = 15000; +} + +public sealed class VoiceAudioDeviceInfo +{ + public string DeviceId { get; set; } = ""; + public string Name { get; set; } = ""; + public bool IsDefault { get; set; } + public bool IsInput { get; set; } + public bool IsOutput { get; set; } +} + +public sealed class VoiceStatusInfo +{ + public bool Available { get; set; } + public bool Running { get; set; } + public VoiceActivationMode Mode { get; set; } = VoiceActivationMode.Off; + public VoiceRuntimeState State { get; set; } = VoiceRuntimeState.Stopped; + public string? SessionKey { get; set; } + public string? InputDeviceId { get; set; } + public string? OutputDeviceId { get; set; } + public string? VoiceWakeModelId { get; set; } + public bool VoiceWakeLoaded { get; set; } + public DateTime? LastVoiceWakeUtc { get; set; } + public DateTime? LastUtteranceUtc { get; set; } + public int PendingReplyCount { get; set; } + public bool CanSkipReply { get; set; } + public string? CurrentReplyPreview { get; set; } + public string? LastError { get; set; } +} + +public sealed class VoiceStartArgs +{ + public VoiceActivationMode? Mode { get; set; } + public string? SessionKey { get; set; } +} + +public sealed class VoiceStopArgs +{ + public string? Reason { get; set; } +} + +public sealed class VoicePauseArgs +{ + public string? Reason { get; set; } +} + +public sealed class VoiceResumeArgs +{ + public string? Reason { get; set; } +} + +public sealed class VoiceSkipArgs +{ + public string? Reason { get; set; } +} + +public sealed class VoiceSettingsUpdateArgs +{ + public VoiceSettings Settings { get; set; } = new(); + public bool Persist { get; set; } = true; +} + +public static class VoiceProviderIds +{ + public const string Windows = "windows"; + public const string HttpWs = "http-ws"; + public const string FoundryLocal = "foundry-local"; + public const string OpenAiWhisper = "openai-whisper"; + public const string ElevenLabsSpeechToText = "elevenlabs-stt"; + public const string AzureAiSpeech = "azure-ai-speech"; + public const string SherpaOnnx = "sherpa-onnx"; + public const string MiniMax = "minimax"; + public const string ElevenLabs = "elevenlabs"; +} + +public static class VoiceProviderRuntimeIds +{ + public const string Windows = "windows"; + public const string Streaming = "streaming"; + public const string Embedded = "embedded"; + public const string Cloud = "cloud"; +} + +public static class VoiceProviderSettingKeys +{ + public const string ApiKey = "apiKey"; + public const string Endpoint = "endpoint"; + public const string Model = "model"; + public const string ModelPath = "modelPath"; + public const string VoiceId = "voiceId"; + public const string VoiceSettingsJson = "voiceSettingsJson"; +} + +public static class VoiceTextToSpeechResponseModes +{ + public const string Binary = "binary"; + public const string HexJsonString = "hexJsonString"; + public const string Base64JsonString = "base64JsonString"; +} + +public sealed class VoiceProviderCredentials +{ + public string? MiniMaxApiKey { get; set; } + public string MiniMaxModel { get; set; } = "speech-2.8-turbo"; + public string MiniMaxVoiceId { get; set; } = "English_MatureBoss"; + public string? ElevenLabsApiKey { get; set; } + public string? ElevenLabsModel { get; set; } + public string? ElevenLabsVoiceId { get; set; } +} + +public sealed class VoiceProviderConfigurationStore +{ + public List Providers { get; set; } = []; +} + +public sealed class VoiceProviderConfiguration +{ + public string ProviderId { get; set; } = ""; + public Dictionary Values { get; set; } = []; +} + +public sealed class VoiceProviderSettingDefinition +{ + public string Key { get; set; } = ""; + public string Label { get; set; } = ""; + public bool Secret { get; set; } + public bool Required { get; set; } = true; + public bool JsonValue { get; set; } + public string? DefaultValue { get; set; } + public string? Placeholder { get; set; } + public string? Description { get; set; } + public List Options { get; set; } = []; +} + +public sealed class VoiceTextToSpeechHttpContract +{ + public string EndpointTemplate { get; set; } = ""; + public string HttpMethod { get; set; } = "POST"; + public string AuthenticationHeaderName { get; set; } = "Authorization"; + public string? AuthenticationScheme { get; set; } = "Bearer"; + public string ApiKeySettingKey { get; set; } = VoiceProviderSettingKeys.ApiKey; + public string RequestContentType { get; set; } = "application/json"; + public string RequestBodyTemplate { get; set; } = ""; + public string ResponseAudioMode { get; set; } = VoiceTextToSpeechResponseModes.Binary; + public string? ResponseAudioJsonPath { get; set; } + public string? ResponseStatusCodeJsonPath { get; set; } + public string? ResponseStatusMessageJsonPath { get; set; } + public string? SuccessStatusValue { get; set; } + public string OutputContentType { get; set; } = "audio/mpeg"; +} + +public sealed class VoiceTextToSpeechWebSocketContract +{ + public string EndpointTemplate { get; set; } = ""; + public string AuthenticationHeaderName { get; set; } = "Authorization"; + public string? AuthenticationScheme { get; set; } = "Bearer"; + public string ApiKeySettingKey { get; set; } = VoiceProviderSettingKeys.ApiKey; + public string ConnectSuccessEventName { get; set; } = "connected_success"; + public string StartMessageTemplate { get; set; } = ""; + public string StartSuccessEventName { get; set; } = "task_started"; + public string ContinueMessageTemplate { get; set; } = ""; + public string FinishMessageTemplate { get; set; } = "{ \"event\": \"task_finish\" }"; + public string ResponseAudioMode { get; set; } = VoiceTextToSpeechResponseModes.Binary; + public string? ResponseAudioJsonPath { get; set; } = "data.audio"; + public string? ResponseStatusCodeJsonPath { get; set; } = "base_resp.status_code"; + public string? ResponseStatusMessageJsonPath { get; set; } = "base_resp.status_msg"; + public string? FinalFlagJsonPath { get; set; } = "is_final"; + public string TaskFailedEventName { get; set; } = "task_failed"; + public string? SuccessStatusValue { get; set; } = "0"; + public string OutputContentType { get; set; } = "audio/mpeg"; +} + +public sealed class VoiceProviderOption +{ + public string Id { get; set; } = ""; + public string Name { get; set; } = ""; + public string Runtime { get; set; } = VoiceProviderRuntimeIds.Windows; + public bool Enabled { get; set; } = true; + public bool VisibleInSettings { get; set; } = true; + public bool Selectable { get; set; } = true; + public string? Description { get; set; } + public List Settings { get; set; } = []; + public VoiceTextToSpeechHttpContract? TextToSpeechHttp { get; set; } + public VoiceTextToSpeechWebSocketContract? TextToSpeechWebSocket { get; set; } + + [JsonIgnore] + public string DisplayName => Selectable ? Name : $"{Name} (coming soon)"; + + [JsonIgnore] + public double DisplayOpacity => Selectable ? 1.0 : 0.55; +} + +public sealed class VoiceProviderCatalog +{ + public List SpeechToTextProviders { get; set; } = []; + public List TextToSpeechProviders { get; set; } = []; +} + +public sealed class VoiceActivationModeJsonConverter : JsonConverter +{ + public override VoiceActivationMode Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) + { + var value = reader.GetString(); + return value switch + { + "VoiceWake" or "WakeWord" => VoiceActivationMode.VoiceWake, + "TalkMode" or "AlwaysOn" => VoiceActivationMode.TalkMode, + _ => VoiceActivationMode.Off + }; + } + + public override void Write(Utf8JsonWriter writer, VoiceActivationMode value, JsonSerializerOptions options) + { + writer.WriteStringValue(value switch + { + VoiceActivationMode.VoiceWake => "VoiceWake", + VoiceActivationMode.TalkMode => "TalkMode", + _ => "Off" + }); + } +} + +public sealed class VoiceRuntimeStateJsonConverter : JsonConverter +{ + public override VoiceRuntimeState Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) + { + var value = reader.GetString(); + return value switch + { + "ListeningForVoiceWake" or "ListeningForWakeWord" => VoiceRuntimeState.ListeningForVoiceWake, + "Stopped" => VoiceRuntimeState.Stopped, + "Paused" => VoiceRuntimeState.Paused, + "Idle" => VoiceRuntimeState.Idle, + "Arming" => VoiceRuntimeState.Arming, + "ListeningContinuously" => VoiceRuntimeState.ListeningContinuously, + "RecordingUtterance" => VoiceRuntimeState.RecordingUtterance, + "SubmittingAudio" => VoiceRuntimeState.SubmittingAudio, + "AwaitingResponse" => VoiceRuntimeState.AwaitingResponse, + "PlayingResponse" => VoiceRuntimeState.PlayingResponse, + "Error" => VoiceRuntimeState.Error, + _ => VoiceRuntimeState.Stopped + }; + } + + public override void Write(Utf8JsonWriter writer, VoiceRuntimeState value, JsonSerializerOptions options) + { + writer.WriteStringValue(value switch + { + VoiceRuntimeState.ListeningForVoiceWake => "ListeningForVoiceWake", + VoiceRuntimeState.Stopped => "Stopped", + VoiceRuntimeState.Paused => "Paused", + VoiceRuntimeState.Idle => "Idle", + VoiceRuntimeState.Arming => "Arming", + VoiceRuntimeState.ListeningContinuously => "ListeningContinuously", + VoiceRuntimeState.RecordingUtterance => "RecordingUtterance", + VoiceRuntimeState.SubmittingAudio => "SubmittingAudio", + VoiceRuntimeState.AwaitingResponse => "AwaitingResponse", + VoiceRuntimeState.PlayingResponse => "PlayingResponse", + VoiceRuntimeState.Error => "Error", + _ => "Stopped" + }); + } +} diff --git a/src/OpenClaw.Shared/VoiceProviderConfigurationStoreExtensions.cs b/src/OpenClaw.Shared/VoiceProviderConfigurationStoreExtensions.cs new file mode 100644 index 0000000..b1dfa41 --- /dev/null +++ b/src/OpenClaw.Shared/VoiceProviderConfigurationStoreExtensions.cs @@ -0,0 +1,161 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace OpenClaw.Shared; + +public static class VoiceProviderConfigurationStoreExtensions +{ + public static VoiceProviderConfiguration GetOrAddProvider( + this VoiceProviderConfigurationStore store, + string providerId) + { + ArgumentNullException.ThrowIfNull(store); + + var existing = store.Providers.FirstOrDefault(p => + string.Equals(p.ProviderId, providerId, StringComparison.OrdinalIgnoreCase)); + if (existing != null) + { + return existing; + } + + var created = new VoiceProviderConfiguration + { + ProviderId = providerId + }; + store.Providers.Add(created); + return created; + } + + public static VoiceProviderConfiguration? FindProvider( + this VoiceProviderConfigurationStore store, + string? providerId) + { + ArgumentNullException.ThrowIfNull(store); + + if (string.IsNullOrWhiteSpace(providerId)) + { + return null; + } + + return store.Providers.FirstOrDefault(p => + string.Equals(p.ProviderId, providerId, StringComparison.OrdinalIgnoreCase)); + } + + public static string? GetValue( + this VoiceProviderConfigurationStore store, + string? providerId, + string settingKey) + { + return store.FindProvider(providerId)?.GetValue(settingKey); + } + + public static string? GetValue(this VoiceProviderConfiguration configuration, string settingKey) + { + ArgumentNullException.ThrowIfNull(configuration); + + if (string.IsNullOrWhiteSpace(settingKey)) + { + return null; + } + + return configuration.Values.FirstOrDefault(entry => + string.Equals(entry.Key, settingKey, StringComparison.OrdinalIgnoreCase)).Value; + } + + public static void SetValue( + this VoiceProviderConfigurationStore store, + string providerId, + string settingKey, + string? value) + { + ArgumentNullException.ThrowIfNull(store); + + var provider = store.GetOrAddProvider(providerId); + provider.SetValue(settingKey, value); + } + + public static void SetValue( + this VoiceProviderConfiguration configuration, + string settingKey, + string? value) + { + ArgumentNullException.ThrowIfNull(configuration); + + if (string.IsNullOrWhiteSpace(settingKey)) + { + return; + } + + var existingKey = configuration.Values.Keys.FirstOrDefault(key => + string.Equals(key, settingKey, StringComparison.OrdinalIgnoreCase)); + + if (string.IsNullOrWhiteSpace(value)) + { + if (existingKey != null) + { + configuration.Values.Remove(existingKey); + } + + return; + } + + if (existingKey != null) + { + configuration.Values[existingKey] = value.Trim(); + return; + } + + configuration.Values[settingKey] = value.Trim(); + } + + public static void MigrateLegacyCredentials( + this VoiceProviderConfigurationStore store, + VoiceProviderCredentials? legacy) + { + ArgumentNullException.ThrowIfNull(store); + + if (legacy == null) + { + return; + } + + var hasMiniMaxValues = + !string.IsNullOrWhiteSpace(legacy.MiniMaxApiKey) || + !string.IsNullOrWhiteSpace(legacy.MiniMaxModel) || + !string.IsNullOrWhiteSpace(legacy.MiniMaxVoiceId); + if (hasMiniMaxValues) + { + store.SetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.ApiKey, legacy.MiniMaxApiKey); + store.SetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.Model, legacy.MiniMaxModel); + store.SetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.VoiceId, legacy.MiniMaxVoiceId); + } + + var hasElevenLabsValues = + !string.IsNullOrWhiteSpace(legacy.ElevenLabsApiKey) || + !string.IsNullOrWhiteSpace(legacy.ElevenLabsModel) || + !string.IsNullOrWhiteSpace(legacy.ElevenLabsVoiceId); + if (hasElevenLabsValues) + { + store.SetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.ApiKey, legacy.ElevenLabsApiKey); + store.SetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.Model, legacy.ElevenLabsModel); + store.SetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.VoiceId, legacy.ElevenLabsVoiceId); + } + } + + public static VoiceProviderConfigurationStore Clone(this VoiceProviderConfigurationStore source) + { + ArgumentNullException.ThrowIfNull(source); + + return new VoiceProviderConfigurationStore + { + Providers = source.Providers + .Select(provider => new VoiceProviderConfiguration + { + ProviderId = provider.ProviderId, + Values = new Dictionary(provider.Values, StringComparer.OrdinalIgnoreCase) + }) + .ToList() + }; + } +} diff --git a/src/OpenClaw.Tray.Shared/Helpers/VoiceTrayIconHelper.cs b/src/OpenClaw.Tray.Shared/Helpers/VoiceTrayIconHelper.cs new file mode 100644 index 0000000..2ff9d3e --- /dev/null +++ b/src/OpenClaw.Tray.Shared/Helpers/VoiceTrayIconHelper.cs @@ -0,0 +1,174 @@ +using System; +using System.Drawing; +using System.IO; +using System.Runtime.InteropServices; + +namespace OpenClawTray.Helpers; + +public enum VoiceTrayIconState +{ + Off, + Armed, + Listening, + Speaking +} + +public static class VoiceTrayIconHelper +{ + private static readonly string GeneratedIconsPath = Path.Combine( + Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), + "OpenClawTray", + "GeneratedIcons"); + + private static string? _voiceArmedIconPath; + private static string? _voiceListeningIconPath; + private static string? _voiceSpeakingIconPath; + + public static string GetBaseAppIconPath() + { + return Path.Combine(ResolveAssetsPath(), "openclaw.ico"); + } + + public static string GetVoiceTrayIconPath(VoiceTrayIconState state) + { + return state switch + { + VoiceTrayIconState.Armed => GetOrCreateVoiceIconPath(ref _voiceArmedIconPath, VoiceTrayIconState.Armed), + VoiceTrayIconState.Listening => GetOrCreateVoiceIconPath(ref _voiceListeningIconPath, VoiceTrayIconState.Listening), + VoiceTrayIconState.Speaking => GetOrCreateVoiceIconPath(ref _voiceSpeakingIconPath, VoiceTrayIconState.Speaking), + _ => GetBaseAppIconPath() + }; + } + + private static string GetOrCreateVoiceIconPath(ref string? cachedPath, VoiceTrayIconState state) + { + if (!string.IsNullOrWhiteSpace(cachedPath) && File.Exists(cachedPath)) + { + return cachedPath; + } + + Directory.CreateDirectory(GeneratedIconsPath); + var outputPath = Path.Combine(GeneratedIconsPath, $"voice-{state.ToString().ToLowerInvariant()}.ico"); + + using var bitmap = CreateVoiceTrayBitmap(state); + using var icon = CreateIcon(bitmap); + using var stream = File.Create(outputPath); + icon.Save(stream); + + cachedPath = outputPath; + return outputPath; + } + + private static Bitmap CreateVoiceTrayBitmap(VoiceTrayIconState state) + { + const int size = 32; + var bitmap = new Bitmap(size, size); + using var graphics = Graphics.FromImage(bitmap); + + graphics.Clear(Color.Transparent); + graphics.SmoothingMode = System.Drawing.Drawing2D.SmoothingMode.AntiAlias; + graphics.InterpolationMode = System.Drawing.Drawing2D.InterpolationMode.HighQualityBicubic; + + using (var baseIcon = new Icon(GetBaseAppIconPath(), size, size)) + using (var baseBitmap = baseIcon.ToBitmap()) + { + graphics.DrawImage(baseBitmap, 0, 0, size, size); + } + + switch (state) + { + case VoiceTrayIconState.Armed: + DrawHeadphones(graphics); + break; + case VoiceTrayIconState.Listening: + DrawHeadphones(graphics); + DrawHeadphoneWaves(graphics); + break; + case VoiceTrayIconState.Speaking: + DrawMicrophone(graphics); + break; + } + + return bitmap; + } + + private static void DrawHeadphones(Graphics graphics) + { + using var shadowPen = new Pen(Color.FromArgb(96, 255, 255, 255), 4f); + using var bandPen = new Pen(Color.FromArgb(42, 48, 58), 3f); + using var earBrush = new SolidBrush(Color.FromArgb(42, 48, 58)); + + graphics.DrawArc(shadowPen, 6, 3, 20, 16, 180, 180); + graphics.DrawArc(bandPen, 6, 3, 20, 16, 180, 180); + graphics.FillPath(earBrush, CreateRoundedRectanglePath(4, 12, 5, 10, 3)); + graphics.FillPath(earBrush, CreateRoundedRectanglePath(23, 12, 5, 10, 3)); + } + + private static void DrawMicrophone(Graphics graphics) + { + using var brush = new SolidBrush(Color.FromArgb(33, 150, 243)); + using var pen = new Pen(Color.FromArgb(33, 150, 243), 2f); + + graphics.FillPath(brush, CreateRoundedRectanglePath(22, 17, 6, 9, 3)); + graphics.FillRectangle(brush, 24, 25, 2, 4); + graphics.DrawArc(pen, 21, 27, 8, 5, 0, 180); + graphics.DrawLine(pen, 20, 21, 15, 19); + } + + private static void DrawHeadphoneWaves(Graphics graphics) + { + using var wavePen = new Pen(Color.FromArgb(76, 175, 80), 2f); + using var accentPen = new Pen(Color.FromArgb(76, 175, 80), 1.5f); + + graphics.DrawArc(wavePen, 0, 12, 8, 8, 270, 180); + graphics.DrawArc(accentPen, 2, 14, 4, 4, 270, 180); + graphics.DrawArc(wavePen, 24, 12, 8, 8, 90, 180); + graphics.DrawArc(accentPen, 26, 14, 4, 4, 90, 180); + } + + private static Icon CreateIcon(Bitmap bitmap) + { + var handle = bitmap.GetHicon(); + var icon = Icon.FromHandle(handle); + var result = (Icon)icon.Clone(); + DestroyIcon(handle); + return result; + } + + private static System.Drawing.Drawing2D.GraphicsPath CreateRoundedRectanglePath(int x, int y, int width, int height, int radius) + { + var path = new System.Drawing.Drawing2D.GraphicsPath(); + path.AddArc(x, y, radius, radius, 180, 90); + path.AddArc(x + width - radius, y, radius, radius, 270, 90); + path.AddArc(x + width - radius, y + height - radius, radius, radius, 0, 90); + path.AddArc(x, y + height - radius, radius, radius, 90, 90); + path.CloseFigure(); + return path; + } + + private static string ResolveAssetsPath() + { + var bundledPath = Path.Combine(AppContext.BaseDirectory, "Assets"); + if (File.Exists(Path.Combine(bundledPath, "openclaw.ico"))) + { + return bundledPath; + } + + var current = new DirectoryInfo(AppContext.BaseDirectory); + while (current != null) + { + var sourcePath = Path.Combine(current.FullName, "src", "OpenClaw.Tray.WinUI", "Assets"); + if (Directory.Exists(sourcePath)) + { + return sourcePath; + } + + current = current.Parent; + } + + return bundledPath; + } + + [DllImport("user32.dll", CharSet = CharSet.Auto)] + private static extern bool DestroyIcon(IntPtr handle); +} diff --git a/src/OpenClaw.Tray.Shared/OpenClaw.Tray.Shared.csproj b/src/OpenClaw.Tray.Shared/OpenClaw.Tray.Shared.csproj new file mode 100644 index 0000000..cfd3156 --- /dev/null +++ b/src/OpenClaw.Tray.Shared/OpenClaw.Tray.Shared.csproj @@ -0,0 +1,19 @@ + + + + net10.0-windows10.0.19041.0 + enable + enable + OpenClawTray + + + + + + + + + + + + diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceCaptureMath.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceCaptureMath.cs new file mode 100644 index 0000000..deac3d4 --- /dev/null +++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceCaptureMath.cs @@ -0,0 +1,48 @@ +namespace OpenClawTray.Services.Voice; + +public static class VoiceCaptureMath +{ + private const float DefaultSignalThreshold = 0.015f; + + public static uint ResolveDesiredSamplesPerQuantum(int sampleRateHz, int chunkMs) + { + if (sampleRateHz <= 0) + { + sampleRateHz = 16000; + } + + if (chunkMs <= 0) + { + chunkMs = 80; + } + + var desired = (sampleRateHz * chunkMs) / 1000; + return (uint)Math.Max(desired, 128); + } + + public static bool HasAudibleSignal(float peakLevel, float threshold = DefaultSignalThreshold) + { + return peakLevel >= threshold; + } + + public static float ComputePeakLevel(byte[] data) + { + if (data.Length < sizeof(float)) + { + return 0f; + } + + float peak = 0f; + var alignedLength = data.Length - (data.Length % sizeof(float)); + for (var offset = 0; offset < alignedLength; offset += sizeof(float)) + { + var sample = Math.Abs(BitConverter.ToSingle(data, offset)); + if (sample > peak) + { + peak = sample; + } + } + + return float.IsFinite(peak) ? peak : 0f; + } +} diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceChatContracts.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceChatContracts.cs new file mode 100644 index 0000000..106e258 --- /dev/null +++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceChatContracts.cs @@ -0,0 +1,43 @@ +using OpenClaw.Shared; + +namespace OpenClawTray.Services.Voice; + +public interface IUiDispatcher +{ + bool TryEnqueue(Action callback); +} + +public interface IVoiceRuntime +{ + event EventHandler? ConversationTurnAvailable; + event EventHandler? TranscriptDraftUpdated; +} + +public interface IVoiceConfigurationApi +{ + Task GetSettingsAsync(); + Task UpdateSettingsAsync(VoiceSettingsUpdateArgs update); + Task ListDevicesAsync(); + VoiceProviderCatalog GetProviderCatalog(); + VoiceProviderConfigurationStore GetProviderConfiguration(); + void SetProviderConfiguration(VoiceProviderConfigurationStore configurationStore); +} + +public interface IVoiceRuntimeControlApi +{ + VoiceStatusInfo CurrentStatus { get; } + Task GetStatusAsync(); + Task StartAsync(VoiceStartArgs args); + Task StopAsync(VoiceStopArgs args); + Task PauseAsync(VoicePauseArgs? args = null); + Task ResumeAsync(VoiceResumeArgs? args = null); + Task SkipCurrentReplyAsync(VoiceSkipArgs? args = null); + Task ToggleQuickPauseAsync(); +} + +public interface IVoiceChatWindow +{ + bool IsClosed { get; } + Task UpdateVoiceTranscriptDraftAsync(string text, bool clear); + Task AppendVoiceConversationTurnAsync(VoiceConversationTurnEventArgs args); +} diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceChatCoordinator.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceChatCoordinator.cs new file mode 100644 index 0000000..959b0dc --- /dev/null +++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceChatCoordinator.cs @@ -0,0 +1,154 @@ +using System; +using System.Collections.Generic; +using System.Threading.Tasks; + +namespace OpenClawTray.Services.Voice; + +public sealed class VoiceChatCoordinator : IDisposable +{ + private const int MaxBufferedConversationTurns = 8; + private readonly IVoiceRuntime _voiceService; + private readonly IUiDispatcher _dispatcher; + private readonly object _gate = new(); + + private readonly List _windows = []; + private string _voiceTranscriptDraftText = string.Empty; + private readonly List _bufferedConversationTurns = []; + private bool _disposed; + + public event EventHandler? ConversationTurnAvailable; + + public VoiceChatCoordinator( + IVoiceRuntime voiceService, + IUiDispatcher dispatcher) + { + _voiceService = voiceService; + _dispatcher = dispatcher; + + _voiceService.ConversationTurnAvailable += OnVoiceConversationTurnAvailable; + _voiceService.TranscriptDraftUpdated += OnVoiceTranscriptDraftUpdated; + } + + public void AttachWindow(IVoiceChatWindow window) + { + ArgumentNullException.ThrowIfNull(window); + + lock (_gate) + { + if (_windows.Contains(window)) + { + return; + } + + _windows.Add(window); + } + + _ = window.UpdateVoiceTranscriptDraftAsync( + _voiceTranscriptDraftText, + clear: string.IsNullOrWhiteSpace(_voiceTranscriptDraftText)); + + List bufferedTurns; + lock (_gate) + { + bufferedTurns = [.. _bufferedConversationTurns]; + } + + foreach (var turn in bufferedTurns) + { + _ = window.AppendVoiceConversationTurnAsync(turn); + } + } + + public void DetachWindow(IVoiceChatWindow? window) + { + lock (_gate) + { + if (_windows.Count == 0) + { + return; + } + + if (window == null) + { + _windows.Clear(); + return; + } + + _windows.Remove(window); + } + } + + public void Dispose() + { + if (_disposed) + { + return; + } + + _disposed = true; + DetachWindow(null); + _voiceService.ConversationTurnAvailable -= OnVoiceConversationTurnAvailable; + _voiceService.TranscriptDraftUpdated -= OnVoiceTranscriptDraftUpdated; + } + + private void OnVoiceConversationTurnAvailable(object? sender, VoiceConversationTurnEventArgs args) + { + _dispatcher.TryEnqueue(() => + { + List windows; + lock (_gate) + { + _bufferedConversationTurns.Add(CloneTurn(args)); + if (_bufferedConversationTurns.Count > MaxBufferedConversationTurns) + { + _bufferedConversationTurns.RemoveAt(0); + } + + windows = [.. _windows]; + } + + foreach (var window in windows) + { + if (!window.IsClosed) + { + _ = window.AppendVoiceConversationTurnAsync(args); + } + } + + ConversationTurnAvailable?.Invoke(this, args); + }); + } + + private void OnVoiceTranscriptDraftUpdated(object? sender, VoiceTranscriptDraftEventArgs args) + { + _dispatcher.TryEnqueue(() => + { + _voiceTranscriptDraftText = args.Clear ? string.Empty : (args.Text ?? string.Empty); + + List windows; + lock (_gate) + { + windows = [.. _windows]; + } + + foreach (var window in windows) + { + if (!window.IsClosed) + { + _ = window.UpdateVoiceTranscriptDraftAsync(_voiceTranscriptDraftText, args.Clear); + } + } + }); + } + + private static VoiceConversationTurnEventArgs CloneTurn(VoiceConversationTurnEventArgs args) + { + return new VoiceConversationTurnEventArgs + { + Direction = args.Direction, + Message = args.Message, + SessionKey = args.SessionKey, + Mode = args.Mode + }; + } +} diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceCloudTextToSpeechClient.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceCloudTextToSpeechClient.cs new file mode 100644 index 0000000..0399559 --- /dev/null +++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceCloudTextToSpeechClient.cs @@ -0,0 +1,592 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Net.Http; +using System.Net.Http.Headers; +using System.Net.WebSockets; +using System.Runtime.InteropServices.WindowsRuntime; +using System.Text; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; +using OpenClaw.Shared; +using Windows.Storage.Streams; + +namespace OpenClawTray.Services.Voice; + +public sealed class VoiceCloudTextToSpeechClient +{ + private static readonly HttpClient s_httpClient = CreateHttpClient(); + + public async Task SynthesizeAsync( + string text, + VoiceProviderOption provider, + VoiceProviderConfigurationStore configurationStore, + IOpenClawLogger? logger = null, + CancellationToken cancellationToken = default) + { + ArgumentException.ThrowIfNullOrWhiteSpace(text); + ArgumentNullException.ThrowIfNull(provider); + ArgumentNullException.ThrowIfNull(configurationStore); + + if (provider.TextToSpeechWebSocket != null) + { + return await SynthesizeViaWebSocketAsync(text, provider, configurationStore, logger, cancellationToken); + } + + var contract = provider.TextToSpeechHttp + ?? throw new InvalidOperationException($"TTS provider '{provider.Name}' does not expose an HTTP contract."); + var providerConfiguration = configurationStore.FindProvider(provider.Id); + var templateValues = BuildTemplateValues(text, provider, providerConfiguration, contract); + var endpoint = ApplyUrlTemplate(contract.EndpointTemplate, templateValues); + using var request = new HttpRequestMessage(ParseHttpMethod(contract.HttpMethod), endpoint); + ApplyAuthenticationHeader(request, contract, templateValues); + + if (!string.IsNullOrWhiteSpace(contract.RequestBodyTemplate)) + { + var requestBody = ApplyJsonTemplate(contract.RequestBodyTemplate, templateValues); + request.Content = new StringContent( + requestBody, + Encoding.UTF8, + string.IsNullOrWhiteSpace(contract.RequestContentType) ? "application/json" : contract.RequestContentType); + } + + var stopwatch = Stopwatch.StartNew(); + using var response = await s_httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, cancellationToken); + var headersElapsedMs = stopwatch.ElapsedMilliseconds; + if (!response.IsSuccessStatusCode) + { + throw new InvalidOperationException( + $"{provider.Name} TTS request failed: {(int)response.StatusCode} {response.ReasonPhrase}"); + } + + if (string.Equals(contract.ResponseAudioMode, VoiceTextToSpeechResponseModes.Binary, StringComparison.OrdinalIgnoreCase)) + { + await using var responseStream = await response.Content.ReadAsStreamAsync(cancellationToken); + var result = await CreateResultAsync(responseStream, contract.OutputContentType); + logger?.Info($"{provider.Name} TTS latency: headers={headersElapsedMs}ms total={stopwatch.ElapsedMilliseconds}ms (binary)"); + return result; + } + + var responseText = await response.Content.ReadAsStringAsync(cancellationToken); + using var document = JsonDocument.Parse(responseText); + ValidateResponseStatus(provider, contract, document.RootElement); + + var audioString = GetRequiredJsonString(document.RootElement, contract.ResponseAudioJsonPath); + var audioBytesFromJson = DecodeAudioBytes(contract.ResponseAudioMode, audioString, provider.Name); + var jsonResult = await CreateResultAsync(audioBytesFromJson, contract.OutputContentType); + logger?.Info($"{provider.Name} TTS latency: headers={headersElapsedMs}ms total={stopwatch.ElapsedMilliseconds}ms ({contract.ResponseAudioMode})"); + return jsonResult; + } + + private static async Task SynthesizeViaWebSocketAsync( + string text, + VoiceProviderOption provider, + VoiceProviderConfigurationStore configurationStore, + IOpenClawLogger? logger, + CancellationToken cancellationToken) + { + var contract = provider.TextToSpeechWebSocket + ?? throw new InvalidOperationException($"TTS provider '{provider.Name}' does not expose a WebSocket contract."); + var providerConfiguration = configurationStore.FindProvider(provider.Id); + var templateValues = BuildTemplateValues(text, provider, providerConfiguration, contract.ApiKeySettingKey); + var endpoint = ApplyUrlTemplate(contract.EndpointTemplate, templateValues); + using var socket = new ClientWebSocket(); + ApplyAuthenticationHeader(socket.Options, contract, templateValues); + + using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + cts.CancelAfter(TimeSpan.FromSeconds(30)); + var ct = cts.Token; + + var stopwatch = Stopwatch.StartNew(); + await socket.ConnectAsync(new Uri(endpoint), ct); + + if (!string.IsNullOrWhiteSpace(contract.ConnectSuccessEventName)) + { + var connectedMessage = await ReceiveJsonMessageAsync(socket, ct); + ValidateWebSocketEvent(provider.Name, contract.ConnectSuccessEventName, connectedMessage, contract); + } + + var startMessage = ApplyJsonTemplate(contract.StartMessageTemplate, templateValues); + await SendTextMessageAsync(socket, startMessage, ct); + + if (!string.IsNullOrWhiteSpace(contract.StartSuccessEventName)) + { + var startedMessage = await ReceiveJsonMessageAsync(socket, ct); + ValidateWebSocketEvent(provider.Name, contract.StartSuccessEventName, startedMessage, contract); + } + + var continueMessage = ApplyJsonTemplate(contract.ContinueMessageTemplate, templateValues); + await SendTextMessageAsync(socket, continueMessage, ct); + + if (!string.IsNullOrWhiteSpace(contract.FinishMessageTemplate)) + { + await SendTextMessageAsync(socket, ApplyJsonTemplate(contract.FinishMessageTemplate, templateValues), ct); + } + + var audioBytes = new List(); + long? firstChunkMs = null; + + while (true) + { + var message = await ReceiveJsonMessageAsync(socket, ct); + EnsureWebSocketNotFailed(provider.Name, contract, message); + + if (TryGetJsonString(message, contract.ResponseAudioJsonPath, out var audioChunk) && + !string.IsNullOrWhiteSpace(audioChunk)) + { + if (!firstChunkMs.HasValue) + { + firstChunkMs = stopwatch.ElapsedMilliseconds; + } + + audioBytes.AddRange(DecodeAudioBytes(contract.ResponseAudioMode, audioChunk, provider.Name)); + } + + if (IsFinalWebSocketMessage(message, contract)) + { + break; + } + } + + try + { + await socket.CloseAsync(WebSocketCloseStatus.NormalClosure, "done", ct); + } + catch + { + } + + if (audioBytes.Count == 0) + { + throw new InvalidOperationException($"{provider.Name} TTS did not return any audio data."); + } + + var result = await CreateResultAsync(audioBytes.ToArray(), contract.OutputContentType); + logger?.Info($"{provider.Name} TTS latency: firstChunk={(firstChunkMs?.ToString() ?? "n/a")}ms total={stopwatch.ElapsedMilliseconds}ms (websocket)"); + return result; + } + + private static Dictionary BuildTemplateValues( + string text, + VoiceProviderOption provider, + VoiceProviderConfiguration? providerConfiguration, + VoiceTextToSpeechHttpContract contract) + { + return BuildTemplateValues(text, provider, providerConfiguration, contract.ApiKeySettingKey); + } + + private static Dictionary BuildTemplateValues( + string text, + VoiceProviderOption provider, + VoiceProviderConfiguration? providerConfiguration, + string apiKeySettingKey) + { + var values = new Dictionary(StringComparer.OrdinalIgnoreCase) + { + ["text"] = TemplateValue.FromString(text), + ["textWithTrailingSpace"] = TemplateValue.FromString( + text.EndsWith(' ') ? text : text + " ") + }; + + foreach (var setting in provider.Settings) + { + var configuredValue = providerConfiguration?.GetValue(setting.Key); + var effectiveValue = string.IsNullOrWhiteSpace(configuredValue) + ? setting.DefaultValue + : configuredValue.Trim(); + + if (string.IsNullOrWhiteSpace(effectiveValue)) + { + if (setting.Secret || string.Equals(setting.Key, apiKeySettingKey, StringComparison.OrdinalIgnoreCase)) + { + throw new InvalidOperationException( + $"{provider.Name} API key is not configured. Open Settings and complete the {provider.Name} voice provider fields."); + } + + if (setting.Required) + { + throw new InvalidOperationException( + $"{provider.Name} setting '{setting.Label}' is required. Open Settings and complete the {provider.Name} voice provider fields."); + } + + continue; + } + + values[setting.Key] = setting.JsonValue + ? TemplateValue.FromJson(effectiveValue, provider.Name, setting.Label, values) + : TemplateValue.FromString(effectiveValue); + } + + return values; + } + + private static string ApplyUrlTemplate(string template, IReadOnlyDictionary values) + { + var result = template; + foreach (var entry in values) + { + result = result.Replace( + "{{" + entry.Key + "}}", + Uri.EscapeDataString(entry.Value.Value), + StringComparison.Ordinal); + } + + return result; + } + + private static string ApplyJsonTemplate(string template, IReadOnlyDictionary values) + { + var result = template; + foreach (var entry in values) + { + result = result.Replace( + "{{" + entry.Key + "}}", + entry.Value.JsonFragment ? entry.Value.Value : JsonSerializer.Serialize(entry.Value.Value), + StringComparison.Ordinal); + } + + return result; + } + + private static void ApplyAuthenticationHeader( + HttpRequestMessage request, + VoiceTextToSpeechHttpContract contract, + IReadOnlyDictionary values) + { + if (!values.TryGetValue(contract.ApiKeySettingKey, out var apiKey) || string.IsNullOrWhiteSpace(apiKey.Value)) + { + throw new InvalidOperationException("Voice provider API key is not configured."); + } + + if (string.Equals(contract.AuthenticationHeaderName, "Authorization", StringComparison.OrdinalIgnoreCase) && + !string.IsNullOrWhiteSpace(contract.AuthenticationScheme)) + { + request.Headers.Authorization = new AuthenticationHeaderValue(contract.AuthenticationScheme, apiKey.Value); + return; + } + + var headerValue = string.IsNullOrWhiteSpace(contract.AuthenticationScheme) + ? apiKey.Value + : $"{contract.AuthenticationScheme} {apiKey.Value}"; + request.Headers.TryAddWithoutValidation(contract.AuthenticationHeaderName, headerValue); + } + + private static void ApplyAuthenticationHeader( + ClientWebSocketOptions options, + VoiceTextToSpeechWebSocketContract contract, + IReadOnlyDictionary values) + { + if (!values.TryGetValue(contract.ApiKeySettingKey, out var apiKey) || string.IsNullOrWhiteSpace(apiKey.Value)) + { + throw new InvalidOperationException("Voice provider API key is not configured."); + } + + var headerValue = string.Equals(contract.AuthenticationHeaderName, "Authorization", StringComparison.OrdinalIgnoreCase) && + !string.IsNullOrWhiteSpace(contract.AuthenticationScheme) + ? $"{contract.AuthenticationScheme} {apiKey.Value}" + : string.IsNullOrWhiteSpace(contract.AuthenticationScheme) + ? apiKey.Value + : $"{contract.AuthenticationScheme} {apiKey.Value}"; + + options.SetRequestHeader(contract.AuthenticationHeaderName, headerValue); + } + + private static HttpMethod ParseHttpMethod(string? method) + { + if (string.Equals(method, HttpMethod.Post.Method, StringComparison.OrdinalIgnoreCase)) + { + return HttpMethod.Post; + } + + return new HttpMethod(string.IsNullOrWhiteSpace(method) ? HttpMethod.Post.Method : method); + } + + private static void ValidateResponseStatus( + VoiceProviderOption provider, + VoiceTextToSpeechHttpContract contract, + JsonElement root) + { + if (string.IsNullOrWhiteSpace(contract.ResponseStatusCodeJsonPath)) + { + return; + } + + var statusValue = GetJsonValue(root, contract.ResponseStatusCodeJsonPath); + var statusText = statusValue.HasValue ? JsonElementToString(statusValue.Value) : null; + var successValue = contract.SuccessStatusValue ?? "0"; + if (string.Equals(statusText, successValue, StringComparison.OrdinalIgnoreCase)) + { + return; + } + + var statusMessage = string.IsNullOrWhiteSpace(contract.ResponseStatusMessageJsonPath) + ? null + : GetJsonValue(root, contract.ResponseStatusMessageJsonPath).HasValue + ? JsonElementToString(GetJsonValue(root, contract.ResponseStatusMessageJsonPath)!.Value) + : null; + throw new InvalidOperationException( + string.IsNullOrWhiteSpace(statusMessage) + ? $"{provider.Name} TTS returned an error." + : $"{provider.Name} TTS returned an error: {statusMessage}"); + } + + private static void ValidateWebSocketEvent( + string providerName, + string expectedEvent, + JsonElement message, + VoiceTextToSpeechWebSocketContract contract) + { + EnsureWebSocketNotFailed(providerName, contract, message); + + if (!TryGetJsonString(message, "event", out var eventName) || + !string.Equals(eventName, expectedEvent, StringComparison.OrdinalIgnoreCase)) + { + throw new InvalidOperationException($"{providerName} TTS returned an unexpected WebSocket event."); + } + } + + private static void EnsureWebSocketNotFailed( + string providerName, + VoiceTextToSpeechWebSocketContract contract, + JsonElement message) + { + if (TryGetJsonString(message, "event", out var eventName) && + string.Equals(eventName, contract.TaskFailedEventName, StringComparison.OrdinalIgnoreCase)) + { + var statusMessage = string.IsNullOrWhiteSpace(contract.ResponseStatusMessageJsonPath) + ? null + : TryGetJsonString(message, contract.ResponseStatusMessageJsonPath, out var value) + ? value + : null; + + throw new InvalidOperationException( + string.IsNullOrWhiteSpace(statusMessage) + ? $"{providerName} TTS returned an error." + : $"{providerName} TTS returned an error: {statusMessage}"); + } + } + + private static JsonElement? GetJsonValue(JsonElement root, string? path) + { + if (string.IsNullOrWhiteSpace(path)) + { + return null; + } + + var current = root; + foreach (var segment in path.Split('.', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)) + { + if (current.ValueKind != JsonValueKind.Object || !current.TryGetProperty(segment, out current)) + { + return null; + } + } + + return current; + } + + private static string GetRequiredJsonString(JsonElement root, string? path) + { + var value = GetJsonValue(root, path); + if (!value.HasValue) + { + throw new InvalidOperationException("Voice provider response did not contain audio data."); + } + + var text = value.Value.GetString(); + if (string.IsNullOrWhiteSpace(text)) + { + throw new InvalidOperationException("Voice provider response did not contain audio data."); + } + + return text; + } + + private static bool TryGetJsonString(JsonElement root, string? path, out string value) + { + value = string.Empty; + var found = GetJsonValue(root, path); + if (!found.HasValue) + { + return false; + } + + var text = JsonElementToString(found.Value); + if (string.IsNullOrWhiteSpace(text)) + { + return false; + } + + value = text; + return true; + } + + private static bool IsFinalWebSocketMessage(JsonElement root, VoiceTextToSpeechWebSocketContract contract) + { + var finalFlag = GetJsonValue(root, contract.FinalFlagJsonPath); + return finalFlag.HasValue && finalFlag.Value.ValueKind == JsonValueKind.True; + } + + private static string? JsonElementToString(JsonElement element) + { + return element.ValueKind switch + { + JsonValueKind.String => element.GetString(), + JsonValueKind.Number => element.ToString(), + JsonValueKind.True => bool.TrueString, + JsonValueKind.False => bool.FalseString, + _ => element.ToString() + }; + } + + private static byte[] DecodeAudioBytes(string responseAudioMode, string audioValue, string providerName) + { + try + { + if (string.Equals(responseAudioMode, VoiceTextToSpeechResponseModes.HexJsonString, StringComparison.OrdinalIgnoreCase)) + { + return Convert.FromHexString(audioValue); + } + + if (string.Equals(responseAudioMode, VoiceTextToSpeechResponseModes.Base64JsonString, StringComparison.OrdinalIgnoreCase)) + { + return Convert.FromBase64String(audioValue); + } + + throw new InvalidOperationException($"Unsupported TTS response mode '{responseAudioMode}'."); + } + catch (FormatException ex) + { + throw new InvalidOperationException($"{providerName} TTS returned invalid audio data.", ex); + } + } + + private static async Task CreateResultAsync(byte[] audioBytes, string contentType) + { + var stream = new InMemoryRandomAccessStream(); + await stream.WriteAsync(audioBytes.AsBuffer()); + await stream.FlushAsync(); + stream.Seek(0); + + return new VoiceCloudTextToSpeechResult(stream, string.IsNullOrWhiteSpace(contentType) ? "audio/mpeg" : contentType); + } + + private static async Task CreateResultAsync(Stream sourceStream, string contentType, CancellationToken cancellationToken = default) + { + var stream = new InMemoryRandomAccessStream(); + await using (var output = stream.AsStreamForWrite()) + { + await sourceStream.CopyToAsync(output, cancellationToken); + await output.FlushAsync(cancellationToken); + } + + stream.Seek(0); + return new VoiceCloudTextToSpeechResult(stream, string.IsNullOrWhiteSpace(contentType) ? "audio/mpeg" : contentType); + } + + private static async Task SendTextMessageAsync(ClientWebSocket socket, string message, CancellationToken cancellationToken) + { + var bytes = Encoding.UTF8.GetBytes(message); + await socket.SendAsync(bytes, WebSocketMessageType.Text, true, cancellationToken); + } + + private static async Task ReceiveJsonMessageAsync(ClientWebSocket socket, CancellationToken cancellationToken) + { + using var buffer = new MemoryStream(); + var receiveBuffer = new byte[8192]; + + while (true) + { + var segment = new ArraySegment(receiveBuffer); + var result = await socket.ReceiveAsync(segment, cancellationToken); + + if (result.MessageType == WebSocketMessageType.Close) + { + var closeStatus = socket.CloseStatus?.ToString() ?? "Unknown"; + var closeDescription = string.IsNullOrWhiteSpace(socket.CloseStatusDescription) + ? null + : socket.CloseStatusDescription; + throw new InvalidOperationException( + string.IsNullOrWhiteSpace(closeDescription) + ? $"Voice provider closed the WebSocket unexpectedly ({closeStatus})." + : $"Voice provider closed the WebSocket unexpectedly ({closeStatus}: {closeDescription})."); + } + + buffer.Write(receiveBuffer, 0, result.Count); + if (result.EndOfMessage) + { + break; + } + } + + var text = Encoding.UTF8.GetString(buffer.ToArray()); + using var document = JsonDocument.Parse(text); + return document.RootElement.Clone(); + } + + private static HttpClient CreateHttpClient() + { + return new HttpClient + { + Timeout = TimeSpan.FromSeconds(30) + }; + } + + private readonly record struct TemplateValue(string Value, bool JsonFragment) + { + public static TemplateValue FromString(string value) => new(value, false); + + public static TemplateValue FromJson( + string json, + string providerName, + string label, + IReadOnlyDictionary? templateValues = null) + { + var substituted = templateValues == null + ? json + : ApplyJsonTemplate(json, templateValues); + + try + { + using var document = JsonDocument.Parse(substituted); + return new(document.RootElement.GetRawText(), true); + } + catch (JsonException ex) + { + try + { + using var wrapped = JsonDocument.Parse("{ " + substituted + " }"); + var wrappedJson = wrapped.RootElement.GetRawText(); + return new(wrappedJson[1..^1], true); + } + catch (JsonException) + { + throw new InvalidOperationException( + $"{providerName} setting '{label}' must be valid JSON.", + ex); + } + } + } + + public static implicit operator string(TemplateValue value) => value.Value; + } +} + +public sealed class VoiceCloudTextToSpeechResult : IDisposable +{ + public VoiceCloudTextToSpeechResult(IRandomAccessStream stream, string contentType) + { + Stream = stream; + ContentType = contentType; + } + + public IRandomAccessStream Stream { get; } + public string ContentType { get; } + + public void Dispose() + { + Stream.Dispose(); + } +} diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceConversationEvents.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceConversationEvents.cs new file mode 100644 index 0000000..f68c32b --- /dev/null +++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceConversationEvents.cs @@ -0,0 +1,25 @@ +using OpenClaw.Shared; + +namespace OpenClawTray.Services.Voice; + +public enum VoiceConversationDirection +{ + Outgoing, + Incoming +} + +public sealed class VoiceConversationTurnEventArgs : EventArgs +{ + public VoiceConversationDirection Direction { get; set; } + public string SessionKey { get; set; } = "main"; + public string Message { get; set; } = ""; + public VoiceActivationMode Mode { get; set; } = VoiceActivationMode.Off; +} + +public sealed class VoiceTranscriptDraftEventArgs : EventArgs +{ + public string SessionKey { get; set; } = "main"; + public string Text { get; set; } = ""; + public bool Clear { get; set; } + public VoiceActivationMode Mode { get; set; } = VoiceActivationMode.Off; +} diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceProviderCatalogService.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceProviderCatalogService.cs new file mode 100644 index 0000000..3af3f2c --- /dev/null +++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceProviderCatalogService.cs @@ -0,0 +1,256 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text.Json; +using OpenClaw.Shared; + +namespace OpenClawTray.Services.Voice; + +public static class VoiceProviderCatalogService +{ + private const long MaxCatalogBytes = 256 * 1024; + private const string CatalogRelativePath = "Assets\\voice-providers.json"; + + private static readonly JsonSerializerOptions s_jsonOptions = new() + { + PropertyNameCaseInsensitive = true, + WriteIndented = true + }; + + public static string CatalogFilePath => ResolveCatalogFilePath(); + + public static VoiceProviderCatalog LoadCatalog(IOpenClawLogger? logger = null) + { + var catalogFilePath = ResolveCatalogFilePath(); + + try + { + if (!File.Exists(catalogFilePath)) + { + throw new FileNotFoundException("Voice provider catalog asset not found.", catalogFilePath); + } + + var fileInfo = new FileInfo(catalogFilePath); + if (fileInfo.Length > MaxCatalogBytes) + { + throw new InvalidOperationException($"Voice provider catalog exceeds {MaxCatalogBytes} bytes."); + } + + var json = File.ReadAllText(catalogFilePath); + var catalog = JsonSerializer.Deserialize(json, s_jsonOptions); + if (catalog == null) + { + throw new InvalidOperationException("Voice provider catalog asset is empty or invalid."); + } + + return NormalizeCatalog(catalog); + } + catch (Exception ex) + { + throw new InvalidOperationException( + $"Failed to load voice provider catalog from '{catalogFilePath}': {ex.Message}", + ex); + } + } + + public static VoiceProviderOption ResolveSpeechToTextProvider(string? providerId, IOpenClawLogger? logger = null) + { + var catalog = LoadCatalog(logger); + return ResolveProvider(catalog.SpeechToTextProviders, providerId); + } + + public static VoiceProviderOption ResolveTextToSpeechProvider(string? providerId, IOpenClawLogger? logger = null) + { + var catalog = LoadCatalog(logger); + return ResolveProvider(catalog.TextToSpeechProviders, providerId); + } + + public static bool SupportsWindowsRuntime(string? providerId) + { + return string.Equals(providerId, VoiceProviderIds.Windows, StringComparison.OrdinalIgnoreCase); + } + + public static bool SupportsSpeechToTextRuntime(string? providerId) + { + try + { + var provider = ResolveSpeechToTextProvider(providerId); + return VoiceSpeechToTextRouteResolver.ResolveRouteKind(provider) == VoiceSpeechToTextRouteKind.WindowsMedia; + } + catch + { + return false; + } + } + + public static bool SupportsTextToSpeechRuntime(string? providerId) + { + if (SupportsWindowsRuntime(providerId)) + { + return true; + } + + try + { + var provider = ResolveTextToSpeechProvider(providerId); + return provider.TextToSpeechHttp != null || provider.TextToSpeechWebSocket != null; + } + catch + { + return false; + } + } + + private static VoiceProviderCatalog NormalizeCatalog(VoiceProviderCatalog catalog) + { + return new VoiceProviderCatalog + { + SpeechToTextProviders = NormalizeProviders(catalog.SpeechToTextProviders), + TextToSpeechProviders = NormalizeProviders(catalog.TextToSpeechProviders) + }; + } + + private static List NormalizeProviders(List? providers) + { + return (providers ?? []) + .Where(p => !string.IsNullOrWhiteSpace(p.Id)) + .Select(Clone) + .Where(p => p.Enabled || p.VisibleInSettings) + .OrderByDescending(p => string.Equals(p.Id, VoiceProviderIds.Windows, StringComparison.OrdinalIgnoreCase)) + .ThenBy(p => p.Name, StringComparer.OrdinalIgnoreCase) + .ToList(); + } + + private static VoiceProviderOption ResolveProvider(IEnumerable providers, string? providerId) + { + if (!string.IsNullOrWhiteSpace(providerId)) + { + var configured = providers.FirstOrDefault(p => string.Equals(p.Id, providerId, StringComparison.OrdinalIgnoreCase)); + if (configured != null) + { + return Clone(configured); + } + } + + return providers + .Select(Clone) + .FirstOrDefault(p => string.Equals(p.Id, VoiceProviderIds.Windows, StringComparison.OrdinalIgnoreCase)) + ?? new VoiceProviderOption + { + Id = VoiceProviderIds.Windows, + Name = "Windows Speech", + Runtime = "windows" + }; + } + + private static VoiceProviderOption Clone(VoiceProviderOption source) + { + return new VoiceProviderOption + { + Id = source.Id, + Name = source.Name, + Runtime = source.Runtime, + Enabled = source.Enabled, + VisibleInSettings = source.VisibleInSettings, + Selectable = source.Selectable, + Description = source.Description, + Settings = source.Settings.Select(Clone).ToList(), + TextToSpeechHttp = Clone(source.TextToSpeechHttp), + TextToSpeechWebSocket = Clone(source.TextToSpeechWebSocket) + }; + } + + private static VoiceProviderSettingDefinition Clone(VoiceProviderSettingDefinition source) + { + return new VoiceProviderSettingDefinition + { + Key = source.Key, + Label = source.Label, + Secret = source.Secret, + Required = source.Required, + JsonValue = source.JsonValue, + DefaultValue = source.DefaultValue, + Placeholder = source.Placeholder, + Description = source.Description, + Options = source.Options.ToList() + }; + } + + private static VoiceTextToSpeechHttpContract? Clone(VoiceTextToSpeechHttpContract? source) + { + if (source == null) + { + return null; + } + + return new VoiceTextToSpeechHttpContract + { + EndpointTemplate = source.EndpointTemplate, + HttpMethod = source.HttpMethod, + AuthenticationHeaderName = source.AuthenticationHeaderName, + AuthenticationScheme = source.AuthenticationScheme, + ApiKeySettingKey = source.ApiKeySettingKey, + RequestContentType = source.RequestContentType, + RequestBodyTemplate = source.RequestBodyTemplate, + ResponseAudioMode = source.ResponseAudioMode, + ResponseAudioJsonPath = source.ResponseAudioJsonPath, + ResponseStatusCodeJsonPath = source.ResponseStatusCodeJsonPath, + ResponseStatusMessageJsonPath = source.ResponseStatusMessageJsonPath, + SuccessStatusValue = source.SuccessStatusValue, + OutputContentType = source.OutputContentType + }; + } + + private static VoiceTextToSpeechWebSocketContract? Clone(VoiceTextToSpeechWebSocketContract? source) + { + if (source == null) + { + return null; + } + + return new VoiceTextToSpeechWebSocketContract + { + EndpointTemplate = source.EndpointTemplate, + AuthenticationHeaderName = source.AuthenticationHeaderName, + AuthenticationScheme = source.AuthenticationScheme, + ApiKeySettingKey = source.ApiKeySettingKey, + ConnectSuccessEventName = source.ConnectSuccessEventName, + StartMessageTemplate = source.StartMessageTemplate, + StartSuccessEventName = source.StartSuccessEventName, + ContinueMessageTemplate = source.ContinueMessageTemplate, + FinishMessageTemplate = source.FinishMessageTemplate, + ResponseAudioMode = source.ResponseAudioMode, + ResponseAudioJsonPath = source.ResponseAudioJsonPath, + ResponseStatusCodeJsonPath = source.ResponseStatusCodeJsonPath, + ResponseStatusMessageJsonPath = source.ResponseStatusMessageJsonPath, + FinalFlagJsonPath = source.FinalFlagJsonPath, + TaskFailedEventName = source.TaskFailedEventName, + SuccessStatusValue = source.SuccessStatusValue, + OutputContentType = source.OutputContentType + }; + } + + private static string ResolveCatalogFilePath() + { + var bundledPath = Path.Combine(AppContext.BaseDirectory, CatalogRelativePath); + if (File.Exists(bundledPath)) + { + return bundledPath; + } + + var current = new DirectoryInfo(AppContext.BaseDirectory); + while (current != null) + { + var sourcePath = Path.Combine(current.FullName, "src", "OpenClaw.Tray.WinUI", CatalogRelativePath); + if (File.Exists(sourcePath)) + { + return sourcePath; + } + + current = current.Parent; + } + + return bundledPath; + } +} diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceServiceTransportLogic.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceServiceTransportLogic.cs new file mode 100644 index 0000000..9a3e57e --- /dev/null +++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceServiceTransportLogic.cs @@ -0,0 +1,255 @@ +using OpenClaw.Shared; +using Windows.Media.Devices; +using Windows.Media.SpeechRecognition; + +namespace OpenClawTray.Services.Voice; + +public static class VoiceServiceTransportLogic +{ + private static readonly TimeSpan HypothesisPromotionWindow = TimeSpan.FromSeconds(2); + + public static TaskCompletionSource GetOrCreateTransportReadySource( + ConnectionStatus transportStatus, + TaskCompletionSource? existingReadySource, + out bool shouldStartConnection) + { + if (transportStatus == ConnectionStatus.Connecting && existingReadySource != null) + { + shouldStartConnection = false; + return existingReadySource; + } + + shouldStartConnection = true; + return new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + } + + public static bool UsesCloudTextToSpeechRuntime(VoiceProviderOption provider) + { + return provider.TextToSpeechHttp != null || provider.TextToSpeechWebSocket != null; + } + + public static bool ShouldAcceptAssistantReply( + bool awaitingReply, + bool isSpeaking, + int queuedReplyCount, + bool acceptedViaLateReplyGrace = false) + { + return awaitingReply || isSpeaking || queuedReplyCount > 0 || acceptedViaLateReplyGrace; + } + + public static bool ShouldAcceptLateAssistantReply( + bool awaitingReply, + bool isSpeaking, + int queuedReplyCount, + string? lateReplySessionKey, + DateTime? lateReplyGraceUntilUtc, + string? incomingSessionKey, + DateTime utcNow) + { + return !awaitingReply && + !isSpeaking && + queuedReplyCount == 0 && + !string.IsNullOrWhiteSpace(lateReplySessionKey) && + !string.IsNullOrWhiteSpace(incomingSessionKey) && + IsMatchingSessionKey(incomingSessionKey, lateReplySessionKey) && + lateReplyGraceUntilUtc.HasValue && + utcNow <= lateReplyGraceUntilUtc.Value; + } + + public static bool ShouldRestartRecognitionAfterCompletion( + bool running, + VoiceActivationMode mode, + bool restartInProgress, + bool awaitingReply, + bool isSpeaking) + { + return running && + mode == VoiceActivationMode.TalkMode && + !restartInProgress && + !awaitingReply && + !isSpeaking; + } + + public static string DescribeRecognitionCompletionRestartDecision( + bool running, + VoiceActivationMode mode, + bool restartInProgress, + bool awaitingReply, + bool isSpeaking) + { + if (!running) + { + return "runtime-not-running"; + } + + if (mode != VoiceActivationMode.TalkMode) + { + return $"mode={mode}"; + } + + if (restartInProgress) + { + return "controlled-restart-in-progress"; + } + + if (awaitingReply) + { + return "awaiting-reply"; + } + + if (isSpeaking) + { + return "speaking"; + } + + return "eligible"; + } + + public static bool ShouldRebuildRecognitionAfterCompletion( + SpeechRecognitionResultStatus status, + bool sessionHadActivity, + bool sessionHadCaptureSignal, + bool restartInProgress, + bool awaitingReply, + bool isSpeaking) + { + if (restartInProgress || awaitingReply || isSpeaking || sessionHadActivity) + { + return false; + } + + return status == SpeechRecognitionResultStatus.UserCanceled; + } + + public static string DescribeRecognitionCompletionRebuildDecision( + SpeechRecognitionResultStatus status, + bool sessionHadActivity, + bool sessionHadCaptureSignal, + bool restartInProgress, + bool awaitingReply, + bool isSpeaking) + { + if (restartInProgress) + { + return "controlled-restart-in-progress"; + } + + if (awaitingReply) + { + return "awaiting-reply"; + } + + if (isSpeaking) + { + return "speaking"; + } + + if (sessionHadActivity) + { + return "session-had-activity"; + } + + if (sessionHadCaptureSignal) + { + return "capture-signal-without-recognition"; + } + + return status switch + { + SpeechRecognitionResultStatus.UserCanceled => "user-canceled-without-activity", + SpeechRecognitionResultStatus.TimeoutExceeded => "disabled-official-session-restart-only (status=TimeoutExceeded)", + _ => $"disabled-official-session-restart-only (status={status})" + }; + } + + public static string SelectRecognizedText( + string recognizedText, + string? latestHypothesisText, + DateTime latestHypothesisUtc, + DateTime utcNow, + out bool promotedHypothesis) + { + promotedHypothesis = false; + + if (string.IsNullOrWhiteSpace(recognizedText) || + string.IsNullOrWhiteSpace(latestHypothesisText) || + utcNow - latestHypothesisUtc > HypothesisPromotionWindow) + { + return recognizedText; + } + + var normalizedResult = recognizedText.Trim(); + var normalizedHypothesis = latestHypothesisText.Trim(); + + if (normalizedHypothesis.Length <= normalizedResult.Length + 3) + { + return normalizedResult; + } + + if (!normalizedHypothesis.EndsWith(normalizedResult, StringComparison.OrdinalIgnoreCase)) + { + return normalizedResult; + } + + promotedHypothesis = true; + return normalizedHypothesis; + } + + public static string? SelectCompletionFallbackText( + bool sessionHadActivity, + string? latestHypothesisText, + DateTime latestHypothesisUtc, + DateTime utcNow) + { + if (!sessionHadActivity || + string.IsNullOrWhiteSpace(latestHypothesisText) || + utcNow - latestHypothesisUtc > HypothesisPromotionWindow) + { + return null; + } + + return latestHypothesisText.Trim(); + } + + public static bool ShouldClearTranscriptDraftAfterCompletion( + bool awaitingReply, + bool isSpeaking, + bool usedFallbackTranscript) + { + return !awaitingReply && + !isSpeaking && + !usedFallbackTranscript; + } + + public static bool ShouldRepromptAfterIncompleteRecognition( + bool sessionHadActivity, + bool awaitingReply, + bool isSpeaking, + bool usedFallbackTranscript) + { + return sessionHadActivity && + !awaitingReply && + !isSpeaking && + !usedFallbackTranscript; + } + + public static bool ShouldRefreshRecognitionForDefaultCaptureDeviceChange( + bool running, + VoiceActivationMode mode, + string? configuredInputDeviceId, + AudioDeviceRole role) + { + return running && + mode == VoiceActivationMode.TalkMode && + string.IsNullOrWhiteSpace(configuredInputDeviceId) && + role == AudioDeviceRole.Default; + } + + private static bool IsMatchingSessionKey(string? first, string? second) + { + return string.Equals( + string.IsNullOrWhiteSpace(first) ? "main" : first, + string.IsNullOrWhiteSpace(second) ? "main" : second, + StringComparison.OrdinalIgnoreCase); + } +} diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceSpeechToTextRouteKind.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceSpeechToTextRouteKind.cs new file mode 100644 index 0000000..cfb5f95 --- /dev/null +++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceSpeechToTextRouteKind.cs @@ -0,0 +1,8 @@ +namespace OpenClawTray.Services.Voice; + +public enum VoiceSpeechToTextRouteKind +{ + WindowsMedia, + Streaming, + SherpaOnnx +} diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceSpeechToTextRouteResolver.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceSpeechToTextRouteResolver.cs new file mode 100644 index 0000000..61aa6a8 --- /dev/null +++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceSpeechToTextRouteResolver.cs @@ -0,0 +1,28 @@ +using OpenClaw.Shared; + +namespace OpenClawTray.Services.Voice; + +public static class VoiceSpeechToTextRouteResolver +{ + public static VoiceSpeechToTextRouteKind ResolveRouteKind(VoiceProviderOption provider) + { + ArgumentNullException.ThrowIfNull(provider); + + if (string.Equals(provider.Id, VoiceProviderIds.SherpaOnnx, StringComparison.OrdinalIgnoreCase)) + { + return VoiceSpeechToTextRouteKind.SherpaOnnx; + } + + if (string.Equals(provider.Runtime, VoiceProviderRuntimeIds.Streaming, StringComparison.OrdinalIgnoreCase)) + { + return VoiceSpeechToTextRouteKind.Streaming; + } + + if (string.Equals(provider.Runtime, VoiceProviderRuntimeIds.Embedded, StringComparison.OrdinalIgnoreCase)) + { + return VoiceSpeechToTextRouteKind.SherpaOnnx; + } + + return VoiceSpeechToTextRouteKind.WindowsMedia; + } +} diff --git a/src/OpenClaw.Tray.Shared/Windows/WebChatVoiceDomBridge.cs b/src/OpenClaw.Tray.Shared/Windows/WebChatVoiceDomBridge.cs new file mode 100644 index 0000000..e1edb3d --- /dev/null +++ b/src/OpenClaw.Tray.Shared/Windows/WebChatVoiceDomBridge.cs @@ -0,0 +1,95 @@ +using System.Text.Json; + +namespace OpenClawTray.Windows; + +public static class WebChatVoiceDomBridge +{ + public const string DocumentCreatedScript = """ +(() => { + const isVisible = (el) => !!el && !(el.disabled === true) && el.getClientRects().length > 0; + let desiredDraft = ''; + + const findComposer = () => { + const candidates = Array.from(document.querySelectorAll('textarea, input[type="text"], [contenteditable="true"], [contenteditable="plaintext-only"]')); + return candidates.find(isVisible) || null; + }; + + const setElementValue = (el, value) => { + const text = typeof value === 'string' ? value : ''; + if ('value' in el) { + const proto = el.tagName === 'TEXTAREA' ? HTMLTextAreaElement.prototype : HTMLInputElement.prototype; + const descriptor = Object.getOwnPropertyDescriptor(proto, 'value'); + if (descriptor && descriptor.set) { + descriptor.set.call(el, text); + } else { + el.value = text; + } + el.dispatchEvent(new InputEvent('input', { bubbles: true, data: text, inputType: 'insertText' })); + el.dispatchEvent(new Event('change', { bubbles: true })); + return; + } + + if (el.isContentEditable) { + el.textContent = text; + el.dispatchEvent(new InputEvent('input', { bubbles: true, data: text, inputType: 'insertText' })); + el.dispatchEvent(new Event('change', { bubbles: true })); + } + }; + + const applyDraftIfPossible = () => { + const composer = findComposer(); + if (!composer) return false; + setElementValue(composer, desiredDraft); + return true; + }; + + const clearLegacyTurnsHost = () => { + const host = document.getElementById('openclaw-tray-voice-turns'); + if (host) { + host.remove(); + } + }; + + const observer = new MutationObserver(() => applyDraftIfPossible()); + const start = () => { + if (!document.body) return; + observer.observe(document.body, { childList: true, subtree: true }); + applyDraftIfPossible(); + clearLegacyTurnsHost(); + }; + + if (document.readyState === 'loading') { + document.addEventListener('DOMContentLoaded', start, { once: true }); + } else { + start(); + } + + window.__openClawTrayVoice = { + setDraft(text) { + desiredDraft = text || ''; + return applyDraftIfPossible(); + }, + clearDraft() { + desiredDraft = ''; + return applyDraftIfPossible(); + }, + setTurns() { + clearLegacyTurnsHost(); + return true; + } + }; +})(); +"""; + + public static string BuildSetDraftScript(string? text) + { + if (string.IsNullOrWhiteSpace(text)) + { + return "window.__openClawTrayVoice?.clearDraft?.();"; + } + + return $"window.__openClawTrayVoice?.setDraft?.({JsonSerializer.Serialize(text)});"; + } + + public const string ClearLegacyTurnsScript = "window.__openClawTrayVoice?.setTurns?.([]);"; +} diff --git a/src/OpenClaw.Tray.WinUI/App.xaml.cs b/src/OpenClaw.Tray.WinUI/App.xaml.cs index 04f053d..5b67e10 100644 --- a/src/OpenClaw.Tray.WinUI/App.xaml.cs +++ b/src/OpenClaw.Tray.WinUI/App.xaml.cs @@ -6,6 +6,7 @@ using OpenClawTray.Dialogs; using OpenClawTray.Helpers; using OpenClawTray.Services; +using OpenClawTray.Services.Voice; using OpenClawTray.Windows; using System; using System.Collections.Frozen; @@ -39,6 +40,7 @@ public partial class App : Application private GlobalHotkeyService? _globalHotkey; private System.Timers.Timer? _healthCheckTimer; private System.Timers.Timer? _sessionPollTimer; + private Microsoft.UI.Dispatching.DispatcherQueueTimer? _voiceTrayIconTimer; private Mutex? _mutex; private Microsoft.UI.Dispatching.DispatcherQueue? _dispatcherQueue; private CancellationTokenSource? _deepLinkCts; @@ -57,6 +59,7 @@ public partial class App : Application private GatewayCostUsageInfo? _lastUsageCost; private DateTime _lastCheckTime = DateTime.Now; private DateTime _lastUsageActivityLogUtc = DateTime.MinValue; + private string? _lastTrayIconPath; // FrozenDictionary for O(1) case-insensitive notification type → setting lookup — no per-call allocation. private static readonly System.Collections.Frozen.FrozenDictionary> s_notifTypeMap = @@ -81,6 +84,8 @@ public partial class App : Application // Windows (created on demand) private SettingsWindow? _settingsWindow; + private VoiceRepeaterWindow? _voiceRepeaterWindow; + private VoiceModeWindow? _voiceModeWindow; private WebChatWindow? _webChatWindow; private StatusDetailWindow? _statusDetailWindow; private NotificationHistoryWindow? _notificationHistoryWindow; @@ -90,6 +95,8 @@ public partial class App : Application // Node service (optional, enabled in settings) private NodeService? _nodeService; + private VoiceService? _voiceService; + private VoiceChatCoordinator? _voiceChatCoordinator; // Keep-alive window to anchor WinUI runtime (prevents GC/threading issues) private Window? _keepAliveWindow; @@ -269,6 +276,11 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args) // Register toast activation handler ToastNotificationManagerCompat.OnActivated += OnToastActivated; + _voiceService = new VoiceService(new AppLogger(), _settings); + _voiceChatCoordinator = new VoiceChatCoordinator( + _voiceService, + new DispatcherQueueAdapter(_dispatcherQueue!)); + _voiceChatCoordinator.ConversationTurnAvailable += OnVoiceConversationTurnAvailable; _sshTunnelService = new SshTunnelService(new AppLogger()); _sshTunnelService.TunnelExited += OnSshTunnelExited; @@ -297,6 +309,7 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args) // Start health check timer StartHealthCheckTimer(); + StartVoiceTrayIconTimer(); // Start deep link server StartDeepLinkServer(); @@ -305,7 +318,8 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args) if (_settings.GlobalHotkeyEnabled) { _globalHotkey = new GlobalHotkeyService(); - _globalHotkey.HotkeyPressed += OnGlobalHotkeyPressed; + _globalHotkey.QuickSendHotkeyPressed += OnGlobalQuickSendHotkeyPressed; + _globalHotkey.VoiceToggleHotkeyPressed += OnGlobalVoiceToggleHotkeyPressed; _globalHotkey.Register(); } @@ -318,6 +332,11 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args) HandleDeepLink(startupDeepLink); } + if (ShouldShowVoiceRepeaterAtStartup()) + { + _dispatcherQueue?.TryEnqueue(ShowVoiceModeSettings); + } + Logger.Info("Application started (WinUI 3)"); } @@ -341,13 +360,28 @@ private void InitializeTrayIcon() // Pre-create tray menu window at startup to avoid creation crashes later InitializeTrayMenuWindow(); - var iconPath = IconHelper.GetStatusIconPath(ConnectionStatus.Disconnected); + var iconPath = AppIconHelper.GetStatusIconPath(ConnectionStatus.Disconnected); _trayIcon = new TrayIcon(1, iconPath, "OpenClaw Tray — Disconnected"); + _lastTrayIconPath = iconPath; _trayIcon.IsVisible = true; _trayIcon.Selected += OnTrayIconSelected; _trayIcon.ContextMenu += OnTrayContextMenu; } + private void StartVoiceTrayIconTimer() + { + if (_dispatcherQueue == null || _voiceTrayIconTimer != null) + { + return; + } + + _voiceTrayIconTimer = _dispatcherQueue.CreateTimer(); + _voiceTrayIconTimer.Interval = TimeSpan.FromMilliseconds(250); + _voiceTrayIconTimer.IsRepeating = true; + _voiceTrayIconTimer.Tick += (s, e) => UpdateTrayIcon(); + _voiceTrayIconTimer.Start(); + } + private void InitializeTrayMenuWindow() { // Pre-create menu window once - reuse to avoid crash on window creation after idle @@ -535,6 +569,8 @@ private void OnTrayMenuItemClicked(object? sender, string action) switch (action) { case "status": ShowStatusDetail(); break; + case "voice-settings": ShowVoiceModeSettings(); break; + case "voice-toggle-pause": _ = ToggleVoiceQuickPauseAsync(); break; case "dashboard": OpenDashboard(); break; case "webchat": ShowWebChat(); break; case "quicksend": ShowQuickSend(); break; @@ -742,6 +778,60 @@ private List GetRecentActivity(int maxItems) .ToList(); } + private string GetRunningVoiceModeLabel() + { + var status = _voiceService?.CurrentStatus; + if (status == null) + { + return "Off"; + } + + return VoiceDisplayHelper.GetRuntimeLabel(status); + } + + private bool CanQuickToggleVoiceMode() + { + if (_settings?.EnableNodeMode != true || _voiceService == null) + { + return false; + } + + var status = _voiceService.CurrentStatus; + if (status.State == VoiceRuntimeState.Paused) + { + return true; + } + + return _settings.Voice.Enabled && _settings.Voice.Mode != VoiceActivationMode.Off; + } + + private bool ShouldShowVoiceRepeaterAtStartup() + { + return _settings?.EnableNodeMode == true && + _settings.Voice.Enabled && + _settings.Voice.Mode != VoiceActivationMode.Off && + _settings.Voice.ShowRepeaterAtStartup; + } + + private string GetVoiceQuickToggleLabel() + { + var status = _voiceService?.CurrentStatus; + return status?.State == VoiceRuntimeState.Paused + ? "Resume Voice" + : "Pause Voice"; + } + + private string GetVoiceDeviceSummary() + { + var voice = _settings?.Voice; + if (voice == null) + return "Talk: system default · Listen: system default"; + + var talk = string.IsNullOrWhiteSpace(voice.OutputDeviceId) ? "system default" : "selected speaker"; + var listen = string.IsNullOrWhiteSpace(voice.InputDeviceId) ? "system default" : "selected microphone"; + return $"Talk: {talk} · Listen: {listen}"; + } + private void BuildTrayMenuPopup(TrayMenuWindow menu) { // Brand header @@ -758,6 +848,14 @@ private void BuildTrayMenuPopup(TrayMenuWindow menu) menu.AddMenuItem(_currentActivity.DisplayText, _currentActivity.Glyph, "", isEnabled: false); } + menu.AddMenuItem($"Voice Mode: {GetRunningVoiceModeLabel()}", "🎙️", "voice-settings"); + menu.AddMenuItem($"↳ {GetVoiceDeviceSummary()}", "", "", isEnabled: false, indent: true); + menu.AddMenuItem($"↳ {GetVoiceQuickToggleLabel()} (Ctrl+Alt+Shift+V)", "", "voice-toggle-pause", isEnabled: CanQuickToggleVoiceMode(), indent: true); + if (_settings?.EnableNodeMode != true) + { + menu.AddMenuItem("↳ Enable Node Mode to activate voice runtime", "", "", isEnabled: false, indent: true); + } + // Usage if (_lastUsage != null || _lastUsageStatus != null || _lastUsageCost != null) { @@ -1147,7 +1245,7 @@ private void InitializeNodeService() { Logger.Info("Initializing Windows Node service..."); - _nodeService = new NodeService(new AppLogger(), _dispatcherQueue, DataPath); + _nodeService = new NodeService(new AppLogger(), _dispatcherQueue, _voiceService!, DataPath); _nodeService.StatusChanged += OnNodeStatusChanged; _nodeService.NotificationRequested += OnNodeNotificationRequested; _nodeService.PairingStatusChanged += OnPairingStatusChanged; @@ -1558,13 +1656,7 @@ private void UpdateTrayIcon() { if (_trayIcon == null) return; - var status = _currentStatus; - if (_currentActivity != null && _currentActivity.Kind != OpenClaw.Shared.ActivityKind.Idle) - { - status = ConnectionStatus.Connecting; // Use connecting icon for activity - } - - var iconPath = IconHelper.GetStatusIconPath(status); + var iconPath = GetTrayIconPathForCurrentState(); var tooltip = $"OpenClaw Tray — {_currentStatus}"; if (_currentActivity != null && !string.IsNullOrEmpty(_currentActivity.DisplayText)) @@ -1576,7 +1668,11 @@ private void UpdateTrayIcon() try { - _trayIcon.SetIcon(iconPath); + if (!string.Equals(_lastTrayIconPath, iconPath, StringComparison.OrdinalIgnoreCase)) + { + _trayIcon.SetIcon(iconPath); + _lastTrayIconPath = iconPath; + } _trayIcon.Tooltip = tooltip; } catch (Exception ex) @@ -1585,15 +1681,60 @@ private void UpdateTrayIcon() } } + private string GetTrayIconPathForCurrentState() + { + var voiceIconState = GetVoiceTrayIconState(); + if (voiceIconState != VoiceTrayIconState.Off) + { + return VoiceTrayIconHelper.GetVoiceTrayIconPath(voiceIconState); + } + + if (_voiceService?.CurrentStatus.State == VoiceRuntimeState.Paused) + { + return VoiceTrayIconHelper.GetVoiceTrayIconPath(VoiceTrayIconState.Off); + } + + var status = _currentStatus; + if (_currentActivity != null && _currentActivity.Kind != OpenClaw.Shared.ActivityKind.Idle) + { + status = ConnectionStatus.Connecting; + } + + return AppIconHelper.GetStatusIconPath(status); + } + + private VoiceTrayIconState GetVoiceTrayIconState() + { + var voiceStatus = _voiceService?.CurrentStatus; + if (voiceStatus == null || !voiceStatus.Running) + { + return VoiceTrayIconState.Off; + } + + return voiceStatus.State switch + { + VoiceRuntimeState.PlayingResponse => VoiceTrayIconState.Speaking, + VoiceRuntimeState.ListeningForVoiceWake => VoiceTrayIconState.Listening, + VoiceRuntimeState.ListeningContinuously => VoiceTrayIconState.Listening, + VoiceRuntimeState.RecordingUtterance => VoiceTrayIconState.Listening, + VoiceRuntimeState.Paused => VoiceTrayIconState.Off, + _ when voiceStatus.Mode == VoiceActivationMode.Off => VoiceTrayIconState.Off, + _ => VoiceTrayIconState.Off + }; + } + #endregion #region Window Management private void ShowSettings() { + if (_settings == null || _voiceService == null) + return; + if (_settingsWindow == null || _settingsWindow.IsClosed) { - _settingsWindow = new SettingsWindow(_settings!); + _settingsWindow = new SettingsWindow(_settings, _voiceService); _settingsWindow.Closed += (s, e) => { _settingsWindow.SettingsSaved -= OnSettingsSaved; @@ -1604,40 +1745,143 @@ private void ShowSettings() _settingsWindow.Activate(); } - private void OnSettingsSaved(object? sender, EventArgs e) + private void ShowVoiceModeSettings() { - // Reconnect with new settings — mirror the startup if/else pattern - // to avoid dual connections that cause gateway conflicts. - UnsubscribeGatewayEvents(); - _gatewayClient?.Dispose(); - _gatewayClient = null; - var oldNodeService = _nodeService; - _nodeService = null; - try { oldNodeService?.Dispose(); } catch (Exception ex) { Logger.Warn($"Node dispose error: {ex.Message}"); } - if (_settings?.UseSshTunnel != true) + if (_settings == null || _voiceService == null) + return; + + if (_voiceRepeaterWindow == null || _voiceRepeaterWindow.IsClosed) { - _sshTunnelService?.Stop(); + _voiceRepeaterWindow = new VoiceRepeaterWindow(_settings, _voiceService); + _voiceRepeaterWindow.OpenVoiceStatusRequested += OnOpenVoiceStatusRequested; + _voiceRepeaterWindow.Closed += (s, e) => + { + _voiceChatCoordinator?.DetachWindow(_voiceRepeaterWindow); + _voiceRepeaterWindow.OpenVoiceStatusRequested -= OnOpenVoiceStatusRequested; + _voiceRepeaterWindow = null; + }; + _voiceChatCoordinator?.AttachWindow(_voiceRepeaterWindow); } - // Reset status so the tray doesn't show a stale "Connected" from the previous mode - _currentStatus = ConnectionStatus.Disconnected; - UpdateTrayIcon(); - - if (_settings?.EnableNodeMode == true) + _voiceRepeaterWindow.RefreshStatus(); + _voiceRepeaterWindow.Activate(); + } + + private void ShowVoiceStatusWindow() + { + if (_settings == null || _voiceService == null) { - InitializeNodeService(); + return; } - else + + if (_voiceModeWindow == null || _voiceModeWindow.IsClosed) { - InitializeGatewayClient(); + _voiceModeWindow = new VoiceModeWindow(_settings, _voiceService, _voiceService); + _voiceModeWindow.OpenSettingsRequested += OnVoiceModeOpenSettingsRequested; + _voiceModeWindow.Closed += (s, e) => + { + if (_voiceModeWindow != null) + { + _voiceModeWindow.OpenSettingsRequested -= OnVoiceModeOpenSettingsRequested; + } + + _voiceModeWindow = null; + }; + } + + _voiceModeWindow.RefreshStatus(); + _voiceModeWindow.Activate(); + } + + private void OnOpenVoiceStatusRequested(object? sender, EventArgs e) + { + ShowVoiceStatusWindow(); + } + + private void OnVoiceModeOpenSettingsRequested(object? sender, EventArgs e) + { + ShowSettings(); + } + + private async void OnSettingsSaved(object? sender, EventArgs e) + { + // Reconnect with new settings — mirror the startup if/else pattern + // to avoid dual connections that cause gateway conflicts. + try + { + if (_gatewayClient != null) + { + try + { + await _gatewayClient.DisconnectAsync(); + } + catch (Exception ex) + { + Logger.Warn($"Gateway disconnect error: {ex.Message}"); + } + + _gatewayClient.Dispose(); + _gatewayClient = null; + } + + var oldNodeService = _nodeService; + _nodeService = null; + if (oldNodeService != null) + { + try + { + await oldNodeService.DisconnectAsync(); + } + catch (Exception ex) + { + Logger.Warn($"Node disconnect error: {ex.Message}"); + } + + try + { + oldNodeService.Dispose(); + } + catch (Exception ex) + { + Logger.Warn($"Node dispose error: {ex.Message}"); + } + } + + if (_settings?.UseSshTunnel != true) + { + _sshTunnelService?.Stop(); + } + + // Reset status so the tray doesn't show a stale "Connected" from the previous mode + _currentStatus = ConnectionStatus.Disconnected; + UpdateTrayIcon(); + + if (_settings?.EnableNodeMode == true) + { + InitializeNodeService(); + } + else + { + InitializeGatewayClient(); + if (_voiceService != null) + { + await _voiceService.StopAsync(new VoiceStopArgs { Reason = "Node mode disabled" }); + } + } + } + catch (Exception ex) + { + Logger.Warn($"Settings reconnect failed: {ex.Message}"); } // Update global hotkey if (_settings!.GlobalHotkeyEnabled) { _globalHotkey ??= new GlobalHotkeyService(); - _globalHotkey.HotkeyPressed -= OnGlobalHotkeyPressed; - _globalHotkey.HotkeyPressed += OnGlobalHotkeyPressed; + _globalHotkey.QuickSendHotkeyPressed -= OnGlobalQuickSendHotkeyPressed; + _globalHotkey.QuickSendHotkeyPressed += OnGlobalQuickSendHotkeyPressed; + _globalHotkey.VoiceToggleHotkeyPressed -= OnGlobalVoiceToggleHotkeyPressed; + _globalHotkey.VoiceToggleHotkeyPressed += OnGlobalVoiceToggleHotkeyPressed; _globalHotkey.Register(); } else @@ -1645,6 +1889,9 @@ private void OnSettingsSaved(object? sender, EventArgs e) _globalHotkey?.Unregister(); } + _voiceRepeaterWindow?.RefreshStatus(); + _voiceModeWindow?.RefreshStatus(); + // Update auto-start AutoStartManager.SetAutoStart(_settings.AutoStart); } @@ -1656,8 +1903,15 @@ private void ShowWebChat() if (_webChatWindow == null || _webChatWindow.IsClosed) { - _webChatWindow = new WebChatWindow(_settings.GetEffectiveGatewayUrl(), _settings.Token); - _webChatWindow.Closed += (s, e) => _webChatWindow = null; + _webChatWindow = new WebChatWindow( + _settings.GetEffectiveGatewayUrl(), + _settings.Token); + _webChatWindow.Closed += (s, e) => + { + _voiceChatCoordinator?.DetachWindow(_webChatWindow); + _webChatWindow = null; + }; + _voiceChatCoordinator?.AttachWindow(_webChatWindow); } _webChatWindow.Activate(); } @@ -1874,7 +2128,7 @@ private void OpenLogFile() } } - private void OnGlobalHotkeyPressed(object? sender, EventArgs e) + private void OnGlobalQuickSendHotkeyPressed(object? sender, EventArgs e) { // Hotkey events are raised from a dedicated Win32 message-loop thread. // Creating/activating WinUI windows must happen on the app's UI thread. @@ -1891,6 +2145,137 @@ private void OnGlobalHotkeyPressed(object? sender, EventArgs e) } } + private void OnGlobalVoiceToggleHotkeyPressed(object? sender, EventArgs e) + { + if (_dispatcherQueue == null) + { + Logger.Warn("Voice hotkey pressed but DispatcherQueue is null"); + return; + } + + var enqueued = _dispatcherQueue.TryEnqueue(async () => await ToggleVoiceQuickPauseAsync()); + if (!enqueued) + { + Logger.Warn("Voice hotkey pressed but failed to enqueue Voice quick pause on UI thread"); + } + } + + private async Task ToggleVoiceQuickPauseAsync() + { + if (_voiceService == null) + { + return; + } + + if (_settings?.EnableNodeMode != true) + { + Logger.Warn("Voice quick pause blocked: Node Mode is disabled"); + return; + } + + if (!CanQuickToggleVoiceMode()) + { + Logger.Warn("Voice quick pause blocked: Voice Mode is off"); + return; + } + + try + { + var status = await _voiceService.ToggleQuickPauseAsync(); + _voiceRepeaterWindow?.RefreshStatus(); + _voiceModeWindow?.RefreshStatus(); + ShowVoiceQuickToggleToast(status); + } + catch (Exception ex) + { + Logger.Warn($"Voice quick pause failed: {ex.Message}"); + } + } + + private static void ShowVoiceQuickToggleToast(VoiceStatusInfo status) + { + try + { + var title = status.State == VoiceRuntimeState.Paused + ? "Voice paused" + : "Voice resumed"; + var detail = status.State == VoiceRuntimeState.Paused + ? $"{status.Mode} is paused. Press Ctrl+Alt+Shift+V to resume." + : $"{status.Mode} is active again."; + + new ToastContentBuilder() + .AddText(title) + .AddText(detail) + .Show(); + } + catch (Exception ex) + { + Logger.Warn($"Failed to show voice pause toast: {ex.Message}"); + } + } + + private void OnVoiceConversationTurnAvailable(object? sender, VoiceConversationTurnEventArgs args) + { + if (_dispatcherQueue == null) + { + return; + } + + _dispatcherQueue.TryEnqueue(() => ShowVoiceConversationToast(args)); + } + + private void ShowVoiceConversationToast(VoiceConversationTurnEventArgs args) + { + if (_settings?.Voice.ShowConversationToasts != true) + { + return; + } + + var title = args.Direction == VoiceConversationDirection.Outgoing + ? "Voice heard" + : "Voice reply"; + + AddRecentActivity( + $"voice: {title}", + category: "voice", + details: args.Message, + dashboardPath: "chat", + sessionKey: args.SessionKey); + + NotificationHistoryService.AddNotification(new Services.GatewayNotification + { + Title = title, + Message = args.Message, + Category = "voice" + }); + + if (_settings.ShowNotifications != true) + { + return; + } + + try + { + var builder = new ToastContentBuilder() + .AddText(title) + .AddText(args.Message); + + if (args.Direction == VoiceConversationDirection.Incoming) + { + builder.AddArgument("action", "open_chat") + .AddButton(new ToastButton() + .SetContent("Open Chat") + .AddArgument("action", "open_chat")); + } + + builder.Show(); + } + catch (Exception ex) + { + Logger.Warn($"Failed to show voice conversation toast: {ex.Message}"); + } + } + #endregion #region Updates @@ -2125,7 +2510,11 @@ private void ExitApplication() _sessionPollTimer?.Dispose(); _sessionPollTimer = null; }); - + SafeShutdownStep("voice tray icon timer", () => + { + _voiceTrayIconTimer?.Stop(); + _voiceTrayIconTimer = null; + }); // Cleanup hotkey SafeShutdownStep("global hotkey", () => { @@ -2191,6 +2580,22 @@ private void ExitApplication() _deepLinkCts = null; }); + SafeShutdownStep("voice chat coordinator", () => + { + if (_voiceChatCoordinator != null) + { + _voiceChatCoordinator.ConversationTurnAvailable -= OnVoiceConversationTurnAvailable; + _voiceChatCoordinator.Dispose(); + _voiceChatCoordinator = null; + } + }); + + SafeShutdownStep("voice service", () => + { + _voiceService?.Dispose(); + _voiceService = null; + }); + Logger.Info("Shutdown complete; calling Exit() now"); Exit(); } @@ -2262,7 +2667,6 @@ _settings.SshTunnelRemotePort is < 1 or > 65535 || return true; } - #endregion private async void OnSshTunnelExited(object? sender, int exitCode) diff --git a/src/OpenClaw.Tray.WinUI/Assets/voice-mode-feature.png b/src/OpenClaw.Tray.WinUI/Assets/voice-mode-feature.png new file mode 100644 index 0000000..04c239b Binary files /dev/null and b/src/OpenClaw.Tray.WinUI/Assets/voice-mode-feature.png differ diff --git a/src/OpenClaw.Tray.WinUI/Assets/voice-providers.json b/src/OpenClaw.Tray.WinUI/Assets/voice-providers.json new file mode 100644 index 0000000..3ffcc0b --- /dev/null +++ b/src/OpenClaw.Tray.WinUI/Assets/voice-providers.json @@ -0,0 +1,274 @@ +{ + "speechToTextProviders": [ + { + "id": "windows", + "name": "Windows Speech Recognition", + "runtime": "windows", + "enabled": true, + "description": "Built-in Windows.Media speech recognition, half-duplex, non-streamed." + }, + { + "id": "http-ws", + "name": "http/ws", + "runtime": "streaming", + "enabled": false, + "visibleInSettings": true, + "selectable": false, + "description": "Will support most cloud and local stand-alone models full or half-duplex, streaming." + }, + { + "id": "foundry-local", + "name": "Foundry Local", + "runtime": "streaming", + "enabled": false, + "visibleInSettings": false, + "selectable": false, + "description": "AudioGraph-fed streaming STT route for Foundry Local or compatible streaming adapters.", + "settings": [ + { + "key": "endpoint", + "label": "Endpoint", + "required": false, + "defaultValue": "http://localhost:5273", + "placeholder": "http://localhost:5273", + "description": "Local Foundry-compatible transcription endpoint for the AudioGraph streaming STT route." + }, + { + "key": "model", + "label": "Model", + "required": false, + "defaultValue": "whisper-tiny", + "placeholder": "whisper-tiny", + "description": "Transcription model identifier for the streaming STT adapter." + } + ] + }, + { + "id": "openai-whisper", + "name": "OpenAI Whisper", + "runtime": "streaming", + "enabled": false, + "visibleInSettings": false, + "selectable": false, + "description": "AudioGraph-fed cloud STT route for the OpenAI Whisper transcription API.", + "settings": [ + { + "key": "apiKey", + "label": "API key", + "secret": true + }, + { + "key": "model", + "label": "Model", + "required": false, + "defaultValue": "whisper-1", + "placeholder": "whisper-1", + "description": "Transcription model identifier for the OpenAI speech-to-text adapter." + } + ] + }, + { + "id": "elevenlabs-stt", + "name": "ElevenLabs Speech to Text", + "runtime": "streaming", + "enabled": false, + "visibleInSettings": false, + "selectable": false, + "description": "AudioGraph-fed cloud STT route for the ElevenLabs speech-to-text API.", + "settings": [ + { + "key": "apiKey", + "label": "API key", + "secret": true + }, + { + "key": "model", + "label": "Model", + "required": false, + "defaultValue": "scribe_v1", + "placeholder": "scribe_v1", + "description": "Transcription model identifier for the ElevenLabs speech-to-text adapter." + } + ] + }, + { + "id": "azure-ai-speech", + "name": "Azure AI Speech", + "runtime": "streaming", + "enabled": false, + "visibleInSettings": false, + "selectable": false, + "description": "AudioGraph-fed cloud STT route for Azure AI Speech real-time transcription.", + "settings": [ + { + "key": "apiKey", + "label": "API key", + "secret": true + }, + { + "key": "endpoint", + "label": "Endpoint", + "required": false, + "defaultValue": "", + "placeholder": "https://your-speech-resource.cognitiveservices.azure.com", + "description": "Azure AI Speech endpoint for the streaming STT adapter." + } + ] + }, + { + "id": "sherpa-onnx", + "name": "sherpa-onnx", + "runtime": "embedded", + "enabled": false, + "visibleInSettings": true, + "selectable": false, + "description": "Can load a variety of models including OpenAI/Whisper, full-duplex, streaming.", + "settings": [ + { + "key": "modelPath", + "label": "Model path", + "required": false, + "defaultValue": "", + "placeholder": "C:\\models\\sherpa-onnx\\model.onnx", + "description": "Path to the downloaded sherpa-onnx model bundle the embedded STT route should use." + }, + { + "key": "model", + "label": "Model preset", + "required": false, + "defaultValue": "", + "placeholder": "tiny / base / small / medium", + "description": "Optional human-readable model preset to help track which local bundle is selected." + } + ] + } + ], + "textToSpeechProviders": [ + { + "id": "windows", + "name": "Windows Speech Synthesis", + "runtime": "windows", + "enabled": true, + "description": "Built-in Windows text-to-speech playback." + }, + { + "id": "minimax", + "name": "MiniMax", + "runtime": "cloud", + "enabled": true, + "description": "Cloud TTS using the MiniMax HTTP text-to-speech API.", + "settings": [ + { + "key": "apiKey", + "label": "API key", + "secret": true + }, + { + "key": "model", + "label": "Model", + "defaultValue": "speech-2.8-turbo", + "options": [ + "speech-2.5-turbo-preview", + "speech-02-turbo", + "speech-02-hd", + "speech-2.6-turbo", + "speech-2.6-hd", + "speech-2.8-turbo", + "speech-2.8-hd" + ] + }, + { + "key": "voiceId", + "label": "Voice ID", + "required": false, + "defaultValue": "English_MatureBoss" + }, + { + "key": "voiceSettingsJson", + "label": "Voice settings JSON", + "required": false, + "jsonValue": true, + "defaultValue": "\"voice_setting\": { \"voice_id\": {{voiceId}}, \"speed\": 1, \"vol\": 1, \"pitch\": 0 }", + "placeholder": "\"voice_setting\": { \"voice_id\": \"English_MatureBoss\", \"speed\": 1, \"vol\": 1, \"pitch\": 0 }", + "description": "Optional full MiniMax request fragment. If present, it controls the full voice_setting payload." + } + ], + "textToSpeechWebSocket": { + "endpointTemplate": "wss://api.minimax.io/ws/v1/t2a_v2", + "authenticationHeaderName": "Authorization", + "authenticationScheme": "Bearer", + "apiKeySettingKey": "apiKey", + "connectSuccessEventName": "connected_success", + "startMessageTemplate": "{ \"event\": \"task_start\", \"model\": {{model}}, \"language_boost\": \"English\", {{voiceSettingsJson}}, \"audio_setting\": { \"sample_rate\": 32000, \"bitrate\": 128000, \"format\": \"mp3\", \"channel\": 1 } }", + "startSuccessEventName": "task_started", + "continueMessageTemplate": "{ \"event\": \"task_continue\", \"text\": {{text}} }", + "finishMessageTemplate": "{ \"event\": \"task_finish\" }", + "responseAudioMode": "hexJsonString", + "responseAudioJsonPath": "data.audio", + "responseStatusCodeJsonPath": "base_resp.status_code", + "responseStatusMessageJsonPath": "base_resp.status_msg", + "finalFlagJsonPath": "is_final", + "taskFailedEventName": "task_failed", + "successStatusValue": "0", + "outputContentType": "audio/mpeg" + } + }, + { + "id": "elevenlabs", + "name": "ElevenLabs", + "runtime": "cloud", + "enabled": true, + "description": "Cloud TTS using the ElevenLabs WebSocket stream-input API.", + "settings": [ + { + "key": "apiKey", + "label": "API key", + "secret": true + }, + { + "key": "model", + "label": "Model", + "defaultValue": "eleven_multilingual_v2", + "options": [ + "eleven_flash_v2_5", + "eleven_turbo_v2_5", + "eleven_multilingual_v2", + "eleven_monolingual_v1" + ] + }, + { + "key": "voiceId", + "label": "Voice ID", + "required": false, + "defaultValue": "6aDn1KB0hjpdcocrUkmq", + "placeholder": "Enter an ElevenLabs voice ID" + }, + { + "key": "voiceSettingsJson", + "label": "Voice settings JSON", + "required": false, + "jsonValue": true, + "defaultValue": "\"voice_settings\": { \"speed\": 0.9, \"stability\": 0.5, \"similarity_boost\": 0.75 }", + "placeholder": "\"voice_settings\": { \"speed\": 0.9, \"stability\": 0.5, \"similarity_boost\": 0.75 }", + "description": "Optional full ElevenLabs request fragment. If present, it controls the full voice_settings payload." + } + ], + "textToSpeechWebSocket": { + "endpointTemplate": "wss://api.elevenlabs.io/v1/text-to-speech/{{voiceId}}/stream-input?model_id={{model}}&output_format=mp3_44100_128&auto_mode=true", + "authenticationHeaderName": "xi-api-key", + "authenticationScheme": "", + "apiKeySettingKey": "apiKey", + "connectSuccessEventName": "", + "startMessageTemplate": "{ \"text\": \" \", {{voiceSettingsJson}}, \"xi_api_key\": {{apiKey}} }", + "startSuccessEventName": "", + "continueMessageTemplate": "{ \"text\": {{textWithTrailingSpace}}, \"try_trigger_generation\": true }", + "finishMessageTemplate": "{ \"text\": \"\" }", + "responseAudioMode": "base64JsonString", + "responseAudioJsonPath": "audio", + "finalFlagJsonPath": "isFinal", + "taskFailedEventName": "error", + "outputContentType": "audio/mpeg" + } + } + ] +} diff --git a/src/OpenClaw.Tray.WinUI/Controls/VoiceSettingsPanel.xaml b/src/OpenClaw.Tray.WinUI/Controls/VoiceSettingsPanel.xaml new file mode 100644 index 0000000..cadffd4 --- /dev/null +++ b/src/OpenClaw.Tray.WinUI/Controls/VoiceSettingsPanel.xaml @@ -0,0 +1,111 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/OpenClaw.Tray.WinUI/Windows/VoiceRepeaterWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/VoiceRepeaterWindow.xaml.cs new file mode 100644 index 0000000..67dde46 --- /dev/null +++ b/src/OpenClaw.Tray.WinUI/Windows/VoiceRepeaterWindow.xaml.cs @@ -0,0 +1,563 @@ +using Microsoft.UI.Windowing; +using Microsoft.UI.Dispatching; +using Microsoft.UI.Xaml; +using Microsoft.UI.Xaml.Controls; +using OpenClaw.Shared; +using OpenClawTray.Helpers; +using OpenClawTray.Services; +using OpenClawTray.Services.Voice; +using System; +using System.Collections.ObjectModel; +using System.ComponentModel; +using System.Runtime.CompilerServices; +using System.Threading.Tasks; +using Windows.Graphics; +using WinUIEx; + +namespace OpenClawTray.Windows; + +public sealed partial class VoiceRepeaterWindow : WindowEx, IVoiceChatWindow +{ + private const int MaxConversationItems = 24; + private const int DefaultWidth = 360; + private const int DefaultHeight = 170; + private const int DefaultMargin = 12; + private const double DefaultTextSize = 13; + private const double DefaultCaptionSize = 10; + + private readonly SettingsManager _settings; + private readonly IVoiceRuntimeControlApi _voiceRuntimeControlApi; + private readonly ObservableCollection _conversationItems = []; + private readonly DispatcherQueueTimer? _refreshTimer; + private readonly DispatcherQueueTimer? _layoutSaveTimer; + + private bool _controlActionInFlight; + private bool _suppressSettingsEvents; + private bool _suppressPlacementSave = true; + private bool _initialPlacementPending = true; + private bool _placementDirty; + private bool _autoScrollEnabled; + private double _messageFontSize = DefaultTextSize; + private double _captionFontSize = DefaultCaptionSize; + + public bool IsClosed { get; private set; } + + public event EventHandler? OpenVoiceStatusRequested; + + public VoiceRepeaterWindow( + SettingsManager settings, + IVoiceRuntimeControlApi voiceRuntimeControlApi) + { + _settings = settings; + _voiceRuntimeControlApi = voiceRuntimeControlApi; + _autoScrollEnabled = _settings.VoiceRepeaterWindow.AutoScroll; + + InitializeComponent(); + + Title = "Voice Mode"; + ApplyStoredWindowPlacement(); + this.SetIcon(AppIconHelper.GetStatusIconPath(ConnectionStatus.Connected)); + + ConversationItemsControl.ItemsSource = _conversationItems; + + Closed += OnWindowClosed; + Activated += OnWindowActivated; + + var dispatcherQueue = DispatcherQueue.GetForCurrentThread(); + if (dispatcherQueue != null) + { + _refreshTimer = dispatcherQueue.CreateTimer(); + _refreshTimer.Interval = TimeSpan.FromMilliseconds(400); + _refreshTimer.Tick += (_, _) => RefreshStatus(); + _refreshTimer.Start(); + + _layoutSaveTimer = dispatcherQueue.CreateTimer(); + _layoutSaveTimer.Interval = TimeSpan.FromMilliseconds(600); + _layoutSaveTimer.IsRepeating = false; + _layoutSaveTimer.Tick += (_, _) => + { + _layoutSaveTimer.Stop(); + SaveWindowPlacement(); + }; + } + + if (AppWindow is not null) + { + AppWindow.Changed += OnAppWindowChanged; + } + + ApplyViewSettings(); + RefreshStatus(); + UpdateConversationPlaceholder(); + } + + public void RefreshStatus() + { + var status = _voiceRuntimeControlApi.CurrentStatus; + ApplyStatus(status); + } + + public Task UpdateVoiceTranscriptDraftAsync(string text, bool clear) + { + var draftText = clear ? string.Empty : (text ?? string.Empty); + DraftTextBlock.Text = draftText; + DraftPanel.Visibility = string.IsNullOrWhiteSpace(draftText) + ? Visibility.Collapsed + : Visibility.Visible; + + UpdateConversationPlaceholder(); + ScrollConversationToEnd(); + return Task.CompletedTask; + } + + public Task AppendVoiceConversationTurnAsync(VoiceConversationTurnEventArgs args) + { + if (args == null || string.IsNullOrWhiteSpace(args.Message)) + { + return Task.CompletedTask; + } + + var item = new ConversationItem( + args.Direction == VoiceConversationDirection.Outgoing ? "You" : "Assistant", + DateTime.Now.ToString("HH:mm:ss"), + args.Message, + _messageFontSize, + _captionFontSize); + + _conversationItems.Add(item); + while (_conversationItems.Count > MaxConversationItems) + { + _conversationItems.RemoveAt(0); + } + + UpdateConversationPlaceholder(); + ScrollConversationToEnd(); + return Task.CompletedTask; + } + + private async void OnPauseResumeClick(object sender, RoutedEventArgs e) + { + if (_controlActionInFlight) + { + return; + } + + _controlActionInFlight = true; + ApplyStatus(_voiceRuntimeControlApi.CurrentStatus); + + try + { + var status = _voiceRuntimeControlApi.CurrentStatus; + if (status.State == VoiceRuntimeState.Paused) + { + await _voiceRuntimeControlApi.ResumeAsync(new VoiceResumeArgs { Reason = "Voice repeater resume button" }); + } + else + { + await _voiceRuntimeControlApi.PauseAsync(new VoicePauseArgs { Reason = "Voice repeater pause button" }); + } + } + finally + { + _controlActionInFlight = false; + RefreshStatus(); + } + } + + private async void OnSkipReplyClick(object sender, RoutedEventArgs e) + { + if (_controlActionInFlight || !_voiceRuntimeControlApi.CurrentStatus.CanSkipReply) + { + return; + } + + _controlActionInFlight = true; + ApplyStatus(_voiceRuntimeControlApi.CurrentStatus); + + try + { + await _voiceRuntimeControlApi.SkipCurrentReplyAsync(new VoiceSkipArgs + { + Reason = "Voice repeater skip button" + }); + } + finally + { + _controlActionInFlight = false; + RefreshStatus(); + } + } + + private void OnAutoScrollChanged(object sender, RoutedEventArgs e) + { + if (_suppressSettingsEvents) + { + return; + } + + _autoScrollEnabled = AutoScrollCheckBox.IsChecked == true; + _settings.VoiceRepeaterWindow.AutoScroll = _autoScrollEnabled; + _settings.Save(logSuccess: false); + + if (_autoScrollEnabled) + { + ScrollConversationToEnd(); + } + } + + private void OnTextSizeSelectionChanged(object sender, SelectionChangedEventArgs e) + { + if (_suppressSettingsEvents || TextSizeComboBox.SelectedItem is not ComboBoxItem item) + { + return; + } + + if (!double.TryParse(item.Tag?.ToString(), out var size)) + { + return; + } + + _settings.VoiceRepeaterWindow.TextSize = size; + ApplyViewSettings(); + _settings.Save(logSuccess: false); + } + + private void OnFloatingEnabledChanged(object sender, RoutedEventArgs e) + { + if (_suppressSettingsEvents) + { + return; + } + + var enabled = FloatingEnabledCheckBox.IsChecked == true; + _settings.VoiceRepeaterWindow.FloatingEnabled = enabled; + IsAlwaysOnTop = enabled; + _settings.Save(logSuccess: false); + } + + private void OnOpenVoiceStatusClick(object sender, RoutedEventArgs e) + { + OpenVoiceStatusRequested?.Invoke(this, EventArgs.Empty); + } + + private void OnWindowClosed(object sender, WindowEventArgs e) + { + if (_refreshTimer != null) + { + _refreshTimer.Stop(); + } + + if (_layoutSaveTimer != null) + { + _layoutSaveTimer.Stop(); + } + + if (AppWindow is not null) + { + AppWindow.Changed -= OnAppWindowChanged; + } + + Activated -= OnWindowActivated; + FlushWindowPlacement(); + IsClosed = true; + } + + private void OnWindowActivated(object sender, WindowActivatedEventArgs args) + { + if (!_initialPlacementPending) + { + return; + } + + _initialPlacementPending = false; + ApplyStoredWindowPlacement(); + + var dispatcherQueue = DispatcherQueue.GetForCurrentThread(); + _ = dispatcherQueue?.TryEnqueue(() => _suppressPlacementSave = false); + } + + private void OnAppWindowChanged(AppWindow sender, AppWindowChangedEventArgs args) + { + if (_suppressPlacementSave) + { + return; + } + + if (args.DidPositionChange || args.DidSizeChange) + { + _placementDirty = true; + _layoutSaveTimer?.Stop(); + _layoutSaveTimer?.Start(); + } + } + + private void ApplyStatus(VoiceStatusInfo status) + { + Title = $"Voice Mode ({GetWindowStateLabel(status)})"; + DraftCaptionTextBlock.Text = status.State == VoiceRuntimeState.RecordingUtterance + ? "You (speaking)" + : "You (draft)"; + + if (string.IsNullOrWhiteSpace(status.LastError)) + { + TroubleshootingTextBlock.Visibility = Visibility.Collapsed; + TroubleshootingTextBlock.Text = string.Empty; + } + else + { + TroubleshootingTextBlock.Visibility = Visibility.Visible; + TroubleshootingTextBlock.Text = status.LastError; + } + + var paused = status.State == VoiceRuntimeState.Paused; + PauseResumeButton.IsEnabled = !_controlActionInFlight && status.Mode != VoiceActivationMode.Off; + PauseResumeIcon.Symbol = paused ? Symbol.Play : Symbol.Pause; + ToolTipService.SetToolTip( + PauseResumeButton, + paused ? "Resume voice mode" : "Pause voice mode"); + + SkipReplyButton.IsEnabled = !_controlActionInFlight && status.CanSkipReply; + } + + private void ApplyStoredWindowPlacement() + { + if (AppWindow is null) + { + return; + } + + var prefs = _settings.VoiceRepeaterWindow; + var width = prefs.HasSavedPlacement + ? prefs.Width.GetValueOrDefault(DefaultWidth) + : DefaultWidth; + var height = prefs.HasSavedPlacement + ? prefs.Height.GetValueOrDefault(DefaultHeight) + : DefaultHeight; + var clampedWidth = Math.Max(width, 320); + var clampedHeight = Math.Max(height, 150); + + IsAlwaysOnTop = prefs.FloatingEnabled; + + var targetRect = prefs.HasSavedPlacement && prefs.X.HasValue && prefs.Y.HasValue + ? new RectInt32(prefs.X.Value, prefs.Y.Value, clampedWidth, clampedHeight) + : GetDefaultAnchorRect(clampedWidth, clampedHeight); + + if (!IsPlacementVisible(targetRect)) + { + targetRect = GetDefaultAnchorRect(clampedWidth, clampedHeight); + } + + try + { + AppWindow.MoveAndResize(targetRect); + } + catch + { + this.SetWindowSize(targetRect.Width, targetRect.Height); + AppWindow.Move(new PointInt32(targetRect.X, targetRect.Y)); + } + } + + private void ApplyViewSettings() + { + _suppressSettingsEvents = true; + try + { + _autoScrollEnabled = _settings.VoiceRepeaterWindow.AutoScroll; + _messageFontSize = Math.Clamp( + _settings.VoiceRepeaterWindow.TextSize > 0 ? _settings.VoiceRepeaterWindow.TextSize : DefaultTextSize, + 11, + 15); + _captionFontSize = Math.Max(9, _messageFontSize - 3); + + DraftTextBlock.FontSize = _messageFontSize; + DraftCaptionTextBlock.FontSize = _captionFontSize; + TroubleshootingTextBlock.FontSize = _captionFontSize; + + foreach (var item in _conversationItems) + { + item.MessageFontSize = _messageFontSize; + item.CaptionFontSize = _captionFontSize; + } + + AutoScrollCheckBox.IsChecked = _autoScrollEnabled; + FloatingEnabledCheckBox.IsChecked = _settings.VoiceRepeaterWindow.FloatingEnabled; + SelectTextSizeItem(_messageFontSize); + } + finally + { + _suppressSettingsEvents = false; + } + } + + private void SaveWindowPlacement() + { + if (IsClosed || AppWindow is null || _suppressPlacementSave) + { + return; + } + + var size = AppWindow.Size; + var position = AppWindow.Position; + _settings.VoiceRepeaterWindow.Width = size.Width; + _settings.VoiceRepeaterWindow.Height = size.Height; + _settings.VoiceRepeaterWindow.X = position.X; + _settings.VoiceRepeaterWindow.Y = position.Y; + _settings.VoiceRepeaterWindow.HasSavedPlacement = true; + _settings.Save(logSuccess: false); + _placementDirty = false; + } + + private void FlushWindowPlacement() + { + if (_placementDirty || !IsClosed) + { + SaveWindowPlacement(); + } + } + + private RectInt32 GetDefaultAnchorRect(int width, int height) + { + var displayArea = DisplayArea.Primary; + var x = displayArea.WorkArea.X + DefaultMargin; + var y = displayArea.WorkArea.Y + Math.Max(DefaultMargin, displayArea.WorkArea.Height - height - DefaultMargin); + return new RectInt32(x, y, width, height); + } + + private static bool IsPlacementVisible(RectInt32 rect) + { + try + { + var displayArea = DisplayArea.GetFromRect(rect, DisplayAreaFallback.Nearest); + var workArea = displayArea.WorkArea; + return rect.Width > 0 && + rect.Height > 0 && + rect.X < workArea.X + workArea.Width && + rect.X + rect.Width > workArea.X && + rect.Y < workArea.Y + workArea.Height && + rect.Y + rect.Height > workArea.Y; + } + catch + { + return false; + } + } + + private void SelectTextSizeItem(double size) + { + var sizeTag = ((int)Math.Round(size)).ToString(); + foreach (var entry in TextSizeComboBox.Items) + { + if (entry is ComboBoxItem item && string.Equals(item.Tag?.ToString(), sizeTag, StringComparison.Ordinal)) + { + TextSizeComboBox.SelectedItem = item; + return; + } + } + + TextSizeComboBox.SelectedIndex = 2; + } + + private void UpdateConversationPlaceholder() + { + EmptyConversationTextBlock.Visibility = _conversationItems.Count == 0 && DraftPanel.Visibility != Visibility.Visible + ? Visibility.Visible + : Visibility.Collapsed; + } + + private void ScrollConversationToEnd() + { + if (!_autoScrollEnabled) + { + return; + } + + var dispatcherQueue = DispatcherQueue.GetForCurrentThread(); + _ = dispatcherQueue?.TryEnqueue(() => + { + ConversationScrollViewer.UpdateLayout(); + ConversationScrollViewer.ChangeView(null, ConversationScrollViewer.ScrollableHeight, null, true); + _ = dispatcherQueue.TryEnqueue(() => + ConversationScrollViewer.ChangeView(null, ConversationScrollViewer.ScrollableHeight, null, true)); + }); + } + + private static string GetWindowStateLabel(VoiceStatusInfo status) + { + return status.State switch + { + VoiceRuntimeState.ListeningForVoiceWake => "listening", + VoiceRuntimeState.ListeningContinuously => "listening", + VoiceRuntimeState.RecordingUtterance => "hearing you", + VoiceRuntimeState.AwaitingResponse => "waiting", + VoiceRuntimeState.PlayingResponse => "speaking", + VoiceRuntimeState.Paused => "paused", + VoiceRuntimeState.Arming => "starting", + VoiceRuntimeState.Error => "error", + _ when status.Mode == VoiceActivationMode.Off => "off", + _ => "idle" + }; + } + + private sealed class ConversationItem : INotifyPropertyChanged + { + private double _messageFontSize; + private double _captionFontSize; + + public ConversationItem( + string speaker, + string timestamp, + string message, + double messageFontSize, + double captionFontSize) + { + Speaker = speaker; + Timestamp = timestamp; + Message = message; + _messageFontSize = messageFontSize; + _captionFontSize = captionFontSize; + } + + public string Speaker { get; } + public string Timestamp { get; } + public string Message { get; } + public string Caption => $"{Speaker} · {Timestamp}"; + + public double MessageFontSize + { + get => _messageFontSize; + set + { + if (Math.Abs(_messageFontSize - value) < 0.01) + { + return; + } + + _messageFontSize = value; + OnPropertyChanged(); + } + } + + public double CaptionFontSize + { + get => _captionFontSize; + set + { + if (Math.Abs(_captionFontSize - value) < 0.01) + { + return; + } + + _captionFontSize = value; + OnPropertyChanged(); + } + } + + public event PropertyChangedEventHandler? PropertyChanged; + + private void OnPropertyChanged([CallerMemberName] string? propertyName = null) + { + PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(propertyName)); + } + } +} diff --git a/src/OpenClaw.Tray.WinUI/Windows/WebChatVoiceDomState.cs b/src/OpenClaw.Tray.WinUI/Windows/WebChatVoiceDomState.cs new file mode 100644 index 0000000..59cdcfe --- /dev/null +++ b/src/OpenClaw.Tray.WinUI/Windows/WebChatVoiceDomState.cs @@ -0,0 +1,15 @@ +namespace OpenClawTray.Windows; + +internal sealed class WebChatVoiceDomState +{ + public WebChatVoiceDomState() + { + } + + public string PendingDraft { get; private set; } = string.Empty; + + public void SetDraft(string? text, bool clear) + { + PendingDraft = clear ? string.Empty : (text ?? string.Empty); + } +} diff --git a/src/OpenClaw.Tray.WinUI/Windows/WebChatWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/WebChatWindow.xaml.cs index 8a6bc4b..dca6739 100644 --- a/src/OpenClaw.Tray.WinUI/Windows/WebChatWindow.xaml.cs +++ b/src/OpenClaw.Tray.WinUI/Windows/WebChatWindow.xaml.cs @@ -3,6 +3,7 @@ using OpenClaw.Shared; using OpenClawTray.Helpers; using OpenClawTray.Services; +using OpenClawTray.Services.Voice; using System; using System.Diagnostics; using System.IO; @@ -14,14 +15,16 @@ namespace OpenClawTray.Windows; public sealed partial class WebChatWindow : WindowEx + , IVoiceChatWindow { private readonly string _gatewayUrl; private readonly string _token; - - // Store event handlers for cleanup + private readonly WebChatVoiceDomState _voiceDomState; + private bool _voiceDomReady; + private TypedEventHandler? _navigationCompletedHandler; private TypedEventHandler? _navigationStartingHandler; - + public bool IsClosed { get; private set; } public WebChatWindow(string gatewayUrl, string token) @@ -29,18 +32,18 @@ public WebChatWindow(string gatewayUrl, string token) Logger.Info($"WebChatWindow: Constructor called, gateway={gatewayUrl}"); _gatewayUrl = gatewayUrl; _token = token; - + _voiceDomState = new WebChatVoiceDomState(); + InitializeComponent(); - - // Window configuration + this.SetWindowSize(520, 750); this.MinWidth = 380; this.MinHeight = 450; this.CenterOnScreen(); - this.SetIcon(IconHelper.GetStatusIconPath(ConnectionStatus.Connected)); - + this.SetIcon(AppIconHelper.GetStatusIconPath(ConnectionStatus.Connected)); + Closed += OnWindowClosed; - + Logger.Info("WebChatWindow: Starting InitializeWebViewAsync"); _ = InitializeWebViewAsync(); } @@ -48,8 +51,8 @@ public WebChatWindow(string gatewayUrl, string token) private void OnWindowClosed(object sender, WindowEventArgs e) { IsClosed = true; - - // Cleanup WebView2 event handlers + _voiceDomReady = false; + if (WebView.CoreWebView2 != null) { if (_navigationCompletedHandler != null) @@ -64,35 +67,39 @@ private async Task InitializeWebViewAsync() try { Logger.Info("WebChatWindow: Initializing WebView2..."); - - // Set up user data folder for WebView2 + var userDataFolder = Path.Combine( Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), "OpenClawTray", "WebView2"); - + Directory.CreateDirectory(userDataFolder); Logger.Info($"WebChatWindow: User data folder: {userDataFolder}"); - // Set environment variable for user data folder Environment.SetEnvironmentVariable("WEBVIEW2_USER_DATA_FOLDER", userDataFolder); - + Logger.Info("WebChatWindow: Calling EnsureCoreWebView2Async..."); await WebView.EnsureCoreWebView2Async(); Logger.Info("WebChatWindow: CoreWebView2 initialized successfully"); - - // Configure WebView2 + WebView.CoreWebView2.Settings.IsStatusBarEnabled = false; WebView.CoreWebView2.Settings.AreDefaultContextMenusEnabled = true; WebView.CoreWebView2.Settings.IsZoomControlEnabled = true; + await WebView.CoreWebView2.AddScriptToExecuteOnDocumentCreatedAsync(WebChatVoiceDomBridge.DocumentCreatedScript); + + _voiceDomReady = false; - // Handle navigation events (store for cleanup) _navigationCompletedHandler = (s, e) => { Logger.Info($"WebChatWindow: Navigation completed, success={e.IsSuccess}, status={e.WebErrorStatus}"); LoadingRing.IsActive = false; LoadingRing.Visibility = Visibility.Collapsed; - - // Show friendly error if connection failed + _voiceDomReady = e.IsSuccess; + + if (e.IsSuccess) + { + _ = RefreshTrayVoiceDomStateAsync(); + } + if (!e.IsSuccess && (e.WebErrorStatus == CoreWebView2WebErrorStatus.ConnectionAborted || e.WebErrorStatus == CoreWebView2WebErrorStatus.CannotConnect || e.WebErrorStatus == CoreWebView2WebErrorStatus.ConnectionReset || @@ -115,15 +122,14 @@ private async Task InitializeWebViewAsync() _navigationStartingHandler = (s, e) => { - // Strip query params to avoid logging tokens var safeUri = e.Uri?.Split('?')[0] ?? "unknown"; Logger.Info($"WebChatWindow: Navigation starting to {safeUri}"); + _voiceDomReady = false; LoadingRing.IsActive = true; LoadingRing.Visibility = Visibility.Visible; }; WebView.CoreWebView2.NavigationStarting += _navigationStartingHandler; - // Navigate to chat NavigateToChat(); } catch (Exception ex) @@ -135,13 +141,12 @@ private async Task InitializeWebViewAsync() Logger.Error($"WebView2 inner exception: {ex.InnerException.GetType().FullName}: {ex.InnerException.Message}"); } Logger.Error($"WebView2 stack trace: {ex.StackTrace}"); - - // Show error in the dialog instead of falling back to browser + LoadingRing.IsActive = false; LoadingRing.Visibility = Visibility.Collapsed; WebView.Visibility = Visibility.Collapsed; ErrorPanel.Visibility = Visibility.Visible; - + var errorDetails = $"Exception: {ex.GetType().FullName}\n" + $"HResult: 0x{ex.HResult:X8}\n" + $"Message: {ex.Message}\n\n" + @@ -149,17 +154,16 @@ private async Task InitializeWebViewAsync() $"Architecture: {RuntimeInformation.ProcessArchitecture}\n" + $"OS: {RuntimeInformation.OSDescription}\n\n" + $"Stack Trace:\n{ex.StackTrace}"; - + if (ex.InnerException != null) { errorDetails += $"\n\nInner Exception: {ex.InnerException.GetType().FullName}\n{ex.InnerException.Message}"; } - + ErrorText.Text = errorDetails; } } - // Set to a test URL to bypass gateway (e.g., "https://www.bing.com"), or null for normal operation private const string? DEBUG_TEST_URL = null; private static bool IsLocalHost(Uri uri) @@ -208,12 +212,11 @@ private void ShowErrorMessage(string message) ErrorPanel.Visibility = Visibility.Visible; ErrorText.Text = message; } - + private void NavigateToChat() { if (WebView.CoreWebView2 == null) return; - // If debug URL is set, use it instead of gateway if (!string.IsNullOrEmpty(DEBUG_TEST_URL)) { Logger.Info($"WebChatWindow: DEBUG MODE - Navigating to test URL: {DEBUG_TEST_URL}"); @@ -251,7 +254,7 @@ private void OnPopout(object sender, RoutedEventArgs e) ShowErrorMessage(errorMessage); return; } - + try { Process.Start(new ProcessStartInfo(url) { UseShellExecute = true }); @@ -266,4 +269,34 @@ private void OnDevTools(object sender, RoutedEventArgs e) { WebView.CoreWebView2?.OpenDevToolsWindow(); } + + public async Task UpdateVoiceTranscriptDraftAsync(string text, bool clear) + { + _voiceDomState.SetDraft(text, clear); + await RefreshTrayVoiceDomStateAsync(); + } + + public async Task AppendVoiceConversationTurnAsync(VoiceConversationTurnEventArgs args) + { + await Task.CompletedTask; + } + + private async Task RefreshTrayVoiceDomStateAsync() + { + if (WebView.CoreWebView2 == null || !_voiceDomReady || IsClosed) + { + return; + } + + try + { + await WebView.CoreWebView2.ExecuteScriptAsync( + WebChatVoiceDomBridge.BuildSetDraftScript(_voiceDomState.PendingDraft)); + await WebView.CoreWebView2.ExecuteScriptAsync(WebChatVoiceDomBridge.ClearLegacyTurnsScript); + } + catch (Exception ex) + { + Logger.Warn($"WebChatWindow: Failed to apply voice DOM state: {ex.Message}"); + } + } } diff --git a/tests/OpenClaw.Shared.Tests/CapabilityTests.cs b/tests/OpenClaw.Shared.Tests/CapabilityTests.cs index 67de774..f1de44e 100644 --- a/tests/OpenClaw.Shared.Tests/CapabilityTests.cs +++ b/tests/OpenClaw.Shared.Tests/CapabilityTests.cs @@ -965,3 +965,234 @@ public async Task Snap_ReturnsError_WhenHandlerThrows() Assert.Contains("Camera access blocked", res.Error); } } + +public class VoiceCapabilityTests +{ + private static JsonElement Parse(string json) + { + using var doc = JsonDocument.Parse(json); + return doc.RootElement.Clone(); + } + + [Fact] + public void CanHandle_VoiceCommands() + { + var cap = new VoiceCapability(NullLogger.Instance); + Assert.True(cap.CanHandle(VoiceCommands.ListDevices)); + Assert.True(cap.CanHandle(VoiceCommands.GetSettings)); + Assert.True(cap.CanHandle(VoiceCommands.SetSettings)); + Assert.True(cap.CanHandle(VoiceCommands.GetStatus)); + Assert.True(cap.CanHandle(VoiceCommands.Start)); + Assert.True(cap.CanHandle(VoiceCommands.Stop)); + Assert.False(cap.CanHandle("voice.unknown")); + Assert.Equal("voice", cap.Category); + } + + [Fact] + public async Task ListDevices_ReturnsArrayFromHandler() + { + var cap = new VoiceCapability(NullLogger.Instance); + cap.ListDevicesRequested += () => Task.FromResult( + [ + new VoiceAudioDeviceInfo + { + DeviceId = "default-input", + Name = "System default microphone", + IsDefault = true, + IsInput = true + } + ]); + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "voice1", + Command = VoiceCommands.ListDevices, + Args = Parse("""{}""") + }); + + Assert.True(res.Ok); + var json = JsonSerializer.Serialize(res.Payload); + using var doc = JsonDocument.Parse(json); + Assert.Equal(JsonValueKind.Array, doc.RootElement.ValueKind); + Assert.Equal("default-input", doc.RootElement[0].GetProperty("DeviceId").GetString()); + } + + [Fact] + public async Task GetSettings_ReturnsSettingsFromHandler() + { + var cap = new VoiceCapability(NullLogger.Instance); + cap.SettingsRequested += () => Task.FromResult(new VoiceSettings + { + Enabled = true, + Mode = VoiceActivationMode.VoiceWake + }); + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "voice2", + Command = VoiceCommands.GetSettings, + Args = Parse("""{}""") + }); + + Assert.True(res.Ok); + var json = JsonSerializer.Serialize(res.Payload); + using var doc = JsonDocument.Parse(json); + Assert.True(doc.RootElement.GetProperty("Enabled").GetBoolean()); + Assert.Equal("VoiceWake", doc.RootElement.GetProperty("Mode").GetString()); + } + + [Fact] + public async Task SetSettings_UsesUpdateEnvelope_WhenPresent() + { + var cap = new VoiceCapability(NullLogger.Instance); + VoiceSettingsUpdateArgs? received = null; + cap.SettingsUpdateRequested += update => + { + received = update; + return Task.FromResult(update.Settings); + }; + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "voice3", + Command = VoiceCommands.SetSettings, + Args = Parse("""{"update":{"persist":false,"settings":{"enabled":true,"mode":"TalkMode"}}}""") + }); + + Assert.True(res.Ok); + Assert.NotNull(received); + Assert.False(received!.Persist); + Assert.Equal(VoiceActivationMode.TalkMode, received.Settings.Mode); + } + + [Fact] + public async Task GetStatus_ReturnsStatusFromHandler() + { + var cap = new VoiceCapability(NullLogger.Instance); + cap.StatusRequested += () => Task.FromResult(new VoiceStatusInfo + { + Available = true, + Running = true, + Mode = VoiceActivationMode.TalkMode, + State = VoiceRuntimeState.ListeningContinuously + }); + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "voice4", + Command = VoiceCommands.GetStatus, + Args = Parse("""{}""") + }); + + Assert.True(res.Ok); + var json = JsonSerializer.Serialize(res.Payload); + using var doc = JsonDocument.Parse(json); + Assert.True(doc.RootElement.GetProperty("Running").GetBoolean()); + Assert.Equal("ListeningContinuously", doc.RootElement.GetProperty("State").GetString()); + } + + [Fact] + public async Task Start_PassesArgsToHandler() + { + var cap = new VoiceCapability(NullLogger.Instance); + VoiceStartArgs? received = null; + cap.StartRequested += args => + { + received = args; + return Task.FromResult(new VoiceStatusInfo + { + Available = true, + Running = true, + Mode = args.Mode ?? VoiceActivationMode.Off, + State = VoiceRuntimeState.ListeningForVoiceWake, + SessionKey = args.SessionKey + }); + }; + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "voice5", + Command = VoiceCommands.Start, + Args = Parse("""{"mode":"VoiceWake","sessionKey":"session-123"}""") + }); + + Assert.True(res.Ok); + Assert.NotNull(received); + Assert.Equal(VoiceActivationMode.VoiceWake, received!.Mode); + Assert.Equal("session-123", received.SessionKey); + } + + [Fact] + public async Task Stop_PassesReasonToHandler() + { + var cap = new VoiceCapability(NullLogger.Instance); + VoiceStopArgs? received = null; + cap.StopRequested += args => + { + received = args; + return Task.FromResult(new VoiceStatusInfo + { + Available = true, + Running = false, + Mode = VoiceActivationMode.Off, + State = VoiceRuntimeState.Stopped, + LastError = args.Reason + }); + }; + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "voice6", + Command = VoiceCommands.Stop, + Args = Parse("""{"reason":"user requested"}""") + }); + + Assert.True(res.Ok); + Assert.NotNull(received); + Assert.Equal("user requested", received!.Reason); + } + + [Fact] + public async Task Start_ReturnsError_WhenHandlerMissing() + { + var cap = new VoiceCapability(NullLogger.Instance); + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "voice7", + Command = VoiceCommands.Start, + Args = Parse("""{}""") + }); + + Assert.False(res.Ok); + Assert.Contains("not available", res.Error, StringComparison.OrdinalIgnoreCase); + } + + [Fact] + public async Task LegacyVoiceSkipCommand_RemainsAccepted() + { + var cap = new VoiceCapability(NullLogger.Instance); + VoiceSkipArgs? received = null; + cap.SkipRequested += args => + { + received = args; + return Task.FromResult(new VoiceStatusInfo + { + Available = true, + Running = true, + Mode = VoiceActivationMode.TalkMode, + State = VoiceRuntimeState.PlayingResponse + }); + }; + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "voice8", + Command = "voice.skip", + Args = Parse("""{"reason":"legacy caller"}""") + }); + + Assert.True(res.Ok); + Assert.NotNull(received); + Assert.Equal("legacy caller", received!.Reason); + } +} diff --git a/tests/OpenClaw.Shared.Tests/OpenClawGatewayClientTests.cs b/tests/OpenClaw.Shared.Tests/OpenClawGatewayClientTests.cs index d44f3fd..9cc95a6 100644 --- a/tests/OpenClaw.Shared.Tests/OpenClawGatewayClientTests.cs +++ b/tests/OpenClaw.Shared.Tests/OpenClawGatewayClientTests.cs @@ -1,6 +1,8 @@ using System; +using System.Collections; using System.Collections.Generic; using System.Linq; +using System.Reflection; using System.Text.Json; using Xunit; using OpenClaw.Shared; @@ -78,6 +80,54 @@ public SessionInfo[] GetSessionList() return _client.GetSessionList(); } + public string GetDefaultChatSessionKey() + { + return GetPrivateField("_defaultChatSessionKey"); + } + + public void UpdateDefaultChatSessionKeyFromHello(string payloadJson) + { + using var doc = JsonDocument.Parse(payloadJson); + var method = typeof(OpenClawGatewayClient).GetMethod( + "UpdateDefaultChatSessionKeyFromHello", + System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); + method!.Invoke(_client, new object[] { doc.RootElement.Clone() }); + } + + public string SerializeChatSendRequest(string message, string sessionKey, string idempotencyKey) + { + var parametersMethod = typeof(OpenClawGatewayClient).GetMethod( + "BuildChatSendParameters", + System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); + var parameters = parametersMethod!.Invoke(_client, new object[] { message, sessionKey, idempotencyKey }); + + var serializeMethod = typeof(OpenClawGatewayClient).GetMethod( + "SerializeRequest", + System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static); + return (string)serializeMethod!.Invoke(null, new object[] { "request-123", "chat.send", parameters! })!; + } + + public string SerializeConnectRequest() + { + var parametersMethod = typeof(OpenClawGatewayClient).GetMethod( + "BuildConnectParameters", + System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); + var parameters = parametersMethod!.Invoke(_client, Array.Empty()); + + var serializeMethod = typeof(OpenClawGatewayClient).GetMethod( + "SerializeRequest", + System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static); + return (string)serializeMethod!.Invoke(null, new object[] { "request-456", "connect", parameters! })!; + } + + public string NormalizeChatSessionKey(string? sessionKey) + { + var method = typeof(OpenClawGatewayClient).GetMethod( + "NormalizeChatSessionKey", + System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static); + return (string)method!.Invoke(null, new object?[] { sessionKey })!; + } + public void SetUnsupportedMethodFlags(bool usageStatus, bool usageCost, bool sessionPreview, bool nodeList) { SetPrivateField("_usageStatusUnsupported", usageStatus); @@ -134,6 +184,70 @@ public SessionsPreviewPayloadInfo ParseSessionsPreviewPayload(string payloadJson return parsed ?? new SessionsPreviewPayloadInfo(); } + public ChatMessageEventArgs? HandleChatEventAndCaptureMessage(string payloadJson) + { + ChatMessageEventArgs? captured = null; + EventHandler handler = (_, args) => captured = args; + _client.ChatMessageReceived += handler; + + try + { + using var doc = JsonDocument.Parse(payloadJson); + var method = typeof(OpenClawGatewayClient).GetMethod( + "HandleChatEvent", + System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); + method!.Invoke(_client, new object[] { doc.RootElement.Clone() }); + } + finally + { + _client.ChatMessageReceived -= handler; + } + + return captured; + } + + public int GetPendingChatPreviewSessionCount() + { + var pending = GetPrivateField("_pendingChatPreviewSessionKeys"); + return pending.Count; + } + + public void AddPendingChatPreviewSession(string sessionKey, string? lastKnownAssistantText = null, int attemptCount = 0) + { + var pending = GetPrivateField("_pendingChatPreviewSessionKeys"); + var stateType = typeof(OpenClawGatewayClient).GetNestedType( + "PendingChatPreviewState", + BindingFlags.NonPublic)!; + var state = Activator.CreateInstance(stateType)!; + stateType.GetProperty("LastKnownAssistantText")!.SetValue(state, lastKnownAssistantText); + stateType.GetProperty("AttemptCount")!.SetValue(state, attemptCount); + pending[sessionKey] = state; + } + + public void SetLastAssistantMessage(string sessionKey, string text) + { + var lastMessages = GetPrivateField("_lastAssistantMessagesBySession"); + lastMessages[sessionKey] = text; + } + + public ChatMessageEventArgs? ParseSessionsPreviewPayloadAndCaptureMessage(string payloadJson) + { + ChatMessageEventArgs? captured = null; + EventHandler handler = (_, args) => captured = args; + _client.ChatMessageReceived += handler; + + try + { + InvokePrivatePayloadParser("ParseSessionsPreview", payloadJson); + } + finally + { + _client.ChatMessageReceived -= handler; + } + + return captured; + } + public GatewayNodeInfo[] ParseNodeListPayload(string payloadJson) { GatewayNodeInfo[] parsed = Array.Empty(); @@ -834,6 +948,164 @@ public void ParseChannelHealth_StatusField_TakesPriorityOverDerivedStatus() Assert.Equal("degraded", channels[0].Status); } + [Fact] + public void UpdateDefaultChatSessionKeyFromHello_UsesSnapshotMainSessionKey() + { + var helper = new GatewayClientTestHelper(); + + helper.UpdateDefaultChatSessionKeyFromHello(""" + { + "type": "hello-ok", + "snapshot": { + "sessionDefaults": { + "mainSessionKey": "agent:main:main" + } + } + } + """); + + Assert.Equal("main", helper.GetDefaultChatSessionKey()); + } + + [Fact] + public void ParseSessions_MainSession_UpdatesDefaultChatSessionKey() + { + var helper = new GatewayClientTestHelper(); + + helper.ParseSessionsPayload(""" + { + "agent:main:main": { + "status": "active", + "displayName": "Main", + "isMain": true + }, + "agent:other:test": { + "status": "active" + } + } + """); + + Assert.Equal("main", helper.GetDefaultChatSessionKey()); + } + + [Fact] + public void SerializeChatSendRequest_IncludesSessionKeyAndIdempotencyKey() + { + var helper = new GatewayClientTestHelper(); + + var json = helper.SerializeChatSendRequest("hello", "main", "idem-123"); + using var doc = JsonDocument.Parse(json); + var parameters = doc.RootElement.GetProperty("params"); + + Assert.Equal("hello", parameters.GetProperty("message").GetString()); + Assert.Equal("main", parameters.GetProperty("sessionKey").GetString()); + Assert.Equal("idem-123", parameters.GetProperty("idempotencyKey").GetString()); + } + + [Fact] + public void NormalizeChatSessionKey_CollapsesExpandedMainKey() + { + var helper = new GatewayClientTestHelper(); + + Assert.Equal("main", helper.NormalizeChatSessionKey("agent:main:main")); + Assert.Equal("main", helper.NormalizeChatSessionKey("main")); + Assert.Equal("agent:sub:test", helper.NormalizeChatSessionKey("agent:sub:test")); + } + + [Fact] + public void HandleChatEvent_FinalWithoutMessage_QueuesPreviewLookup() + { + var helper = new GatewayClientTestHelper(); + + var captured = helper.HandleChatEventAndCaptureMessage(""" + { + "type": "event", + "event": "chat", + "payload": { + "sessionKey": "agent:main:main", + "state": "final" + } + } + """); + + Assert.Null(captured); + Assert.Equal(1, helper.GetPendingChatPreviewSessionCount()); + } + + [Fact] + public void ParseSessionsPreview_EmitsAssistantMessage_ForQueuedFinalPreview() + { + var helper = new GatewayClientTestHelper(); + helper.AddPendingChatPreviewSession("main"); + + var captured = helper.ParseSessionsPreviewPayloadAndCaptureMessage(""" + { + "ts": 1739760000000, + "previews": [ + { + "key": "agent:main:main", + "status": "ok", + "items": [ + { "role": "user", "text": "hello" }, + { "role": "assistant", "text": "world" } + ] + } + ] + } + """); + + Assert.NotNull(captured); + Assert.Equal("main", captured!.SessionKey); + Assert.Equal("assistant", captured.Role); + Assert.Equal("world", captured.Message); + Assert.True(captured.IsFinal); + Assert.Equal(0, helper.GetPendingChatPreviewSessionCount()); + } + + [Fact] + public void ParseSessionsPreview_DoesNotEmitStaleAssistantMessage_ForQueuedFinalPreview() + { + var helper = new GatewayClientTestHelper(); + helper.SetUnsupportedMethodFlags(usageStatus: false, usageCost: false, sessionPreview: true, nodeList: false); + helper.SetLastAssistantMessage("main", "world"); + helper.AddPendingChatPreviewSession("main", lastKnownAssistantText: "world"); + + var captured = helper.ParseSessionsPreviewPayloadAndCaptureMessage(""" + { + "ts": 1739760000000, + "previews": [ + { + "key": "agent:main:main", + "status": "ok", + "items": [ + { "role": "user", "text": "hello again" }, + { "role": "assistant", "text": "world" } + ] + } + ] + } + """); + + Assert.Null(captured); + Assert.Equal(1, helper.GetPendingChatPreviewSessionCount()); + } + + [Fact] + public void SerializeConnectRequest_UsesCliClientModeAndOperatorScopes() + { + var helper = new GatewayClientTestHelper(); + + var json = helper.SerializeConnectRequest(); + using var doc = JsonDocument.Parse(json); + var parameters = doc.RootElement.GetProperty("params"); + var client = parameters.GetProperty("client"); + var scopes = parameters.GetProperty("scopes").EnumerateArray().Select(item => item.GetString()).ToArray(); + + Assert.Equal("cli", client.GetProperty("mode").GetString()); + Assert.Contains("operator.read", scopes); + Assert.Contains("operator.write", scopes); + } + // ── BuildMissingScopeFixCommands tests ───────────────────────────────────── [Fact] diff --git a/tests/OpenClaw.Shared.Tests/VoiceModeSchemaTests.cs b/tests/OpenClaw.Shared.Tests/VoiceModeSchemaTests.cs new file mode 100644 index 0000000..eb48667 --- /dev/null +++ b/tests/OpenClaw.Shared.Tests/VoiceModeSchemaTests.cs @@ -0,0 +1,141 @@ +using OpenClaw.Shared; +using System.Text.Json; + +namespace OpenClaw.Shared.Tests; + +public class VoiceCommandsTests +{ + [Fact] + public void All_ContainsExpectedCommandsInStableOrder() + { + Assert.Equal( + [ + "voice.devices.list", + "voice.settings.get", + "voice.settings.set", + "voice.status.get", + "voice.start", + "voice.stop", + "voice.pause", + "voice.resume", + "voice.response.skip" + ], + VoiceCommands.All); + } +} + +public class VoiceSchemaDefaultsTests +{ + [Fact] + public void VoiceSettings_Defaults_AreConcreteAndProviderAgnostic() + { + var settings = new VoiceSettings(); + + Assert.False(settings.Enabled); + Assert.Equal(VoiceActivationMode.Off, settings.Mode); + Assert.True(settings.ShowRepeaterAtStartup); + Assert.False(settings.ShowConversationToasts); + Assert.Equal(VoiceProviderIds.Windows, settings.SpeechToTextProviderId); + Assert.Equal(VoiceProviderIds.Windows, settings.TextToSpeechProviderId); + Assert.Equal(16000, settings.SampleRateHz); + Assert.Equal(80, settings.CaptureChunkMs); + Assert.True(settings.BargeInEnabled); + Assert.Equal("NanoWakeWord", settings.VoiceWake.Engine); + Assert.Equal("hey_openclaw", settings.VoiceWake.ModelId); + Assert.Equal(0.65f, settings.VoiceWake.TriggerThreshold); + Assert.Equal(250, settings.TalkMode.MinSpeechMs); + } + + [Fact] + public void VoiceStatusInfo_Defaults_ToStopped() + { + var status = new VoiceStatusInfo(); + + Assert.False(status.Available); + Assert.False(status.Running); + Assert.Equal(VoiceActivationMode.Off, status.Mode); + Assert.Equal(VoiceRuntimeState.Stopped, status.State); + Assert.False(status.VoiceWakeLoaded); + Assert.Equal(0, status.PendingReplyCount); + Assert.False(status.CanSkipReply); + Assert.Null(status.CurrentReplyPreview); + Assert.Null(status.LastError); + } + + [Fact] + public void VoiceEnums_Serialize_AsStrings() + { + var json = JsonSerializer.Serialize(new VoiceStartArgs + { + Mode = VoiceActivationMode.VoiceWake + }); + + Assert.Contains("\"VoiceWake\"", json); + } + + [Fact] + public void VoiceProviderCatalog_Defaults_ToEmptyLists() + { + var catalog = new VoiceProviderCatalog(); + + Assert.Empty(catalog.SpeechToTextProviders); + Assert.Empty(catalog.TextToSpeechProviders); + } + + [Fact] + public void VoiceProviderIds_ExposeRequiredBuiltInProviders() + { + Assert.Equal("windows", VoiceProviderIds.Windows); + Assert.Equal("foundry-local", VoiceProviderIds.FoundryLocal); + Assert.Equal("openai-whisper", VoiceProviderIds.OpenAiWhisper); + Assert.Equal("elevenlabs-stt", VoiceProviderIds.ElevenLabsSpeechToText); + Assert.Equal("azure-ai-speech", VoiceProviderIds.AzureAiSpeech); + Assert.Equal("sherpa-onnx", VoiceProviderIds.SherpaOnnx); + Assert.Equal("minimax", VoiceProviderIds.MiniMax); + Assert.Equal("elevenlabs", VoiceProviderIds.ElevenLabs); + Assert.Equal("endpoint", VoiceProviderSettingKeys.Endpoint); + Assert.Equal("modelPath", VoiceProviderSettingKeys.ModelPath); + Assert.Equal("voiceSettingsJson", VoiceProviderSettingKeys.VoiceSettingsJson); + } + + [Fact] + public void VoiceProviderOption_Defaults_ToVisibleAndSelectable() + { + var option = new VoiceProviderOption { Name = "Provider" }; + + Assert.True(option.VisibleInSettings); + Assert.True(option.Selectable); + Assert.Equal("Provider", option.DisplayName); + Assert.Equal(1.0, option.DisplayOpacity); + } + + [Fact] + public void VoiceProviderConfigurationStore_Defaults_ToEmptyProviders() + { + var configuration = new VoiceProviderConfigurationStore(); + + Assert.Empty(configuration.Providers); + } + + [Fact] + public void VoiceProviderConfigurationStore_MigratesLegacyProviderCredentials() + { + var configuration = new VoiceProviderConfigurationStore(); + configuration.MigrateLegacyCredentials(new VoiceProviderCredentials + { + MiniMaxApiKey = "minimax-key", + MiniMaxModel = "speech-2.8-turbo", + MiniMaxVoiceId = "English_MatureBoss", + ElevenLabsApiKey = "eleven-key", + ElevenLabsModel = "eleven_multilingual_v2", + ElevenLabsVoiceId = "voice-42" + }); + + Assert.Equal("minimax-key", configuration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.ApiKey)); + Assert.Equal("speech-2.8-turbo", configuration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.Model)); + Assert.Equal("English_MatureBoss", configuration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.VoiceId)); + Assert.Equal("eleven-key", configuration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.ApiKey)); + Assert.Equal("eleven_multilingual_v2", configuration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.Model)); + Assert.Equal("voice-42", configuration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.VoiceId)); + } +} diff --git a/tests/OpenClaw.Shared.Tests/VoiceProviderConfigurationStoreExtensionsTests.cs b/tests/OpenClaw.Shared.Tests/VoiceProviderConfigurationStoreExtensionsTests.cs new file mode 100644 index 0000000..ca563a4 --- /dev/null +++ b/tests/OpenClaw.Shared.Tests/VoiceProviderConfigurationStoreExtensionsTests.cs @@ -0,0 +1,148 @@ +using OpenClaw.Shared; + +namespace OpenClaw.Shared.Tests; + +public class VoiceProviderConfigurationStoreExtensionsTests +{ + [Fact] + public void GetOrAddProvider_ReusesExistingProvider_CaseInsensitively() + { + var store = new VoiceProviderConfigurationStore + { + Providers = + [ + new VoiceProviderConfiguration { ProviderId = "MiniMax" } + ] + }; + + var provider = store.GetOrAddProvider("minimax"); + + Assert.Same(store.Providers[0], provider); + Assert.Single(store.Providers); + } + + [Fact] + public void FindProvider_MatchesProviderId_CaseInsensitively() + { + var store = new VoiceProviderConfigurationStore + { + Providers = + [ + new VoiceProviderConfiguration { ProviderId = "ElevenLabs" } + ] + }; + + var provider = store.FindProvider("elevenlabs"); + + Assert.NotNull(provider); + Assert.Equal("ElevenLabs", provider!.ProviderId); + } + + [Fact] + public void GetValue_MatchesSettingKey_CaseInsensitively() + { + var configuration = new VoiceProviderConfiguration + { + Values = new Dictionary + { + ["ApiKey"] = "secret" + } + }; + + var value = configuration.GetValue("apikey"); + + Assert.Equal("secret", value); + } + + [Fact] + public void StoreGetValue_MatchesProviderAndSetting_CaseInsensitively() + { + var store = new VoiceProviderConfigurationStore + { + Providers = + [ + new VoiceProviderConfiguration + { + ProviderId = "MiniMax", + Values = new Dictionary + { + ["VoiceId"] = "English_MatureBoss" + } + } + ] + }; + + var value = store.GetValue("minimax", "voiceid"); + + Assert.Equal("English_MatureBoss", value); + } + + [Fact] + public void SetValue_AddsProviderAndTrimsStoredValue() + { + var store = new VoiceProviderConfigurationStore(); + + store.SetValue("minimax", "apiKey", " secret-key "); + + var provider = Assert.Single(store.Providers); + Assert.Equal("minimax", provider.ProviderId); + Assert.Equal("secret-key", provider.Values["apiKey"]); + } + + [Fact] + public void SetValue_UpdatesExistingEntry_CaseInsensitively() + { + var configuration = new VoiceProviderConfiguration + { + Values = new Dictionary + { + ["ApiKey"] = "old-value" + } + }; + + configuration.SetValue("apikey", " new-value "); + + Assert.Single(configuration.Values); + Assert.Equal("new-value", configuration.Values["ApiKey"]); + } + + [Fact] + public void SetValue_RemovesExistingEntry_WhenValueIsBlank() + { + var configuration = new VoiceProviderConfiguration + { + Values = new Dictionary + { + ["ApiKey"] = "secret" + } + }; + + configuration.SetValue("apikey", " "); + + Assert.Empty(configuration.Values); + } + + [Fact] + public void StoreSetValue_RemovesSetting_WhenValueIsNull() + { + var store = new VoiceProviderConfigurationStore + { + Providers = + [ + new VoiceProviderConfiguration + { + ProviderId = "minimax", + Values = new Dictionary + { + ["apiKey"] = "secret" + } + } + ] + }; + + store.SetValue("MiniMax", "ApiKey", null); + + var provider = Assert.Single(store.Providers); + Assert.Empty(provider.Values); + } +} diff --git a/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj b/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj index c322211..a23c0d0 100644 --- a/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj +++ b/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj @@ -1,7 +1,7 @@ - net10.0 + net10.0-windows10.0.19041.0 enable enable false @@ -20,6 +20,7 @@ + diff --git a/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs b/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs index 4155812..15af4cc 100644 --- a/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs +++ b/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs @@ -1,3 +1,4 @@ +using System.Collections.Generic; using System.Text.Json; using OpenClaw.Shared; @@ -34,6 +35,62 @@ public void RoundTrip_AllFields_Preserved() SkippedUpdateTag = "v1.2.3", NotifyChatResponses = false, PreferStructuredCategories = true, + Voice = new VoiceSettings + { + Enabled = true, + Mode = VoiceActivationMode.VoiceWake, + ShowRepeaterAtStartup = false, + ShowConversationToasts = true, + SpeechToTextProviderId = "windows", + TextToSpeechProviderId = "elevenlabs", + InputDeviceId = "mic-1", + OutputDeviceId = "spk-2", + SampleRateHz = 16000, + CaptureChunkMs = 80, + BargeInEnabled = false, + VoiceWake = new VoiceWakeSettings + { + Engine = "NanoWakeWord", + ModelId = "hey_openclaw", + TriggerThreshold = 0.72f, + TriggerCooldownMs = 2500, + PreRollMs = 1400, + EndSilenceMs = 1000 + }, + TalkMode = new TalkModeSettings + { + MinSpeechMs = 300, + EndSilenceMs = 1100, + MaxUtteranceMs = 18000 + } + }, + VoiceProviderConfiguration = new VoiceProviderConfigurationStore + { + Providers = + [ + new VoiceProviderConfiguration + { + ProviderId = VoiceProviderIds.MiniMax, + Values = new Dictionary + { + [VoiceProviderSettingKeys.ApiKey] = "minimax-key", + [VoiceProviderSettingKeys.Model] = "speech-2.8-turbo", + [VoiceProviderSettingKeys.VoiceId] = "English_MatureBoss", + [VoiceProviderSettingKeys.VoiceSettingsJson] = "{\"voice_id\":\"English_MatureBoss\",\"speed\":1.1}" + } + }, + new VoiceProviderConfiguration + { + ProviderId = VoiceProviderIds.ElevenLabs, + Values = new Dictionary + { + [VoiceProviderSettingKeys.ApiKey] = "eleven-key", + [VoiceProviderSettingKeys.Model] = "eleven_multilingual_v2", + [VoiceProviderSettingKeys.VoiceId] = "voice-42" + } + } + ] + }, UserRules = new List { new() { Pattern = "build.*fail", IsRegex = true, Category = "urgent", Enabled = true } @@ -68,6 +125,27 @@ public void RoundTrip_AllFields_Preserved() Assert.Equal(original.SkippedUpdateTag, restored.SkippedUpdateTag); Assert.Equal(original.NotifyChatResponses, restored.NotifyChatResponses); Assert.Equal(original.PreferStructuredCategories, restored.PreferStructuredCategories); + Assert.NotNull(restored.Voice); + Assert.True(restored.Voice.Enabled); + Assert.Equal(VoiceActivationMode.VoiceWake, restored.Voice.Mode); + Assert.False(restored.Voice.ShowRepeaterAtStartup); + Assert.True(restored.Voice.ShowConversationToasts); + Assert.Equal("windows", restored.Voice.SpeechToTextProviderId); + Assert.Equal("elevenlabs", restored.Voice.TextToSpeechProviderId); + Assert.Equal("mic-1", restored.Voice.InputDeviceId); + Assert.Equal("spk-2", restored.Voice.OutputDeviceId); + Assert.Equal("NanoWakeWord", restored.Voice.VoiceWake.Engine); + Assert.Equal("hey_openclaw", restored.Voice.VoiceWake.ModelId); + Assert.Equal(0.72f, restored.Voice.VoiceWake.TriggerThreshold); + Assert.Equal(300, restored.Voice.TalkMode.MinSpeechMs); + Assert.NotNull(restored.VoiceProviderConfiguration); + Assert.Equal("minimax-key", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.ApiKey)); + Assert.Equal("speech-2.8-turbo", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.Model)); + Assert.Equal("English_MatureBoss", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.VoiceId)); + Assert.Equal("{\"voice_id\":\"English_MatureBoss\",\"speed\":1.1}", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.VoiceSettingsJson)); + Assert.Equal("eleven-key", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.ApiKey)); + Assert.Equal("eleven_multilingual_v2", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.Model)); + Assert.Equal("voice-42", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.VoiceId)); Assert.NotNull(restored.UserRules); Assert.Single(restored.UserRules); Assert.Equal("build.*fail", restored.UserRules[0].Pattern); @@ -119,9 +197,42 @@ public void MissingFields_UseDefaults() Assert.Null(settings.SkippedUpdateTag); Assert.True(settings.NotifyChatResponses); Assert.True(settings.PreferStructuredCategories); + Assert.NotNull(settings.Voice); + Assert.False(settings.Voice.Enabled); + Assert.Equal(VoiceActivationMode.Off, settings.Voice.Mode); + Assert.True(settings.Voice.ShowRepeaterAtStartup); + Assert.False(settings.Voice.ShowConversationToasts); + Assert.Equal(VoiceProviderIds.Windows, settings.Voice.SpeechToTextProviderId); + Assert.Equal(VoiceProviderIds.Windows, settings.Voice.TextToSpeechProviderId); + Assert.NotNull(settings.VoiceProviderConfiguration); + Assert.Empty(settings.VoiceProviderConfiguration.Providers); + Assert.Equal(16000, settings.Voice.SampleRateHz); + Assert.Equal("NanoWakeWord", settings.Voice.VoiceWake.Engine); Assert.Null(settings.UserRules); } + [Fact] + public void LegacyVoiceProviderCredentials_Deserialize_ForMigration() + { + var json = """ + { + "VoiceProviderCredentials": { + "MiniMaxApiKey": "minimax-key", + "MiniMaxModel": "speech-2.8-turbo", + "MiniMaxVoiceId": "English_MatureBoss" + } + } + """; + + var settings = SettingsData.FromJson(json); + + Assert.NotNull(settings); + Assert.NotNull(settings.VoiceProviderCredentials); + Assert.Equal("minimax-key", settings.VoiceProviderCredentials.MiniMaxApiKey); + Assert.Equal("speech-2.8-turbo", settings.VoiceProviderCredentials.MiniMaxModel); + Assert.Equal("English_MatureBoss", settings.VoiceProviderCredentials.MiniMaxVoiceId); + } + [Fact] public void BackwardCompatibility_OldSettingsWithoutNewFields() { @@ -161,6 +272,13 @@ public void BackwardCompatibility_OldSettingsWithoutNewFields() Assert.False(settings.HasSeenActivityStreamTip); Assert.Null(settings.SkippedUpdateTag); Assert.True(settings.GlobalHotkeyEnabled); + Assert.NotNull(settings.Voice); + Assert.False(settings.Voice.Enabled); + Assert.Equal(VoiceActivationMode.Off, settings.Voice.Mode); + Assert.True(settings.Voice.ShowRepeaterAtStartup); + Assert.False(settings.Voice.ShowConversationToasts); + Assert.Equal(VoiceProviderIds.Windows, settings.Voice.SpeechToTextProviderId); + Assert.Equal(VoiceProviderIds.Windows, settings.Voice.TextToSpeechProviderId); Assert.Null(settings.UserRules); } diff --git a/tests/OpenClaw.Tray.Tests/VoiceChatCoordinatorTests.cs b/tests/OpenClaw.Tray.Tests/VoiceChatCoordinatorTests.cs new file mode 100644 index 0000000..379991d --- /dev/null +++ b/tests/OpenClaw.Tray.Tests/VoiceChatCoordinatorTests.cs @@ -0,0 +1,221 @@ +using OpenClaw.Shared; +using OpenClawTray.Services.Voice; + +namespace OpenClaw.Tray.Tests; + +public class VoiceChatCoordinatorTests +{ + [Fact] + public async Task AttachWindow_ReplaysBufferedDraft() + { + var runtime = new FakeVoiceRuntime(); + using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher()); + + runtime.RaiseDraft("hello world", "main", clear: false); + + var window = new FakeVoiceChatWindow(); + coordinator.AttachWindow(window); + await Task.Yield(); + + Assert.Equal("hello world", window.LastDraftText); + Assert.False(window.LastDraftClear); + } + + [Fact] + public async Task DraftClear_IsReplayedWhenWindowAttachesLater() + { + var runtime = new FakeVoiceRuntime(); + using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher()); + + runtime.RaiseDraft("temporary draft", "main", clear: false); + runtime.RaiseDraft(string.Empty, "main", clear: true); + await Task.Yield(); + + var window = new FakeVoiceChatWindow(); + coordinator.AttachWindow(window); + await Task.Yield(); + + Assert.Equal(string.Empty, window.LastDraftText); + Assert.True(window.LastDraftClear); + } + + [Fact] + public async Task DraftUpdates_AreIgnoredForClosedWindow() + { + var runtime = new FakeVoiceRuntime(); + using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher()); + var window = new FakeVoiceChatWindow { IsClosed = true }; + coordinator.AttachWindow(window); + var updateCountAfterAttach = window.UpdateCallCount; + + runtime.RaiseDraft("headless text", "main", clear: false); + await Task.Yield(); + + Assert.Equal(updateCountAfterAttach, window.UpdateCallCount); + } + + [Fact] + public async Task DetachWindow_StopsFurtherDraftMirroring() + { + var runtime = new FakeVoiceRuntime(); + using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher()); + var window = new FakeVoiceChatWindow(); + coordinator.AttachWindow(window); + + coordinator.DetachWindow(window); + runtime.RaiseDraft("after detach", "main", clear: false); + await Task.Yield(); + + Assert.Equal(1, window.UpdateCallCount); + Assert.Equal(string.Empty, window.LastDraftText); + Assert.True(window.LastDraftClear); + } + + [Fact] + public void ConversationTurn_IsForwarded() + { + var runtime = new FakeVoiceRuntime(); + using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher()); + VoiceConversationTurnEventArgs? received = null; + coordinator.ConversationTurnAvailable += (_, args) => received = args; + + runtime.RaiseConversationTurn(new VoiceConversationTurnEventArgs + { + Direction = VoiceConversationDirection.Incoming, + Message = "reply", + SessionKey = "main" + }); + + Assert.NotNull(received); + Assert.Equal("reply", received!.Message); + Assert.Equal(VoiceConversationDirection.Incoming, received.Direction); + } + + [Fact] + public async Task ConversationTurn_IsMirroredToAttachedWindow() + { + var runtime = new FakeVoiceRuntime(); + using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher()); + var window = new FakeVoiceChatWindow(); + coordinator.AttachWindow(window); + + runtime.RaiseConversationTurn(new VoiceConversationTurnEventArgs + { + Direction = VoiceConversationDirection.Outgoing, + Message = "hello from voice", + SessionKey = "main" + }); + await Task.Yield(); + + Assert.Equal("hello from voice", window.LastTurnMessage); + Assert.Equal(VoiceConversationDirection.Outgoing, window.LastTurnDirection); + Assert.Equal(1, window.TurnCallCount); + } + + [Fact] + public async Task AttachWindow_ReplaysBufferedConversationTurns() + { + var runtime = new FakeVoiceRuntime(); + using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher()); + + runtime.RaiseConversationTurn(new VoiceConversationTurnEventArgs + { + Direction = VoiceConversationDirection.Outgoing, + Message = "replay this", + SessionKey = "main" + }); + await Task.Yield(); + + var window = new FakeVoiceChatWindow(); + coordinator.AttachWindow(window); + await Task.Yield(); + + Assert.Equal("replay this", window.LastTurnMessage); + Assert.Equal(VoiceConversationDirection.Outgoing, window.LastTurnDirection); + Assert.Equal(1, window.TurnCallCount); + } + + [Fact] + public async Task DraftAndTurns_AreBroadcastToAllAttachedWindows() + { + var runtime = new FakeVoiceRuntime(); + using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher()); + var firstWindow = new FakeVoiceChatWindow(); + var secondWindow = new FakeVoiceChatWindow(); + + coordinator.AttachWindow(firstWindow); + coordinator.AttachWindow(secondWindow); + + runtime.RaiseDraft("shared draft", "main", clear: false); + runtime.RaiseConversationTurn(new VoiceConversationTurnEventArgs + { + Direction = VoiceConversationDirection.Incoming, + Message = "shared reply", + SessionKey = "main" + }); + await Task.Yield(); + + Assert.Equal("shared draft", firstWindow.LastDraftText); + Assert.Equal("shared draft", secondWindow.LastDraftText); + Assert.Equal("shared reply", firstWindow.LastTurnMessage); + Assert.Equal("shared reply", secondWindow.LastTurnMessage); + } + + private sealed class ImmediateDispatcher : IUiDispatcher + { + public bool TryEnqueue(Action callback) + { + callback(); + return true; + } + } + + private sealed class FakeVoiceRuntime : IVoiceRuntime + { + public event EventHandler? ConversationTurnAvailable; + public event EventHandler? TranscriptDraftUpdated; + + public void RaiseDraft(string text, string? sessionKey, bool clear) + { + TranscriptDraftUpdated?.Invoke(this, new VoiceTranscriptDraftEventArgs + { + Text = text, + SessionKey = sessionKey ?? "main", + Clear = clear + }); + } + + public void RaiseConversationTurn(VoiceConversationTurnEventArgs args) + { + ConversationTurnAvailable?.Invoke(this, args); + } + } + + private sealed class FakeVoiceChatWindow : IVoiceChatWindow + { + public bool IsClosed { get; set; } + + public string LastDraftText { get; private set; } = string.Empty; + public bool LastDraftClear { get; private set; } + public int UpdateCallCount { get; private set; } + public string LastTurnMessage { get; private set; } = string.Empty; + public VoiceConversationDirection? LastTurnDirection { get; private set; } + public int TurnCallCount { get; private set; } + + public Task UpdateVoiceTranscriptDraftAsync(string text, bool clear) + { + UpdateCallCount++; + LastDraftText = text; + LastDraftClear = clear; + return Task.CompletedTask; + } + + public Task AppendVoiceConversationTurnAsync(VoiceConversationTurnEventArgs args) + { + TurnCallCount++; + LastTurnMessage = args.Message ?? string.Empty; + LastTurnDirection = args.Direction; + return Task.CompletedTask; + } + } +} diff --git a/tests/OpenClaw.Tray.Tests/VoiceCloudTextToSpeechClientTests.cs b/tests/OpenClaw.Tray.Tests/VoiceCloudTextToSpeechClientTests.cs new file mode 100644 index 0000000..75cefc0 --- /dev/null +++ b/tests/OpenClaw.Tray.Tests/VoiceCloudTextToSpeechClientTests.cs @@ -0,0 +1,75 @@ +using System; +using System.Reflection; +using System.Threading; +using System.Threading.Tasks; +using OpenClaw.Shared; +using OpenClawTray.Services.Voice; + +namespace OpenClaw.Tray.Tests; + +public class VoiceCloudTextToSpeechClientTests +{ + [Fact] + public async Task SynthesizeAsync_ThrowsOperationCanceled_WhenCallerTokenIsPreCancelled() + { + var client = new VoiceCloudTextToSpeechClient(); + var provider = new VoiceProviderOption + { + Id = "test-ws", + Name = "Test WS", + Settings = + [ + new VoiceProviderSettingDefinition { Key = "apiKey", Secret = true } + ], + TextToSpeechWebSocket = new VoiceTextToSpeechWebSocketContract + { + EndpointTemplate = "wss://127.0.0.1:0/tts" + } + }; + var store = new VoiceProviderConfigurationStore(); + store.SetValue("test-ws", "apiKey", "test-key"); + + using var cts = new CancellationTokenSource(); + cts.Cancel(); + + await Assert.ThrowsAnyAsync( + () => client.SynthesizeAsync("hello", provider, store, cancellationToken: cts.Token)); + } + + [Fact] + public void DecodeAudioBytes_DecodesHexString() + { + var result = InvokeDecodeAudioBytes("hexJsonString", "48656c6c6f", "TestProvider"); + + Assert.Equal([72, 101, 108, 108, 111], result); // "Hello" + } + + [Fact] + public void DecodeAudioBytes_DecodesBase64String() + { + var result = InvokeDecodeAudioBytes("base64JsonString", "SGVsbG8=", "TestProvider"); + + Assert.Equal([72, 101, 108, 108, 111], result); // "Hello" + } + + [Fact] + public void DecodeAudioBytes_ThrowsForUnsupportedMode() + { + var method = GetDecodeAudioBytesMethod(); + + var ex = Assert.Throws( + () => method.Invoke(null, ["unsupported", "data", "TestProvider"])); + + Assert.IsType(ex.InnerException); + } + + private static byte[] InvokeDecodeAudioBytes(string mode, string value, string providerName) + { + return (byte[])GetDecodeAudioBytesMethod().Invoke(null, [mode, value, providerName])!; + } + + private static MethodInfo GetDecodeAudioBytesMethod() => + typeof(VoiceCloudTextToSpeechClient).GetMethod( + "DecodeAudioBytes", + BindingFlags.NonPublic | BindingFlags.Static)!; +} diff --git a/tests/OpenClaw.Tray.Tests/VoiceProviderCatalogServiceTests.cs b/tests/OpenClaw.Tray.Tests/VoiceProviderCatalogServiceTests.cs new file mode 100644 index 0000000..ba56af0 --- /dev/null +++ b/tests/OpenClaw.Tray.Tests/VoiceProviderCatalogServiceTests.cs @@ -0,0 +1,131 @@ +using System; +using System.IO; +using OpenClaw.Shared; +using OpenClawTray.Helpers; +using OpenClawTray.Services.Voice; +using System.Linq; + +namespace OpenClaw.Tray.Tests; + +public class VoiceProviderCatalogServiceTests +{ + [Fact] + public void GetVoiceTrayIconPath_ReturnsBundledAppIconForOff() + { + var path = VoiceTrayIconHelper.GetVoiceTrayIconPath(VoiceTrayIconState.Off); + + Assert.Equal(VoiceTrayIconHelper.GetBaseAppIconPath(), path, ignoreCase: true); + } + + [Fact] + public void GetVoiceTrayIconPath_GeneratesListeningVariant() + { + var path = VoiceTrayIconHelper.GetVoiceTrayIconPath(VoiceTrayIconState.Listening); + + Assert.True(File.Exists(path)); + Assert.EndsWith(".ico", path, StringComparison.OrdinalIgnoreCase); + Assert.NotEqual(VoiceTrayIconHelper.GetBaseAppIconPath(), path, StringComparer.OrdinalIgnoreCase); + } + + [Fact] + public void CatalogFilePath_ResolvesToExistingBundledAsset() + { + Assert.EndsWith("voice-providers.json", VoiceProviderCatalogService.CatalogFilePath, StringComparison.OrdinalIgnoreCase); + Assert.True(File.Exists(VoiceProviderCatalogService.CatalogFilePath)); + } + + [Fact] + public void LoadCatalog_IncludesOnlySelectableAndVisibleSpeechProviders() + { + var catalog = VoiceProviderCatalogService.LoadCatalog(); + + Assert.Contains(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.Windows); + Assert.Contains(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.SherpaOnnx); + Assert.DoesNotContain(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.FoundryLocal); + Assert.DoesNotContain(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.OpenAiWhisper); + Assert.DoesNotContain(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.ElevenLabsSpeechToText); + Assert.DoesNotContain(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.AzureAiSpeech); + Assert.Contains(catalog.TextToSpeechProviders, p => p.Id == VoiceProviderIds.Windows); + Assert.Contains(catalog.TextToSpeechProviders, p => p.Id == VoiceProviderIds.MiniMax); + Assert.Contains(catalog.TextToSpeechProviders, p => p.Id == VoiceProviderIds.ElevenLabs); + } + + [Fact] + public void SupportsSpeechToTextRuntime_ReportsWindowsRouteSupportForConfiguredSpeechProviders() + { + Assert.True(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.Windows)); + Assert.True(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.FoundryLocal)); + Assert.True(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.OpenAiWhisper)); + Assert.True(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.ElevenLabsSpeechToText)); + Assert.True(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.AzureAiSpeech)); + Assert.False(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.SherpaOnnx)); + } + + [Fact] + public void SupportsTextToSpeechRuntime_ReturnsTrueForImplementedProviders() + { + Assert.True(VoiceProviderCatalogService.SupportsTextToSpeechRuntime(VoiceProviderIds.Windows)); + Assert.True(VoiceProviderCatalogService.SupportsTextToSpeechRuntime(VoiceProviderIds.MiniMax)); + Assert.True(VoiceProviderCatalogService.SupportsTextToSpeechRuntime(VoiceProviderIds.ElevenLabs)); + } + + [Fact] + public void LoadCatalog_ExposesBuiltInCloudTtsContracts() + { + var catalog = VoiceProviderCatalogService.LoadCatalog(); + + var sherpaOnnx = Assert.Single(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.SherpaOnnx); + Assert.Equal(VoiceProviderRuntimeIds.Embedded, sherpaOnnx.Runtime); + Assert.False(sherpaOnnx.Enabled); + Assert.True(sherpaOnnx.VisibleInSettings); + Assert.False(sherpaOnnx.Selectable); + Assert.Equal(string.Empty, sherpaOnnx.Settings.Single(s => s.Key == VoiceProviderSettingKeys.ModelPath).DefaultValue); + + var minimax = Assert.Single(catalog.TextToSpeechProviders, p => p.Id == VoiceProviderIds.MiniMax); + Assert.Equal("MiniMax", minimax.Name); + Assert.NotNull(minimax.TextToSpeechWebSocket); + Assert.Equal("wss://api.minimax.io/ws/v1/t2a_v2", minimax.TextToSpeechWebSocket!.EndpointTemplate); + Assert.Equal("Authorization", minimax.TextToSpeechWebSocket.AuthenticationHeaderName); + Assert.Equal(VoiceTextToSpeechResponseModes.HexJsonString, minimax.TextToSpeechWebSocket.ResponseAudioMode); + Assert.Contains("\"event\": \"task_start\"", minimax.TextToSpeechWebSocket.StartMessageTemplate); + Assert.Contains("\"event\": \"task_continue\"", minimax.TextToSpeechWebSocket.ContinueMessageTemplate); + var minimaxModelSetting = minimax.Settings.Single(s => s.Key == VoiceProviderSettingKeys.Model); + Assert.Equal("speech-2.8-turbo", minimaxModelSetting.DefaultValue); + Assert.Contains("speech-2.8-turbo", minimaxModelSetting.Options); + Assert.Contains("speech-2.5-turbo-preview", minimaxModelSetting.Options); + Assert.Equal("English_MatureBoss", minimax.Settings.Single(s => s.Key == VoiceProviderSettingKeys.VoiceId).DefaultValue); + var minimaxVoiceSettingsJson = minimax.Settings.Single(s => s.Key == VoiceProviderSettingKeys.VoiceSettingsJson); + Assert.False(minimaxVoiceSettingsJson.Required); + Assert.True(minimaxVoiceSettingsJson.JsonValue); + Assert.Contains("\"voice_setting\":", minimaxVoiceSettingsJson.Placeholder); + Assert.Contains("{{voiceId}}", minimaxVoiceSettingsJson.DefaultValue); + + var elevenLabs = Assert.Single(catalog.TextToSpeechProviders, p => p.Id == VoiceProviderIds.ElevenLabs); + Assert.Equal("ElevenLabs", elevenLabs.Name); + Assert.NotNull(elevenLabs.TextToSpeechWebSocket); + Assert.Equal( + "wss://api.elevenlabs.io/v1/text-to-speech/{{voiceId}}/stream-input?model_id={{model}}&output_format=mp3_44100_128&auto_mode=true", + elevenLabs.TextToSpeechWebSocket!.EndpointTemplate); + Assert.Equal("xi-api-key", elevenLabs.TextToSpeechWebSocket.AuthenticationHeaderName); + Assert.Equal(string.Empty, elevenLabs.TextToSpeechWebSocket.AuthenticationScheme); + Assert.Equal(string.Empty, elevenLabs.TextToSpeechWebSocket.ConnectSuccessEventName); + Assert.Equal(string.Empty, elevenLabs.TextToSpeechWebSocket.StartSuccessEventName); + Assert.Contains("\"xi_api_key\": {{apiKey}}", elevenLabs.TextToSpeechWebSocket.StartMessageTemplate); + Assert.Contains("\"try_trigger_generation\": true", elevenLabs.TextToSpeechWebSocket.ContinueMessageTemplate); + Assert.Contains("{{textWithTrailingSpace}}", elevenLabs.TextToSpeechWebSocket.ContinueMessageTemplate); + Assert.Equal("{ \"text\": \"\" }", elevenLabs.TextToSpeechWebSocket.FinishMessageTemplate); + Assert.Equal(VoiceTextToSpeechResponseModes.Base64JsonString, elevenLabs.TextToSpeechWebSocket.ResponseAudioMode); + Assert.Equal("audio", elevenLabs.TextToSpeechWebSocket.ResponseAudioJsonPath); + Assert.Equal("isFinal", elevenLabs.TextToSpeechWebSocket.FinalFlagJsonPath); + var elevenLabsModelSetting = elevenLabs.Settings.Single(s => s.Key == VoiceProviderSettingKeys.Model); + Assert.Equal("eleven_multilingual_v2", elevenLabsModelSetting.DefaultValue); + Assert.Contains("eleven_flash_v2_5", elevenLabsModelSetting.Options); + Assert.Contains("eleven_turbo_v2_5", elevenLabsModelSetting.Options); + Assert.Equal("6aDn1KB0hjpdcocrUkmq", elevenLabs.Settings.Single(s => s.Key == VoiceProviderSettingKeys.VoiceId).DefaultValue); + var elevenLabsVoiceSettingsJson = elevenLabs.Settings.Single(s => s.Key == VoiceProviderSettingKeys.VoiceSettingsJson); + Assert.False(elevenLabsVoiceSettingsJson.Required); + Assert.True(elevenLabsVoiceSettingsJson.JsonValue); + Assert.Contains("\"voice_settings\":", elevenLabsVoiceSettingsJson.DefaultValue); + Assert.Contains("\"speed\": 0.9", elevenLabsVoiceSettingsJson.DefaultValue); + } +} diff --git a/tests/OpenClaw.Tray.Tests/VoiceServiceTransportTests.cs b/tests/OpenClaw.Tray.Tests/VoiceServiceTransportTests.cs new file mode 100644 index 0000000..344a878 --- /dev/null +++ b/tests/OpenClaw.Tray.Tests/VoiceServiceTransportTests.cs @@ -0,0 +1,360 @@ +using OpenClaw.Shared; +using Windows.Media.Devices; +using Windows.Media.SpeechRecognition; +using OpenClawTray.Services.Voice; + +namespace OpenClaw.Tray.Tests; + +public class VoiceServiceTransportTests +{ + [Fact] + public void GetOrCreateTransportReadySource_ReusesExistingTaskWhileConnecting() + { + var existing = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var result = VoiceServiceTransportLogic.GetOrCreateTransportReadySource( + ConnectionStatus.Connecting, + existing, + out var shouldStartConnection); + + Assert.Same(existing, result); + Assert.False(shouldStartConnection); + } + + [Fact] + public void GetOrCreateTransportReadySource_CreatesFreshTaskWhenDisconnected() + { + var existing = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var result = VoiceServiceTransportLogic.GetOrCreateTransportReadySource( + ConnectionStatus.Disconnected, + existing, + out var shouldStartConnection); + + Assert.NotSame(existing, result); + Assert.True(shouldStartConnection); + } + + [Fact] + public void GetOrCreateTransportReadySource_CreatesFreshTaskAfterError() + { + var existing = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var result = VoiceServiceTransportLogic.GetOrCreateTransportReadySource( + ConnectionStatus.Error, + existing, + out var shouldStartConnection); + + Assert.NotSame(existing, result); + Assert.True(shouldStartConnection); + } + + [Fact] + public void UsesCloudTextToSpeechRuntime_ReturnsTrueForWebSocketProviders() + { + var provider = new VoiceProviderOption + { + Id = VoiceProviderIds.MiniMax, + TextToSpeechWebSocket = new VoiceTextToSpeechWebSocketContract + { + EndpointTemplate = "wss://example.test/tts" + } + }; + + var result = VoiceServiceTransportLogic.UsesCloudTextToSpeechRuntime(provider); + + Assert.True(result); + } + + [Theory] + [InlineData(true, false, 0, false, true)] + [InlineData(false, true, 0, false, true)] + [InlineData(false, false, 1, false, true)] + [InlineData(false, false, 0, true, true)] + [InlineData(false, false, 0, false, false)] + public void ShouldAcceptAssistantReply_MatchesPlaybackAndAwaitingState( + bool awaitingReply, + bool isSpeaking, + int queuedReplyCount, + bool acceptedViaLateReplyGrace, + bool expected) + { + var result = VoiceServiceTransportLogic.ShouldAcceptAssistantReply( + awaitingReply, + isSpeaking, + queuedReplyCount, + acceptedViaLateReplyGrace); + + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(false, false, 0, "main", "main", 30, true)] + [InlineData(false, false, 0, "main", "main", 121, false)] + [InlineData(true, false, 0, "main", "main", 30, false)] + [InlineData(false, true, 0, "main", "main", 30, false)] + [InlineData(false, false, 1, "main", "main", 30, false)] + [InlineData(false, false, 0, "main", "other", 30, false)] + public void ShouldAcceptLateAssistantReply_OnlyMatchesBoundedGraceWindow( + bool awaitingReply, + bool isSpeaking, + int queuedReplyCount, + string lateReplySessionKey, + string incomingSessionKey, + int secondsAfterTimeout, + bool expected) + { + var timeoutUtc = new DateTime(2026, 3, 25, 0, 0, 0, DateTimeKind.Utc); + var graceUntilUtc = timeoutUtc.AddMinutes(2); + var result = VoiceServiceTransportLogic.ShouldAcceptLateAssistantReply( + awaitingReply, + isSpeaking, + queuedReplyCount, + lateReplySessionKey, + graceUntilUtc, + incomingSessionKey, + timeoutUtc.AddSeconds(secondsAfterTimeout)); + + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(true, false, false)] + [InlineData(false, true, false)] + [InlineData(false, false, true)] + public void ShouldRestartRecognitionAfterCompletion_SuppressesControlledRecycle( + bool restartInProgress, + bool awaitingReply, + bool expected) + { + var result = VoiceServiceTransportLogic.ShouldRestartRecognitionAfterCompletion( + true, + VoiceActivationMode.TalkMode, + restartInProgress, + awaitingReply, + false); + + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(true, VoiceActivationMode.TalkMode, false, false, false, "eligible")] + [InlineData(true, VoiceActivationMode.VoiceWake, false, false, false, "mode=VoiceWake")] + [InlineData(false, VoiceActivationMode.TalkMode, false, false, false, "runtime-not-running")] + [InlineData(true, VoiceActivationMode.TalkMode, true, false, false, "controlled-restart-in-progress")] + [InlineData(true, VoiceActivationMode.TalkMode, false, true, false, "awaiting-reply")] + [InlineData(true, VoiceActivationMode.TalkMode, false, false, true, "speaking")] + public void DescribeRecognitionCompletionRestartDecision_ExplainsWhyRestartIsBlocked( + bool running, + VoiceActivationMode mode, + bool restartInProgress, + bool awaitingReply, + bool isSpeaking, + string expected) + { + var result = VoiceServiceTransportLogic.DescribeRecognitionCompletionRestartDecision( + running, + mode, + restartInProgress, + awaitingReply, + isSpeaking); + + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(SpeechRecognitionResultStatus.UserCanceled, false, false, false, false, false, true)] + [InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, false, false, false, false, false)] + [InlineData(SpeechRecognitionResultStatus.Success, false, false, false, false, false, false)] + [InlineData(SpeechRecognitionResultStatus.Success, false, true, false, false, false, false)] + [InlineData(SpeechRecognitionResultStatus.UserCanceled, true, false, false, false, false, false)] + [InlineData(SpeechRecognitionResultStatus.UserCanceled, false, false, true, false, false, false)] + [InlineData(SpeechRecognitionResultStatus.UserCanceled, false, false, false, true, false, false)] + [InlineData(SpeechRecognitionResultStatus.UserCanceled, false, false, false, false, true, false)] + public void ShouldRebuildRecognitionAfterCompletion_RebuildsOnlyForUserCanceledWithoutActivity( + SpeechRecognitionResultStatus status, + bool sessionHadActivity, + bool sessionHadCaptureSignal, + bool restartInProgress, + bool awaitingReply, + bool isSpeaking, + bool expected) + { + var result = VoiceServiceTransportLogic.ShouldRebuildRecognitionAfterCompletion( + status, + sessionHadActivity, + sessionHadCaptureSignal, + restartInProgress, + awaitingReply, + isSpeaking); + + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, true, false, false, false, "capture-signal-without-recognition")] + [InlineData(SpeechRecognitionResultStatus.UserCanceled, false, false, false, false, false, "user-canceled-without-activity")] + [InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, false, false, false, false, "disabled-official-session-restart-only (status=TimeoutExceeded)")] + [InlineData(SpeechRecognitionResultStatus.Success, false, false, false, false, false, "disabled-official-session-restart-only (status=Success)")] + [InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, true, true, false, false, false, "session-had-activity")] + [InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, true, true, false, false, "controlled-restart-in-progress")] + [InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, true, false, true, false, "awaiting-reply")] + [InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, true, false, false, true, "speaking")] + public void DescribeRecognitionCompletionRebuildDecision_ExplainsWhyRebuildIsBlocked( + SpeechRecognitionResultStatus status, + bool sessionHadActivity, + bool sessionHadCaptureSignal, + bool restartInProgress, + bool awaitingReply, + bool isSpeaking, + string expected) + { + var result = VoiceServiceTransportLogic.DescribeRecognitionCompletionRebuildDecision( + status, + sessionHadActivity, + sessionHadCaptureSignal, + restartInProgress, + awaitingReply, + isSpeaking); + + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(16000, 80, 1280)] + [InlineData(16000, 0, 1280)] + [InlineData(0, 80, 1280)] + [InlineData(48000, 20, 960)] + public void ResolveDesiredSamplesPerQuantum_UsesSpeechFriendlyDefaults( + int sampleRateHz, + int chunkMs, + uint expected) + { + var result = VoiceCaptureMath.ResolveDesiredSamplesPerQuantum(sampleRateHz, chunkMs); + + Assert.Equal(expected, result); + } + + public static IEnumerable PeakLevelCases() + { + yield return [new byte[] { 0, 0, 0, 0 }, 0f]; + yield return [new byte[] { 0, 0, 0, 63 }, 0.5f]; + yield return [new byte[] { 0, 0, 128, 63, 0, 0, 0, 191 }, 1f]; + } + + [Theory] + [MemberData(nameof(PeakLevelCases))] + public void ComputePeakLevel_FindsLargestAbsoluteFloatSample(byte[] data, float expected) + { + var result = VoiceCaptureMath.ComputePeakLevel(data); + + Assert.Equal(expected, result, 3); + } + + [Theory] + [InlineData("Now again testing", "again testing", 1, true, "Now again testing")] + [InlineData("again testing", "again testing", 1, false, "again testing")] + [InlineData("Now again testing", "again testing", 3, false, "again testing")] + [InlineData("This is different", "again testing", 1, false, "again testing")] + public void SelectRecognizedText_PromotesRecentLongerHypothesisWhenFinalLooksTruncated( + string hypothesis, + string recognized, + int hypothesisAgeSeconds, + bool expectedPromoted, + string expected) + { + var now = new DateTime(2026, 3, 25, 16, 45, 30, DateTimeKind.Utc); + var result = VoiceServiceTransportLogic.SelectRecognizedText( + recognized, + hypothesis, + now.AddSeconds(-hypothesisAgeSeconds), + now, + out var promotedHypothesis); + + Assert.Equal(expected, result); + Assert.Equal(expectedPromoted, promotedHypothesis); + } + + [Theory] + [InlineData(true, "Now again testing", 1, "Now again testing")] + [InlineData(true, "Now again testing", 3, null)] + [InlineData(false, "Now again testing", 1, null)] + [InlineData(true, "", 1, null)] + public void SelectCompletionFallbackText_PromotesRecentHypothesisWhenSessionHadActivity( + bool sessionHadActivity, + string hypothesis, + int hypothesisAgeSeconds, + string? expected) + { + var now = new DateTime(2026, 3, 25, 21, 36, 35, DateTimeKind.Utc); + + var result = VoiceServiceTransportLogic.SelectCompletionFallbackText( + sessionHadActivity, + hypothesis, + now.AddSeconds(-hypothesisAgeSeconds), + now); + + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(false, false, false, true)] + [InlineData(true, false, false, false)] + [InlineData(false, true, false, false)] + [InlineData(false, false, true, false)] + public void ShouldClearTranscriptDraftAfterCompletion_ClearsOnlyWhenNoReplyOrFallbackInFlight( + bool awaitingReply, + bool isSpeaking, + bool usedFallbackTranscript, + bool expected) + { + var result = VoiceServiceTransportLogic.ShouldClearTranscriptDraftAfterCompletion( + awaitingReply, + isSpeaking, + usedFallbackTranscript); + + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(true, false, false, false, true)] + [InlineData(false, false, false, false, false)] + [InlineData(true, true, false, false, false)] + [InlineData(true, false, true, false, false)] + [InlineData(true, false, false, true, false)] + public void ShouldRepromptAfterIncompleteRecognition_OnlyPromptsWhenSpeechWasHeardButNothingUsableSurvived( + bool sessionHadActivity, + bool awaitingReply, + bool isSpeaking, + bool usedFallbackTranscript, + bool expected) + { + var result = VoiceServiceTransportLogic.ShouldRepromptAfterIncompleteRecognition( + sessionHadActivity, + awaitingReply, + isSpeaking, + usedFallbackTranscript); + + Assert.Equal(expected, result); + } + + [Theory] + [InlineData(true, VoiceActivationMode.TalkMode, null, AudioDeviceRole.Default, true)] + [InlineData(true, VoiceActivationMode.TalkMode, "", AudioDeviceRole.Default, true)] + [InlineData(true, VoiceActivationMode.TalkMode, "device-1", AudioDeviceRole.Default, false)] + [InlineData(true, VoiceActivationMode.VoiceWake, null, AudioDeviceRole.Default, false)] + [InlineData(false, VoiceActivationMode.TalkMode, null, AudioDeviceRole.Default, false)] + [InlineData(true, VoiceActivationMode.TalkMode, null, AudioDeviceRole.Communications, false)] + public void ShouldRefreshRecognitionForDefaultCaptureDeviceChange_OnlyRefreshesTalkModeUsingSystemDefaultMic( + bool running, + VoiceActivationMode mode, + string? configuredInputDeviceId, + AudioDeviceRole role, + bool expected) + { + var result = VoiceServiceTransportLogic.ShouldRefreshRecognitionForDefaultCaptureDeviceChange( + running, + mode, + configuredInputDeviceId, + role); + + Assert.Equal(expected, result); + } +} diff --git a/tests/OpenClaw.Tray.Tests/WebChatWindowDomBridgeTests.cs b/tests/OpenClaw.Tray.Tests/WebChatWindowDomBridgeTests.cs new file mode 100644 index 0000000..9fe2633 --- /dev/null +++ b/tests/OpenClaw.Tray.Tests/WebChatWindowDomBridgeTests.cs @@ -0,0 +1,31 @@ +using OpenClawTray.Windows; + +namespace OpenClaw.Tray.Tests; + +public class WebChatWindowDomBridgeTests +{ + [Fact] + public void BuildSetDraftScript_ClearsWhenDraftIsBlank() + { + var script = WebChatVoiceDomBridge.BuildSetDraftScript(string.Empty); + + Assert.Equal("window.__openClawTrayVoice?.clearDraft?.();", script); + } + + [Fact] + public void BuildSetDraftScript_SerializesDraftText() + { + var script = WebChatVoiceDomBridge.BuildSetDraftScript("hello from voice"); + + Assert.Contains("setDraft", script); + Assert.Contains("\"hello from voice\"", script); + } + + [Fact] + public void DocumentCreatedScript_ClearsLegacyTurnsHost() + { + Assert.Contains("openclaw-tray-voice-turns", WebChatVoiceDomBridge.DocumentCreatedScript); + Assert.Contains("clearLegacyTurnsHost", WebChatVoiceDomBridge.DocumentCreatedScript); + Assert.Equal("window.__openClawTrayVoice?.setTurns?.([]);", WebChatVoiceDomBridge.ClearLegacyTurnsScript); + } +}