diff --git a/.gitignore b/.gitignore
index 6b3d49e..0c4e131 100644
--- a/.gitignore
+++ b/.gitignore
@@ -62,6 +62,7 @@ BenchmarkDotNet.Artifacts/
project.lock.json
project.fragment.lock.json
artifacts/
+.env
# ASP.NET Scaffolding
ScaffoldingReadMe.txt
@@ -344,3 +345,9 @@ MigrationBackup/
# Fody - auto-generated XML schema
FodyWeavers.xsd
Output/
+
+# Repo-local tool caches and workspace metadata
+.claude/
+.dotnet-cli/
+.playwright-cli/
+output/playwright/
diff --git a/README.md b/README.md
index 27290b2..2718eaf 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,7 @@ Modern Windows 11-style system tray companion that connects to your local OpenCl
- 🚀 **Auto-start** - Launch with Windows
- ⚙️ **Settings** - Full configuration dialog
- 🎯 **First-run experience** - Welcome dialog guides new users
+-
**Voice Mode (new)** - Talk to your Claw via your Windows node
#### Quick Send scope requirement
@@ -122,13 +123,14 @@ If Quick Send fails with `pairing required` / `NOT_PAIRED`, that is a **device a
### Menu Sections
- **Status** - Gateway connection status with click-to-view details
+- **Voice** - Access to Voice controls
- **Sessions** - Active agent sessions with preview and per-session controls
- **Usage** - Provider/cost summary with quick jump to activity details
- **Channels** - Telegram/WhatsApp status with toggle control
- **Nodes** - Online/offline node inventory and copyable summary
- **Recent Activity** - Timestamped event stream for sessions, usage, nodes, and notifications
- **Actions** - Dashboard, Web Chat, Quick Send, Activity Stream, History
-- **Settings** - Configuration, auto-start, logs
+- **Settings** - Configuration, auto-start, logs, voice
### Mac Parity Status
@@ -148,6 +150,7 @@ Comparing against [openclaw-menubar](https://github.com/magimetal/openclaw-menub
| Refresh | ✅ | ✅ | Auto-refresh on menu open |
| Launch at Login | ✅ | ✅ | |
| Notifications toggle | ✅ | ✅ | |
+| Voice Mode | ✅ | 🟡 | Talk Mode implemented (half-duplex), WakeWord, Interrupt, etc. in progress
### Windows-Only Features
@@ -281,6 +284,14 @@ OpenClaw registers the `openclaw://` URL scheme for automation and integration:
Deep links work even when Molty is already running - they're forwarded via IPC.
+### Voice Mode
+*contributed by NichUK and his colleagues @codex and @copilot*
+
+Currently supports Talk Mode - Always on talk to your Claw! Wakeword and PTT modes coming soon
+- Uses internal Windows STT (cloud providers coming soon)
+- Windows/Minimax/Eleven Labs TTS voices
+ - Give your Claw a voice!
+
## 📦 OpenClaw.CommandPalette
PowerToys Command Palette extension for quick OpenClaw access.
diff --git a/docs/VOICE-MODE.md b/docs/VOICE-MODE.md
new file mode 100644
index 0000000..87d6011
--- /dev/null
+++ b/docs/VOICE-MODE.md
@@ -0,0 +1,988 @@
+# Voice Mode Architecture
+*Author: Nich Overend (NichUK@GitHub) - with @codex and @copilot*
+https://github.com/openclaw/openclaw-windows-node
+
+
+This document defines the voice subsystem for the Windows node only. It introduces the command surface, persisted settings schema, and minimum runtime boundaries needed to add Windows voice support without reshaping the existing node architecture.
+
+## Goals
+
+- Add a node-local voice mode with two activation modes: `VoiceWake` and `TalkMode`
+- Utilise minimal touch points to the existing app to reduce the potential for screw-ups
+- Use NanoWakeWord for wakeword detection on-device
+- Present the user-facing mode names as `Voice Wake` and `Talk Mode`
+- Keep STT/TTS provider selection configurable, with Windows implementations as the default built-in baseline
+- Implement `MiniMax` TTS and `ElevenLabs` TTS as required non-Windows providers after the Windows baseline
+- Make adding new voice providers an update to a Json catalog, rather than requiring code changes where possible
+- Reuse the existing node capability pattern instead of introducing a parallel control path
+- Ensure that the voice sub-system is extensible
+- Ensure that the voice sub-system is controllable from other applications
+
+## Non-Goals
+
+- True full-duplex or chunk-streaming audio transport between node and gateway
+- Subtantial changes to the existing project
+
+## Design Position
+
+The Windows node should own device-local audio concerns:
+
+- microphone capture
+- wakeword detection
+- silence detection / utterance segmentation
+- speaker playback
+- device enumeration and persisted local settings
+
+OpenClaw remains responsible for conversation/session routing and upstream voice orchestration.
+
+This keeps the Windows node lean for the first implementation and avoids introducing provider-routing settings before they are needed.
+
+## Visible Mode Names
+
+The tray app now uses user-facing names (borrowed from the macOS app) rather than exposing the internal enum names directly:
+
+| Internal Mode | Visible Name | Availability |
+|---|---|---|
+| `Off` | Off | available |
+| `VoiceWake` | Voice Wake | visible but disabled for now |
+| `TalkMode` | Talk Mode | available |
+
+The contracts and persisted settings now use `VoiceWake` and `TalkMode` as well.
+
+## Transport Boundary
+
+`TalkMode` follows the current talk-mode style control flow:
+
+- the node captures audio locally
+- local speech recognition turns that audio into transcript text on the active STT route
+- interim hypotheses are surfaced live, but only final `Medium` or `High` confidence recognizer results are submitted
+- if speech activity ends without any usable final transcript surviving, Talk Mode now clears the draft and gives a short local repeat prompt instead of silently doing nothing
+- the compact voice repeater window, when open, shows the live transcript draft plus local sent/received turns in a single scrolling surface
+- the tray chat window, when open, mirrors the live transcript draft into the compose box only
+- the finalized transcript is always sent to OpenClaw via direct `chat.send` on the voice mode target session, which is currently hardcoded in the tray app to `agent:main:main`
+- OpenClaw returns the assistant reply as normal chat output
+- the node performs local or remote TTS playback of that reply
+- assistant replies are queued locally and spoken sequentially, with a short (500 ms currently) pause between queued replies so overlapping responses are not lost
+- if a reply arrives after the normal 45-second wait timeout, the tray still accepts and speaks that late reply for a short bounded grace window (currently 120s) so slow upstream responses are not silently lost
+- assistant replies are currently accepted from either `agent:main:main` or the `main` alias so the tray can tolerate upstream session-key normalisation differences
+
+To avoid obvious duplicate sends from the Windows recognizer, exact duplicate final transcripts are suppressed within a short 750 ms window.
+
+The current Windows implementation uses a voice-local operator connection inside the tray app while node mode is active. That connection carries assistant chat events for `TalkMode`, while the recognized transcript is always sent through the tray app's direct `chat.send` path.
+
+## Voice APIs
+
+The Windows tray implementation now has two API layers:
+
+- shared node-capability commands in `OpenClaw.Shared`
+- in-process tray interfaces used by the windows/forms
+
+### Shared Capability Commands
+
+The node capability command surface is:
+
+- `voice.devices.list`
+- `voice.settings.get`
+- `voice.settings.set`
+- `voice.status.get`
+- `voice.start`
+- `voice.stop`
+- `voice.pause`
+- `voice.resume`
+- `voice.response.skip`
+
+These commands are defined in [VoiceModeSchema.cs](../src/OpenClaw.Shared/VoiceModeSchema.cs) and handled by [VoiceCapability.cs](../src/OpenClaw.Shared/Capabilities/VoiceCapability.cs).
+
+`voice.settings.get` / `voice.settings.set` are the configuration API.
+
+`voice.start` / `voice.stop` / `voice.pause` / `voice.resume` / `voice.response.skip` are the runtime control API.
+
+### Status Surface
+
+`VoiceStatusInfo` now carries the basic state needed by control surfaces:
+
+- mode
+- runtime state
+- session key
+- input/output device ids
+- last wake / last utterance timestamps
+- pending reply count
+- whether a reply can currently be skipped
+- current reply preview
+- last error
+
+### In-Process Tray Interfaces
+
+The tray app also exposes in-process interfaces so its own windows do not need to bind directly to the concrete `VoiceService` implementation:
+
+- `IVoiceConfigurationApi`
+ - get voice settings
+ - update voice settings
+ - list devices
+ - get provider catalog
+ - get/set provider configuration
+- `IVoiceRuntimeControlApi`
+ - get runtime status
+ - start / stop
+ - pause / resume
+ - skip current reply
+- `IVoiceRuntime`
+ - transcript draft and conversation events for chat integration
+
+This now powers multiple tray-local voice surfaces, including the compact voice repeater window.
+
+### Can the Settings Form Use This API?
+
+Yes. The Settings form can use the configuration API cleanly.
+
+The current tray implementation now uses the voice configuration interface for:
+
+- provider catalog loading
+- device enumeration
+- applying updated voice settings / provider configuration on save
+
+That means the settings UI is no longer hard-wired only to concrete `VoiceService` internals for its voice-specific behavior.
+
+## Speech Output Implementation
+
+In order to reduce output latency as much as possible, the current Windows implementation has made the following implementation decisions:
+
+- the Windows `SpeechSynthesizer` is created once per `TalkMode` runtime and reused for subsequent replies
+ - Frankly, no one will probably use it, but everyone has it, so...
+- cloud TTS uses a shared static `HttpClient`, so HTTP/TLS connections can be reused across replies
+- cloud requests use `ResponseHeadersRead`, which lets the client observe response-header arrival without waiting for full buffering first
+- the tray app now logs per-reply synthesis timings for both Windows and cloud TTS paths so latency can be measured directly during testing
+
+The main remaining gap is streaming playback from the first audio chunk. Best practice recommends chunked playback as soon as the first audio arrives, but the current implementation still waits for a complete playable stream before starting output (but not for long...):
+
+- Windows `SpeechSynthesizer` is used through `SynthesizeTextToStreamAsync`, which returns a complete stream for playback
+- MiniMax now uses the provider catalog's WebSocket TTS contract, but the current player still waits for a complete playable stream before output starts
+- ElevenLabs now uses the provider catalog's `stream-input` WebSocket contract, but the current player still waits for a complete playable stream before output starts
+
+So the current design minimizes avoidable setup and connection latency, but does not yet implement first-chunk playback streaming. This is however, planned for an early release (I'm working on it next).
+
+## Tray Chat Integration Decision
+
+Ideally Voice mode and typed chat should remain part of the same user-visible conversation in the web chat UI, however this proved difficult to achieve, as the gateway treated a message stream from the tray app seperately to that from the WebUI, even with the same session key.
+
+The only way of achieving this vaguely reliably seemed to be to locally insert messages into the DOM, but as this was a brittle, hacky solution, it was disgarded.
+
+### Chosen Approach
+
+It was therefore decided to create a separate *voice repeater form* to serve as a message window for voice, as well as making the messages available via toasts.
+
+The tray app keeps a tray-local interim transcript buffer for the current utterance, independent of whether any chat window or voice repeater form is open.
+
+## Provider Selection
+
+Voice settings now carry explicit provider ids for both STT and TTS:
+
+- `Voice.SpeechToTextProviderId`
+- `Voice.TextToSpeechProviderId`
+
+The built-in default for both is `windows`.
+
+Runtime behavior in the current phase:
+
+- `windows` is implemented for both STT and TTS
+- the `windows` STT route is a pure `Windows.Media.SpeechRecognition.SpeechRecognizer` path with no `AudioGraph` dependency
+- `windows` STT is currently treated as `half-duplex, non-streamed`
+- `http/ws` is now catalogued as a visible "coming soon" STT slot for generic streaming HTTP/WebSocket adapters
+- built-in catalog entries exist for both `minimax` and `elevenlabs` TTS
+- `minimax` defaults to `speech-2.8-turbo` and `English_MatureBoss` at present
+- `minimax` now uses a catalog-driven WebSocket contract for synchronous TTS
+- `elevenlabs` defaults to `eleven_multilingual_v2` and voice id `6aDn1KB0hjpdcocrUkmq (Tiffany)` for now
+- only currently usable providers are selectable in Settings
+- `sherpa-onnx` is visible but greyed out as a coming-soon local embedded route
+- unsupported providers fall back to Windows at runtime with a status warning
+
+### Settings Surface Notes
+
+The Settings panel now shows short inline descriptions for:
+
+- the selected voice mode
+- the selected speech-to-text provider
+- the selected text-to-speech provider
+
+Those provider descriptions are drawn directly from the provider catalog.
+
+When `Windows Speech Recognition` is selected for STT, the Settings panel now forces both audio device pickers back to the system defaults and greys them out. That matches the current Windows route limitation and avoids advertising per-device microphone routing that does not exist on this route yet.
+
+### Provider Catalog
+
+The provider catalog now ships with the tray app as a bundled asset:
+
+- `Assets\\voice-providers.json`
+
+Example:
+
+```json
+{
+ "speechToTextProviders": [
+ {
+ "id": "windows",
+ "name": "Windows Speech Recognition",
+ "runtime": "windows",
+ "enabled": true,
+ "description": "Built-in Windows.Media speech recognition, half-duplex, non-streamed."
+ },
+ {
+ "id": "http-ws",
+ "name": "http/ws",
+ "runtime": "streaming",
+ "enabled": false,
+ "visibleInSettings": true,
+ "selectable": false,
+ "description": "Will support most cloud and local stand-alone models full or half-duplex, streaming."
+ },
+ ],
+ "textToSpeechProviders": [
+ {
+ "id": "windows",
+ "name": "Windows Speech Synthesis",
+ "runtime": "windows",
+ "enabled": true,
+ "description": "Built-in Windows text-to-speech playback."
+ },
+ {
+ "id": "minimax",
+ "name": "MiniMax",
+ "runtime": "cloud",
+ "enabled": true,
+ "description": "Cloud TTS using the MiniMax WebSocket text-to-speech API.",
+ "settings": [
+ { "key": "apiKey", "label": "API key", "secret": true },
+ {
+ "key": "model",
+ "label": "Model",
+ "defaultValue": "speech-2.8-turbo",
+ "options": [
+ "speech-2.5-turbo-preview",
+ "speech-02-turbo",
+ "speech-02-hd",
+ "speech-2.6-turbo",
+ "speech-2.6-hd",
+ "speech-2.8-turbo",
+ "speech-2.8-hd"
+ ]
+ },
+ { "key": "voiceId", "label": "Voice ID", "defaultValue": "English_MatureBoss" },
+ {
+ "key": "voiceSettingsJson",
+ "label": "Voice settings JSON",
+ "defaultValue": "\"voice_setting\": { \"voice_id\": {{voiceId}}, \"speed\": 1, \"vol\": 1, \"pitch\": 0 }",
+ "placeholder": "\"voice_setting\": { \"voice_id\": \"English_MatureBoss\", \"speed\": 1, \"vol\": 1, \"pitch\": 0 }"
+ }
+ ],
+ "textToSpeechWebSocket": {
+ "endpointTemplate": "wss://api.minimax.io/ws/v1/t2a_v2",
+ "authenticationHeaderName": "Authorization",
+ "authenticationScheme": "Bearer",
+ "apiKeySettingKey": "apiKey",
+ "connectSuccessEventName": "connected_success",
+ "startMessageTemplate": "{ \"event\": \"task_start\", \"model\": {{model}}, \"language_boost\": \"English\", {{voiceSettingsJson}}, \"audio_setting\": { \"sample_rate\": 32000, \"bitrate\": 128000, \"format\": \"mp3\", \"channel\": 1 } }",
+ "startSuccessEventName": "task_started",
+ "continueMessageTemplate": "{ \"event\": \"task_continue\", \"text\": {{text}} }",
+ "finishMessageTemplate": "{ \"event\": \"task_finish\" }",
+ "responseAudioMode": "hexJsonString",
+ "responseAudioJsonPath": "data.audio",
+ "responseStatusCodeJsonPath": "base_resp.status_code",
+ "responseStatusMessageJsonPath": "base_resp.status_msg",
+ "finalFlagJsonPath": "is_final",
+ "taskFailedEventName": "task_failed",
+ "successStatusValue": "0",
+ "outputContentType": "audio/mpeg"
+ }
+ },
+ {
+ "id": "elevenlabs",
+ "name": "ElevenLabs",
+ "runtime": "cloud",
+ "enabled": true,
+ "description": "Cloud TTS using the ElevenLabs WebSocket stream-input API.",
+ "settings": [
+ { "key": "apiKey", "label": "API key", "secret": true },
+ {
+ "key": "model",
+ "label": "Model",
+ "defaultValue": "eleven_multilingual_v2",
+ "options": [
+ "eleven_flash_v2_5",
+ "eleven_turbo_v2_5",
+ "eleven_multilingual_v2",
+ "eleven_monolingual_v1"
+ ]
+ },
+ { "key": "voiceId", "label": "Voice ID", "defaultValue": "6aDn1KB0hjpdcocrUkmq", "placeholder": "Enter an ElevenLabs voice ID" },
+ {
+ "key": "voiceSettingsJson",
+ "label": "Voice settings JSON",
+ "defaultValue": "\"voice_settings\": { \"speed\": 0.9, \"stability\": 0.5, \"similarity_boost\": 0.75 }",
+ "placeholder": "\"voice_settings\": { \"speed\": 0.9, \"stability\": 0.5, \"similarity_boost\": 0.75 }"
+ }
+ ],
+ "textToSpeechWebSocket": {
+ "endpointTemplate": "wss://api.elevenlabs.io/v1/text-to-speech/{{voiceId}}/stream-input?model_id={{model}}&output_format=mp3_44100_128&auto_mode=true",
+ "authenticationHeaderName": "xi-api-key",
+ "authenticationScheme": "",
+ "apiKeySettingKey": "apiKey",
+ "connectSuccessEventName": "",
+ "startMessageTemplate": "{ \"text\": \" \", {{voiceSettingsJson}}, \"xi_api_key\": {{apiKey}} }",
+ "startSuccessEventName": "",
+ "continueMessageTemplate": "{ \"text\": {{textWithTrailingSpace}}, \"try_trigger_generation\": true }",
+ "finishMessageTemplate": "{ \"text\": \"\" }",
+ "responseAudioMode": "base64JsonString",
+ "responseAudioJsonPath": "audio",
+ "finalFlagJsonPath": "isFinal",
+ "taskFailedEventName": "error",
+ "outputContentType": "audio/mpeg"
+ }
+ }
+ ]
+}
+```
+
+For cloud-backed TTS providers, the catalog carries either an HTTP or WebSocket request/response contract. That allows a new provider to be added by shipping an updated catalog file with the app, as long as it follows the same general templated transport approach.
+
+This file defines provider metadata and transport contracts. It does not carry API keys, these are stored with the standard config.
+
+### Local Provider Configuration
+
+That means the current design is:
+
+- local tray settings choose the preferred STT/TTS provider ids
+- provider API keys and editable values are stored in `%APPDATA%\\OpenClawTray\\settings.json` under `VoiceProviderConfiguration`
+- OpenClaw remains the conversation endpoint for `chat.send`
+- the shipped provider catalog remains metadata-only and must not contain secrets
+
+This is an intentional short-term design choice so the Windows tray app can use cloud TTS providers without inventing a second catalog file for secrets. It can be revisited later if provider ownership is split differently.
+
+Current configuration values are keyed by provider id. The built-in providers use:
+
+- `apiKey`
+- `model`
+- `voiceId`
+- `voiceSettingsJson`
+
+When the selected TTS provider in Settings is not `windows`, the tray app shows provider-specific fields in the configuration form so the user can enter or edit:
+
+- API key
+- model
+- voice id
+- voice settings JSON
+
+If a provider setting definition includes an `options` list, the settings UI renders that setting as a drop-down instead of a free-text field. That is how built-in cloud providers expose a provider-level choice plus a separate model choice without recompilation.
+
+If a provider setting definition is marked as JSON, the value is inserted into the provider request template as a raw JSON fragment rather than a quoted string. That allows the provider catalog to define whether the user is entering:
+
+- a bare object
+- or a full keyed fragment such as `"voice_setting": { ... }`
+
+without hard-coding provider-specific wrapper keys into the runtime.
+
+The current cloud TTS transports are:
+
+- `MiniMax`: catalog-driven WebSocket synthesis
+- `ElevenLabs`: catalog-driven WebSocket synthesis (`stream-input`)
+
+For `VoiceWake`, trigger words are gateway-owned global state. The Windows node should eventually consume the same shared trigger list and keep only a local enabled/disabled toggle plus device/runtime settings.
+
+In-flight voice controls are supported, if supported by the chosen provider and provided in their format, although an abstraction/translation layer is being considered, to accompany support for OpenClaw voice directives in replies records.
+Pronunciation dictionaries are also only currently supported directly on the voice provider, however a centralised dictionary is possible, and a proposal is being considered.
+
+## Command Surface
+
+The voice subsystem is introduced as a new node capability category: `voice`.
+
+### Commands
+
+| Command | Purpose | Request Payload | Response Payload |
+|---|---|---|---|
+| `voice.devices.list` | Enumerate input/output audio devices | none | `VoiceAudioDeviceInfo[]` |
+| `voice.settings.get` | Return the effective voice configuration | none | `VoiceSettings` |
+| `voice.settings.set` | Update the voice configuration | `VoiceSettingsUpdateArgs` | `VoiceSettings` |
+| `voice.status.get` | Return runtime voice status | none | `VoiceStatusInfo` |
+| `voice.start` | Start the voice runtime with the supplied or persisted mode | `VoiceStartArgs` | `VoiceStatusInfo` |
+| `voice.stop` | Stop the voice runtime | `VoiceStopArgs` | `VoiceStatusInfo` |
+| `voice.pause` | Pause the active voice runtime | `VoicePauseArgs` | `VoiceStatusInfo` |
+| `voice.resume` | Resume a paused voice runtime | `VoiceResumeArgs` | `VoiceStatusInfo` |
+| `voice.response.skip` | Skip the currently spoken reply and advance the queue if another reply is pending | `VoiceSkipArgs` | `VoiceStatusInfo` |
+
+### Payload Types
+
+- `VoiceSettings`
+- `VoiceWakeSettings`
+- `TalkModeSettings`
+- `VoiceAudioDeviceInfo`
+- `VoiceStatusInfo`
+- `VoiceStartArgs`
+- `VoiceStopArgs`
+- `VoicePauseArgs`
+- `VoiceResumeArgs`
+- `VoiceSkipArgs`
+- `VoiceSettingsUpdateArgs`
+
+These contracts are defined in [VoiceModeSchema.cs](../src/OpenClaw.Shared/VoiceModeSchema.cs).
+
+## Settings Schema
+
+Voice settings are persisted as `SettingsData.Voice` in [SettingsData.cs](../src/OpenClaw.Shared/SettingsData.cs).
+Provider configuration is persisted as `SettingsData.VoiceProviderConfiguration` in the same local settings file.
+The compact repeater window state is persisted as `SettingsData.VoiceRepeaterWindow` in the same settings file.
+
+The editable voice configuration now lives in the main Settings window.
+The tray `Voice Mode` window is a read-only runtime status/detail surface with a shortcut back into Settings.
+
+### Voice Repeater Window Settings
+
+The compact repeater persists its own local UI state in `SettingsData.VoiceRepeaterWindow`:
+
+| Setting | Type | Default | Meaning |
+|---|---|---|---|
+| `VoiceRepeaterWindow.AutoScroll` | bool | `true` | Automatically scroll the transcript surface to the latest draft/reply |
+| `VoiceRepeaterWindow.FloatingEnabled` | bool | `true` | Keep the repeater floating above other windows |
+| `VoiceRepeaterWindow.TextSize` | double | `13` | Repeater transcript font size |
+| `VoiceRepeaterWindow.HasSavedPlacement` | bool | `false` | Whether a user placement has been persisted yet |
+| `VoiceRepeaterWindow.Width` | int? | `null` | Saved repeater width |
+| `VoiceRepeaterWindow.Height` | int? | `null` | Saved repeater height |
+| `VoiceRepeaterWindow.X` | int? | `null` | Saved repeater screen X coordinate |
+| `VoiceRepeaterWindow.Y` | int? | `null` | Saved repeater screen Y coordinate |
+
+### Effective Schema
+
+```json
+{
+ "Voice": {
+ "Mode": "VoiceWake",
+ "Enabled": true,
+ "ShowRepeaterAtStartup": true,
+ "SpeechToTextProviderId": "windows",
+ "TextToSpeechProviderId": "windows",
+ "InputDeviceId": "default-mic",
+ "OutputDeviceId": "default-speaker",
+ "SampleRateHz": 16000,
+ "CaptureChunkMs": 80,
+ "BargeInEnabled": true,
+ "VoiceWake": {
+ "Engine": "NanoWakeWord",
+ "ModelId": "hey_openclaw",
+ "TriggerThreshold": 0.65,
+ "TriggerCooldownMs": 2000,
+ "PreRollMs": 1200,
+ "EndSilenceMs": 900
+ },
+ "TalkMode": {
+ "MinSpeechMs": 250,
+ "EndSilenceMs": 900,
+ "MaxUtteranceMs": 15000
+ }
+ },
+ "VoiceProviderConfiguration": {
+ "Providers": [
+ {
+ "ProviderId": "minimax",
+ "Values": {
+ "apiKey": "",
+ "model": "speech-2.8-turbo",
+ "voiceId": "English_MatureBoss",
+ "voiceSettingsJson": "\"voice_setting\": { \"voice_id\": \"English_MatureBoss\", \"speed\": 1, \"vol\": 1, \"pitch\": 0 }"
+ }
+ },
+ {
+ "ProviderId": "elevenlabs",
+ "Values": {
+ "apiKey": "",
+ "model": "eleven_multilingual_v2",
+ "voiceId": "voice-id",
+ "voiceSettingsJson": "\"voice_settings\": { \"stability\": 0.5, \"similarity_boost\": 0.8 }"
+ }
+ }
+ ]
+ }
+}
+```
+
+### Field Rationale
+
+| Field | Purpose |
+|---|---|
+| `Mode` | Top-level activation mode: `Off`, `VoiceWake`, `TalkMode` |
+| `Enabled` | Global feature kill-switch independent of mode |
+| `ShowRepeaterAtStartup` | Opens the compact Voice Mode repeater automatically when the app starts with voice mode active |
+| `SpeechToTextProviderId` | Selected STT provider id from the local provider catalog |
+| `TextToSpeechProviderId` | Selected TTS provider id from the local provider catalog |
+| `InputDeviceId` / `OutputDeviceId` | Preferred audio device binding, with selected-speaker support implemented first |
+| `SampleRateHz` | Shared capture sample rate, fixed to a speech-friendly default |
+| `CaptureChunkMs` | Frame size for capture, VAD, and wakeword processing |
+| `BargeInEnabled` | Allows microphone capture while audio playback is active |
+| `VoiceWake.*` | NanoWakeWord and post-trigger utterance capture tuning |
+| `TalkMode.*` | Continuous-listening segmentation tuning |
+
+### Complete Settings Definition
+
+| Setting | Type | Default | Applies To | Meaning |
+|---|---|---|---|---|
+| `Voice.Mode` | enum | `Off` | all | Activation mode: `Off`, `VoiceWake`, `TalkMode` |
+| `Voice.Enabled` | bool | `false` | all | Master enable/disable flag for voice mode |
+| `Voice.ShowRepeaterAtStartup` | bool | `true` | all | If `true`, the compact Voice Mode repeater opens automatically when the app starts with voice mode active |
+| `Voice.SpeechToTextProviderId` | string | `windows` | all | Preferred speech-to-text provider id |
+| `Voice.TextToSpeechProviderId` | string | `windows` | all | Preferred text-to-speech provider id |
+| `Voice.InputDeviceId` | string? | `null` | all | Preferred microphone device id; `null` means system default |
+| `Voice.OutputDeviceId` | string? | `null` | all | Preferred speaker device id; `null` means system default |
+| `Voice.SampleRateHz` | int | `16000` | all | Internal capture rate used for wakeword, VAD, and utterance assembly |
+| `Voice.CaptureChunkMs` | int | `80` | all | Audio frame duration used by the capture loop |
+| `Voice.BargeInEnabled` | bool | `true` | all | If `true`, microphone capture may continue while response audio is playing |
+| `Voice.VoiceWake.Engine` | string | `NanoWakeWord` | voice wake | Voice Wake engine identifier |
+| `Voice.VoiceWake.ModelId` | string | `hey_openclaw` | voice wake | Voice Wake model/profile identifier |
+| `Voice.VoiceWake.TriggerThreshold` | float | `0.65` | voice wake | Minimum score required to trigger Voice Wake activation |
+| `Voice.VoiceWake.TriggerCooldownMs` | int | `2000` | voice wake | Minimum delay before another Voice Wake trigger is accepted |
+| `Voice.VoiceWake.PreRollMs` | int | `1200` | voice wake | Buffered audio retained before the trigger point |
+| `Voice.VoiceWake.EndSilenceMs` | int | `900` | voice wake | Silence timeout used to finalize the post-trigger utterance |
+| `Voice.TalkMode.MinSpeechMs` | int | `250` | talk mode | Minimum detected speech duration before an utterance is treated as real input |
+| `Voice.TalkMode.EndSilenceMs` | int | `900` | talk mode | Silence timeout used to finalize an utterance |
+| `Voice.TalkMode.MaxUtteranceMs` | int | `15000` | talk mode | Hard cap on utterance length before forced submission/finalization |
+| `VoiceProviderConfiguration.Providers[].ProviderId` | string | none | cloud providers | Provider id matching an `Assets\\voice-providers.json` entry |
+| `VoiceProviderConfiguration.Providers[].Values["apiKey"]` | string? | `null` | cloud providers | API key sent using the provider contract's configured auth header |
+| `VoiceProviderConfiguration.Providers[].Values["model"]` | string? | provider default | cloud providers | Model identifier inserted into the configured request template |
+| `VoiceProviderConfiguration.Providers[].Values["voiceId"]` | string? | provider default | cloud providers | Voice id inserted into the configured request template or URL |
+| `VoiceProviderConfiguration.Providers[].Values["voiceSettingsJson"]` | string? | provider default | cloud providers | Raw JSON fragment inserted into the configured request template; may be a keyed fragment like `"voice_setting": { ... }` |
+
+At runtime today:
+
+- `Voice.OutputDeviceId` is applied to Talk Mode playback through `MediaPlayer.AudioDevice`
+- `VoiceCaptureService` now runs an `AudioGraph` capture pipeline in parallel with Talk Mode and binds it to the selected or default microphone device
+- `Voice.InputDeviceId` is now used by that `AudioGraph` capture path, but transcript generation still uses the Windows default speech input path until the STT adapter migration is complete
+- Talk Mode only advertises `ListeningContinuously` after the capture graph has produced live frames and the recognizer warm-up window has elapsed, so the status acts as a real “you can start talking now” signal instead of a timer-only guess
+- recognizer recovery is now speech-triggered rather than silence-triggered: the Windows recognizer is only recycled when sustained capture speech is present but no recognition activity follows
+- when a recognizer session ends after real hypothesis activity but before a final result arrives, Talk Mode now promotes the last recent hypothesis and submits it instead of dropping the utterance
+- the speech-mismatch recovery watchdog is single-owner and only armed from capture speech, so a new recognition session does not spawn overlapping recovery loops
+- when the system default capture device changes and Talk Mode is using the default mic, the recognizer is rebuilt so device switches such as AirPods are picked up without a full app restart
+- explicit non-default microphone transcript generation is still pending the planned STT adapter migration
+
+## Current Runtime Architecture
+
+The current Windows implementation is still centred on `VoiceService`, with a few supporting seams around it:
+
+- `VoiceCapability`
+ exposes shared `voice.*` commands to the node/gateway surface
+- `VoiceCaptureService`
+ owns the new `AudioGraph` capture backbone, selected/default microphone binding, and live signal detection
+- `VoiceService`
+ owns Talk Mode runtime state, recognizer/TTS integration, reply queuing, timeouts, gateway reply handling, and the transition layer between `AudioGraph` capture and the current recognizer-owned STT path
+- `VoiceChatCoordinator`
+ mirrors interim transcript drafts and conversation turns into attached tray windows without making any window part of the transport path
+- `OpenClawGatewayClient`
+ carries direct `chat.send`, final chat events, and the `sessions.preview` fallback path for bare final markers
+- `WebChatWindow`
+ mirrors live transcript drafts into the WebChat compose box
+- `VoiceRepeaterWindow`
+ is the compact local transcript/reply/control surface for Talk Mode
+
+### Current End-to-End Talk Mode
+
+```mermaid
+flowchart LR
+ A["User speech"] --> B["VoiceCaptureService
AudioGraph on selected/default mic"]
+ A --> C["Windows SpeechRecognizer
continuous dictation on current default mic"]
+
+ B --> D["FrameCaptured / SignalDetected"]
+ D --> E["VoiceService
capture-backed health + device state"]
+
+ C --> F["HypothesisGenerated
interim text"]
+ F --> G["VoiceService
draft event"]
+ G --> H["VoiceChatCoordinator"]
+ H --> I["WebChatWindow
compose-box mirror only"]
+ H --> I2["VoiceRepeaterWindow
compact local draft surface"]
+
+ C --> J["ResultGenerated
final Medium/High text"]
+ J --> K["VoiceService
duplicate guard + late hypothesis promotion"]
+ K --> L["Stop recognition session"]
+ L --> M["OpenClawGatewayClient.SendChatMessageAsync
direct chat.send(agent:main:main, transcript)"]
+ M --> N["OpenClaw / session pipeline"]
+ K --> H2["VoiceChatCoordinator
outgoing turn event"]
+ H2 --> I2
+ N --> O["Chat final event"]
+ O --> P{"assistant text present?"}
+ P -- "yes" --> Q["assistant text"]
+ P -- "no" --> R["sessions.preview fallback
with stale-preview retry guard"]
+ R --> Q
+ Q --> H3["VoiceChatCoordinator
incoming turn event"]
+ H3 --> I2
+
+ Q --> S["VoiceService reply queue"]
+ S --> T{"TTS provider"}
+ T -- "windows" --> U["SpeechSynthesizer"]
+ T -- "cloud" --> V["VoiceCloudTextToSpeechClient
MiniMax websocket or other provider"]
+ U --> W["Complete playable stream"]
+ V --> W
+ W --> X["MediaPlayer
selected OutputDeviceId if set"]
+ X --> Y["Speaker / headset output"]
+ Y --> Z["Resume recognition when queue drains"]
+```
+
+### Current Processing Stages
+
+| Stage | Component | Input | Output |
+|---|---|---|---|
+| 1 | `VoiceCaptureService` | selected/default microphone device | continuous frame and signal events from `AudioGraph` |
+| 2 | `SpeechRecognizer` | Windows default speech-input path | interim/final transcript text |
+| 3 | `VoiceService` | capture signal + final transcript text | health/restart decisions, de-duplicated transcript, runtime state changes |
+| 4 | `VoiceChatCoordinator` | draft and conversation-turn events | mirrored draft for WebChat plus compact local transcript/reply updates |
+| 5 | `OpenClawGatewayClient` | transcript text + session key | `chat.send` request + assistant reply events |
+| 6 | `OpenClawGatewayClient` preview fallback | bare final chat marker | assistant preview text, guarded against stale replay |
+| 7 | `VoiceService` reply queue | assistant reply text | ordered reply playback work |
+| 8 | `VoiceCloudTextToSpeechClient` / `SpeechSynthesizer` | assistant reply text | complete playable audio stream |
+| 9 | `MediaPlayer` | complete playable audio stream | rendered audio on default or selected speaker |
+
+## Planned AudioGraph Input Architecture
+
+The next input-phase refactor will move microphone ownership away from `SpeechRecognizer` and into an explicit capture pipeline built around `AudioGraph`.
+
+The purpose of that change is to unlock:
+
+- true selected non-default microphone support
+- streaming rather than utterance-owned capture
+- a proper ring buffer and VAD pipeline
+- future non-Windows and streaming STT providers
+- future barge-in / full-duplex work
+
+### Target Input Stack
+
+```mermaid
+flowchart TD
+ A["Selected microphone device id
or system default mic"] --> B["VoiceCaptureService
AudioGraph input node"]
+ B --> C["PCM frame stream
fixed chunk duration"]
+ C --> D["Ring buffer
bounded pre-roll"]
+ C --> E["VoiceActivityDetector"]
+ C --> F["VoiceWake engine
later"]
+ C --> G["SpeechToText adapter"]
+ E --> H["UtteranceAssembler
for non-streaming STT adapters"]
+ D --> H
+ H --> G
+ G --> I["Transcript events
interim + final"]
+ I --> J["VoiceService / runtime controller"]
+ J --> K["OpenClawGatewayClient
chat.send + reply events"]
+```
+
+### Proposed Seams
+
+The target split should look like this:
+
+- `VoiceCaptureService`
+ - owns `AudioGraph`
+ - binds to an explicit input device id when one is selected
+ - emits continuous PCM frames
+- `IVoiceActivityDetector`
+ - emits speech / silence transitions from frame data
+- `IUtteranceAssembler`
+ - builds bounded utterances from frames for non-streaming STT backends
+- `ISpeechToTextAdapter`
+ - consumes either live frames or completed utterances
+ - emits interim and final transcript events
+- `VoiceService`
+ - remains the runtime orchestrator rather than the owner of low-level capture
+
+## Selected-Device Roadmap
+
+The current selected-device position is now:
+
+- selected non-default speaker: implemented
+- selected/default microphone binding for `SpeechRecognizer` capture: implemented
+- selected non-default microphone for actual transcript generation: not implemented yet (requires `AudioGraph` support)
+
+## Control Flow
+
+```mermaid
+sequenceDiagram
+ participant Gateway as Gateway / Operator
+ participant VoiceCap as VoiceCapability
+ participant Coord as VoiceService
+ participant Store as SettingsData.Voice
+
+ Gateway->>VoiceCap: voice.settings.get
+ VoiceCap-->>Gateway: VoiceSettings
+
+ Gateway->>VoiceCap: voice.settings.set(settings, persist=true)
+ VoiceCap->>Store: save VoiceSettings
+ VoiceCap-->>Gateway: VoiceSettings
+
+Gateway->>VoiceCap: voice.start(mode=TalkMode, sessionKey=...)
+ VoiceCap->>Coord: Start(VoiceStartArgs)
+Coord-->>VoiceCap: VoiceStatusInfo(state=ListeningContinuously)
+ VoiceCap-->>Gateway: VoiceStatusInfo
+
+ Gateway->>VoiceCap: voice.status.get
+ VoiceCap-->>Gateway: VoiceStatusInfo
+
+ Gateway->>VoiceCap: voice.pause(reason=...)
+ VoiceCap->>Coord: Pause()
+ Coord-->>VoiceCap: VoiceStatusInfo(state=Paused)
+ VoiceCap-->>Gateway: VoiceStatusInfo
+
+ Gateway->>VoiceCap: voice.resume(reason=...)
+ VoiceCap->>Coord: Resume()
+ Coord-->>VoiceCap: VoiceStatusInfo(state=ListeningContinuously)
+ VoiceCap-->>Gateway: VoiceStatusInfo
+
+ Gateway->>VoiceCap: voice.response.skip(reason=...)
+ VoiceCap->>Coord: SkipCurrentReply()
+ Coord-->>VoiceCap: VoiceStatusInfo
+ VoiceCap-->>Gateway: VoiceStatusInfo
+
+ Gateway->>VoiceCap: voice.stop(reason=...)
+ VoiceCap->>Coord: Stop()
+ Coord-->>VoiceCap: VoiceStatusInfo(state=Stopped)
+ VoiceCap-->>Gateway: VoiceStatusInfo
+```
+
+## Integration Boundaries
+
+### Existing Components Reused
+
+- `NodeService` remains the capability registration and lifecycle owner
+- `SettingsData` remains the persisted JSON settings model
+- `WindowsNodeClient` remains the gateway/node transport
+- existing node capability registration remains the integration pattern
+- current request/response transport remains the v1 control plane
+
+### Supporting Components In Current Use
+
+- `VoiceCapability` in `OpenClaw.Shared.Capabilities`
+- `VoiceCaptureService` in `OpenClaw.Tray.WinUI.Services`
+- `VoiceChatCoordinator` in `OpenClaw.Tray.WinUI.Services`
+- `VoiceRepeaterWindow` in `OpenClaw.Tray.WinUI.Windows`
+- `WebChatWindow` in `OpenClaw.Tray.WinUI.Windows`
+
+### Components Still Expected Later
+
+- `VoiceWakeService` in `OpenClaw.Tray.WinUI.Services`
+- a dedicated `VoicePlaybackService` seam when playback is split out of `VoiceService`
+
+## Parity with macOS Node
+
+Status values used below:
+
+- `Supported`
+- `Partial`
+- `NotSupported (planned)`
+- `Exceeded*`
+
+| macOS feature | Current Windows state | Notes |
+|---|---|---|
+| Talk Mode continuous loop (`listen -> chat.send(main) -> wait -> speak`) | `Supported` | Windows Talk Mode uses direct `chat.send` on the tray voice target session (`agent:main:main` today, while still accepting the `main` alias on replies) and loops back to listening after reply playback. |
+| Talk Mode sends after a short silence window | `Supported` | The current runtime finalizes on recognition pause and uses configurable Talk Mode silence settings. |
+| Talk Mode visible phase transitions (`Listening -> Thinking -> Speaking`) | `Partial` | Runtime states, tray icon changes, and the compact voice repeater window exist, but there is no always-visible overlay yet. |
+| Talk Mode always-on overlay with click-to-stop / click-X controls | `NotSupported (planned)` | Windows currently has a tray icon, a manually-opened compact repeater window, and WebChat draft mirroring, but no always-on overlay surface. |
+| Talk Mode writes replies into WebChat the same way typed chat does | `Partial` | Replies appear in WebChat through normal session updates, but Talk Mode uses direct send rather than a same-as-typing transport path. |
+| Talk Mode interrupt-on-speech / barge-in | `NotSupported (planned)` | Windows is still half-duplex during reply playback. |
+| Talk Mode voice directives in replies | `NotSupported (planned)` | Windows does not yet parse or apply the JSON voice directive line described in the Talk Mode docs. |
+| Talk Mode true streaming TTS playback | `NotSupported (planned)` | MiniMax uses WebSocket transport, but playback still waits for a complete playable stream. |
+| Talk Mode cloud TTS provider flexibility | `Exceeded` | Windows already supports Windows built-in TTS plus catalog-driven cloud providers rather than being limited to a single provider path. This exceeds the documented macOS baseline on provider flexibility, but not yet on true streaming playback latency because incremental playback is still pending. |
+| Voice Wake wake-word runtime | `NotSupported (planned)` | `VoiceWake` remains a documented target mode, but there is no active wake-word runtime yet. |
+| Voice Wake push-to-talk capture | `NotSupported (planned)` | There is no Windows push-to-talk path yet. |
+| Voice Wake overlay with committed / volatile transcript states | `NotSupported (planned)` | No Voice Wake overlay exists on Windows yet. |
+| Voice Wake restart invariants when UI is dismissed | `NotSupported (planned)` | The macOS overlay-dismiss resilience behavior has no Windows equivalent yet because the overlay/runtime does not exist. |
+| Voice Wake forwarding to the active gateway / agent | `NotSupported (planned)` | Forwarding semantics are only implemented for Talk Mode today. |
+| Voice Wake machine-hint transcript prefixing | `NotSupported (planned)` | Windows does not currently prepend a machine hint on forwarded wake transcripts. |
+| Voice Wake mic picker, live level meter, trigger-word table, and tester | `NotSupported (planned)` | Windows has general voice settings and device lists, but not the Voice Wake-specific settings surface from macOS. |
+| Voice mic device selection | `Partial` | When `Windows Speech Recognition` is selected, Settings now locks both audio device pickers to the system defaults. Explicit per-device transcription routing remains a future AudioGraph/streaming-route feature. |
+| Voice Wake send / trigger chimes | `NotSupported (planned)` | Windows currently has no configurable trigger/send sounds. |
+
+## Feature List - Backlog - Not in Order, except maybe the first two ;)
+
+### Story: Streaming STT Capture Pipeline
+
+Implement `AudioGraph` to create an extensible streaming speech input pipeline, rather than the current self-contained `Windows.Media.SpeechRecognizer` pipeline.
+
+This will allow us to mix/match components, and reduce latency.
+
+- Will support Cloud or Local http/ws providers (including Microsoft Foundry Local/OpenAI Whisper/etc)
+- Will support Embedded sherpa-onnx engine for user-defined/downloaded models
+- This will enable selection of best of class model for required use/language
+
+### Story: True streaming TTS playback
+
+Start speaking assistant replies from the first usable audio chunk instead of waiting for a complete playable stream.
+
+Notes:
+
+- the current implementation uses WebSocket transport for MiniMax, but still buffers the entire audio response before playback begins
+- `firstChunk=...ms` in the log is currently provider-chunk arrival time, not actual speech-start time
+- implement a playback path that can consume incremental audio data as it arrives from the provider
+- the provider catalog contract should remain transport-driven and provider-agnostic, so streaming behavior should be expressed through the existing TTS contract model rather than hard-coded for MiniMax
+- preserve the existing queued reply behavior, skip support, and late-reply handling while switching playback to progressive output
+- add timing logs that separate `firstChunk`, `playbackStart`, and `playbackEnd` so latency improvements are measurable
+
+### Story: True selected-microphone transcription support
+
+Make actual STT transcription follow the selected microphone device, not just the default device.
+
+- depends on `AudioGraph` support
+
+
+### Story: Talk Mode overlay and visible phase parity
+
+Add a Talk Mode overlay that makes `Listening`, `Thinking`, and `Speaking` visible to the user in the same way the macOS experience does. Probably via the current voice mode form. I haven't actually seen the macOS version, so not sure how they do it.
+
+
+### Story: Talk Mode overlay controls
+
+Add explicit Talk Mode overlay controls for stopping speech playback and exiting Talk Mode.
+
+Notes:
+
+- macOS exposes click-to-stop and click-to-exit controls directly on the overlay
+- Windows currently requires tray or settings interaction instead
+- this should plug into the shared runtime control API rather than directly manipulating `VoiceService`
+
+
+### Story: Voice directives in replies
+
+Support the Talk Mode reply-prefix JSON directive described in the OpenClaw docs.
+
+Notes:
+
+- parse only the first non-empty reply line
+- strip the directive before playback
+- support per-reply `once: true` and persistent default updates
+- supported keys should at least include voice, model, and the documented voice-shaping parameters
+- provider-specific validation should happen through the provider contract layer where possible
+
+### Story: Foundry Local STT provider
+
+Implement the AudioGraph-fed streaming STT adapter for Foundry Local.
+
+Notes:
+
+- provider metadata now lives in the provider catalog, but it should stay disabled in settings until the runtime adapter exists
+- this route should use the shared streaming STT path rather than the Windows.Media recognizer path
+- endpoint and model selection should come from the provider catalog settings contract
+
+### Story: OpenAI Whisper STT provider
+
+Implement the AudioGraph-fed streaming STT adapter for OpenAI Whisper transcription.
+
+Notes:
+
+- this should be catalog-driven and disabled in settings until the adapter is production-ready
+- the initial implementation only needs the basic transcription path, not translation or diarization
+- API key and model configuration should come from the provider catalog
+
+### Story: ElevenLabs Speech to Text provider
+
+Implement the AudioGraph-fed streaming STT adapter for ElevenLabs speech-to-text.
+
+Notes:
+
+- keep it catalog-driven and disabled in settings until the runtime path is implemented
+- match the same route abstraction used by the other non-Windows STT providers
+- any provider-specific partial/final transcript semantics should be normalized in the adapter layer
+
+### Story: Azure AI Speech STT provider
+
+Implement the AudioGraph-fed streaming STT adapter for Azure AI Speech.
+
+Notes:
+
+- use the official Azure AI Speech naming in settings and docs rather than an internal "Foundry Azure STT" label
+- keep the provider catalog entry disabled until the adapter is functional end to end
+- endpoint and credential handling should come from the provider settings contract
+
+### Story: sherpa-onnx embedded STT provider
+
+Implement the local embedded sherpa-onnx STT route for user-supplied model bundles.
+
+Notes:
+
+- keep this visible but greyed out in settings until the embedded runtime is implemented
+- the user should be able to choose their own downloaded model bundle and language-appropriate package
+- model lifecycle, validation, and error reporting should be handled in the embedded adapter rather than in the Windows.Media route
+
+
+### Story: Full-duplex / barge-in Talk Mode
+
+Allow the node to keep listening while it is speaking, so the user can interrupt or interleave speech without waiting for reply playback to finish.
+
+Notes:
+
+- the current Windows implementation is half-duplex: recognition is stopped or ignored while a reply is being spoken
+- practical requirements are likely to include:
+ - microphone capture that can remain active during playback
+ - acoustic echo cancellation / echo suppression
+ - barge-in detection and playback interruption rules
+ - a policy for whether interrupt speech cancels the current reply or queues behind it
+ - additional runtime control/status so the UI can show when barge-in is armed
+- this should be treated as a separate engineering phase, not a small extension of the current Talk Mode runtime
+
+### Story: Voice Wake wake-word runtime
+
+Implement the actual Windows Voice Wake runtime.
+
+Notes:
+
+- this should cover wake-word listening, trigger detection, post-trigger capture, silence finalization, hard-stop protection, and debounce between sessions
+- the runtime should restart cleanly after send and should remain armed whenever Voice Wake is enabled and permissions are available
+- the implementation should be based on the planned `AudioGraph` capture pipeline rather than a second unrelated microphone stack
+
+### Story: Voice Wake push-to-talk
+
+Implement a Windows push-to-talk capture path alongside wake-word activation.
+
+Notes:
+
+- this should support press-to-capture, release-to-finalize semantics
+- it should pause the wake runtime while push-to-talk capture is active, then resume it cleanly afterward
+- Windows-specific hotkey and permissions behavior should be documented explicitly once chosen
+
+### Story: Voice Wake settings parity
+
+Add the user-facing Voice Wake settings surface that exists on macOS.
+
+Notes:
+
+- include language and mic pickers
+- include a live level meter
+- include trigger-word editing or table management
+- include a local-only tester that does not forward
+- preserve the chosen mic if it disconnects, surface a disconnected hint, and fall back to the system default until it returns
+
+### Story: Voice Wake sounds and chimes
+
+Add configurable trigger and send sounds for Voice Wake.
+
+Notes:
+
+- trigger and send events should be independently configurable
+- support `No Sound`
+- keep the sound implementation distinct from assistant reply playback
+
+### Story: Voice Wake forwarding semantics
+
+Implement the documented Voice Wake forwarding behavior.
+
+Notes:
+
+- forwarded transcripts should go to the active gateway / agent path
+- reply delivery and logging behavior should match the rest of the node session model
+- the forwarding path should be resilient even when UI surfaces are closed
+
+### Story: Voice Wake machine-hint prefixing
+
+Implement the documented transcript prefixing / machine-hint behavior for forwarded Voice Wake utterances.
+
+Notes:
+
+- the prefixing rule should be explicit and testable
+- both wake-word and push-to-talk paths should share the same forwarding helper
+
+### Story: Voice Wake trigger tuning and pause semantics
+
+Implement the documented Voice Wake trigger-gap, silence-window, hard-stop, and debounce semantics.
+
+Notes:
+
+- include the wake-word gap behavior before command capture begins
+- support distinct silence windows for trigger-only vs flowing speech cases
+- include a hard maximum capture duration
+- expose the tuning through voice settings rather than hard-coded constants alone
+
diff --git a/moltbot-windows-hub.slnx b/moltbot-windows-hub.slnx
index 79eaf12..b83139f 100644
--- a/moltbot-windows-hub.slnx
+++ b/moltbot-windows-hub.slnx
@@ -5,6 +5,7 @@
+
diff --git a/src/OpenClaw.Shared/Capabilities/VoiceCapability.cs b/src/OpenClaw.Shared/Capabilities/VoiceCapability.cs
new file mode 100644
index 0000000..728b8fd
--- /dev/null
+++ b/src/OpenClaw.Shared/Capabilities/VoiceCapability.cs
@@ -0,0 +1,248 @@
+using System;
+using System.Collections.Generic;
+using System.Text.Json;
+using System.Threading.Tasks;
+
+namespace OpenClaw.Shared.Capabilities;
+
+public class VoiceCapability : NodeCapabilityBase
+{
+ private const string LegacySkipCommand = "voice.skip";
+
+ private static readonly JsonSerializerOptions s_jsonOptions = new()
+ {
+ PropertyNameCaseInsensitive = true
+ };
+
+ public override string Category => "voice";
+
+ public override IReadOnlyList Commands => VoiceCommands.All;
+
+ public event Func>? ListDevicesRequested;
+ public event Func>? SettingsRequested;
+ public event Func>? SettingsUpdateRequested;
+ public event Func>? StatusRequested;
+ public event Func>? StartRequested;
+ public event Func>? StopRequested;
+ public event Func>? PauseRequested;
+ public event Func>? ResumeRequested;
+ public event Func>? SkipRequested;
+
+ public VoiceCapability(IOpenClawLogger logger) : base(logger)
+ {
+ }
+
+ public override async Task ExecuteAsync(NodeInvokeRequest request)
+ {
+ return request.Command switch
+ {
+ VoiceCommands.ListDevices => await HandleListDevicesAsync(),
+ VoiceCommands.GetSettings => await HandleGetSettingsAsync(),
+ VoiceCommands.SetSettings => await HandleSetSettingsAsync(request),
+ VoiceCommands.GetStatus => await HandleGetStatusAsync(),
+ VoiceCommands.Start => await HandleStartAsync(request),
+ VoiceCommands.Stop => await HandleStopAsync(request),
+ VoiceCommands.Pause => await HandlePauseAsync(request),
+ VoiceCommands.Resume => await HandleResumeAsync(request),
+ VoiceCommands.Skip or LegacySkipCommand => await HandleSkipAsync(request),
+ _ => Error($"Unknown command: {request.Command}")
+ };
+ }
+
+ private async Task HandleListDevicesAsync()
+ {
+ Logger.Info(VoiceCommands.ListDevices);
+
+ if (ListDevicesRequested == null)
+ return Error("Voice device enumeration not available");
+
+ try
+ {
+ return Success(await ListDevicesRequested());
+ }
+ catch (Exception ex)
+ {
+ Logger.Error("Voice device enumeration failed", ex);
+ return Error($"Device enumeration failed: {ex.Message}");
+ }
+ }
+
+ private async Task HandleGetSettingsAsync()
+ {
+ Logger.Info(VoiceCommands.GetSettings);
+
+ if (SettingsRequested == null)
+ return Error("Voice settings not available");
+
+ try
+ {
+ return Success(await SettingsRequested());
+ }
+ catch (Exception ex)
+ {
+ Logger.Error("Voice settings get failed", ex);
+ return Error($"Get settings failed: {ex.Message}");
+ }
+ }
+
+ private async Task HandleSetSettingsAsync(NodeInvokeRequest request)
+ {
+ Logger.Info(VoiceCommands.SetSettings);
+
+ if (SettingsUpdateRequested == null)
+ return Error("Voice settings update not available");
+
+ try
+ {
+ var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null
+ ? "{}"
+ : request.Args.GetRawText();
+ VoiceSettingsUpdateArgs? update = null;
+ if (request.Args.ValueKind == JsonValueKind.Object &&
+ request.Args.TryGetProperty("update", out var updateEl))
+ {
+ update = JsonSerializer.Deserialize(updateEl.GetRawText(), s_jsonOptions);
+ }
+
+ update ??= JsonSerializer.Deserialize(rawArgs, s_jsonOptions);
+
+ if (update == null)
+ return Error("Missing update payload");
+
+ return Success(await SettingsUpdateRequested(update));
+ }
+ catch (Exception ex)
+ {
+ Logger.Error("Voice settings update failed", ex);
+ return Error($"Set settings failed: {ex.Message}");
+ }
+ }
+
+ private async Task HandleGetStatusAsync()
+ {
+ Logger.Info(VoiceCommands.GetStatus);
+
+ if (StatusRequested == null)
+ return Error("Voice status not available");
+
+ try
+ {
+ return Success(await StatusRequested());
+ }
+ catch (Exception ex)
+ {
+ Logger.Error("Voice status get failed", ex);
+ return Error($"Get status failed: {ex.Message}");
+ }
+ }
+
+ private async Task HandleStartAsync(NodeInvokeRequest request)
+ {
+ Logger.Info(VoiceCommands.Start);
+
+ if (StartRequested == null)
+ return Error("Voice start not available");
+
+ try
+ {
+ var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null
+ ? "{}"
+ : request.Args.GetRawText();
+ var args = JsonSerializer.Deserialize(rawArgs, s_jsonOptions) ?? new VoiceStartArgs();
+ return Success(await StartRequested(args));
+ }
+ catch (Exception ex)
+ {
+ Logger.Error("Voice start failed", ex);
+ return Error($"Start failed: {ex.Message}");
+ }
+ }
+
+ private async Task HandleStopAsync(NodeInvokeRequest request)
+ {
+ Logger.Info(VoiceCommands.Stop);
+
+ if (StopRequested == null)
+ return Error("Voice stop not available");
+
+ try
+ {
+ var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null
+ ? "{}"
+ : request.Args.GetRawText();
+ var args = JsonSerializer.Deserialize(rawArgs, s_jsonOptions) ?? new VoiceStopArgs();
+ return Success(await StopRequested(args));
+ }
+ catch (Exception ex)
+ {
+ Logger.Error("Voice stop failed", ex);
+ return Error($"Stop failed: {ex.Message}");
+ }
+ }
+
+ private async Task HandlePauseAsync(NodeInvokeRequest request)
+ {
+ Logger.Info(VoiceCommands.Pause);
+
+ if (PauseRequested == null)
+ return Error("Voice pause not available");
+
+ try
+ {
+ var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null
+ ? "{}"
+ : request.Args.GetRawText();
+ var args = JsonSerializer.Deserialize(rawArgs, s_jsonOptions) ?? new VoicePauseArgs();
+ return Success(await PauseRequested(args));
+ }
+ catch (Exception ex)
+ {
+ Logger.Error("Voice pause failed", ex);
+ return Error($"Pause failed: {ex.Message}");
+ }
+ }
+
+ private async Task HandleResumeAsync(NodeInvokeRequest request)
+ {
+ Logger.Info(VoiceCommands.Resume);
+
+ if (ResumeRequested == null)
+ return Error("Voice resume not available");
+
+ try
+ {
+ var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null
+ ? "{}"
+ : request.Args.GetRawText();
+ var args = JsonSerializer.Deserialize(rawArgs, s_jsonOptions) ?? new VoiceResumeArgs();
+ return Success(await ResumeRequested(args));
+ }
+ catch (Exception ex)
+ {
+ Logger.Error("Voice resume failed", ex);
+ return Error($"Resume failed: {ex.Message}");
+ }
+ }
+
+ private async Task HandleSkipAsync(NodeInvokeRequest request)
+ {
+ Logger.Info(VoiceCommands.Skip);
+
+ if (SkipRequested == null)
+ return Error("Voice skip not available");
+
+ try
+ {
+ var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null
+ ? "{}"
+ : request.Args.GetRawText();
+ var args = JsonSerializer.Deserialize(rawArgs, s_jsonOptions) ?? new VoiceSkipArgs();
+ return Success(await SkipRequested(args));
+ }
+ catch (Exception ex)
+ {
+ Logger.Error("Voice skip failed", ex);
+ return Error($"Skip failed: {ex.Message}");
+ }
+ }
+}
diff --git a/src/OpenClaw.Shared/Models.cs b/src/OpenClaw.Shared/Models.cs
index 15b1afb..a6feecc 100644
--- a/src/OpenClaw.Shared/Models.cs
+++ b/src/OpenClaw.Shared/Models.cs
@@ -88,6 +88,14 @@ public class OpenClawNotification
public string[]? Tags { get; set; } // free-form routing tags
}
+public class ChatMessageEventArgs : EventArgs
+{
+ public string SessionKey { get; set; } = "main";
+ public string Role { get; set; } = "";
+ public string Message { get; set; } = "";
+ public bool IsFinal { get; set; }
+}
+
///
/// A user-defined notification categorization rule.
///
diff --git a/src/OpenClaw.Shared/OpenClawGatewayClient.cs b/src/OpenClaw.Shared/OpenClawGatewayClient.cs
index 05189f5..2fec9e8 100644
--- a/src/OpenClaw.Shared/OpenClawGatewayClient.cs
+++ b/src/OpenClaw.Shared/OpenClawGatewayClient.cs
@@ -41,8 +41,11 @@ private enum SignatureTokenMode
private GatewayUsageStatusInfo? _usageStatus;
private GatewayCostUsageInfo? _usageCost;
private readonly Dictionary _pendingRequestMethods = new();
+ private readonly Dictionary _pendingChatPreviewSessionKeys = new();
+ private readonly Dictionary _lastAssistantMessagesBySession = new();
private readonly Dictionary> _pendingChatSendRequests = new();
private readonly object _pendingRequestLock = new();
+ private readonly object _pendingChatPreviewLock = new();
private readonly object _pendingChatSendLock = new();
private readonly object _sessionsLock = new();
private readonly object _nodesLock = new();
@@ -58,11 +61,19 @@ private enum SignatureTokenMode
private bool _usageCostUnsupported;
private bool _sessionPreviewUnsupported;
private bool _nodeListUnsupported;
+ private string _defaultChatSessionKey = DefaultChatSessionKey;
private bool _operatorReadScopeUnavailable;
private bool _pairingRequiredAwaitingApproval;
private IReadOnlyList? _userRules;
private bool _preferStructuredCategories = true;
+ private const string DefaultChatSessionKey = "main";
+ private sealed class PendingChatPreviewState
+ {
+ public string? LastKnownAssistantText { get; init; }
+ public int AttemptCount { get; set; }
+ }
+
///
/// Controls whether structured notification metadata (Intent, Channel) takes priority
/// over keyword-based classification. Call after construction and whenever settings change.
@@ -111,15 +122,18 @@ protected override bool ShouldAutoReconnect()
protected override void OnDisconnected()
{
ClearPendingRequests();
+ ClearPendingChatPreviewSessions();
}
protected override void OnDisposing()
{
ClearPendingRequests();
+ ClearPendingChatPreviewSessions();
}
// Events
public event EventHandler? NotificationReceived;
+ public event EventHandler? ChatMessageReceived;
public event EventHandler? ActivityChanged;
public event EventHandler? ChannelHealthUpdated;
public event EventHandler? SessionsUpdated;
@@ -191,35 +205,32 @@ public async Task CheckHealthAsync()
}
}
- public async Task SendChatMessageAsync(string message, string? sessionKey = null)
+ public async Task SendChatMessageAsync(string message, string? sessionKey = null, string? idempotencyKey = null)
{
if (!IsConnected)
throw new InvalidOperationException("Gateway connection is not open");
if (string.IsNullOrWhiteSpace(message))
throw new ArgumentException("Message is required", nameof(message));
- var effectiveSessionKey = string.IsNullOrWhiteSpace(sessionKey)
- ? _mainSessionKey
- : sessionKey.Trim();
-
var requestId = Guid.NewGuid().ToString();
var completion = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
TrackPendingChatSend(requestId, completion);
+ var resolvedSessionKey = ResolveChatSessionKey(sessionKey);
+ var resolvedIdempotencyKey = string.IsNullOrWhiteSpace(idempotencyKey)
+ ? Guid.NewGuid().ToString()
+ : idempotencyKey;
+ var parameters = BuildChatSendParameters(message, resolvedSessionKey, resolvedIdempotencyKey);
- var req = new
+ TrackPendingRequest(requestId, "chat.send");
+ try
{
- type = "req",
- id = requestId,
- method = "chat.send",
- @params = new
- {
- sessionKey = effectiveSessionKey,
- message,
- idempotencyKey = Guid.NewGuid().ToString()
- }
- };
-
- await SendRawAsync(JsonSerializer.Serialize(req));
+ await SendRawAsync(SerializeRequest(requestId, "chat.send", parameters));
+ }
+ catch
+ {
+ RemovePendingRequest(requestId);
+ throw;
+ }
var completedTask = await Task.WhenAny(completion.Task, Task.Delay(5000, CancellationToken));
if (completedTask != completion.Task)
@@ -459,6 +470,31 @@ private async Task SendConnectMessageAsync(string? nonce = null)
}
}
+ private object BuildConnectParameters()
+ {
+ return new
+ {
+ minProtocol = 3,
+ maxProtocol = 3,
+ client = new
+ {
+ id = "cli",
+ version = "1.0.0",
+ platform = "windows",
+ mode = "cli",
+ displayName = "OpenClaw Windows Tray"
+ },
+ role = "operator",
+ scopes = new[] { "operator.read", "operator.write", "operator.admin", "operator.approvals", "operator.pairing" },
+ caps = Array.Empty(),
+ commands = Array.Empty(),
+ permissions = new { },
+ auth = new { token = _token },
+ locale = "en-US",
+ userAgent = "openclaw-windows-tray/1.0.0"
+ };
+ }
+
private async Task SendTrackedRequestAsync(string method, object? parameters = null)
{
if (!IsConnected) return;
@@ -666,6 +702,7 @@ private void HandleResponse(JsonElement root)
// Handle handshake acknowledgement payload.
if (payload.TryGetProperty("type", out var t) && t.GetString() == "hello-ok")
{
+ UpdateDefaultChatSessionKeyFromHello(payload);
_pairingRequiredAwaitingApproval = false;
_operatorDeviceId = TryGetHandshakeDeviceId(payload);
_grantedOperatorScopes = TryGetHandshakeScopes(payload);
@@ -677,7 +714,6 @@ private void HandleResponse(JsonElement root)
_connectAuthToken = newDeviceToken;
_logger.Info("Operator device token stored for reconnect");
}
-
_logger.Info("Handshake complete (hello-ok)");
if (!string.IsNullOrWhiteSpace(_operatorDeviceId))
{
@@ -1257,13 +1293,17 @@ private void HandleChatEvent(JsonElement root)
{
var rawText = root.GetRawText();
_logger.Debug($"Chat event received: {rawText.Substring(0, Math.Min(200, rawText.Length))}");
-
if (!root.TryGetProperty("payload", out var payload)) return;
+ var sessionKey = NormalizeChatSessionKey(TryGetSessionKey(root, payload));
+ var isFinal = !payload.TryGetProperty("state", out var state) ||
+ string.Equals(state.GetString(), "final", StringComparison.OrdinalIgnoreCase);
+ var emittedAssistantText = false;
// Try new format: payload.message.role + payload.message.content[].text
if (payload.TryGetProperty("message", out var message))
{
- if (message.TryGetProperty("role", out var role) && role.GetString() == "assistant")
+ var roleName = GetString(message, "role");
+ if (roleName == "assistant")
{
// Extract text from content array
if (message.TryGetProperty("content", out var content) && content.ValueKind == JsonValueKind.Array)
@@ -1274,11 +1314,11 @@ private void HandleChatEvent(JsonElement root)
item.TryGetProperty("text", out var textProp))
{
var text = textProp.GetString() ?? "";
- if (!string.IsNullOrEmpty(text) &&
- payload.TryGetProperty("state", out var state) &&
- state.GetString() == "final")
+ if (!string.IsNullOrEmpty(text) && isFinal)
{
+ emittedAssistantText = true;
_logger.Info($"Assistant response: {text.Substring(0, Math.Min(100, text.Length))}...");
+ EmitChatMessage(sessionKey, roleName ?? "assistant", text, isFinal);
EmitChatNotification(text);
}
}
@@ -1291,14 +1331,40 @@ private void HandleChatEvent(JsonElement root)
else if (payload.TryGetProperty("text", out var textProp))
{
var text = textProp.GetString() ?? "";
- if (payload.TryGetProperty("role", out var role) &&
- role.GetString() == "assistant" &&
+ var roleName = GetString(payload, "role");
+ if (roleName == "assistant" &&
!string.IsNullOrEmpty(text))
{
+ emittedAssistantText = true;
_logger.Info($"Assistant response (legacy): {text.Substring(0, Math.Min(100, text.Length))}");
+ EmitChatMessage(sessionKey, roleName, text, isFinal: true);
EmitChatNotification(text);
}
}
+
+ if (isFinal && !emittedAssistantText)
+ {
+ RequestChatPreviewForFinalState(sessionKey);
+ }
+ }
+
+ private void EmitChatMessage(string sessionKey, string role, string text, bool isFinal)
+ {
+ if (isFinal && string.Equals(role, "assistant", StringComparison.OrdinalIgnoreCase))
+ {
+ lock (_pendingChatPreviewLock)
+ {
+ _lastAssistantMessagesBySession[NormalizeChatSessionKey(sessionKey)] = text;
+ }
+ }
+
+ ChatMessageReceived?.Invoke(this, new ChatMessageEventArgs
+ {
+ SessionKey = sessionKey,
+ Role = role,
+ Message = text,
+ IsFinal = isFinal
+ });
}
private void EmitChatNotification(string text)
@@ -1512,6 +1578,7 @@ private void ParseSessions(JsonElement sessions)
}
snapshot = GetSessionListInternal();
+ UpdateDefaultChatSessionKeyFromSessions();
}
SessionsUpdated?.Invoke(this, snapshot);
@@ -1540,6 +1607,205 @@ private void ParseSessionItem(JsonElement item)
PopulateSessionFromObject(session, item);
_sessions[session.Key] = session;
+ if (session.IsMain)
+ {
+ UpdateDefaultChatSessionKey(session.Key);
+ }
+ }
+
+ private object BuildChatSendParameters(string message, string sessionKey, string idempotencyKey)
+ {
+ return new
+ {
+ message,
+ sessionKey,
+ idempotencyKey
+ };
+ }
+
+ private string ResolveChatSessionKey(string? sessionKey)
+ {
+ if (!string.IsNullOrWhiteSpace(sessionKey))
+ {
+ return NormalizeChatSessionKey(sessionKey);
+ }
+
+ return string.IsNullOrWhiteSpace(_defaultChatSessionKey)
+ ? DefaultChatSessionKey
+ : _defaultChatSessionKey;
+ }
+
+ private void UpdateDefaultChatSessionKeyFromHello(JsonElement payload)
+ {
+ if (!payload.TryGetProperty("snapshot", out var snapshot) ||
+ snapshot.ValueKind != JsonValueKind.Object ||
+ !snapshot.TryGetProperty("sessionDefaults", out var sessionDefaults) ||
+ sessionDefaults.ValueKind != JsonValueKind.Object)
+ {
+ return;
+ }
+
+ var mainSessionKey = GetString(sessionDefaults, "mainKey") ??
+ GetString(sessionDefaults, "mainSessionKey");
+ if (!string.IsNullOrWhiteSpace(mainSessionKey))
+ {
+ UpdateDefaultChatSessionKey(mainSessionKey);
+ }
+ }
+
+ private void UpdateDefaultChatSessionKeyFromSessions()
+ {
+ var mainSession = _sessions.Values.FirstOrDefault(s => s.IsMain && !string.IsNullOrWhiteSpace(s.Key));
+ if (!string.IsNullOrWhiteSpace(mainSession?.Key))
+ {
+ UpdateDefaultChatSessionKey(mainSession.Key);
+ }
+ }
+
+ private void UpdateDefaultChatSessionKey(string? sessionKey)
+ {
+ if (!string.IsNullOrWhiteSpace(sessionKey))
+ {
+ _defaultChatSessionKey = NormalizeChatSessionKey(sessionKey);
+ }
+ }
+
+ private void RequestChatPreviewForFinalState(string sessionKey)
+ {
+ if (string.IsNullOrWhiteSpace(sessionKey) || _sessionPreviewUnsupported)
+ {
+ return;
+ }
+
+ var normalizedSessionKey = NormalizeChatSessionKey(sessionKey);
+ string? lastKnownAssistantText;
+ lock (_pendingChatPreviewLock)
+ {
+ if (_pendingChatPreviewSessionKeys.ContainsKey(normalizedSessionKey))
+ {
+ return;
+ }
+
+ _lastAssistantMessagesBySession.TryGetValue(normalizedSessionKey, out lastKnownAssistantText);
+ _pendingChatPreviewSessionKeys[normalizedSessionKey] = new PendingChatPreviewState
+ {
+ LastKnownAssistantText = lastKnownAssistantText,
+ AttemptCount = 0
+ };
+ }
+
+ RequestChatPreviewForFinalStateAsync(normalizedSessionKey, delayMs: 0);
+ }
+
+ private void RequestChatPreviewForFinalStateAsync(string normalizedSessionKey, int delayMs)
+ {
+ _ = Task.Run(async () =>
+ {
+ try
+ {
+ if (delayMs > 0)
+ {
+ await Task.Delay(delayMs);
+ }
+
+ await RequestSessionPreviewAsync([normalizedSessionKey], limit: 2, maxChars: 4000);
+ }
+ catch (Exception ex)
+ {
+ _logger.Warn($"sessions.preview request failed for {normalizedSessionKey}: {ex.Message}");
+ lock (_pendingChatPreviewLock)
+ {
+ _pendingChatPreviewSessionKeys.Remove(normalizedSessionKey);
+ }
+ }
+ });
+ }
+
+ private void EmitPendingChatPreviewMessages(SessionsPreviewPayloadInfo payload)
+ {
+ foreach (var preview in payload.Previews)
+ {
+ var normalizedSessionKey = NormalizeChatSessionKey(preview.Key);
+ PendingChatPreviewState? pendingState = null;
+
+ lock (_pendingChatPreviewLock)
+ {
+ _pendingChatPreviewSessionKeys.TryGetValue(normalizedSessionKey, out pendingState);
+ }
+
+ if (pendingState == null)
+ {
+ continue;
+ }
+
+ var assistantText = preview.Items
+ .LastOrDefault(item => string.Equals(item.Role, "assistant", StringComparison.OrdinalIgnoreCase))?
+ .Text?
+ .Trim();
+
+ if (string.IsNullOrWhiteSpace(assistantText))
+ {
+ continue;
+ }
+
+ if (string.Equals(assistantText, pendingState.LastKnownAssistantText, StringComparison.Ordinal))
+ {
+ if (pendingState.AttemptCount < 3)
+ {
+ pendingState.AttemptCount++;
+ _logger.Warn(
+ $"sessions.preview returned the previous assistant reply for {normalizedSessionKey}; retrying preview ({pendingState.AttemptCount}/3)");
+ RequestChatPreviewForFinalStateAsync(normalizedSessionKey, delayMs: 400 * pendingState.AttemptCount);
+ continue;
+ }
+ }
+
+ lock (_pendingChatPreviewLock)
+ {
+ _pendingChatPreviewSessionKeys.Remove(normalizedSessionKey);
+ }
+
+ _logger.Info($"Assistant response (preview): {assistantText.Substring(0, Math.Min(100, assistantText.Length))}...");
+ EmitChatMessage(normalizedSessionKey, "assistant", assistantText, isFinal: true);
+ EmitChatNotification(assistantText);
+ }
+ }
+
+ private void ClearPendingChatPreviewSessions()
+ {
+ lock (_pendingChatPreviewLock)
+ {
+ _pendingChatPreviewSessionKeys.Clear();
+ _lastAssistantMessagesBySession.Clear();
+ }
+ }
+
+ private static string NormalizeChatSessionKey(string? sessionKey)
+ {
+ if (string.IsNullOrWhiteSpace(sessionKey))
+ {
+ return DefaultChatSessionKey;
+ }
+
+ return sessionKey == "main" || sessionKey.Contains(":main:", StringComparison.Ordinal)
+ ? DefaultChatSessionKey
+ : sessionKey;
+ }
+
+ private static string? TryGetSessionKey(JsonElement root, JsonElement payload)
+ {
+ if (root.TryGetProperty("sessionKey", out var rootSessionKey))
+ {
+ return rootSessionKey.GetString();
+ }
+
+ if (payload.ValueKind == JsonValueKind.Object &&
+ payload.TryGetProperty("sessionKey", out var payloadSessionKey))
+ {
+ return payloadSessionKey.GetString();
+ }
+
+ return null;
}
private void PopulateSessionFromObject(SessionInfo session, JsonElement item)
@@ -1853,6 +2119,7 @@ private void ParseSessionsPreview(JsonElement payload)
}
SessionPreviewUpdated?.Invoke(this, previewPayload);
+ EmitPendingChatPreviewMessages(previewPayload);
}
catch (Exception ex)
{
diff --git a/src/OpenClaw.Shared/SettingsData.cs b/src/OpenClaw.Shared/SettingsData.cs
index 7dee87f..60e2939 100644
--- a/src/OpenClaw.Shared/SettingsData.cs
+++ b/src/OpenClaw.Shared/SettingsData.cs
@@ -1,3 +1,5 @@
+using System;
+using System.Text.Json.Serialization;
using System.Text.Json;
namespace OpenClaw.Shared;
@@ -32,6 +34,11 @@ public class SettingsData
public bool NotifyChatResponses { get; set; } = true;
public bool PreferStructuredCategories { get; set; } = true;
public List? UserRules { get; set; }
+ public VoiceSettings Voice { get; set; } = new();
+ public VoiceRepeaterWindowSettings VoiceRepeaterWindow { get; set; } = new();
+ public VoiceProviderConfigurationStore VoiceProviderConfiguration { get; set; } = new();
+ [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
+ public VoiceProviderCredentials? VoiceProviderCredentials { get; set; }
private static readonly JsonSerializerOptions s_options = new() { WriteIndented = true };
@@ -43,11 +50,39 @@ public class SettingsData
return null;
try
{
- return JsonSerializer.Deserialize(json);
+ return JsonSerializer.Deserialize(MigrateLegacyVoiceJson(json));
}
catch (JsonException)
{
return null;
}
}
+
+ private static string MigrateLegacyVoiceJson(string json)
+ {
+ return json
+ .Replace("\"WakeWord\":", "\"VoiceWake\":", StringComparison.Ordinal)
+ .Replace("\"AlwaysOn\":", "\"TalkMode\":", StringComparison.Ordinal)
+ .Replace("\"WakeWordModelId\":", "\"VoiceWakeModelId\":", StringComparison.Ordinal)
+ .Replace("\"WakeWordLoaded\":", "\"VoiceWakeLoaded\":", StringComparison.Ordinal)
+ .Replace("\"LastWakeWordUtc\":", "\"LastVoiceWakeUtc\":", StringComparison.Ordinal)
+ .Replace("\"Mode\":\"WakeWord\"", "\"Mode\":\"VoiceWake\"", StringComparison.Ordinal)
+ .Replace("\"Mode\": \"WakeWord\"", "\"Mode\": \"VoiceWake\"", StringComparison.Ordinal)
+ .Replace("\"Mode\":\"AlwaysOn\"", "\"Mode\":\"TalkMode\"", StringComparison.Ordinal)
+ .Replace("\"Mode\": \"AlwaysOn\"", "\"Mode\": \"TalkMode\"", StringComparison.Ordinal)
+ .Replace("\"State\":\"ListeningForWakeWord\"", "\"State\":\"ListeningForVoiceWake\"", StringComparison.Ordinal)
+ .Replace("\"State\": \"ListeningForWakeWord\"", "\"State\": \"ListeningForVoiceWake\"", StringComparison.Ordinal);
+ }
+}
+
+public sealed class VoiceRepeaterWindowSettings
+{
+ public bool AutoScroll { get; set; } = true;
+ public bool FloatingEnabled { get; set; } = true;
+ public bool HasSavedPlacement { get; set; }
+ public double TextSize { get; set; } = 13;
+ public int? Width { get; set; }
+ public int? Height { get; set; }
+ public int? X { get; set; }
+ public int? Y { get; set; }
}
diff --git a/src/OpenClaw.Shared/VoiceModeSchema.cs b/src/OpenClaw.Shared/VoiceModeSchema.cs
new file mode 100644
index 0000000..e47af8c
--- /dev/null
+++ b/src/OpenClaw.Shared/VoiceModeSchema.cs
@@ -0,0 +1,354 @@
+using System;
+using System.Collections.ObjectModel;
+using System.Text.Json;
+using System.Text.Json.Serialization;
+
+namespace OpenClaw.Shared;
+
+public static class VoiceCommands
+{
+ public const string ListDevices = "voice.devices.list";
+ public const string GetSettings = "voice.settings.get";
+ public const string SetSettings = "voice.settings.set";
+ public const string GetStatus = "voice.status.get";
+ public const string Start = "voice.start";
+ public const string Stop = "voice.stop";
+ public const string Pause = "voice.pause";
+ public const string Resume = "voice.resume";
+ public const string Skip = "voice.response.skip";
+
+ private static readonly ReadOnlyCollection s_all = Array.AsReadOnly(
+ [
+ ListDevices,
+ GetSettings,
+ SetSettings,
+ GetStatus,
+ Start,
+ Stop,
+ Pause,
+ Resume,
+ Skip
+ ]);
+
+ public static IReadOnlyList All => s_all;
+}
+
+[JsonConverter(typeof(VoiceActivationModeJsonConverter))]
+public enum VoiceActivationMode
+{
+ Off,
+ VoiceWake,
+ TalkMode
+}
+
+[JsonConverter(typeof(VoiceRuntimeStateJsonConverter))]
+public enum VoiceRuntimeState
+{
+ Stopped,
+ Paused,
+ Idle,
+ Arming,
+ ListeningForVoiceWake,
+ ListeningContinuously,
+ RecordingUtterance,
+ SubmittingAudio,
+ AwaitingResponse,
+ PlayingResponse,
+ Error
+}
+
+public sealed class VoiceSettings
+{
+ public VoiceActivationMode Mode { get; set; } = VoiceActivationMode.Off;
+ public bool Enabled { get; set; }
+ public bool ShowRepeaterAtStartup { get; set; } = true;
+ public bool ShowConversationToasts { get; set; }
+ public string SpeechToTextProviderId { get; set; } = VoiceProviderIds.Windows;
+ public string TextToSpeechProviderId { get; set; } = VoiceProviderIds.Windows;
+ public string? InputDeviceId { get; set; }
+ public string? OutputDeviceId { get; set; }
+ public int SampleRateHz { get; set; } = 16000;
+ public int CaptureChunkMs { get; set; } = 80;
+ public bool BargeInEnabled { get; set; } = true;
+ public VoiceWakeSettings VoiceWake { get; set; } = new();
+ public TalkModeSettings TalkMode { get; set; } = new();
+}
+
+public sealed class VoiceWakeSettings
+{
+ public string Engine { get; set; } = "NanoWakeWord";
+ public string ModelId { get; set; } = "hey_openclaw";
+ public float TriggerThreshold { get; set; } = 0.65f;
+ public int TriggerCooldownMs { get; set; } = 2000;
+ public int PreRollMs { get; set; } = 1200;
+ public int EndSilenceMs { get; set; } = 900;
+}
+
+public sealed class TalkModeSettings
+{
+ public int MinSpeechMs { get; set; } = 250;
+ public int EndSilenceMs { get; set; } = 900;
+ public int MaxUtteranceMs { get; set; } = 15000;
+}
+
+public sealed class VoiceAudioDeviceInfo
+{
+ public string DeviceId { get; set; } = "";
+ public string Name { get; set; } = "";
+ public bool IsDefault { get; set; }
+ public bool IsInput { get; set; }
+ public bool IsOutput { get; set; }
+}
+
+public sealed class VoiceStatusInfo
+{
+ public bool Available { get; set; }
+ public bool Running { get; set; }
+ public VoiceActivationMode Mode { get; set; } = VoiceActivationMode.Off;
+ public VoiceRuntimeState State { get; set; } = VoiceRuntimeState.Stopped;
+ public string? SessionKey { get; set; }
+ public string? InputDeviceId { get; set; }
+ public string? OutputDeviceId { get; set; }
+ public string? VoiceWakeModelId { get; set; }
+ public bool VoiceWakeLoaded { get; set; }
+ public DateTime? LastVoiceWakeUtc { get; set; }
+ public DateTime? LastUtteranceUtc { get; set; }
+ public int PendingReplyCount { get; set; }
+ public bool CanSkipReply { get; set; }
+ public string? CurrentReplyPreview { get; set; }
+ public string? LastError { get; set; }
+}
+
+public sealed class VoiceStartArgs
+{
+ public VoiceActivationMode? Mode { get; set; }
+ public string? SessionKey { get; set; }
+}
+
+public sealed class VoiceStopArgs
+{
+ public string? Reason { get; set; }
+}
+
+public sealed class VoicePauseArgs
+{
+ public string? Reason { get; set; }
+}
+
+public sealed class VoiceResumeArgs
+{
+ public string? Reason { get; set; }
+}
+
+public sealed class VoiceSkipArgs
+{
+ public string? Reason { get; set; }
+}
+
+public sealed class VoiceSettingsUpdateArgs
+{
+ public VoiceSettings Settings { get; set; } = new();
+ public bool Persist { get; set; } = true;
+}
+
+public static class VoiceProviderIds
+{
+ public const string Windows = "windows";
+ public const string HttpWs = "http-ws";
+ public const string FoundryLocal = "foundry-local";
+ public const string OpenAiWhisper = "openai-whisper";
+ public const string ElevenLabsSpeechToText = "elevenlabs-stt";
+ public const string AzureAiSpeech = "azure-ai-speech";
+ public const string SherpaOnnx = "sherpa-onnx";
+ public const string MiniMax = "minimax";
+ public const string ElevenLabs = "elevenlabs";
+}
+
+public static class VoiceProviderRuntimeIds
+{
+ public const string Windows = "windows";
+ public const string Streaming = "streaming";
+ public const string Embedded = "embedded";
+ public const string Cloud = "cloud";
+}
+
+public static class VoiceProviderSettingKeys
+{
+ public const string ApiKey = "apiKey";
+ public const string Endpoint = "endpoint";
+ public const string Model = "model";
+ public const string ModelPath = "modelPath";
+ public const string VoiceId = "voiceId";
+ public const string VoiceSettingsJson = "voiceSettingsJson";
+}
+
+public static class VoiceTextToSpeechResponseModes
+{
+ public const string Binary = "binary";
+ public const string HexJsonString = "hexJsonString";
+ public const string Base64JsonString = "base64JsonString";
+}
+
+public sealed class VoiceProviderCredentials
+{
+ public string? MiniMaxApiKey { get; set; }
+ public string MiniMaxModel { get; set; } = "speech-2.8-turbo";
+ public string MiniMaxVoiceId { get; set; } = "English_MatureBoss";
+ public string? ElevenLabsApiKey { get; set; }
+ public string? ElevenLabsModel { get; set; }
+ public string? ElevenLabsVoiceId { get; set; }
+}
+
+public sealed class VoiceProviderConfigurationStore
+{
+ public List Providers { get; set; } = [];
+}
+
+public sealed class VoiceProviderConfiguration
+{
+ public string ProviderId { get; set; } = "";
+ public Dictionary Values { get; set; } = [];
+}
+
+public sealed class VoiceProviderSettingDefinition
+{
+ public string Key { get; set; } = "";
+ public string Label { get; set; } = "";
+ public bool Secret { get; set; }
+ public bool Required { get; set; } = true;
+ public bool JsonValue { get; set; }
+ public string? DefaultValue { get; set; }
+ public string? Placeholder { get; set; }
+ public string? Description { get; set; }
+ public List Options { get; set; } = [];
+}
+
+public sealed class VoiceTextToSpeechHttpContract
+{
+ public string EndpointTemplate { get; set; } = "";
+ public string HttpMethod { get; set; } = "POST";
+ public string AuthenticationHeaderName { get; set; } = "Authorization";
+ public string? AuthenticationScheme { get; set; } = "Bearer";
+ public string ApiKeySettingKey { get; set; } = VoiceProviderSettingKeys.ApiKey;
+ public string RequestContentType { get; set; } = "application/json";
+ public string RequestBodyTemplate { get; set; } = "";
+ public string ResponseAudioMode { get; set; } = VoiceTextToSpeechResponseModes.Binary;
+ public string? ResponseAudioJsonPath { get; set; }
+ public string? ResponseStatusCodeJsonPath { get; set; }
+ public string? ResponseStatusMessageJsonPath { get; set; }
+ public string? SuccessStatusValue { get; set; }
+ public string OutputContentType { get; set; } = "audio/mpeg";
+}
+
+public sealed class VoiceTextToSpeechWebSocketContract
+{
+ public string EndpointTemplate { get; set; } = "";
+ public string AuthenticationHeaderName { get; set; } = "Authorization";
+ public string? AuthenticationScheme { get; set; } = "Bearer";
+ public string ApiKeySettingKey { get; set; } = VoiceProviderSettingKeys.ApiKey;
+ public string ConnectSuccessEventName { get; set; } = "connected_success";
+ public string StartMessageTemplate { get; set; } = "";
+ public string StartSuccessEventName { get; set; } = "task_started";
+ public string ContinueMessageTemplate { get; set; } = "";
+ public string FinishMessageTemplate { get; set; } = "{ \"event\": \"task_finish\" }";
+ public string ResponseAudioMode { get; set; } = VoiceTextToSpeechResponseModes.Binary;
+ public string? ResponseAudioJsonPath { get; set; } = "data.audio";
+ public string? ResponseStatusCodeJsonPath { get; set; } = "base_resp.status_code";
+ public string? ResponseStatusMessageJsonPath { get; set; } = "base_resp.status_msg";
+ public string? FinalFlagJsonPath { get; set; } = "is_final";
+ public string TaskFailedEventName { get; set; } = "task_failed";
+ public string? SuccessStatusValue { get; set; } = "0";
+ public string OutputContentType { get; set; } = "audio/mpeg";
+}
+
+public sealed class VoiceProviderOption
+{
+ public string Id { get; set; } = "";
+ public string Name { get; set; } = "";
+ public string Runtime { get; set; } = VoiceProviderRuntimeIds.Windows;
+ public bool Enabled { get; set; } = true;
+ public bool VisibleInSettings { get; set; } = true;
+ public bool Selectable { get; set; } = true;
+ public string? Description { get; set; }
+ public List Settings { get; set; } = [];
+ public VoiceTextToSpeechHttpContract? TextToSpeechHttp { get; set; }
+ public VoiceTextToSpeechWebSocketContract? TextToSpeechWebSocket { get; set; }
+
+ [JsonIgnore]
+ public string DisplayName => Selectable ? Name : $"{Name} (coming soon)";
+
+ [JsonIgnore]
+ public double DisplayOpacity => Selectable ? 1.0 : 0.55;
+}
+
+public sealed class VoiceProviderCatalog
+{
+ public List SpeechToTextProviders { get; set; } = [];
+ public List TextToSpeechProviders { get; set; } = [];
+}
+
+public sealed class VoiceActivationModeJsonConverter : JsonConverter
+{
+ public override VoiceActivationMode Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
+ {
+ var value = reader.GetString();
+ return value switch
+ {
+ "VoiceWake" or "WakeWord" => VoiceActivationMode.VoiceWake,
+ "TalkMode" or "AlwaysOn" => VoiceActivationMode.TalkMode,
+ _ => VoiceActivationMode.Off
+ };
+ }
+
+ public override void Write(Utf8JsonWriter writer, VoiceActivationMode value, JsonSerializerOptions options)
+ {
+ writer.WriteStringValue(value switch
+ {
+ VoiceActivationMode.VoiceWake => "VoiceWake",
+ VoiceActivationMode.TalkMode => "TalkMode",
+ _ => "Off"
+ });
+ }
+}
+
+public sealed class VoiceRuntimeStateJsonConverter : JsonConverter
+{
+ public override VoiceRuntimeState Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
+ {
+ var value = reader.GetString();
+ return value switch
+ {
+ "ListeningForVoiceWake" or "ListeningForWakeWord" => VoiceRuntimeState.ListeningForVoiceWake,
+ "Stopped" => VoiceRuntimeState.Stopped,
+ "Paused" => VoiceRuntimeState.Paused,
+ "Idle" => VoiceRuntimeState.Idle,
+ "Arming" => VoiceRuntimeState.Arming,
+ "ListeningContinuously" => VoiceRuntimeState.ListeningContinuously,
+ "RecordingUtterance" => VoiceRuntimeState.RecordingUtterance,
+ "SubmittingAudio" => VoiceRuntimeState.SubmittingAudio,
+ "AwaitingResponse" => VoiceRuntimeState.AwaitingResponse,
+ "PlayingResponse" => VoiceRuntimeState.PlayingResponse,
+ "Error" => VoiceRuntimeState.Error,
+ _ => VoiceRuntimeState.Stopped
+ };
+ }
+
+ public override void Write(Utf8JsonWriter writer, VoiceRuntimeState value, JsonSerializerOptions options)
+ {
+ writer.WriteStringValue(value switch
+ {
+ VoiceRuntimeState.ListeningForVoiceWake => "ListeningForVoiceWake",
+ VoiceRuntimeState.Stopped => "Stopped",
+ VoiceRuntimeState.Paused => "Paused",
+ VoiceRuntimeState.Idle => "Idle",
+ VoiceRuntimeState.Arming => "Arming",
+ VoiceRuntimeState.ListeningContinuously => "ListeningContinuously",
+ VoiceRuntimeState.RecordingUtterance => "RecordingUtterance",
+ VoiceRuntimeState.SubmittingAudio => "SubmittingAudio",
+ VoiceRuntimeState.AwaitingResponse => "AwaitingResponse",
+ VoiceRuntimeState.PlayingResponse => "PlayingResponse",
+ VoiceRuntimeState.Error => "Error",
+ _ => "Stopped"
+ });
+ }
+}
diff --git a/src/OpenClaw.Shared/VoiceProviderConfigurationStoreExtensions.cs b/src/OpenClaw.Shared/VoiceProviderConfigurationStoreExtensions.cs
new file mode 100644
index 0000000..b1dfa41
--- /dev/null
+++ b/src/OpenClaw.Shared/VoiceProviderConfigurationStoreExtensions.cs
@@ -0,0 +1,161 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace OpenClaw.Shared;
+
+public static class VoiceProviderConfigurationStoreExtensions
+{
+ public static VoiceProviderConfiguration GetOrAddProvider(
+ this VoiceProviderConfigurationStore store,
+ string providerId)
+ {
+ ArgumentNullException.ThrowIfNull(store);
+
+ var existing = store.Providers.FirstOrDefault(p =>
+ string.Equals(p.ProviderId, providerId, StringComparison.OrdinalIgnoreCase));
+ if (existing != null)
+ {
+ return existing;
+ }
+
+ var created = new VoiceProviderConfiguration
+ {
+ ProviderId = providerId
+ };
+ store.Providers.Add(created);
+ return created;
+ }
+
+ public static VoiceProviderConfiguration? FindProvider(
+ this VoiceProviderConfigurationStore store,
+ string? providerId)
+ {
+ ArgumentNullException.ThrowIfNull(store);
+
+ if (string.IsNullOrWhiteSpace(providerId))
+ {
+ return null;
+ }
+
+ return store.Providers.FirstOrDefault(p =>
+ string.Equals(p.ProviderId, providerId, StringComparison.OrdinalIgnoreCase));
+ }
+
+ public static string? GetValue(
+ this VoiceProviderConfigurationStore store,
+ string? providerId,
+ string settingKey)
+ {
+ return store.FindProvider(providerId)?.GetValue(settingKey);
+ }
+
+ public static string? GetValue(this VoiceProviderConfiguration configuration, string settingKey)
+ {
+ ArgumentNullException.ThrowIfNull(configuration);
+
+ if (string.IsNullOrWhiteSpace(settingKey))
+ {
+ return null;
+ }
+
+ return configuration.Values.FirstOrDefault(entry =>
+ string.Equals(entry.Key, settingKey, StringComparison.OrdinalIgnoreCase)).Value;
+ }
+
+ public static void SetValue(
+ this VoiceProviderConfigurationStore store,
+ string providerId,
+ string settingKey,
+ string? value)
+ {
+ ArgumentNullException.ThrowIfNull(store);
+
+ var provider = store.GetOrAddProvider(providerId);
+ provider.SetValue(settingKey, value);
+ }
+
+ public static void SetValue(
+ this VoiceProviderConfiguration configuration,
+ string settingKey,
+ string? value)
+ {
+ ArgumentNullException.ThrowIfNull(configuration);
+
+ if (string.IsNullOrWhiteSpace(settingKey))
+ {
+ return;
+ }
+
+ var existingKey = configuration.Values.Keys.FirstOrDefault(key =>
+ string.Equals(key, settingKey, StringComparison.OrdinalIgnoreCase));
+
+ if (string.IsNullOrWhiteSpace(value))
+ {
+ if (existingKey != null)
+ {
+ configuration.Values.Remove(existingKey);
+ }
+
+ return;
+ }
+
+ if (existingKey != null)
+ {
+ configuration.Values[existingKey] = value.Trim();
+ return;
+ }
+
+ configuration.Values[settingKey] = value.Trim();
+ }
+
+ public static void MigrateLegacyCredentials(
+ this VoiceProviderConfigurationStore store,
+ VoiceProviderCredentials? legacy)
+ {
+ ArgumentNullException.ThrowIfNull(store);
+
+ if (legacy == null)
+ {
+ return;
+ }
+
+ var hasMiniMaxValues =
+ !string.IsNullOrWhiteSpace(legacy.MiniMaxApiKey) ||
+ !string.IsNullOrWhiteSpace(legacy.MiniMaxModel) ||
+ !string.IsNullOrWhiteSpace(legacy.MiniMaxVoiceId);
+ if (hasMiniMaxValues)
+ {
+ store.SetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.ApiKey, legacy.MiniMaxApiKey);
+ store.SetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.Model, legacy.MiniMaxModel);
+ store.SetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.VoiceId, legacy.MiniMaxVoiceId);
+ }
+
+ var hasElevenLabsValues =
+ !string.IsNullOrWhiteSpace(legacy.ElevenLabsApiKey) ||
+ !string.IsNullOrWhiteSpace(legacy.ElevenLabsModel) ||
+ !string.IsNullOrWhiteSpace(legacy.ElevenLabsVoiceId);
+ if (hasElevenLabsValues)
+ {
+ store.SetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.ApiKey, legacy.ElevenLabsApiKey);
+ store.SetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.Model, legacy.ElevenLabsModel);
+ store.SetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.VoiceId, legacy.ElevenLabsVoiceId);
+ }
+ }
+
+ public static VoiceProviderConfigurationStore Clone(this VoiceProviderConfigurationStore source)
+ {
+ ArgumentNullException.ThrowIfNull(source);
+
+ return new VoiceProviderConfigurationStore
+ {
+ Providers = source.Providers
+ .Select(provider => new VoiceProviderConfiguration
+ {
+ ProviderId = provider.ProviderId,
+ Values = new Dictionary(provider.Values, StringComparer.OrdinalIgnoreCase)
+ })
+ .ToList()
+ };
+ }
+}
diff --git a/src/OpenClaw.Tray.Shared/Helpers/VoiceTrayIconHelper.cs b/src/OpenClaw.Tray.Shared/Helpers/VoiceTrayIconHelper.cs
new file mode 100644
index 0000000..2ff9d3e
--- /dev/null
+++ b/src/OpenClaw.Tray.Shared/Helpers/VoiceTrayIconHelper.cs
@@ -0,0 +1,174 @@
+using System;
+using System.Drawing;
+using System.IO;
+using System.Runtime.InteropServices;
+
+namespace OpenClawTray.Helpers;
+
+public enum VoiceTrayIconState
+{
+ Off,
+ Armed,
+ Listening,
+ Speaking
+}
+
+public static class VoiceTrayIconHelper
+{
+ private static readonly string GeneratedIconsPath = Path.Combine(
+ Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData),
+ "OpenClawTray",
+ "GeneratedIcons");
+
+ private static string? _voiceArmedIconPath;
+ private static string? _voiceListeningIconPath;
+ private static string? _voiceSpeakingIconPath;
+
+ public static string GetBaseAppIconPath()
+ {
+ return Path.Combine(ResolveAssetsPath(), "openclaw.ico");
+ }
+
+ public static string GetVoiceTrayIconPath(VoiceTrayIconState state)
+ {
+ return state switch
+ {
+ VoiceTrayIconState.Armed => GetOrCreateVoiceIconPath(ref _voiceArmedIconPath, VoiceTrayIconState.Armed),
+ VoiceTrayIconState.Listening => GetOrCreateVoiceIconPath(ref _voiceListeningIconPath, VoiceTrayIconState.Listening),
+ VoiceTrayIconState.Speaking => GetOrCreateVoiceIconPath(ref _voiceSpeakingIconPath, VoiceTrayIconState.Speaking),
+ _ => GetBaseAppIconPath()
+ };
+ }
+
+ private static string GetOrCreateVoiceIconPath(ref string? cachedPath, VoiceTrayIconState state)
+ {
+ if (!string.IsNullOrWhiteSpace(cachedPath) && File.Exists(cachedPath))
+ {
+ return cachedPath;
+ }
+
+ Directory.CreateDirectory(GeneratedIconsPath);
+ var outputPath = Path.Combine(GeneratedIconsPath, $"voice-{state.ToString().ToLowerInvariant()}.ico");
+
+ using var bitmap = CreateVoiceTrayBitmap(state);
+ using var icon = CreateIcon(bitmap);
+ using var stream = File.Create(outputPath);
+ icon.Save(stream);
+
+ cachedPath = outputPath;
+ return outputPath;
+ }
+
+ private static Bitmap CreateVoiceTrayBitmap(VoiceTrayIconState state)
+ {
+ const int size = 32;
+ var bitmap = new Bitmap(size, size);
+ using var graphics = Graphics.FromImage(bitmap);
+
+ graphics.Clear(Color.Transparent);
+ graphics.SmoothingMode = System.Drawing.Drawing2D.SmoothingMode.AntiAlias;
+ graphics.InterpolationMode = System.Drawing.Drawing2D.InterpolationMode.HighQualityBicubic;
+
+ using (var baseIcon = new Icon(GetBaseAppIconPath(), size, size))
+ using (var baseBitmap = baseIcon.ToBitmap())
+ {
+ graphics.DrawImage(baseBitmap, 0, 0, size, size);
+ }
+
+ switch (state)
+ {
+ case VoiceTrayIconState.Armed:
+ DrawHeadphones(graphics);
+ break;
+ case VoiceTrayIconState.Listening:
+ DrawHeadphones(graphics);
+ DrawHeadphoneWaves(graphics);
+ break;
+ case VoiceTrayIconState.Speaking:
+ DrawMicrophone(graphics);
+ break;
+ }
+
+ return bitmap;
+ }
+
+ private static void DrawHeadphones(Graphics graphics)
+ {
+ using var shadowPen = new Pen(Color.FromArgb(96, 255, 255, 255), 4f);
+ using var bandPen = new Pen(Color.FromArgb(42, 48, 58), 3f);
+ using var earBrush = new SolidBrush(Color.FromArgb(42, 48, 58));
+
+ graphics.DrawArc(shadowPen, 6, 3, 20, 16, 180, 180);
+ graphics.DrawArc(bandPen, 6, 3, 20, 16, 180, 180);
+ graphics.FillPath(earBrush, CreateRoundedRectanglePath(4, 12, 5, 10, 3));
+ graphics.FillPath(earBrush, CreateRoundedRectanglePath(23, 12, 5, 10, 3));
+ }
+
+ private static void DrawMicrophone(Graphics graphics)
+ {
+ using var brush = new SolidBrush(Color.FromArgb(33, 150, 243));
+ using var pen = new Pen(Color.FromArgb(33, 150, 243), 2f);
+
+ graphics.FillPath(brush, CreateRoundedRectanglePath(22, 17, 6, 9, 3));
+ graphics.FillRectangle(brush, 24, 25, 2, 4);
+ graphics.DrawArc(pen, 21, 27, 8, 5, 0, 180);
+ graphics.DrawLine(pen, 20, 21, 15, 19);
+ }
+
+ private static void DrawHeadphoneWaves(Graphics graphics)
+ {
+ using var wavePen = new Pen(Color.FromArgb(76, 175, 80), 2f);
+ using var accentPen = new Pen(Color.FromArgb(76, 175, 80), 1.5f);
+
+ graphics.DrawArc(wavePen, 0, 12, 8, 8, 270, 180);
+ graphics.DrawArc(accentPen, 2, 14, 4, 4, 270, 180);
+ graphics.DrawArc(wavePen, 24, 12, 8, 8, 90, 180);
+ graphics.DrawArc(accentPen, 26, 14, 4, 4, 90, 180);
+ }
+
+ private static Icon CreateIcon(Bitmap bitmap)
+ {
+ var handle = bitmap.GetHicon();
+ var icon = Icon.FromHandle(handle);
+ var result = (Icon)icon.Clone();
+ DestroyIcon(handle);
+ return result;
+ }
+
+ private static System.Drawing.Drawing2D.GraphicsPath CreateRoundedRectanglePath(int x, int y, int width, int height, int radius)
+ {
+ var path = new System.Drawing.Drawing2D.GraphicsPath();
+ path.AddArc(x, y, radius, radius, 180, 90);
+ path.AddArc(x + width - radius, y, radius, radius, 270, 90);
+ path.AddArc(x + width - radius, y + height - radius, radius, radius, 0, 90);
+ path.AddArc(x, y + height - radius, radius, radius, 90, 90);
+ path.CloseFigure();
+ return path;
+ }
+
+ private static string ResolveAssetsPath()
+ {
+ var bundledPath = Path.Combine(AppContext.BaseDirectory, "Assets");
+ if (File.Exists(Path.Combine(bundledPath, "openclaw.ico")))
+ {
+ return bundledPath;
+ }
+
+ var current = new DirectoryInfo(AppContext.BaseDirectory);
+ while (current != null)
+ {
+ var sourcePath = Path.Combine(current.FullName, "src", "OpenClaw.Tray.WinUI", "Assets");
+ if (Directory.Exists(sourcePath))
+ {
+ return sourcePath;
+ }
+
+ current = current.Parent;
+ }
+
+ return bundledPath;
+ }
+
+ [DllImport("user32.dll", CharSet = CharSet.Auto)]
+ private static extern bool DestroyIcon(IntPtr handle);
+}
diff --git a/src/OpenClaw.Tray.Shared/OpenClaw.Tray.Shared.csproj b/src/OpenClaw.Tray.Shared/OpenClaw.Tray.Shared.csproj
new file mode 100644
index 0000000..cfd3156
--- /dev/null
+++ b/src/OpenClaw.Tray.Shared/OpenClaw.Tray.Shared.csproj
@@ -0,0 +1,19 @@
+
+
+
+ net10.0-windows10.0.19041.0
+ enable
+ enable
+ OpenClawTray
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceCaptureMath.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceCaptureMath.cs
new file mode 100644
index 0000000..deac3d4
--- /dev/null
+++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceCaptureMath.cs
@@ -0,0 +1,48 @@
+namespace OpenClawTray.Services.Voice;
+
+public static class VoiceCaptureMath
+{
+ private const float DefaultSignalThreshold = 0.015f;
+
+ public static uint ResolveDesiredSamplesPerQuantum(int sampleRateHz, int chunkMs)
+ {
+ if (sampleRateHz <= 0)
+ {
+ sampleRateHz = 16000;
+ }
+
+ if (chunkMs <= 0)
+ {
+ chunkMs = 80;
+ }
+
+ var desired = (sampleRateHz * chunkMs) / 1000;
+ return (uint)Math.Max(desired, 128);
+ }
+
+ public static bool HasAudibleSignal(float peakLevel, float threshold = DefaultSignalThreshold)
+ {
+ return peakLevel >= threshold;
+ }
+
+ public static float ComputePeakLevel(byte[] data)
+ {
+ if (data.Length < sizeof(float))
+ {
+ return 0f;
+ }
+
+ float peak = 0f;
+ var alignedLength = data.Length - (data.Length % sizeof(float));
+ for (var offset = 0; offset < alignedLength; offset += sizeof(float))
+ {
+ var sample = Math.Abs(BitConverter.ToSingle(data, offset));
+ if (sample > peak)
+ {
+ peak = sample;
+ }
+ }
+
+ return float.IsFinite(peak) ? peak : 0f;
+ }
+}
diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceChatContracts.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceChatContracts.cs
new file mode 100644
index 0000000..106e258
--- /dev/null
+++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceChatContracts.cs
@@ -0,0 +1,43 @@
+using OpenClaw.Shared;
+
+namespace OpenClawTray.Services.Voice;
+
+public interface IUiDispatcher
+{
+ bool TryEnqueue(Action callback);
+}
+
+public interface IVoiceRuntime
+{
+ event EventHandler? ConversationTurnAvailable;
+ event EventHandler? TranscriptDraftUpdated;
+}
+
+public interface IVoiceConfigurationApi
+{
+ Task GetSettingsAsync();
+ Task UpdateSettingsAsync(VoiceSettingsUpdateArgs update);
+ Task ListDevicesAsync();
+ VoiceProviderCatalog GetProviderCatalog();
+ VoiceProviderConfigurationStore GetProviderConfiguration();
+ void SetProviderConfiguration(VoiceProviderConfigurationStore configurationStore);
+}
+
+public interface IVoiceRuntimeControlApi
+{
+ VoiceStatusInfo CurrentStatus { get; }
+ Task GetStatusAsync();
+ Task StartAsync(VoiceStartArgs args);
+ Task StopAsync(VoiceStopArgs args);
+ Task PauseAsync(VoicePauseArgs? args = null);
+ Task ResumeAsync(VoiceResumeArgs? args = null);
+ Task SkipCurrentReplyAsync(VoiceSkipArgs? args = null);
+ Task ToggleQuickPauseAsync();
+}
+
+public interface IVoiceChatWindow
+{
+ bool IsClosed { get; }
+ Task UpdateVoiceTranscriptDraftAsync(string text, bool clear);
+ Task AppendVoiceConversationTurnAsync(VoiceConversationTurnEventArgs args);
+}
diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceChatCoordinator.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceChatCoordinator.cs
new file mode 100644
index 0000000..959b0dc
--- /dev/null
+++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceChatCoordinator.cs
@@ -0,0 +1,154 @@
+using System;
+using System.Collections.Generic;
+using System.Threading.Tasks;
+
+namespace OpenClawTray.Services.Voice;
+
+public sealed class VoiceChatCoordinator : IDisposable
+{
+ private const int MaxBufferedConversationTurns = 8;
+ private readonly IVoiceRuntime _voiceService;
+ private readonly IUiDispatcher _dispatcher;
+ private readonly object _gate = new();
+
+ private readonly List _windows = [];
+ private string _voiceTranscriptDraftText = string.Empty;
+ private readonly List _bufferedConversationTurns = [];
+ private bool _disposed;
+
+ public event EventHandler? ConversationTurnAvailable;
+
+ public VoiceChatCoordinator(
+ IVoiceRuntime voiceService,
+ IUiDispatcher dispatcher)
+ {
+ _voiceService = voiceService;
+ _dispatcher = dispatcher;
+
+ _voiceService.ConversationTurnAvailable += OnVoiceConversationTurnAvailable;
+ _voiceService.TranscriptDraftUpdated += OnVoiceTranscriptDraftUpdated;
+ }
+
+ public void AttachWindow(IVoiceChatWindow window)
+ {
+ ArgumentNullException.ThrowIfNull(window);
+
+ lock (_gate)
+ {
+ if (_windows.Contains(window))
+ {
+ return;
+ }
+
+ _windows.Add(window);
+ }
+
+ _ = window.UpdateVoiceTranscriptDraftAsync(
+ _voiceTranscriptDraftText,
+ clear: string.IsNullOrWhiteSpace(_voiceTranscriptDraftText));
+
+ List bufferedTurns;
+ lock (_gate)
+ {
+ bufferedTurns = [.. _bufferedConversationTurns];
+ }
+
+ foreach (var turn in bufferedTurns)
+ {
+ _ = window.AppendVoiceConversationTurnAsync(turn);
+ }
+ }
+
+ public void DetachWindow(IVoiceChatWindow? window)
+ {
+ lock (_gate)
+ {
+ if (_windows.Count == 0)
+ {
+ return;
+ }
+
+ if (window == null)
+ {
+ _windows.Clear();
+ return;
+ }
+
+ _windows.Remove(window);
+ }
+ }
+
+ public void Dispose()
+ {
+ if (_disposed)
+ {
+ return;
+ }
+
+ _disposed = true;
+ DetachWindow(null);
+ _voiceService.ConversationTurnAvailable -= OnVoiceConversationTurnAvailable;
+ _voiceService.TranscriptDraftUpdated -= OnVoiceTranscriptDraftUpdated;
+ }
+
+ private void OnVoiceConversationTurnAvailable(object? sender, VoiceConversationTurnEventArgs args)
+ {
+ _dispatcher.TryEnqueue(() =>
+ {
+ List windows;
+ lock (_gate)
+ {
+ _bufferedConversationTurns.Add(CloneTurn(args));
+ if (_bufferedConversationTurns.Count > MaxBufferedConversationTurns)
+ {
+ _bufferedConversationTurns.RemoveAt(0);
+ }
+
+ windows = [.. _windows];
+ }
+
+ foreach (var window in windows)
+ {
+ if (!window.IsClosed)
+ {
+ _ = window.AppendVoiceConversationTurnAsync(args);
+ }
+ }
+
+ ConversationTurnAvailable?.Invoke(this, args);
+ });
+ }
+
+ private void OnVoiceTranscriptDraftUpdated(object? sender, VoiceTranscriptDraftEventArgs args)
+ {
+ _dispatcher.TryEnqueue(() =>
+ {
+ _voiceTranscriptDraftText = args.Clear ? string.Empty : (args.Text ?? string.Empty);
+
+ List windows;
+ lock (_gate)
+ {
+ windows = [.. _windows];
+ }
+
+ foreach (var window in windows)
+ {
+ if (!window.IsClosed)
+ {
+ _ = window.UpdateVoiceTranscriptDraftAsync(_voiceTranscriptDraftText, args.Clear);
+ }
+ }
+ });
+ }
+
+ private static VoiceConversationTurnEventArgs CloneTurn(VoiceConversationTurnEventArgs args)
+ {
+ return new VoiceConversationTurnEventArgs
+ {
+ Direction = args.Direction,
+ Message = args.Message,
+ SessionKey = args.SessionKey,
+ Mode = args.Mode
+ };
+ }
+}
diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceCloudTextToSpeechClient.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceCloudTextToSpeechClient.cs
new file mode 100644
index 0000000..0399559
--- /dev/null
+++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceCloudTextToSpeechClient.cs
@@ -0,0 +1,592 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using System.Net.Http;
+using System.Net.Http.Headers;
+using System.Net.WebSockets;
+using System.Runtime.InteropServices.WindowsRuntime;
+using System.Text;
+using System.Text.Json;
+using System.Threading;
+using System.Threading.Tasks;
+using OpenClaw.Shared;
+using Windows.Storage.Streams;
+
+namespace OpenClawTray.Services.Voice;
+
+public sealed class VoiceCloudTextToSpeechClient
+{
+ private static readonly HttpClient s_httpClient = CreateHttpClient();
+
+ public async Task SynthesizeAsync(
+ string text,
+ VoiceProviderOption provider,
+ VoiceProviderConfigurationStore configurationStore,
+ IOpenClawLogger? logger = null,
+ CancellationToken cancellationToken = default)
+ {
+ ArgumentException.ThrowIfNullOrWhiteSpace(text);
+ ArgumentNullException.ThrowIfNull(provider);
+ ArgumentNullException.ThrowIfNull(configurationStore);
+
+ if (provider.TextToSpeechWebSocket != null)
+ {
+ return await SynthesizeViaWebSocketAsync(text, provider, configurationStore, logger, cancellationToken);
+ }
+
+ var contract = provider.TextToSpeechHttp
+ ?? throw new InvalidOperationException($"TTS provider '{provider.Name}' does not expose an HTTP contract.");
+ var providerConfiguration = configurationStore.FindProvider(provider.Id);
+ var templateValues = BuildTemplateValues(text, provider, providerConfiguration, contract);
+ var endpoint = ApplyUrlTemplate(contract.EndpointTemplate, templateValues);
+ using var request = new HttpRequestMessage(ParseHttpMethod(contract.HttpMethod), endpoint);
+ ApplyAuthenticationHeader(request, contract, templateValues);
+
+ if (!string.IsNullOrWhiteSpace(contract.RequestBodyTemplate))
+ {
+ var requestBody = ApplyJsonTemplate(contract.RequestBodyTemplate, templateValues);
+ request.Content = new StringContent(
+ requestBody,
+ Encoding.UTF8,
+ string.IsNullOrWhiteSpace(contract.RequestContentType) ? "application/json" : contract.RequestContentType);
+ }
+
+ var stopwatch = Stopwatch.StartNew();
+ using var response = await s_httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, cancellationToken);
+ var headersElapsedMs = stopwatch.ElapsedMilliseconds;
+ if (!response.IsSuccessStatusCode)
+ {
+ throw new InvalidOperationException(
+ $"{provider.Name} TTS request failed: {(int)response.StatusCode} {response.ReasonPhrase}");
+ }
+
+ if (string.Equals(contract.ResponseAudioMode, VoiceTextToSpeechResponseModes.Binary, StringComparison.OrdinalIgnoreCase))
+ {
+ await using var responseStream = await response.Content.ReadAsStreamAsync(cancellationToken);
+ var result = await CreateResultAsync(responseStream, contract.OutputContentType);
+ logger?.Info($"{provider.Name} TTS latency: headers={headersElapsedMs}ms total={stopwatch.ElapsedMilliseconds}ms (binary)");
+ return result;
+ }
+
+ var responseText = await response.Content.ReadAsStringAsync(cancellationToken);
+ using var document = JsonDocument.Parse(responseText);
+ ValidateResponseStatus(provider, contract, document.RootElement);
+
+ var audioString = GetRequiredJsonString(document.RootElement, contract.ResponseAudioJsonPath);
+ var audioBytesFromJson = DecodeAudioBytes(contract.ResponseAudioMode, audioString, provider.Name);
+ var jsonResult = await CreateResultAsync(audioBytesFromJson, contract.OutputContentType);
+ logger?.Info($"{provider.Name} TTS latency: headers={headersElapsedMs}ms total={stopwatch.ElapsedMilliseconds}ms ({contract.ResponseAudioMode})");
+ return jsonResult;
+ }
+
+ private static async Task SynthesizeViaWebSocketAsync(
+ string text,
+ VoiceProviderOption provider,
+ VoiceProviderConfigurationStore configurationStore,
+ IOpenClawLogger? logger,
+ CancellationToken cancellationToken)
+ {
+ var contract = provider.TextToSpeechWebSocket
+ ?? throw new InvalidOperationException($"TTS provider '{provider.Name}' does not expose a WebSocket contract.");
+ var providerConfiguration = configurationStore.FindProvider(provider.Id);
+ var templateValues = BuildTemplateValues(text, provider, providerConfiguration, contract.ApiKeySettingKey);
+ var endpoint = ApplyUrlTemplate(contract.EndpointTemplate, templateValues);
+ using var socket = new ClientWebSocket();
+ ApplyAuthenticationHeader(socket.Options, contract, templateValues);
+
+ using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
+ cts.CancelAfter(TimeSpan.FromSeconds(30));
+ var ct = cts.Token;
+
+ var stopwatch = Stopwatch.StartNew();
+ await socket.ConnectAsync(new Uri(endpoint), ct);
+
+ if (!string.IsNullOrWhiteSpace(contract.ConnectSuccessEventName))
+ {
+ var connectedMessage = await ReceiveJsonMessageAsync(socket, ct);
+ ValidateWebSocketEvent(provider.Name, contract.ConnectSuccessEventName, connectedMessage, contract);
+ }
+
+ var startMessage = ApplyJsonTemplate(contract.StartMessageTemplate, templateValues);
+ await SendTextMessageAsync(socket, startMessage, ct);
+
+ if (!string.IsNullOrWhiteSpace(contract.StartSuccessEventName))
+ {
+ var startedMessage = await ReceiveJsonMessageAsync(socket, ct);
+ ValidateWebSocketEvent(provider.Name, contract.StartSuccessEventName, startedMessage, contract);
+ }
+
+ var continueMessage = ApplyJsonTemplate(contract.ContinueMessageTemplate, templateValues);
+ await SendTextMessageAsync(socket, continueMessage, ct);
+
+ if (!string.IsNullOrWhiteSpace(contract.FinishMessageTemplate))
+ {
+ await SendTextMessageAsync(socket, ApplyJsonTemplate(contract.FinishMessageTemplate, templateValues), ct);
+ }
+
+ var audioBytes = new List();
+ long? firstChunkMs = null;
+
+ while (true)
+ {
+ var message = await ReceiveJsonMessageAsync(socket, ct);
+ EnsureWebSocketNotFailed(provider.Name, contract, message);
+
+ if (TryGetJsonString(message, contract.ResponseAudioJsonPath, out var audioChunk) &&
+ !string.IsNullOrWhiteSpace(audioChunk))
+ {
+ if (!firstChunkMs.HasValue)
+ {
+ firstChunkMs = stopwatch.ElapsedMilliseconds;
+ }
+
+ audioBytes.AddRange(DecodeAudioBytes(contract.ResponseAudioMode, audioChunk, provider.Name));
+ }
+
+ if (IsFinalWebSocketMessage(message, contract))
+ {
+ break;
+ }
+ }
+
+ try
+ {
+ await socket.CloseAsync(WebSocketCloseStatus.NormalClosure, "done", ct);
+ }
+ catch
+ {
+ }
+
+ if (audioBytes.Count == 0)
+ {
+ throw new InvalidOperationException($"{provider.Name} TTS did not return any audio data.");
+ }
+
+ var result = await CreateResultAsync(audioBytes.ToArray(), contract.OutputContentType);
+ logger?.Info($"{provider.Name} TTS latency: firstChunk={(firstChunkMs?.ToString() ?? "n/a")}ms total={stopwatch.ElapsedMilliseconds}ms (websocket)");
+ return result;
+ }
+
+ private static Dictionary BuildTemplateValues(
+ string text,
+ VoiceProviderOption provider,
+ VoiceProviderConfiguration? providerConfiguration,
+ VoiceTextToSpeechHttpContract contract)
+ {
+ return BuildTemplateValues(text, provider, providerConfiguration, contract.ApiKeySettingKey);
+ }
+
+ private static Dictionary BuildTemplateValues(
+ string text,
+ VoiceProviderOption provider,
+ VoiceProviderConfiguration? providerConfiguration,
+ string apiKeySettingKey)
+ {
+ var values = new Dictionary(StringComparer.OrdinalIgnoreCase)
+ {
+ ["text"] = TemplateValue.FromString(text),
+ ["textWithTrailingSpace"] = TemplateValue.FromString(
+ text.EndsWith(' ') ? text : text + " ")
+ };
+
+ foreach (var setting in provider.Settings)
+ {
+ var configuredValue = providerConfiguration?.GetValue(setting.Key);
+ var effectiveValue = string.IsNullOrWhiteSpace(configuredValue)
+ ? setting.DefaultValue
+ : configuredValue.Trim();
+
+ if (string.IsNullOrWhiteSpace(effectiveValue))
+ {
+ if (setting.Secret || string.Equals(setting.Key, apiKeySettingKey, StringComparison.OrdinalIgnoreCase))
+ {
+ throw new InvalidOperationException(
+ $"{provider.Name} API key is not configured. Open Settings and complete the {provider.Name} voice provider fields.");
+ }
+
+ if (setting.Required)
+ {
+ throw new InvalidOperationException(
+ $"{provider.Name} setting '{setting.Label}' is required. Open Settings and complete the {provider.Name} voice provider fields.");
+ }
+
+ continue;
+ }
+
+ values[setting.Key] = setting.JsonValue
+ ? TemplateValue.FromJson(effectiveValue, provider.Name, setting.Label, values)
+ : TemplateValue.FromString(effectiveValue);
+ }
+
+ return values;
+ }
+
+ private static string ApplyUrlTemplate(string template, IReadOnlyDictionary values)
+ {
+ var result = template;
+ foreach (var entry in values)
+ {
+ result = result.Replace(
+ "{{" + entry.Key + "}}",
+ Uri.EscapeDataString(entry.Value.Value),
+ StringComparison.Ordinal);
+ }
+
+ return result;
+ }
+
+ private static string ApplyJsonTemplate(string template, IReadOnlyDictionary values)
+ {
+ var result = template;
+ foreach (var entry in values)
+ {
+ result = result.Replace(
+ "{{" + entry.Key + "}}",
+ entry.Value.JsonFragment ? entry.Value.Value : JsonSerializer.Serialize(entry.Value.Value),
+ StringComparison.Ordinal);
+ }
+
+ return result;
+ }
+
+ private static void ApplyAuthenticationHeader(
+ HttpRequestMessage request,
+ VoiceTextToSpeechHttpContract contract,
+ IReadOnlyDictionary values)
+ {
+ if (!values.TryGetValue(contract.ApiKeySettingKey, out var apiKey) || string.IsNullOrWhiteSpace(apiKey.Value))
+ {
+ throw new InvalidOperationException("Voice provider API key is not configured.");
+ }
+
+ if (string.Equals(contract.AuthenticationHeaderName, "Authorization", StringComparison.OrdinalIgnoreCase) &&
+ !string.IsNullOrWhiteSpace(contract.AuthenticationScheme))
+ {
+ request.Headers.Authorization = new AuthenticationHeaderValue(contract.AuthenticationScheme, apiKey.Value);
+ return;
+ }
+
+ var headerValue = string.IsNullOrWhiteSpace(contract.AuthenticationScheme)
+ ? apiKey.Value
+ : $"{contract.AuthenticationScheme} {apiKey.Value}";
+ request.Headers.TryAddWithoutValidation(contract.AuthenticationHeaderName, headerValue);
+ }
+
+ private static void ApplyAuthenticationHeader(
+ ClientWebSocketOptions options,
+ VoiceTextToSpeechWebSocketContract contract,
+ IReadOnlyDictionary values)
+ {
+ if (!values.TryGetValue(contract.ApiKeySettingKey, out var apiKey) || string.IsNullOrWhiteSpace(apiKey.Value))
+ {
+ throw new InvalidOperationException("Voice provider API key is not configured.");
+ }
+
+ var headerValue = string.Equals(contract.AuthenticationHeaderName, "Authorization", StringComparison.OrdinalIgnoreCase) &&
+ !string.IsNullOrWhiteSpace(contract.AuthenticationScheme)
+ ? $"{contract.AuthenticationScheme} {apiKey.Value}"
+ : string.IsNullOrWhiteSpace(contract.AuthenticationScheme)
+ ? apiKey.Value
+ : $"{contract.AuthenticationScheme} {apiKey.Value}";
+
+ options.SetRequestHeader(contract.AuthenticationHeaderName, headerValue);
+ }
+
+ private static HttpMethod ParseHttpMethod(string? method)
+ {
+ if (string.Equals(method, HttpMethod.Post.Method, StringComparison.OrdinalIgnoreCase))
+ {
+ return HttpMethod.Post;
+ }
+
+ return new HttpMethod(string.IsNullOrWhiteSpace(method) ? HttpMethod.Post.Method : method);
+ }
+
+ private static void ValidateResponseStatus(
+ VoiceProviderOption provider,
+ VoiceTextToSpeechHttpContract contract,
+ JsonElement root)
+ {
+ if (string.IsNullOrWhiteSpace(contract.ResponseStatusCodeJsonPath))
+ {
+ return;
+ }
+
+ var statusValue = GetJsonValue(root, contract.ResponseStatusCodeJsonPath);
+ var statusText = statusValue.HasValue ? JsonElementToString(statusValue.Value) : null;
+ var successValue = contract.SuccessStatusValue ?? "0";
+ if (string.Equals(statusText, successValue, StringComparison.OrdinalIgnoreCase))
+ {
+ return;
+ }
+
+ var statusMessage = string.IsNullOrWhiteSpace(contract.ResponseStatusMessageJsonPath)
+ ? null
+ : GetJsonValue(root, contract.ResponseStatusMessageJsonPath).HasValue
+ ? JsonElementToString(GetJsonValue(root, contract.ResponseStatusMessageJsonPath)!.Value)
+ : null;
+ throw new InvalidOperationException(
+ string.IsNullOrWhiteSpace(statusMessage)
+ ? $"{provider.Name} TTS returned an error."
+ : $"{provider.Name} TTS returned an error: {statusMessage}");
+ }
+
+ private static void ValidateWebSocketEvent(
+ string providerName,
+ string expectedEvent,
+ JsonElement message,
+ VoiceTextToSpeechWebSocketContract contract)
+ {
+ EnsureWebSocketNotFailed(providerName, contract, message);
+
+ if (!TryGetJsonString(message, "event", out var eventName) ||
+ !string.Equals(eventName, expectedEvent, StringComparison.OrdinalIgnoreCase))
+ {
+ throw new InvalidOperationException($"{providerName} TTS returned an unexpected WebSocket event.");
+ }
+ }
+
+ private static void EnsureWebSocketNotFailed(
+ string providerName,
+ VoiceTextToSpeechWebSocketContract contract,
+ JsonElement message)
+ {
+ if (TryGetJsonString(message, "event", out var eventName) &&
+ string.Equals(eventName, contract.TaskFailedEventName, StringComparison.OrdinalIgnoreCase))
+ {
+ var statusMessage = string.IsNullOrWhiteSpace(contract.ResponseStatusMessageJsonPath)
+ ? null
+ : TryGetJsonString(message, contract.ResponseStatusMessageJsonPath, out var value)
+ ? value
+ : null;
+
+ throw new InvalidOperationException(
+ string.IsNullOrWhiteSpace(statusMessage)
+ ? $"{providerName} TTS returned an error."
+ : $"{providerName} TTS returned an error: {statusMessage}");
+ }
+ }
+
+ private static JsonElement? GetJsonValue(JsonElement root, string? path)
+ {
+ if (string.IsNullOrWhiteSpace(path))
+ {
+ return null;
+ }
+
+ var current = root;
+ foreach (var segment in path.Split('.', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries))
+ {
+ if (current.ValueKind != JsonValueKind.Object || !current.TryGetProperty(segment, out current))
+ {
+ return null;
+ }
+ }
+
+ return current;
+ }
+
+ private static string GetRequiredJsonString(JsonElement root, string? path)
+ {
+ var value = GetJsonValue(root, path);
+ if (!value.HasValue)
+ {
+ throw new InvalidOperationException("Voice provider response did not contain audio data.");
+ }
+
+ var text = value.Value.GetString();
+ if (string.IsNullOrWhiteSpace(text))
+ {
+ throw new InvalidOperationException("Voice provider response did not contain audio data.");
+ }
+
+ return text;
+ }
+
+ private static bool TryGetJsonString(JsonElement root, string? path, out string value)
+ {
+ value = string.Empty;
+ var found = GetJsonValue(root, path);
+ if (!found.HasValue)
+ {
+ return false;
+ }
+
+ var text = JsonElementToString(found.Value);
+ if (string.IsNullOrWhiteSpace(text))
+ {
+ return false;
+ }
+
+ value = text;
+ return true;
+ }
+
+ private static bool IsFinalWebSocketMessage(JsonElement root, VoiceTextToSpeechWebSocketContract contract)
+ {
+ var finalFlag = GetJsonValue(root, contract.FinalFlagJsonPath);
+ return finalFlag.HasValue && finalFlag.Value.ValueKind == JsonValueKind.True;
+ }
+
+ private static string? JsonElementToString(JsonElement element)
+ {
+ return element.ValueKind switch
+ {
+ JsonValueKind.String => element.GetString(),
+ JsonValueKind.Number => element.ToString(),
+ JsonValueKind.True => bool.TrueString,
+ JsonValueKind.False => bool.FalseString,
+ _ => element.ToString()
+ };
+ }
+
+ private static byte[] DecodeAudioBytes(string responseAudioMode, string audioValue, string providerName)
+ {
+ try
+ {
+ if (string.Equals(responseAudioMode, VoiceTextToSpeechResponseModes.HexJsonString, StringComparison.OrdinalIgnoreCase))
+ {
+ return Convert.FromHexString(audioValue);
+ }
+
+ if (string.Equals(responseAudioMode, VoiceTextToSpeechResponseModes.Base64JsonString, StringComparison.OrdinalIgnoreCase))
+ {
+ return Convert.FromBase64String(audioValue);
+ }
+
+ throw new InvalidOperationException($"Unsupported TTS response mode '{responseAudioMode}'.");
+ }
+ catch (FormatException ex)
+ {
+ throw new InvalidOperationException($"{providerName} TTS returned invalid audio data.", ex);
+ }
+ }
+
+ private static async Task CreateResultAsync(byte[] audioBytes, string contentType)
+ {
+ var stream = new InMemoryRandomAccessStream();
+ await stream.WriteAsync(audioBytes.AsBuffer());
+ await stream.FlushAsync();
+ stream.Seek(0);
+
+ return new VoiceCloudTextToSpeechResult(stream, string.IsNullOrWhiteSpace(contentType) ? "audio/mpeg" : contentType);
+ }
+
+ private static async Task CreateResultAsync(Stream sourceStream, string contentType, CancellationToken cancellationToken = default)
+ {
+ var stream = new InMemoryRandomAccessStream();
+ await using (var output = stream.AsStreamForWrite())
+ {
+ await sourceStream.CopyToAsync(output, cancellationToken);
+ await output.FlushAsync(cancellationToken);
+ }
+
+ stream.Seek(0);
+ return new VoiceCloudTextToSpeechResult(stream, string.IsNullOrWhiteSpace(contentType) ? "audio/mpeg" : contentType);
+ }
+
+ private static async Task SendTextMessageAsync(ClientWebSocket socket, string message, CancellationToken cancellationToken)
+ {
+ var bytes = Encoding.UTF8.GetBytes(message);
+ await socket.SendAsync(bytes, WebSocketMessageType.Text, true, cancellationToken);
+ }
+
+ private static async Task ReceiveJsonMessageAsync(ClientWebSocket socket, CancellationToken cancellationToken)
+ {
+ using var buffer = new MemoryStream();
+ var receiveBuffer = new byte[8192];
+
+ while (true)
+ {
+ var segment = new ArraySegment(receiveBuffer);
+ var result = await socket.ReceiveAsync(segment, cancellationToken);
+
+ if (result.MessageType == WebSocketMessageType.Close)
+ {
+ var closeStatus = socket.CloseStatus?.ToString() ?? "Unknown";
+ var closeDescription = string.IsNullOrWhiteSpace(socket.CloseStatusDescription)
+ ? null
+ : socket.CloseStatusDescription;
+ throw new InvalidOperationException(
+ string.IsNullOrWhiteSpace(closeDescription)
+ ? $"Voice provider closed the WebSocket unexpectedly ({closeStatus})."
+ : $"Voice provider closed the WebSocket unexpectedly ({closeStatus}: {closeDescription}).");
+ }
+
+ buffer.Write(receiveBuffer, 0, result.Count);
+ if (result.EndOfMessage)
+ {
+ break;
+ }
+ }
+
+ var text = Encoding.UTF8.GetString(buffer.ToArray());
+ using var document = JsonDocument.Parse(text);
+ return document.RootElement.Clone();
+ }
+
+ private static HttpClient CreateHttpClient()
+ {
+ return new HttpClient
+ {
+ Timeout = TimeSpan.FromSeconds(30)
+ };
+ }
+
+ private readonly record struct TemplateValue(string Value, bool JsonFragment)
+ {
+ public static TemplateValue FromString(string value) => new(value, false);
+
+ public static TemplateValue FromJson(
+ string json,
+ string providerName,
+ string label,
+ IReadOnlyDictionary? templateValues = null)
+ {
+ var substituted = templateValues == null
+ ? json
+ : ApplyJsonTemplate(json, templateValues);
+
+ try
+ {
+ using var document = JsonDocument.Parse(substituted);
+ return new(document.RootElement.GetRawText(), true);
+ }
+ catch (JsonException ex)
+ {
+ try
+ {
+ using var wrapped = JsonDocument.Parse("{ " + substituted + " }");
+ var wrappedJson = wrapped.RootElement.GetRawText();
+ return new(wrappedJson[1..^1], true);
+ }
+ catch (JsonException)
+ {
+ throw new InvalidOperationException(
+ $"{providerName} setting '{label}' must be valid JSON.",
+ ex);
+ }
+ }
+ }
+
+ public static implicit operator string(TemplateValue value) => value.Value;
+ }
+}
+
+public sealed class VoiceCloudTextToSpeechResult : IDisposable
+{
+ public VoiceCloudTextToSpeechResult(IRandomAccessStream stream, string contentType)
+ {
+ Stream = stream;
+ ContentType = contentType;
+ }
+
+ public IRandomAccessStream Stream { get; }
+ public string ContentType { get; }
+
+ public void Dispose()
+ {
+ Stream.Dispose();
+ }
+}
diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceConversationEvents.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceConversationEvents.cs
new file mode 100644
index 0000000..f68c32b
--- /dev/null
+++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceConversationEvents.cs
@@ -0,0 +1,25 @@
+using OpenClaw.Shared;
+
+namespace OpenClawTray.Services.Voice;
+
+public enum VoiceConversationDirection
+{
+ Outgoing,
+ Incoming
+}
+
+public sealed class VoiceConversationTurnEventArgs : EventArgs
+{
+ public VoiceConversationDirection Direction { get; set; }
+ public string SessionKey { get; set; } = "main";
+ public string Message { get; set; } = "";
+ public VoiceActivationMode Mode { get; set; } = VoiceActivationMode.Off;
+}
+
+public sealed class VoiceTranscriptDraftEventArgs : EventArgs
+{
+ public string SessionKey { get; set; } = "main";
+ public string Text { get; set; } = "";
+ public bool Clear { get; set; }
+ public VoiceActivationMode Mode { get; set; } = VoiceActivationMode.Off;
+}
diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceProviderCatalogService.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceProviderCatalogService.cs
new file mode 100644
index 0000000..3af3f2c
--- /dev/null
+++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceProviderCatalogService.cs
@@ -0,0 +1,256 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text.Json;
+using OpenClaw.Shared;
+
+namespace OpenClawTray.Services.Voice;
+
+public static class VoiceProviderCatalogService
+{
+ private const long MaxCatalogBytes = 256 * 1024;
+ private const string CatalogRelativePath = "Assets\\voice-providers.json";
+
+ private static readonly JsonSerializerOptions s_jsonOptions = new()
+ {
+ PropertyNameCaseInsensitive = true,
+ WriteIndented = true
+ };
+
+ public static string CatalogFilePath => ResolveCatalogFilePath();
+
+ public static VoiceProviderCatalog LoadCatalog(IOpenClawLogger? logger = null)
+ {
+ var catalogFilePath = ResolveCatalogFilePath();
+
+ try
+ {
+ if (!File.Exists(catalogFilePath))
+ {
+ throw new FileNotFoundException("Voice provider catalog asset not found.", catalogFilePath);
+ }
+
+ var fileInfo = new FileInfo(catalogFilePath);
+ if (fileInfo.Length > MaxCatalogBytes)
+ {
+ throw new InvalidOperationException($"Voice provider catalog exceeds {MaxCatalogBytes} bytes.");
+ }
+
+ var json = File.ReadAllText(catalogFilePath);
+ var catalog = JsonSerializer.Deserialize(json, s_jsonOptions);
+ if (catalog == null)
+ {
+ throw new InvalidOperationException("Voice provider catalog asset is empty or invalid.");
+ }
+
+ return NormalizeCatalog(catalog);
+ }
+ catch (Exception ex)
+ {
+ throw new InvalidOperationException(
+ $"Failed to load voice provider catalog from '{catalogFilePath}': {ex.Message}",
+ ex);
+ }
+ }
+
+ public static VoiceProviderOption ResolveSpeechToTextProvider(string? providerId, IOpenClawLogger? logger = null)
+ {
+ var catalog = LoadCatalog(logger);
+ return ResolveProvider(catalog.SpeechToTextProviders, providerId);
+ }
+
+ public static VoiceProviderOption ResolveTextToSpeechProvider(string? providerId, IOpenClawLogger? logger = null)
+ {
+ var catalog = LoadCatalog(logger);
+ return ResolveProvider(catalog.TextToSpeechProviders, providerId);
+ }
+
+ public static bool SupportsWindowsRuntime(string? providerId)
+ {
+ return string.Equals(providerId, VoiceProviderIds.Windows, StringComparison.OrdinalIgnoreCase);
+ }
+
+ public static bool SupportsSpeechToTextRuntime(string? providerId)
+ {
+ try
+ {
+ var provider = ResolveSpeechToTextProvider(providerId);
+ return VoiceSpeechToTextRouteResolver.ResolveRouteKind(provider) == VoiceSpeechToTextRouteKind.WindowsMedia;
+ }
+ catch
+ {
+ return false;
+ }
+ }
+
+ public static bool SupportsTextToSpeechRuntime(string? providerId)
+ {
+ if (SupportsWindowsRuntime(providerId))
+ {
+ return true;
+ }
+
+ try
+ {
+ var provider = ResolveTextToSpeechProvider(providerId);
+ return provider.TextToSpeechHttp != null || provider.TextToSpeechWebSocket != null;
+ }
+ catch
+ {
+ return false;
+ }
+ }
+
+ private static VoiceProviderCatalog NormalizeCatalog(VoiceProviderCatalog catalog)
+ {
+ return new VoiceProviderCatalog
+ {
+ SpeechToTextProviders = NormalizeProviders(catalog.SpeechToTextProviders),
+ TextToSpeechProviders = NormalizeProviders(catalog.TextToSpeechProviders)
+ };
+ }
+
+ private static List NormalizeProviders(List? providers)
+ {
+ return (providers ?? [])
+ .Where(p => !string.IsNullOrWhiteSpace(p.Id))
+ .Select(Clone)
+ .Where(p => p.Enabled || p.VisibleInSettings)
+ .OrderByDescending(p => string.Equals(p.Id, VoiceProviderIds.Windows, StringComparison.OrdinalIgnoreCase))
+ .ThenBy(p => p.Name, StringComparer.OrdinalIgnoreCase)
+ .ToList();
+ }
+
+ private static VoiceProviderOption ResolveProvider(IEnumerable providers, string? providerId)
+ {
+ if (!string.IsNullOrWhiteSpace(providerId))
+ {
+ var configured = providers.FirstOrDefault(p => string.Equals(p.Id, providerId, StringComparison.OrdinalIgnoreCase));
+ if (configured != null)
+ {
+ return Clone(configured);
+ }
+ }
+
+ return providers
+ .Select(Clone)
+ .FirstOrDefault(p => string.Equals(p.Id, VoiceProviderIds.Windows, StringComparison.OrdinalIgnoreCase))
+ ?? new VoiceProviderOption
+ {
+ Id = VoiceProviderIds.Windows,
+ Name = "Windows Speech",
+ Runtime = "windows"
+ };
+ }
+
+ private static VoiceProviderOption Clone(VoiceProviderOption source)
+ {
+ return new VoiceProviderOption
+ {
+ Id = source.Id,
+ Name = source.Name,
+ Runtime = source.Runtime,
+ Enabled = source.Enabled,
+ VisibleInSettings = source.VisibleInSettings,
+ Selectable = source.Selectable,
+ Description = source.Description,
+ Settings = source.Settings.Select(Clone).ToList(),
+ TextToSpeechHttp = Clone(source.TextToSpeechHttp),
+ TextToSpeechWebSocket = Clone(source.TextToSpeechWebSocket)
+ };
+ }
+
+ private static VoiceProviderSettingDefinition Clone(VoiceProviderSettingDefinition source)
+ {
+ return new VoiceProviderSettingDefinition
+ {
+ Key = source.Key,
+ Label = source.Label,
+ Secret = source.Secret,
+ Required = source.Required,
+ JsonValue = source.JsonValue,
+ DefaultValue = source.DefaultValue,
+ Placeholder = source.Placeholder,
+ Description = source.Description,
+ Options = source.Options.ToList()
+ };
+ }
+
+ private static VoiceTextToSpeechHttpContract? Clone(VoiceTextToSpeechHttpContract? source)
+ {
+ if (source == null)
+ {
+ return null;
+ }
+
+ return new VoiceTextToSpeechHttpContract
+ {
+ EndpointTemplate = source.EndpointTemplate,
+ HttpMethod = source.HttpMethod,
+ AuthenticationHeaderName = source.AuthenticationHeaderName,
+ AuthenticationScheme = source.AuthenticationScheme,
+ ApiKeySettingKey = source.ApiKeySettingKey,
+ RequestContentType = source.RequestContentType,
+ RequestBodyTemplate = source.RequestBodyTemplate,
+ ResponseAudioMode = source.ResponseAudioMode,
+ ResponseAudioJsonPath = source.ResponseAudioJsonPath,
+ ResponseStatusCodeJsonPath = source.ResponseStatusCodeJsonPath,
+ ResponseStatusMessageJsonPath = source.ResponseStatusMessageJsonPath,
+ SuccessStatusValue = source.SuccessStatusValue,
+ OutputContentType = source.OutputContentType
+ };
+ }
+
+ private static VoiceTextToSpeechWebSocketContract? Clone(VoiceTextToSpeechWebSocketContract? source)
+ {
+ if (source == null)
+ {
+ return null;
+ }
+
+ return new VoiceTextToSpeechWebSocketContract
+ {
+ EndpointTemplate = source.EndpointTemplate,
+ AuthenticationHeaderName = source.AuthenticationHeaderName,
+ AuthenticationScheme = source.AuthenticationScheme,
+ ApiKeySettingKey = source.ApiKeySettingKey,
+ ConnectSuccessEventName = source.ConnectSuccessEventName,
+ StartMessageTemplate = source.StartMessageTemplate,
+ StartSuccessEventName = source.StartSuccessEventName,
+ ContinueMessageTemplate = source.ContinueMessageTemplate,
+ FinishMessageTemplate = source.FinishMessageTemplate,
+ ResponseAudioMode = source.ResponseAudioMode,
+ ResponseAudioJsonPath = source.ResponseAudioJsonPath,
+ ResponseStatusCodeJsonPath = source.ResponseStatusCodeJsonPath,
+ ResponseStatusMessageJsonPath = source.ResponseStatusMessageJsonPath,
+ FinalFlagJsonPath = source.FinalFlagJsonPath,
+ TaskFailedEventName = source.TaskFailedEventName,
+ SuccessStatusValue = source.SuccessStatusValue,
+ OutputContentType = source.OutputContentType
+ };
+ }
+
+ private static string ResolveCatalogFilePath()
+ {
+ var bundledPath = Path.Combine(AppContext.BaseDirectory, CatalogRelativePath);
+ if (File.Exists(bundledPath))
+ {
+ return bundledPath;
+ }
+
+ var current = new DirectoryInfo(AppContext.BaseDirectory);
+ while (current != null)
+ {
+ var sourcePath = Path.Combine(current.FullName, "src", "OpenClaw.Tray.WinUI", CatalogRelativePath);
+ if (File.Exists(sourcePath))
+ {
+ return sourcePath;
+ }
+
+ current = current.Parent;
+ }
+
+ return bundledPath;
+ }
+}
diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceServiceTransportLogic.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceServiceTransportLogic.cs
new file mode 100644
index 0000000..9a3e57e
--- /dev/null
+++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceServiceTransportLogic.cs
@@ -0,0 +1,255 @@
+using OpenClaw.Shared;
+using Windows.Media.Devices;
+using Windows.Media.SpeechRecognition;
+
+namespace OpenClawTray.Services.Voice;
+
+public static class VoiceServiceTransportLogic
+{
+ private static readonly TimeSpan HypothesisPromotionWindow = TimeSpan.FromSeconds(2);
+
+ public static TaskCompletionSource GetOrCreateTransportReadySource(
+ ConnectionStatus transportStatus,
+ TaskCompletionSource? existingReadySource,
+ out bool shouldStartConnection)
+ {
+ if (transportStatus == ConnectionStatus.Connecting && existingReadySource != null)
+ {
+ shouldStartConnection = false;
+ return existingReadySource;
+ }
+
+ shouldStartConnection = true;
+ return new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
+ }
+
+ public static bool UsesCloudTextToSpeechRuntime(VoiceProviderOption provider)
+ {
+ return provider.TextToSpeechHttp != null || provider.TextToSpeechWebSocket != null;
+ }
+
+ public static bool ShouldAcceptAssistantReply(
+ bool awaitingReply,
+ bool isSpeaking,
+ int queuedReplyCount,
+ bool acceptedViaLateReplyGrace = false)
+ {
+ return awaitingReply || isSpeaking || queuedReplyCount > 0 || acceptedViaLateReplyGrace;
+ }
+
+ public static bool ShouldAcceptLateAssistantReply(
+ bool awaitingReply,
+ bool isSpeaking,
+ int queuedReplyCount,
+ string? lateReplySessionKey,
+ DateTime? lateReplyGraceUntilUtc,
+ string? incomingSessionKey,
+ DateTime utcNow)
+ {
+ return !awaitingReply &&
+ !isSpeaking &&
+ queuedReplyCount == 0 &&
+ !string.IsNullOrWhiteSpace(lateReplySessionKey) &&
+ !string.IsNullOrWhiteSpace(incomingSessionKey) &&
+ IsMatchingSessionKey(incomingSessionKey, lateReplySessionKey) &&
+ lateReplyGraceUntilUtc.HasValue &&
+ utcNow <= lateReplyGraceUntilUtc.Value;
+ }
+
+ public static bool ShouldRestartRecognitionAfterCompletion(
+ bool running,
+ VoiceActivationMode mode,
+ bool restartInProgress,
+ bool awaitingReply,
+ bool isSpeaking)
+ {
+ return running &&
+ mode == VoiceActivationMode.TalkMode &&
+ !restartInProgress &&
+ !awaitingReply &&
+ !isSpeaking;
+ }
+
+ public static string DescribeRecognitionCompletionRestartDecision(
+ bool running,
+ VoiceActivationMode mode,
+ bool restartInProgress,
+ bool awaitingReply,
+ bool isSpeaking)
+ {
+ if (!running)
+ {
+ return "runtime-not-running";
+ }
+
+ if (mode != VoiceActivationMode.TalkMode)
+ {
+ return $"mode={mode}";
+ }
+
+ if (restartInProgress)
+ {
+ return "controlled-restart-in-progress";
+ }
+
+ if (awaitingReply)
+ {
+ return "awaiting-reply";
+ }
+
+ if (isSpeaking)
+ {
+ return "speaking";
+ }
+
+ return "eligible";
+ }
+
+ public static bool ShouldRebuildRecognitionAfterCompletion(
+ SpeechRecognitionResultStatus status,
+ bool sessionHadActivity,
+ bool sessionHadCaptureSignal,
+ bool restartInProgress,
+ bool awaitingReply,
+ bool isSpeaking)
+ {
+ if (restartInProgress || awaitingReply || isSpeaking || sessionHadActivity)
+ {
+ return false;
+ }
+
+ return status == SpeechRecognitionResultStatus.UserCanceled;
+ }
+
+ public static string DescribeRecognitionCompletionRebuildDecision(
+ SpeechRecognitionResultStatus status,
+ bool sessionHadActivity,
+ bool sessionHadCaptureSignal,
+ bool restartInProgress,
+ bool awaitingReply,
+ bool isSpeaking)
+ {
+ if (restartInProgress)
+ {
+ return "controlled-restart-in-progress";
+ }
+
+ if (awaitingReply)
+ {
+ return "awaiting-reply";
+ }
+
+ if (isSpeaking)
+ {
+ return "speaking";
+ }
+
+ if (sessionHadActivity)
+ {
+ return "session-had-activity";
+ }
+
+ if (sessionHadCaptureSignal)
+ {
+ return "capture-signal-without-recognition";
+ }
+
+ return status switch
+ {
+ SpeechRecognitionResultStatus.UserCanceled => "user-canceled-without-activity",
+ SpeechRecognitionResultStatus.TimeoutExceeded => "disabled-official-session-restart-only (status=TimeoutExceeded)",
+ _ => $"disabled-official-session-restart-only (status={status})"
+ };
+ }
+
+ public static string SelectRecognizedText(
+ string recognizedText,
+ string? latestHypothesisText,
+ DateTime latestHypothesisUtc,
+ DateTime utcNow,
+ out bool promotedHypothesis)
+ {
+ promotedHypothesis = false;
+
+ if (string.IsNullOrWhiteSpace(recognizedText) ||
+ string.IsNullOrWhiteSpace(latestHypothesisText) ||
+ utcNow - latestHypothesisUtc > HypothesisPromotionWindow)
+ {
+ return recognizedText;
+ }
+
+ var normalizedResult = recognizedText.Trim();
+ var normalizedHypothesis = latestHypothesisText.Trim();
+
+ if (normalizedHypothesis.Length <= normalizedResult.Length + 3)
+ {
+ return normalizedResult;
+ }
+
+ if (!normalizedHypothesis.EndsWith(normalizedResult, StringComparison.OrdinalIgnoreCase))
+ {
+ return normalizedResult;
+ }
+
+ promotedHypothesis = true;
+ return normalizedHypothesis;
+ }
+
+ public static string? SelectCompletionFallbackText(
+ bool sessionHadActivity,
+ string? latestHypothesisText,
+ DateTime latestHypothesisUtc,
+ DateTime utcNow)
+ {
+ if (!sessionHadActivity ||
+ string.IsNullOrWhiteSpace(latestHypothesisText) ||
+ utcNow - latestHypothesisUtc > HypothesisPromotionWindow)
+ {
+ return null;
+ }
+
+ return latestHypothesisText.Trim();
+ }
+
+ public static bool ShouldClearTranscriptDraftAfterCompletion(
+ bool awaitingReply,
+ bool isSpeaking,
+ bool usedFallbackTranscript)
+ {
+ return !awaitingReply &&
+ !isSpeaking &&
+ !usedFallbackTranscript;
+ }
+
+ public static bool ShouldRepromptAfterIncompleteRecognition(
+ bool sessionHadActivity,
+ bool awaitingReply,
+ bool isSpeaking,
+ bool usedFallbackTranscript)
+ {
+ return sessionHadActivity &&
+ !awaitingReply &&
+ !isSpeaking &&
+ !usedFallbackTranscript;
+ }
+
+ public static bool ShouldRefreshRecognitionForDefaultCaptureDeviceChange(
+ bool running,
+ VoiceActivationMode mode,
+ string? configuredInputDeviceId,
+ AudioDeviceRole role)
+ {
+ return running &&
+ mode == VoiceActivationMode.TalkMode &&
+ string.IsNullOrWhiteSpace(configuredInputDeviceId) &&
+ role == AudioDeviceRole.Default;
+ }
+
+ private static bool IsMatchingSessionKey(string? first, string? second)
+ {
+ return string.Equals(
+ string.IsNullOrWhiteSpace(first) ? "main" : first,
+ string.IsNullOrWhiteSpace(second) ? "main" : second,
+ StringComparison.OrdinalIgnoreCase);
+ }
+}
diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceSpeechToTextRouteKind.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceSpeechToTextRouteKind.cs
new file mode 100644
index 0000000..cfb5f95
--- /dev/null
+++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceSpeechToTextRouteKind.cs
@@ -0,0 +1,8 @@
+namespace OpenClawTray.Services.Voice;
+
+public enum VoiceSpeechToTextRouteKind
+{
+ WindowsMedia,
+ Streaming,
+ SherpaOnnx
+}
diff --git a/src/OpenClaw.Tray.Shared/Services/Voice/VoiceSpeechToTextRouteResolver.cs b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceSpeechToTextRouteResolver.cs
new file mode 100644
index 0000000..61aa6a8
--- /dev/null
+++ b/src/OpenClaw.Tray.Shared/Services/Voice/VoiceSpeechToTextRouteResolver.cs
@@ -0,0 +1,28 @@
+using OpenClaw.Shared;
+
+namespace OpenClawTray.Services.Voice;
+
+public static class VoiceSpeechToTextRouteResolver
+{
+ public static VoiceSpeechToTextRouteKind ResolveRouteKind(VoiceProviderOption provider)
+ {
+ ArgumentNullException.ThrowIfNull(provider);
+
+ if (string.Equals(provider.Id, VoiceProviderIds.SherpaOnnx, StringComparison.OrdinalIgnoreCase))
+ {
+ return VoiceSpeechToTextRouteKind.SherpaOnnx;
+ }
+
+ if (string.Equals(provider.Runtime, VoiceProviderRuntimeIds.Streaming, StringComparison.OrdinalIgnoreCase))
+ {
+ return VoiceSpeechToTextRouteKind.Streaming;
+ }
+
+ if (string.Equals(provider.Runtime, VoiceProviderRuntimeIds.Embedded, StringComparison.OrdinalIgnoreCase))
+ {
+ return VoiceSpeechToTextRouteKind.SherpaOnnx;
+ }
+
+ return VoiceSpeechToTextRouteKind.WindowsMedia;
+ }
+}
diff --git a/src/OpenClaw.Tray.Shared/Windows/WebChatVoiceDomBridge.cs b/src/OpenClaw.Tray.Shared/Windows/WebChatVoiceDomBridge.cs
new file mode 100644
index 0000000..e1edb3d
--- /dev/null
+++ b/src/OpenClaw.Tray.Shared/Windows/WebChatVoiceDomBridge.cs
@@ -0,0 +1,95 @@
+using System.Text.Json;
+
+namespace OpenClawTray.Windows;
+
+public static class WebChatVoiceDomBridge
+{
+ public const string DocumentCreatedScript = """
+(() => {
+ const isVisible = (el) => !!el && !(el.disabled === true) && el.getClientRects().length > 0;
+ let desiredDraft = '';
+
+ const findComposer = () => {
+ const candidates = Array.from(document.querySelectorAll('textarea, input[type="text"], [contenteditable="true"], [contenteditable="plaintext-only"]'));
+ return candidates.find(isVisible) || null;
+ };
+
+ const setElementValue = (el, value) => {
+ const text = typeof value === 'string' ? value : '';
+ if ('value' in el) {
+ const proto = el.tagName === 'TEXTAREA' ? HTMLTextAreaElement.prototype : HTMLInputElement.prototype;
+ const descriptor = Object.getOwnPropertyDescriptor(proto, 'value');
+ if (descriptor && descriptor.set) {
+ descriptor.set.call(el, text);
+ } else {
+ el.value = text;
+ }
+ el.dispatchEvent(new InputEvent('input', { bubbles: true, data: text, inputType: 'insertText' }));
+ el.dispatchEvent(new Event('change', { bubbles: true }));
+ return;
+ }
+
+ if (el.isContentEditable) {
+ el.textContent = text;
+ el.dispatchEvent(new InputEvent('input', { bubbles: true, data: text, inputType: 'insertText' }));
+ el.dispatchEvent(new Event('change', { bubbles: true }));
+ }
+ };
+
+ const applyDraftIfPossible = () => {
+ const composer = findComposer();
+ if (!composer) return false;
+ setElementValue(composer, desiredDraft);
+ return true;
+ };
+
+ const clearLegacyTurnsHost = () => {
+ const host = document.getElementById('openclaw-tray-voice-turns');
+ if (host) {
+ host.remove();
+ }
+ };
+
+ const observer = new MutationObserver(() => applyDraftIfPossible());
+ const start = () => {
+ if (!document.body) return;
+ observer.observe(document.body, { childList: true, subtree: true });
+ applyDraftIfPossible();
+ clearLegacyTurnsHost();
+ };
+
+ if (document.readyState === 'loading') {
+ document.addEventListener('DOMContentLoaded', start, { once: true });
+ } else {
+ start();
+ }
+
+ window.__openClawTrayVoice = {
+ setDraft(text) {
+ desiredDraft = text || '';
+ return applyDraftIfPossible();
+ },
+ clearDraft() {
+ desiredDraft = '';
+ return applyDraftIfPossible();
+ },
+ setTurns() {
+ clearLegacyTurnsHost();
+ return true;
+ }
+ };
+})();
+""";
+
+ public static string BuildSetDraftScript(string? text)
+ {
+ if (string.IsNullOrWhiteSpace(text))
+ {
+ return "window.__openClawTrayVoice?.clearDraft?.();";
+ }
+
+ return $"window.__openClawTrayVoice?.setDraft?.({JsonSerializer.Serialize(text)});";
+ }
+
+ public const string ClearLegacyTurnsScript = "window.__openClawTrayVoice?.setTurns?.([]);";
+}
diff --git a/src/OpenClaw.Tray.WinUI/App.xaml.cs b/src/OpenClaw.Tray.WinUI/App.xaml.cs
index 04f053d..5b67e10 100644
--- a/src/OpenClaw.Tray.WinUI/App.xaml.cs
+++ b/src/OpenClaw.Tray.WinUI/App.xaml.cs
@@ -6,6 +6,7 @@
using OpenClawTray.Dialogs;
using OpenClawTray.Helpers;
using OpenClawTray.Services;
+using OpenClawTray.Services.Voice;
using OpenClawTray.Windows;
using System;
using System.Collections.Frozen;
@@ -39,6 +40,7 @@ public partial class App : Application
private GlobalHotkeyService? _globalHotkey;
private System.Timers.Timer? _healthCheckTimer;
private System.Timers.Timer? _sessionPollTimer;
+ private Microsoft.UI.Dispatching.DispatcherQueueTimer? _voiceTrayIconTimer;
private Mutex? _mutex;
private Microsoft.UI.Dispatching.DispatcherQueue? _dispatcherQueue;
private CancellationTokenSource? _deepLinkCts;
@@ -57,6 +59,7 @@ public partial class App : Application
private GatewayCostUsageInfo? _lastUsageCost;
private DateTime _lastCheckTime = DateTime.Now;
private DateTime _lastUsageActivityLogUtc = DateTime.MinValue;
+ private string? _lastTrayIconPath;
// FrozenDictionary for O(1) case-insensitive notification type → setting lookup — no per-call allocation.
private static readonly System.Collections.Frozen.FrozenDictionary> s_notifTypeMap =
@@ -81,6 +84,8 @@ public partial class App : Application
// Windows (created on demand)
private SettingsWindow? _settingsWindow;
+ private VoiceRepeaterWindow? _voiceRepeaterWindow;
+ private VoiceModeWindow? _voiceModeWindow;
private WebChatWindow? _webChatWindow;
private StatusDetailWindow? _statusDetailWindow;
private NotificationHistoryWindow? _notificationHistoryWindow;
@@ -90,6 +95,8 @@ public partial class App : Application
// Node service (optional, enabled in settings)
private NodeService? _nodeService;
+ private VoiceService? _voiceService;
+ private VoiceChatCoordinator? _voiceChatCoordinator;
// Keep-alive window to anchor WinUI runtime (prevents GC/threading issues)
private Window? _keepAliveWindow;
@@ -269,6 +276,11 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args)
// Register toast activation handler
ToastNotificationManagerCompat.OnActivated += OnToastActivated;
+ _voiceService = new VoiceService(new AppLogger(), _settings);
+ _voiceChatCoordinator = new VoiceChatCoordinator(
+ _voiceService,
+ new DispatcherQueueAdapter(_dispatcherQueue!));
+ _voiceChatCoordinator.ConversationTurnAvailable += OnVoiceConversationTurnAvailable;
_sshTunnelService = new SshTunnelService(new AppLogger());
_sshTunnelService.TunnelExited += OnSshTunnelExited;
@@ -297,6 +309,7 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args)
// Start health check timer
StartHealthCheckTimer();
+ StartVoiceTrayIconTimer();
// Start deep link server
StartDeepLinkServer();
@@ -305,7 +318,8 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args)
if (_settings.GlobalHotkeyEnabled)
{
_globalHotkey = new GlobalHotkeyService();
- _globalHotkey.HotkeyPressed += OnGlobalHotkeyPressed;
+ _globalHotkey.QuickSendHotkeyPressed += OnGlobalQuickSendHotkeyPressed;
+ _globalHotkey.VoiceToggleHotkeyPressed += OnGlobalVoiceToggleHotkeyPressed;
_globalHotkey.Register();
}
@@ -318,6 +332,11 @@ protected override async void OnLaunched(LaunchActivatedEventArgs args)
HandleDeepLink(startupDeepLink);
}
+ if (ShouldShowVoiceRepeaterAtStartup())
+ {
+ _dispatcherQueue?.TryEnqueue(ShowVoiceModeSettings);
+ }
+
Logger.Info("Application started (WinUI 3)");
}
@@ -341,13 +360,28 @@ private void InitializeTrayIcon()
// Pre-create tray menu window at startup to avoid creation crashes later
InitializeTrayMenuWindow();
- var iconPath = IconHelper.GetStatusIconPath(ConnectionStatus.Disconnected);
+ var iconPath = AppIconHelper.GetStatusIconPath(ConnectionStatus.Disconnected);
_trayIcon = new TrayIcon(1, iconPath, "OpenClaw Tray — Disconnected");
+ _lastTrayIconPath = iconPath;
_trayIcon.IsVisible = true;
_trayIcon.Selected += OnTrayIconSelected;
_trayIcon.ContextMenu += OnTrayContextMenu;
}
+ private void StartVoiceTrayIconTimer()
+ {
+ if (_dispatcherQueue == null || _voiceTrayIconTimer != null)
+ {
+ return;
+ }
+
+ _voiceTrayIconTimer = _dispatcherQueue.CreateTimer();
+ _voiceTrayIconTimer.Interval = TimeSpan.FromMilliseconds(250);
+ _voiceTrayIconTimer.IsRepeating = true;
+ _voiceTrayIconTimer.Tick += (s, e) => UpdateTrayIcon();
+ _voiceTrayIconTimer.Start();
+ }
+
private void InitializeTrayMenuWindow()
{
// Pre-create menu window once - reuse to avoid crash on window creation after idle
@@ -535,6 +569,8 @@ private void OnTrayMenuItemClicked(object? sender, string action)
switch (action)
{
case "status": ShowStatusDetail(); break;
+ case "voice-settings": ShowVoiceModeSettings(); break;
+ case "voice-toggle-pause": _ = ToggleVoiceQuickPauseAsync(); break;
case "dashboard": OpenDashboard(); break;
case "webchat": ShowWebChat(); break;
case "quicksend": ShowQuickSend(); break;
@@ -742,6 +778,60 @@ private List GetRecentActivity(int maxItems)
.ToList();
}
+ private string GetRunningVoiceModeLabel()
+ {
+ var status = _voiceService?.CurrentStatus;
+ if (status == null)
+ {
+ return "Off";
+ }
+
+ return VoiceDisplayHelper.GetRuntimeLabel(status);
+ }
+
+ private bool CanQuickToggleVoiceMode()
+ {
+ if (_settings?.EnableNodeMode != true || _voiceService == null)
+ {
+ return false;
+ }
+
+ var status = _voiceService.CurrentStatus;
+ if (status.State == VoiceRuntimeState.Paused)
+ {
+ return true;
+ }
+
+ return _settings.Voice.Enabled && _settings.Voice.Mode != VoiceActivationMode.Off;
+ }
+
+ private bool ShouldShowVoiceRepeaterAtStartup()
+ {
+ return _settings?.EnableNodeMode == true &&
+ _settings.Voice.Enabled &&
+ _settings.Voice.Mode != VoiceActivationMode.Off &&
+ _settings.Voice.ShowRepeaterAtStartup;
+ }
+
+ private string GetVoiceQuickToggleLabel()
+ {
+ var status = _voiceService?.CurrentStatus;
+ return status?.State == VoiceRuntimeState.Paused
+ ? "Resume Voice"
+ : "Pause Voice";
+ }
+
+ private string GetVoiceDeviceSummary()
+ {
+ var voice = _settings?.Voice;
+ if (voice == null)
+ return "Talk: system default · Listen: system default";
+
+ var talk = string.IsNullOrWhiteSpace(voice.OutputDeviceId) ? "system default" : "selected speaker";
+ var listen = string.IsNullOrWhiteSpace(voice.InputDeviceId) ? "system default" : "selected microphone";
+ return $"Talk: {talk} · Listen: {listen}";
+ }
+
private void BuildTrayMenuPopup(TrayMenuWindow menu)
{
// Brand header
@@ -758,6 +848,14 @@ private void BuildTrayMenuPopup(TrayMenuWindow menu)
menu.AddMenuItem(_currentActivity.DisplayText, _currentActivity.Glyph, "", isEnabled: false);
}
+ menu.AddMenuItem($"Voice Mode: {GetRunningVoiceModeLabel()}", "🎙️", "voice-settings");
+ menu.AddMenuItem($"↳ {GetVoiceDeviceSummary()}", "", "", isEnabled: false, indent: true);
+ menu.AddMenuItem($"↳ {GetVoiceQuickToggleLabel()} (Ctrl+Alt+Shift+V)", "", "voice-toggle-pause", isEnabled: CanQuickToggleVoiceMode(), indent: true);
+ if (_settings?.EnableNodeMode != true)
+ {
+ menu.AddMenuItem("↳ Enable Node Mode to activate voice runtime", "", "", isEnabled: false, indent: true);
+ }
+
// Usage
if (_lastUsage != null || _lastUsageStatus != null || _lastUsageCost != null)
{
@@ -1147,7 +1245,7 @@ private void InitializeNodeService()
{
Logger.Info("Initializing Windows Node service...");
- _nodeService = new NodeService(new AppLogger(), _dispatcherQueue, DataPath);
+ _nodeService = new NodeService(new AppLogger(), _dispatcherQueue, _voiceService!, DataPath);
_nodeService.StatusChanged += OnNodeStatusChanged;
_nodeService.NotificationRequested += OnNodeNotificationRequested;
_nodeService.PairingStatusChanged += OnPairingStatusChanged;
@@ -1558,13 +1656,7 @@ private void UpdateTrayIcon()
{
if (_trayIcon == null) return;
- var status = _currentStatus;
- if (_currentActivity != null && _currentActivity.Kind != OpenClaw.Shared.ActivityKind.Idle)
- {
- status = ConnectionStatus.Connecting; // Use connecting icon for activity
- }
-
- var iconPath = IconHelper.GetStatusIconPath(status);
+ var iconPath = GetTrayIconPathForCurrentState();
var tooltip = $"OpenClaw Tray — {_currentStatus}";
if (_currentActivity != null && !string.IsNullOrEmpty(_currentActivity.DisplayText))
@@ -1576,7 +1668,11 @@ private void UpdateTrayIcon()
try
{
- _trayIcon.SetIcon(iconPath);
+ if (!string.Equals(_lastTrayIconPath, iconPath, StringComparison.OrdinalIgnoreCase))
+ {
+ _trayIcon.SetIcon(iconPath);
+ _lastTrayIconPath = iconPath;
+ }
_trayIcon.Tooltip = tooltip;
}
catch (Exception ex)
@@ -1585,15 +1681,60 @@ private void UpdateTrayIcon()
}
}
+ private string GetTrayIconPathForCurrentState()
+ {
+ var voiceIconState = GetVoiceTrayIconState();
+ if (voiceIconState != VoiceTrayIconState.Off)
+ {
+ return VoiceTrayIconHelper.GetVoiceTrayIconPath(voiceIconState);
+ }
+
+ if (_voiceService?.CurrentStatus.State == VoiceRuntimeState.Paused)
+ {
+ return VoiceTrayIconHelper.GetVoiceTrayIconPath(VoiceTrayIconState.Off);
+ }
+
+ var status = _currentStatus;
+ if (_currentActivity != null && _currentActivity.Kind != OpenClaw.Shared.ActivityKind.Idle)
+ {
+ status = ConnectionStatus.Connecting;
+ }
+
+ return AppIconHelper.GetStatusIconPath(status);
+ }
+
+ private VoiceTrayIconState GetVoiceTrayIconState()
+ {
+ var voiceStatus = _voiceService?.CurrentStatus;
+ if (voiceStatus == null || !voiceStatus.Running)
+ {
+ return VoiceTrayIconState.Off;
+ }
+
+ return voiceStatus.State switch
+ {
+ VoiceRuntimeState.PlayingResponse => VoiceTrayIconState.Speaking,
+ VoiceRuntimeState.ListeningForVoiceWake => VoiceTrayIconState.Listening,
+ VoiceRuntimeState.ListeningContinuously => VoiceTrayIconState.Listening,
+ VoiceRuntimeState.RecordingUtterance => VoiceTrayIconState.Listening,
+ VoiceRuntimeState.Paused => VoiceTrayIconState.Off,
+ _ when voiceStatus.Mode == VoiceActivationMode.Off => VoiceTrayIconState.Off,
+ _ => VoiceTrayIconState.Off
+ };
+ }
+
#endregion
#region Window Management
private void ShowSettings()
{
+ if (_settings == null || _voiceService == null)
+ return;
+
if (_settingsWindow == null || _settingsWindow.IsClosed)
{
- _settingsWindow = new SettingsWindow(_settings!);
+ _settingsWindow = new SettingsWindow(_settings, _voiceService);
_settingsWindow.Closed += (s, e) =>
{
_settingsWindow.SettingsSaved -= OnSettingsSaved;
@@ -1604,40 +1745,143 @@ private void ShowSettings()
_settingsWindow.Activate();
}
- private void OnSettingsSaved(object? sender, EventArgs e)
+ private void ShowVoiceModeSettings()
{
- // Reconnect with new settings — mirror the startup if/else pattern
- // to avoid dual connections that cause gateway conflicts.
- UnsubscribeGatewayEvents();
- _gatewayClient?.Dispose();
- _gatewayClient = null;
- var oldNodeService = _nodeService;
- _nodeService = null;
- try { oldNodeService?.Dispose(); } catch (Exception ex) { Logger.Warn($"Node dispose error: {ex.Message}"); }
- if (_settings?.UseSshTunnel != true)
+ if (_settings == null || _voiceService == null)
+ return;
+
+ if (_voiceRepeaterWindow == null || _voiceRepeaterWindow.IsClosed)
{
- _sshTunnelService?.Stop();
+ _voiceRepeaterWindow = new VoiceRepeaterWindow(_settings, _voiceService);
+ _voiceRepeaterWindow.OpenVoiceStatusRequested += OnOpenVoiceStatusRequested;
+ _voiceRepeaterWindow.Closed += (s, e) =>
+ {
+ _voiceChatCoordinator?.DetachWindow(_voiceRepeaterWindow);
+ _voiceRepeaterWindow.OpenVoiceStatusRequested -= OnOpenVoiceStatusRequested;
+ _voiceRepeaterWindow = null;
+ };
+ _voiceChatCoordinator?.AttachWindow(_voiceRepeaterWindow);
}
- // Reset status so the tray doesn't show a stale "Connected" from the previous mode
- _currentStatus = ConnectionStatus.Disconnected;
- UpdateTrayIcon();
-
- if (_settings?.EnableNodeMode == true)
+ _voiceRepeaterWindow.RefreshStatus();
+ _voiceRepeaterWindow.Activate();
+ }
+
+ private void ShowVoiceStatusWindow()
+ {
+ if (_settings == null || _voiceService == null)
{
- InitializeNodeService();
+ return;
}
- else
+
+ if (_voiceModeWindow == null || _voiceModeWindow.IsClosed)
{
- InitializeGatewayClient();
+ _voiceModeWindow = new VoiceModeWindow(_settings, _voiceService, _voiceService);
+ _voiceModeWindow.OpenSettingsRequested += OnVoiceModeOpenSettingsRequested;
+ _voiceModeWindow.Closed += (s, e) =>
+ {
+ if (_voiceModeWindow != null)
+ {
+ _voiceModeWindow.OpenSettingsRequested -= OnVoiceModeOpenSettingsRequested;
+ }
+
+ _voiceModeWindow = null;
+ };
+ }
+
+ _voiceModeWindow.RefreshStatus();
+ _voiceModeWindow.Activate();
+ }
+
+ private void OnOpenVoiceStatusRequested(object? sender, EventArgs e)
+ {
+ ShowVoiceStatusWindow();
+ }
+
+ private void OnVoiceModeOpenSettingsRequested(object? sender, EventArgs e)
+ {
+ ShowSettings();
+ }
+
+ private async void OnSettingsSaved(object? sender, EventArgs e)
+ {
+ // Reconnect with new settings — mirror the startup if/else pattern
+ // to avoid dual connections that cause gateway conflicts.
+ try
+ {
+ if (_gatewayClient != null)
+ {
+ try
+ {
+ await _gatewayClient.DisconnectAsync();
+ }
+ catch (Exception ex)
+ {
+ Logger.Warn($"Gateway disconnect error: {ex.Message}");
+ }
+
+ _gatewayClient.Dispose();
+ _gatewayClient = null;
+ }
+
+ var oldNodeService = _nodeService;
+ _nodeService = null;
+ if (oldNodeService != null)
+ {
+ try
+ {
+ await oldNodeService.DisconnectAsync();
+ }
+ catch (Exception ex)
+ {
+ Logger.Warn($"Node disconnect error: {ex.Message}");
+ }
+
+ try
+ {
+ oldNodeService.Dispose();
+ }
+ catch (Exception ex)
+ {
+ Logger.Warn($"Node dispose error: {ex.Message}");
+ }
+ }
+
+ if (_settings?.UseSshTunnel != true)
+ {
+ _sshTunnelService?.Stop();
+ }
+
+ // Reset status so the tray doesn't show a stale "Connected" from the previous mode
+ _currentStatus = ConnectionStatus.Disconnected;
+ UpdateTrayIcon();
+
+ if (_settings?.EnableNodeMode == true)
+ {
+ InitializeNodeService();
+ }
+ else
+ {
+ InitializeGatewayClient();
+ if (_voiceService != null)
+ {
+ await _voiceService.StopAsync(new VoiceStopArgs { Reason = "Node mode disabled" });
+ }
+ }
+ }
+ catch (Exception ex)
+ {
+ Logger.Warn($"Settings reconnect failed: {ex.Message}");
}
// Update global hotkey
if (_settings!.GlobalHotkeyEnabled)
{
_globalHotkey ??= new GlobalHotkeyService();
- _globalHotkey.HotkeyPressed -= OnGlobalHotkeyPressed;
- _globalHotkey.HotkeyPressed += OnGlobalHotkeyPressed;
+ _globalHotkey.QuickSendHotkeyPressed -= OnGlobalQuickSendHotkeyPressed;
+ _globalHotkey.QuickSendHotkeyPressed += OnGlobalQuickSendHotkeyPressed;
+ _globalHotkey.VoiceToggleHotkeyPressed -= OnGlobalVoiceToggleHotkeyPressed;
+ _globalHotkey.VoiceToggleHotkeyPressed += OnGlobalVoiceToggleHotkeyPressed;
_globalHotkey.Register();
}
else
@@ -1645,6 +1889,9 @@ private void OnSettingsSaved(object? sender, EventArgs e)
_globalHotkey?.Unregister();
}
+ _voiceRepeaterWindow?.RefreshStatus();
+ _voiceModeWindow?.RefreshStatus();
+
// Update auto-start
AutoStartManager.SetAutoStart(_settings.AutoStart);
}
@@ -1656,8 +1903,15 @@ private void ShowWebChat()
if (_webChatWindow == null || _webChatWindow.IsClosed)
{
- _webChatWindow = new WebChatWindow(_settings.GetEffectiveGatewayUrl(), _settings.Token);
- _webChatWindow.Closed += (s, e) => _webChatWindow = null;
+ _webChatWindow = new WebChatWindow(
+ _settings.GetEffectiveGatewayUrl(),
+ _settings.Token);
+ _webChatWindow.Closed += (s, e) =>
+ {
+ _voiceChatCoordinator?.DetachWindow(_webChatWindow);
+ _webChatWindow = null;
+ };
+ _voiceChatCoordinator?.AttachWindow(_webChatWindow);
}
_webChatWindow.Activate();
}
@@ -1874,7 +2128,7 @@ private void OpenLogFile()
}
}
- private void OnGlobalHotkeyPressed(object? sender, EventArgs e)
+ private void OnGlobalQuickSendHotkeyPressed(object? sender, EventArgs e)
{
// Hotkey events are raised from a dedicated Win32 message-loop thread.
// Creating/activating WinUI windows must happen on the app's UI thread.
@@ -1891,6 +2145,137 @@ private void OnGlobalHotkeyPressed(object? sender, EventArgs e)
}
}
+ private void OnGlobalVoiceToggleHotkeyPressed(object? sender, EventArgs e)
+ {
+ if (_dispatcherQueue == null)
+ {
+ Logger.Warn("Voice hotkey pressed but DispatcherQueue is null");
+ return;
+ }
+
+ var enqueued = _dispatcherQueue.TryEnqueue(async () => await ToggleVoiceQuickPauseAsync());
+ if (!enqueued)
+ {
+ Logger.Warn("Voice hotkey pressed but failed to enqueue Voice quick pause on UI thread");
+ }
+ }
+
+ private async Task ToggleVoiceQuickPauseAsync()
+ {
+ if (_voiceService == null)
+ {
+ return;
+ }
+
+ if (_settings?.EnableNodeMode != true)
+ {
+ Logger.Warn("Voice quick pause blocked: Node Mode is disabled");
+ return;
+ }
+
+ if (!CanQuickToggleVoiceMode())
+ {
+ Logger.Warn("Voice quick pause blocked: Voice Mode is off");
+ return;
+ }
+
+ try
+ {
+ var status = await _voiceService.ToggleQuickPauseAsync();
+ _voiceRepeaterWindow?.RefreshStatus();
+ _voiceModeWindow?.RefreshStatus();
+ ShowVoiceQuickToggleToast(status);
+ }
+ catch (Exception ex)
+ {
+ Logger.Warn($"Voice quick pause failed: {ex.Message}");
+ }
+ }
+
+ private static void ShowVoiceQuickToggleToast(VoiceStatusInfo status)
+ {
+ try
+ {
+ var title = status.State == VoiceRuntimeState.Paused
+ ? "Voice paused"
+ : "Voice resumed";
+ var detail = status.State == VoiceRuntimeState.Paused
+ ? $"{status.Mode} is paused. Press Ctrl+Alt+Shift+V to resume."
+ : $"{status.Mode} is active again.";
+
+ new ToastContentBuilder()
+ .AddText(title)
+ .AddText(detail)
+ .Show();
+ }
+ catch (Exception ex)
+ {
+ Logger.Warn($"Failed to show voice pause toast: {ex.Message}");
+ }
+ }
+
+ private void OnVoiceConversationTurnAvailable(object? sender, VoiceConversationTurnEventArgs args)
+ {
+ if (_dispatcherQueue == null)
+ {
+ return;
+ }
+
+ _dispatcherQueue.TryEnqueue(() => ShowVoiceConversationToast(args));
+ }
+
+ private void ShowVoiceConversationToast(VoiceConversationTurnEventArgs args)
+ {
+ if (_settings?.Voice.ShowConversationToasts != true)
+ {
+ return;
+ }
+
+ var title = args.Direction == VoiceConversationDirection.Outgoing
+ ? "Voice heard"
+ : "Voice reply";
+
+ AddRecentActivity(
+ $"voice: {title}",
+ category: "voice",
+ details: args.Message,
+ dashboardPath: "chat",
+ sessionKey: args.SessionKey);
+
+ NotificationHistoryService.AddNotification(new Services.GatewayNotification
+ {
+ Title = title,
+ Message = args.Message,
+ Category = "voice"
+ });
+
+ if (_settings.ShowNotifications != true)
+ {
+ return;
+ }
+
+ try
+ {
+ var builder = new ToastContentBuilder()
+ .AddText(title)
+ .AddText(args.Message);
+
+ if (args.Direction == VoiceConversationDirection.Incoming)
+ {
+ builder.AddArgument("action", "open_chat")
+ .AddButton(new ToastButton()
+ .SetContent("Open Chat")
+ .AddArgument("action", "open_chat"));
+ }
+
+ builder.Show();
+ }
+ catch (Exception ex)
+ {
+ Logger.Warn($"Failed to show voice conversation toast: {ex.Message}");
+ }
+ }
+
#endregion
#region Updates
@@ -2125,7 +2510,11 @@ private void ExitApplication()
_sessionPollTimer?.Dispose();
_sessionPollTimer = null;
});
-
+ SafeShutdownStep("voice tray icon timer", () =>
+ {
+ _voiceTrayIconTimer?.Stop();
+ _voiceTrayIconTimer = null;
+ });
// Cleanup hotkey
SafeShutdownStep("global hotkey", () =>
{
@@ -2191,6 +2580,22 @@ private void ExitApplication()
_deepLinkCts = null;
});
+ SafeShutdownStep("voice chat coordinator", () =>
+ {
+ if (_voiceChatCoordinator != null)
+ {
+ _voiceChatCoordinator.ConversationTurnAvailable -= OnVoiceConversationTurnAvailable;
+ _voiceChatCoordinator.Dispose();
+ _voiceChatCoordinator = null;
+ }
+ });
+
+ SafeShutdownStep("voice service", () =>
+ {
+ _voiceService?.Dispose();
+ _voiceService = null;
+ });
+
Logger.Info("Shutdown complete; calling Exit() now");
Exit();
}
@@ -2262,7 +2667,6 @@ _settings.SshTunnelRemotePort is < 1 or > 65535 ||
return true;
}
-
#endregion
private async void OnSshTunnelExited(object? sender, int exitCode)
diff --git a/src/OpenClaw.Tray.WinUI/Assets/voice-mode-feature.png b/src/OpenClaw.Tray.WinUI/Assets/voice-mode-feature.png
new file mode 100644
index 0000000..04c239b
Binary files /dev/null and b/src/OpenClaw.Tray.WinUI/Assets/voice-mode-feature.png differ
diff --git a/src/OpenClaw.Tray.WinUI/Assets/voice-providers.json b/src/OpenClaw.Tray.WinUI/Assets/voice-providers.json
new file mode 100644
index 0000000..3ffcc0b
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Assets/voice-providers.json
@@ -0,0 +1,274 @@
+{
+ "speechToTextProviders": [
+ {
+ "id": "windows",
+ "name": "Windows Speech Recognition",
+ "runtime": "windows",
+ "enabled": true,
+ "description": "Built-in Windows.Media speech recognition, half-duplex, non-streamed."
+ },
+ {
+ "id": "http-ws",
+ "name": "http/ws",
+ "runtime": "streaming",
+ "enabled": false,
+ "visibleInSettings": true,
+ "selectable": false,
+ "description": "Will support most cloud and local stand-alone models full or half-duplex, streaming."
+ },
+ {
+ "id": "foundry-local",
+ "name": "Foundry Local",
+ "runtime": "streaming",
+ "enabled": false,
+ "visibleInSettings": false,
+ "selectable": false,
+ "description": "AudioGraph-fed streaming STT route for Foundry Local or compatible streaming adapters.",
+ "settings": [
+ {
+ "key": "endpoint",
+ "label": "Endpoint",
+ "required": false,
+ "defaultValue": "http://localhost:5273",
+ "placeholder": "http://localhost:5273",
+ "description": "Local Foundry-compatible transcription endpoint for the AudioGraph streaming STT route."
+ },
+ {
+ "key": "model",
+ "label": "Model",
+ "required": false,
+ "defaultValue": "whisper-tiny",
+ "placeholder": "whisper-tiny",
+ "description": "Transcription model identifier for the streaming STT adapter."
+ }
+ ]
+ },
+ {
+ "id": "openai-whisper",
+ "name": "OpenAI Whisper",
+ "runtime": "streaming",
+ "enabled": false,
+ "visibleInSettings": false,
+ "selectable": false,
+ "description": "AudioGraph-fed cloud STT route for the OpenAI Whisper transcription API.",
+ "settings": [
+ {
+ "key": "apiKey",
+ "label": "API key",
+ "secret": true
+ },
+ {
+ "key": "model",
+ "label": "Model",
+ "required": false,
+ "defaultValue": "whisper-1",
+ "placeholder": "whisper-1",
+ "description": "Transcription model identifier for the OpenAI speech-to-text adapter."
+ }
+ ]
+ },
+ {
+ "id": "elevenlabs-stt",
+ "name": "ElevenLabs Speech to Text",
+ "runtime": "streaming",
+ "enabled": false,
+ "visibleInSettings": false,
+ "selectable": false,
+ "description": "AudioGraph-fed cloud STT route for the ElevenLabs speech-to-text API.",
+ "settings": [
+ {
+ "key": "apiKey",
+ "label": "API key",
+ "secret": true
+ },
+ {
+ "key": "model",
+ "label": "Model",
+ "required": false,
+ "defaultValue": "scribe_v1",
+ "placeholder": "scribe_v1",
+ "description": "Transcription model identifier for the ElevenLabs speech-to-text adapter."
+ }
+ ]
+ },
+ {
+ "id": "azure-ai-speech",
+ "name": "Azure AI Speech",
+ "runtime": "streaming",
+ "enabled": false,
+ "visibleInSettings": false,
+ "selectable": false,
+ "description": "AudioGraph-fed cloud STT route for Azure AI Speech real-time transcription.",
+ "settings": [
+ {
+ "key": "apiKey",
+ "label": "API key",
+ "secret": true
+ },
+ {
+ "key": "endpoint",
+ "label": "Endpoint",
+ "required": false,
+ "defaultValue": "",
+ "placeholder": "https://your-speech-resource.cognitiveservices.azure.com",
+ "description": "Azure AI Speech endpoint for the streaming STT adapter."
+ }
+ ]
+ },
+ {
+ "id": "sherpa-onnx",
+ "name": "sherpa-onnx",
+ "runtime": "embedded",
+ "enabled": false,
+ "visibleInSettings": true,
+ "selectable": false,
+ "description": "Can load a variety of models including OpenAI/Whisper, full-duplex, streaming.",
+ "settings": [
+ {
+ "key": "modelPath",
+ "label": "Model path",
+ "required": false,
+ "defaultValue": "",
+ "placeholder": "C:\\models\\sherpa-onnx\\model.onnx",
+ "description": "Path to the downloaded sherpa-onnx model bundle the embedded STT route should use."
+ },
+ {
+ "key": "model",
+ "label": "Model preset",
+ "required": false,
+ "defaultValue": "",
+ "placeholder": "tiny / base / small / medium",
+ "description": "Optional human-readable model preset to help track which local bundle is selected."
+ }
+ ]
+ }
+ ],
+ "textToSpeechProviders": [
+ {
+ "id": "windows",
+ "name": "Windows Speech Synthesis",
+ "runtime": "windows",
+ "enabled": true,
+ "description": "Built-in Windows text-to-speech playback."
+ },
+ {
+ "id": "minimax",
+ "name": "MiniMax",
+ "runtime": "cloud",
+ "enabled": true,
+ "description": "Cloud TTS using the MiniMax HTTP text-to-speech API.",
+ "settings": [
+ {
+ "key": "apiKey",
+ "label": "API key",
+ "secret": true
+ },
+ {
+ "key": "model",
+ "label": "Model",
+ "defaultValue": "speech-2.8-turbo",
+ "options": [
+ "speech-2.5-turbo-preview",
+ "speech-02-turbo",
+ "speech-02-hd",
+ "speech-2.6-turbo",
+ "speech-2.6-hd",
+ "speech-2.8-turbo",
+ "speech-2.8-hd"
+ ]
+ },
+ {
+ "key": "voiceId",
+ "label": "Voice ID",
+ "required": false,
+ "defaultValue": "English_MatureBoss"
+ },
+ {
+ "key": "voiceSettingsJson",
+ "label": "Voice settings JSON",
+ "required": false,
+ "jsonValue": true,
+ "defaultValue": "\"voice_setting\": { \"voice_id\": {{voiceId}}, \"speed\": 1, \"vol\": 1, \"pitch\": 0 }",
+ "placeholder": "\"voice_setting\": { \"voice_id\": \"English_MatureBoss\", \"speed\": 1, \"vol\": 1, \"pitch\": 0 }",
+ "description": "Optional full MiniMax request fragment. If present, it controls the full voice_setting payload."
+ }
+ ],
+ "textToSpeechWebSocket": {
+ "endpointTemplate": "wss://api.minimax.io/ws/v1/t2a_v2",
+ "authenticationHeaderName": "Authorization",
+ "authenticationScheme": "Bearer",
+ "apiKeySettingKey": "apiKey",
+ "connectSuccessEventName": "connected_success",
+ "startMessageTemplate": "{ \"event\": \"task_start\", \"model\": {{model}}, \"language_boost\": \"English\", {{voiceSettingsJson}}, \"audio_setting\": { \"sample_rate\": 32000, \"bitrate\": 128000, \"format\": \"mp3\", \"channel\": 1 } }",
+ "startSuccessEventName": "task_started",
+ "continueMessageTemplate": "{ \"event\": \"task_continue\", \"text\": {{text}} }",
+ "finishMessageTemplate": "{ \"event\": \"task_finish\" }",
+ "responseAudioMode": "hexJsonString",
+ "responseAudioJsonPath": "data.audio",
+ "responseStatusCodeJsonPath": "base_resp.status_code",
+ "responseStatusMessageJsonPath": "base_resp.status_msg",
+ "finalFlagJsonPath": "is_final",
+ "taskFailedEventName": "task_failed",
+ "successStatusValue": "0",
+ "outputContentType": "audio/mpeg"
+ }
+ },
+ {
+ "id": "elevenlabs",
+ "name": "ElevenLabs",
+ "runtime": "cloud",
+ "enabled": true,
+ "description": "Cloud TTS using the ElevenLabs WebSocket stream-input API.",
+ "settings": [
+ {
+ "key": "apiKey",
+ "label": "API key",
+ "secret": true
+ },
+ {
+ "key": "model",
+ "label": "Model",
+ "defaultValue": "eleven_multilingual_v2",
+ "options": [
+ "eleven_flash_v2_5",
+ "eleven_turbo_v2_5",
+ "eleven_multilingual_v2",
+ "eleven_monolingual_v1"
+ ]
+ },
+ {
+ "key": "voiceId",
+ "label": "Voice ID",
+ "required": false,
+ "defaultValue": "6aDn1KB0hjpdcocrUkmq",
+ "placeholder": "Enter an ElevenLabs voice ID"
+ },
+ {
+ "key": "voiceSettingsJson",
+ "label": "Voice settings JSON",
+ "required": false,
+ "jsonValue": true,
+ "defaultValue": "\"voice_settings\": { \"speed\": 0.9, \"stability\": 0.5, \"similarity_boost\": 0.75 }",
+ "placeholder": "\"voice_settings\": { \"speed\": 0.9, \"stability\": 0.5, \"similarity_boost\": 0.75 }",
+ "description": "Optional full ElevenLabs request fragment. If present, it controls the full voice_settings payload."
+ }
+ ],
+ "textToSpeechWebSocket": {
+ "endpointTemplate": "wss://api.elevenlabs.io/v1/text-to-speech/{{voiceId}}/stream-input?model_id={{model}}&output_format=mp3_44100_128&auto_mode=true",
+ "authenticationHeaderName": "xi-api-key",
+ "authenticationScheme": "",
+ "apiKeySettingKey": "apiKey",
+ "connectSuccessEventName": "",
+ "startMessageTemplate": "{ \"text\": \" \", {{voiceSettingsJson}}, \"xi_api_key\": {{apiKey}} }",
+ "startSuccessEventName": "",
+ "continueMessageTemplate": "{ \"text\": {{textWithTrailingSpace}}, \"try_trigger_generation\": true }",
+ "finishMessageTemplate": "{ \"text\": \"\" }",
+ "responseAudioMode": "base64JsonString",
+ "responseAudioJsonPath": "audio",
+ "finalFlagJsonPath": "isFinal",
+ "taskFailedEventName": "error",
+ "outputContentType": "audio/mpeg"
+ }
+ }
+ ]
+}
diff --git a/src/OpenClaw.Tray.WinUI/Controls/VoiceSettingsPanel.xaml b/src/OpenClaw.Tray.WinUI/Controls/VoiceSettingsPanel.xaml
new file mode 100644
index 0000000..cadffd4
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Controls/VoiceSettingsPanel.xaml
@@ -0,0 +1,111 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/OpenClaw.Tray.WinUI/Controls/VoiceSettingsPanel.xaml.cs b/src/OpenClaw.Tray.WinUI/Controls/VoiceSettingsPanel.xaml.cs
new file mode 100644
index 0000000..30290a4
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Controls/VoiceSettingsPanel.xaml.cs
@@ -0,0 +1,574 @@
+using Microsoft.UI.Xaml;
+using Microsoft.UI.Xaml.Controls;
+using OpenClaw.Shared;
+using OpenClawTray.Services;
+using OpenClawTray.Services.Voice;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading.Tasks;
+
+namespace OpenClawTray.Controls;
+
+public sealed partial class VoiceSettingsPanel : UserControl
+{
+ private SettingsManager? _settings;
+ private IVoiceConfigurationApi? _voiceConfigurationApi;
+ private VoiceProviderConfigurationStore _voiceProviderConfigurationDraft = new();
+ private string _activeSttProviderId = VoiceProviderIds.Windows;
+ private string _activeTtsProviderId = VoiceProviderIds.Windows;
+ private bool _updatingVoiceProviderFields;
+ private List _speechToTextOptions = new();
+ private List _textToSpeechOptions = new();
+ private List _inputOptions = new();
+ private List _outputOptions = new();
+ private List _activeTtsModelOptions = new();
+
+ public VoiceSettingsPanel()
+ {
+ InitializeComponent();
+ }
+
+ public void Initialize(SettingsManager settings, IVoiceConfigurationApi voiceConfigurationApi)
+ {
+ _settings = settings;
+ _voiceConfigurationApi = voiceConfigurationApi;
+
+ LoadVoiceSettings();
+ _ = LoadVoiceDevicesAsync();
+ }
+
+ public async Task ApplyAsync(SettingsManager settings)
+ {
+ CaptureSelectedVoiceProviderSettings();
+
+ var voiceSettings = new VoiceSettings
+ {
+ Mode = GetSelectedVoiceMode(),
+ Enabled = GetSelectedVoiceMode() != VoiceActivationMode.Off,
+ ShowRepeaterAtStartup = (VoiceShowRepeaterAtStartupCheckBox.IsChecked ?? true) && GetSelectedVoiceMode() != VoiceActivationMode.Off,
+ ShowConversationToasts = VoiceConversationToastsCheckBox.IsChecked ?? false,
+ SpeechToTextProviderId = (VoiceSpeechToTextProviderComboBox.SelectedItem as VoiceProviderOption)?.Id ?? VoiceProviderIds.Windows,
+ TextToSpeechProviderId = (VoiceTextToSpeechProviderComboBox.SelectedItem as VoiceProviderOption)?.Id ?? VoiceProviderIds.Windows,
+ InputDeviceId = (VoiceInputDeviceComboBox.SelectedItem as DeviceOption)?.DeviceId,
+ OutputDeviceId = (VoiceOutputDeviceComboBox.SelectedItem as DeviceOption)?.DeviceId,
+ SampleRateHz = settings.Voice.SampleRateHz,
+ CaptureChunkMs = settings.Voice.CaptureChunkMs,
+ BargeInEnabled = settings.Voice.BargeInEnabled,
+ VoiceWake = new VoiceWakeSettings
+ {
+ Engine = settings.Voice.VoiceWake.Engine,
+ ModelId = settings.Voice.VoiceWake.ModelId,
+ TriggerThreshold = settings.Voice.VoiceWake.TriggerThreshold,
+ TriggerCooldownMs = settings.Voice.VoiceWake.TriggerCooldownMs,
+ PreRollMs = settings.Voice.VoiceWake.PreRollMs,
+ EndSilenceMs = settings.Voice.VoiceWake.EndSilenceMs
+ },
+ TalkMode = new TalkModeSettings
+ {
+ MinSpeechMs = settings.Voice.TalkMode.MinSpeechMs,
+ EndSilenceMs = settings.Voice.TalkMode.EndSilenceMs,
+ MaxUtteranceMs = settings.Voice.TalkMode.MaxUtteranceMs
+ }
+ };
+ settings.Voice = voiceSettings;
+ settings.VoiceProviderConfiguration = _voiceProviderConfigurationDraft.Clone();
+
+ if (_voiceConfigurationApi != null)
+ {
+ _voiceConfigurationApi.SetProviderConfiguration(_voiceProviderConfigurationDraft);
+ await _voiceConfigurationApi.UpdateSettingsAsync(new VoiceSettingsUpdateArgs
+ {
+ Settings = voiceSettings,
+ Persist = false
+ });
+ }
+ }
+
+ private void LoadVoiceSettings()
+ {
+ if (_settings == null || _voiceConfigurationApi == null)
+ {
+ return;
+ }
+
+ _voiceProviderConfigurationDraft = _settings.VoiceProviderConfiguration.Clone();
+ LoadVoiceProviders();
+ SelectVoiceMode(_settings.Voice.Mode);
+ UpdateVoiceSelectionDescriptions();
+ VoiceShowRepeaterAtStartupCheckBox.IsChecked = _settings.Voice.Mode == VoiceActivationMode.Off
+ ? false
+ : _settings.Voice.ShowRepeaterAtStartup;
+ VoiceConversationToastsCheckBox.IsChecked = _settings.Voice.ShowConversationToasts;
+ UpdateVoiceProviderSettingsEditor();
+ UpdateVoiceSettingsInfo();
+ }
+
+ private void LoadVoiceProviders()
+ {
+ var catalog = _voiceConfigurationApi!.GetProviderCatalog();
+
+ _speechToTextOptions = catalog.SpeechToTextProviders
+ .Select(Clone)
+ .ToList();
+ _textToSpeechOptions = catalog.TextToSpeechProviders
+ .Select(Clone)
+ .ToList();
+
+ VoiceSpeechToTextProviderComboBox.ItemsSource = _speechToTextOptions;
+ VoiceTextToSpeechProviderComboBox.ItemsSource = _textToSpeechOptions;
+
+ VoiceSpeechToTextProviderComboBox.SelectedItem =
+ _speechToTextOptions.FirstOrDefault(p => p.Id == _settings!.Voice.SpeechToTextProviderId)
+ ?? _speechToTextOptions.FirstOrDefault();
+ VoiceTextToSpeechProviderComboBox.SelectedItem =
+ _textToSpeechOptions.FirstOrDefault(p => p.Id == _settings!.Voice.TextToSpeechProviderId)
+ ?? _textToSpeechOptions.FirstOrDefault();
+
+ _ = EnsureSelectableProviderSelection(VoiceSpeechToTextProviderComboBox, _speechToTextOptions, ref _activeSttProviderId);
+ _ = EnsureSelectableProviderSelection(VoiceTextToSpeechProviderComboBox, _textToSpeechOptions, ref _activeTtsProviderId);
+ UpdateVoiceSelectionDescriptions();
+ UpdateDeviceSelectionAvailability();
+ }
+
+ private async Task LoadVoiceDevicesAsync()
+ {
+ if (_settings == null || _voiceConfigurationApi == null)
+ {
+ return;
+ }
+
+ try
+ {
+ VoiceSettingsInfoTextBlock.Text = "Loading voice devices...";
+ var devices = await _voiceConfigurationApi.ListDevicesAsync();
+
+ _inputOptions =
+ [
+ new DeviceOption(null, "System default microphone")
+ ];
+ _inputOptions.AddRange(devices
+ .Where(d => d.IsInput)
+ .Select(d => new DeviceOption(d.DeviceId, d.Name)));
+
+ _outputOptions =
+ [
+ new DeviceOption(null, "System default speaker")
+ ];
+ _outputOptions.AddRange(devices
+ .Where(d => d.IsOutput)
+ .Select(d => new DeviceOption(d.DeviceId, d.Name)));
+
+ VoiceInputDeviceComboBox.ItemsSource = _inputOptions;
+ VoiceOutputDeviceComboBox.ItemsSource = _outputOptions;
+
+ VoiceInputDeviceComboBox.SelectedItem = _inputOptions.FirstOrDefault(o => o.DeviceId == _settings.Voice.InputDeviceId) ?? _inputOptions[0];
+ VoiceOutputDeviceComboBox.SelectedItem = _outputOptions.FirstOrDefault(o => o.DeviceId == _settings.Voice.OutputDeviceId) ?? _outputOptions[0];
+
+ UpdateDeviceSelectionAvailability();
+ UpdateVoiceSettingsInfo();
+ }
+ catch (Exception ex)
+ {
+ VoiceSettingsInfoTextBlock.Text = $"Failed to load voice devices: {ex.Message}";
+ }
+ }
+
+ private void SelectVoiceMode(VoiceActivationMode mode)
+ {
+ var target = mode switch
+ {
+ VoiceActivationMode.VoiceWake => "VoiceWake",
+ VoiceActivationMode.TalkMode => "TalkMode",
+ _ => "Off"
+ };
+
+ foreach (var item in VoiceModeComboBox.Items.OfType())
+ {
+ if (string.Equals(item.Tag?.ToString(), target, StringComparison.Ordinal))
+ {
+ VoiceModeComboBox.SelectedItem = item;
+ return;
+ }
+ }
+
+ VoiceModeComboBox.SelectedIndex = 0;
+ }
+
+ private VoiceActivationMode GetSelectedVoiceMode()
+ {
+ var tag = (VoiceModeComboBox.SelectedItem as ComboBoxItem)?.Tag?.ToString();
+ return tag switch
+ {
+ "VoiceWake" => VoiceActivationMode.VoiceWake,
+ "TalkMode" => VoiceActivationMode.TalkMode,
+ _ => VoiceActivationMode.Off
+ };
+ }
+
+ private void UpdateVoiceSelectionDescriptions()
+ {
+ VoiceModeDescriptionTextBlock.Text = GetVoiceModeDescription(GetSelectedVoiceMode());
+ VoiceSpeechToTextProviderDescriptionTextBlock.Text =
+ (VoiceSpeechToTextProviderComboBox.SelectedItem as VoiceProviderOption)?.Description ?? string.Empty;
+ VoiceTextToSpeechProviderDescriptionTextBlock.Text =
+ (VoiceTextToSpeechProviderComboBox.SelectedItem as VoiceProviderOption)?.Description ?? string.Empty;
+ }
+
+ private static string GetVoiceModeDescription(VoiceActivationMode mode)
+ {
+ return mode switch
+ {
+ VoiceActivationMode.TalkMode => "Continuous conversation mode. Listen after replies and send each completed utterance as a chat turn.",
+ VoiceActivationMode.VoiceWake => "Wake-word mode. Stays idle until the hotword is detected, then starts listening for a request.",
+ _ => "Voice features stay off until you start them manually."
+ };
+ }
+
+ private void UpdateVoiceSettingsInfo()
+ {
+ var stt = (VoiceSpeechToTextProviderComboBox.SelectedItem as VoiceProviderOption)?.Name ?? "Windows Speech Recognition";
+ var tts = (VoiceTextToSpeechProviderComboBox.SelectedItem as VoiceProviderOption)?.Name ?? "Windows Speech Synthesis";
+ var input = (VoiceInputDeviceComboBox.SelectedItem as DeviceOption)?.Name ?? "System default microphone";
+ var output = (VoiceOutputDeviceComboBox.SelectedItem as DeviceOption)?.Name ?? "System default speaker";
+ var fallbackNotice = string.Empty;
+
+ if (VoiceSpeechToTextProviderComboBox.SelectedItem is VoiceProviderOption sttOption &&
+ !VoiceProviderCatalogService.SupportsSpeechToTextRuntime(sttOption.Id))
+ {
+ fallbackNotice += " Selected non-Windows STT routes are scaffolded but not implemented yet.";
+ }
+
+ if (VoiceTextToSpeechProviderComboBox.SelectedItem is VoiceProviderOption ttsOption &&
+ !VoiceProviderCatalogService.SupportsTextToSpeechRuntime(ttsOption.Id))
+ {
+ fallbackNotice += " Unsupported TTS providers will fall back to Windows until their runtime adapters are added.";
+ }
+
+ VoiceSettingsInfoTextBlock.Text =
+ $"Mode: {VoiceDisplayHelper.GetModeLabel(GetSelectedVoiceMode())}. STT: {stt}. TTS: {tts}. Listen: {input}. Talk: {output}.{fallbackNotice}";
+ }
+
+ private void UpdateDeviceSelectionAvailability()
+ {
+ var lockToDefaultDevices = string.Equals(
+ (VoiceSpeechToTextProviderComboBox.SelectedItem as VoiceProviderOption)?.Id,
+ VoiceProviderIds.Windows,
+ StringComparison.OrdinalIgnoreCase);
+
+ if (lockToDefaultDevices)
+ {
+ if (_inputOptions.Count > 0)
+ {
+ VoiceInputDeviceComboBox.SelectedItem = _inputOptions[0];
+ }
+
+ if (_outputOptions.Count > 0)
+ {
+ VoiceOutputDeviceComboBox.SelectedItem = _outputOptions[0];
+ }
+ }
+
+ VoiceInputDeviceComboBox.IsEnabled = !lockToDefaultDevices;
+ VoiceOutputDeviceComboBox.IsEnabled = !lockToDefaultDevices;
+ }
+
+ private void UpdateVoiceProviderSettingsEditor()
+ {
+ var providerId = GetSelectedTextToSpeechProviderId();
+ var showProviderSettings = !string.Equals(providerId, VoiceProviderIds.Windows, StringComparison.OrdinalIgnoreCase);
+
+ VoiceTtsProviderSettingsPanel.Visibility = showProviderSettings ? Visibility.Visible : Visibility.Collapsed;
+ if (!showProviderSettings)
+ {
+ _activeTtsProviderId = VoiceProviderIds.Windows;
+ return;
+ }
+
+ var provider = GetSelectedTextToSpeechProvider();
+ var apiKeySetting = FindSetting(provider, VoiceProviderSettingKeys.ApiKey);
+ var modelSetting = FindSetting(provider, VoiceProviderSettingKeys.Model);
+ var voiceIdSetting = FindSetting(provider, VoiceProviderSettingKeys.VoiceId);
+ var voiceSettingsJsonSetting = FindSetting(provider, VoiceProviderSettingKeys.VoiceSettingsJson);
+ var modelValue = GetProviderValue(providerId, modelSetting) ?? string.Empty;
+
+ _updatingVoiceProviderFields = true;
+ try
+ {
+ VoiceTtsProviderSettingsTitleTextBlock.Text = $"{GetSelectedTextToSpeechProviderName().ToUpperInvariant()} SETTINGS";
+ VoiceTtsApiKeyPasswordBox.Header = apiKeySetting?.Label ?? "API key";
+ VoiceTtsApiKeyPasswordBox.Visibility = apiKeySetting != null ? Visibility.Visible : Visibility.Collapsed;
+ VoiceTtsApiKeyPasswordBox.Password = GetProviderValue(providerId, apiKeySetting) ?? string.Empty;
+
+ _activeTtsModelOptions = modelSetting?.Options
+ .Where(option => !string.IsNullOrWhiteSpace(option))
+ .Distinct(StringComparer.OrdinalIgnoreCase)
+ .ToList()
+ ?? [];
+
+ if (_activeTtsModelOptions.Count > 0)
+ {
+ if (!string.IsNullOrWhiteSpace(modelValue) &&
+ !_activeTtsModelOptions.Contains(modelValue, StringComparer.OrdinalIgnoreCase))
+ {
+ _activeTtsModelOptions.Insert(0, modelValue);
+ }
+
+ VoiceTtsModelComboBox.Header = modelSetting?.Label ?? "Model";
+ VoiceTtsModelComboBox.ItemsSource = _activeTtsModelOptions;
+ VoiceTtsModelComboBox.SelectedItem = _activeTtsModelOptions
+ .FirstOrDefault(option => string.Equals(option, modelValue, StringComparison.OrdinalIgnoreCase))
+ ?? _activeTtsModelOptions.FirstOrDefault();
+ VoiceTtsModelComboBox.Visibility = Visibility.Visible;
+ VoiceTtsModelTextBox.Visibility = Visibility.Collapsed;
+ }
+ else
+ {
+ VoiceTtsModelTextBox.Header = modelSetting?.Label ?? "Model";
+ VoiceTtsModelTextBox.PlaceholderText = modelSetting?.Placeholder ?? string.Empty;
+ VoiceTtsModelTextBox.Visibility = modelSetting != null ? Visibility.Visible : Visibility.Collapsed;
+ VoiceTtsModelTextBox.Text = modelValue;
+ VoiceTtsModelComboBox.ItemsSource = null;
+ VoiceTtsModelComboBox.SelectedItem = null;
+ VoiceTtsModelComboBox.Visibility = Visibility.Collapsed;
+ }
+
+ VoiceTtsVoiceIdTextBox.Header = voiceIdSetting?.Label ?? "Voice ID";
+ VoiceTtsVoiceIdTextBox.PlaceholderText = voiceIdSetting?.Placeholder ?? string.Empty;
+ VoiceTtsVoiceIdTextBox.Visibility = voiceIdSetting != null ? Visibility.Visible : Visibility.Collapsed;
+ VoiceTtsVoiceIdTextBox.Text = GetProviderValue(providerId, voiceIdSetting) ?? string.Empty;
+
+ VoiceTtsVoiceSettingsJsonTextBox.Header = voiceSettingsJsonSetting?.Label ?? "Voice settings JSON";
+ VoiceTtsVoiceSettingsJsonTextBox.PlaceholderText = voiceSettingsJsonSetting?.Placeholder ?? string.Empty;
+ VoiceTtsVoiceSettingsJsonTextBox.Visibility = voiceSettingsJsonSetting != null ? Visibility.Visible : Visibility.Collapsed;
+ VoiceTtsVoiceSettingsJsonTextBox.Text = GetProviderValue(providerId, voiceSettingsJsonSetting) ?? string.Empty;
+ _activeTtsProviderId = providerId;
+ }
+ finally
+ {
+ _updatingVoiceProviderFields = false;
+ }
+ }
+
+ private string GetSelectedTextToSpeechProviderId()
+ {
+ return (VoiceTextToSpeechProviderComboBox.SelectedItem as VoiceProviderOption)?.Id ?? VoiceProviderIds.Windows;
+ }
+
+ private string GetSelectedTextToSpeechProviderName()
+ {
+ return (VoiceTextToSpeechProviderComboBox.SelectedItem as VoiceProviderOption)?.Name ?? "Provider";
+ }
+
+ private VoiceProviderOption? GetSelectedTextToSpeechProvider()
+ {
+ return VoiceTextToSpeechProviderComboBox.SelectedItem as VoiceProviderOption;
+ }
+
+ private void CaptureSelectedVoiceProviderSettings()
+ {
+ if (_updatingVoiceProviderFields)
+ {
+ return;
+ }
+
+ var providerId = _activeTtsProviderId;
+ if (string.Equals(providerId, VoiceProviderIds.Windows, StringComparison.OrdinalIgnoreCase))
+ {
+ return;
+ }
+
+ var provider = _textToSpeechOptions.FirstOrDefault(option =>
+ string.Equals(option.Id, providerId, StringComparison.OrdinalIgnoreCase));
+ SetProviderValue(providerId, FindSetting(provider, VoiceProviderSettingKeys.ApiKey), VoiceTtsApiKeyPasswordBox.Password);
+ SetProviderValue(providerId, FindSetting(provider, VoiceProviderSettingKeys.Model), GetSelectedProviderModelValue());
+ SetProviderValue(providerId, FindSetting(provider, VoiceProviderSettingKeys.VoiceId), VoiceTtsVoiceIdTextBox.Text);
+ SetProviderValue(providerId, FindSetting(provider, VoiceProviderSettingKeys.VoiceSettingsJson), VoiceTtsVoiceSettingsJsonTextBox.Text);
+ }
+
+ private async void OnRefreshVoiceDevices(object sender, RoutedEventArgs e)
+ {
+ await LoadVoiceDevicesAsync();
+ }
+
+ private void OnVoiceModeChanged(object sender, SelectionChangedEventArgs e)
+ {
+ var mode = GetSelectedVoiceMode();
+ VoiceShowRepeaterAtStartupCheckBox.IsChecked = mode == VoiceActivationMode.Off
+ ? false
+ : (VoiceShowRepeaterAtStartupCheckBox.IsChecked ?? true);
+ VoiceShowRepeaterAtStartupCheckBox.IsEnabled = mode != VoiceActivationMode.Off;
+ UpdateVoiceSelectionDescriptions();
+ UpdateVoiceSettingsInfo();
+ }
+
+ private void OnVoiceProviderChanged(object sender, SelectionChangedEventArgs e)
+ {
+ if (ReferenceEquals(sender, VoiceSpeechToTextProviderComboBox) &&
+ !EnsureSelectableProviderSelection(VoiceSpeechToTextProviderComboBox, _speechToTextOptions, ref _activeSttProviderId))
+ {
+ return;
+ }
+
+ if (ReferenceEquals(sender, VoiceTextToSpeechProviderComboBox) &&
+ !EnsureSelectableProviderSelection(VoiceTextToSpeechProviderComboBox, _textToSpeechOptions, ref _activeTtsProviderId))
+ {
+ return;
+ }
+
+ CaptureSelectedVoiceProviderSettings();
+ UpdateVoiceSelectionDescriptions();
+ UpdateDeviceSelectionAvailability();
+ UpdateVoiceProviderSettingsEditor();
+ UpdateVoiceSettingsInfo();
+ }
+
+ private void OnVoiceProviderSettingsChanged(object sender, RoutedEventArgs e)
+ {
+ CaptureSelectedVoiceProviderSettings();
+ }
+
+ private string? GetProviderValue(string providerId, VoiceProviderSettingDefinition? setting)
+ {
+ if (setting == null)
+ {
+ return null;
+ }
+
+ return _voiceProviderConfigurationDraft.GetValue(providerId, setting.Key) ?? setting.DefaultValue;
+ }
+
+ private string? GetSelectedProviderModelValue()
+ {
+ if (VoiceTtsModelComboBox.Visibility == Visibility.Visible)
+ {
+ return VoiceTtsModelComboBox.SelectedItem?.ToString();
+ }
+
+ return VoiceTtsModelTextBox.Text;
+ }
+
+ private sealed record DeviceOption(string? DeviceId, string Name);
+
+ private void SetProviderValue(
+ string providerId,
+ VoiceProviderSettingDefinition? setting,
+ string? value)
+ {
+ if (setting == null)
+ {
+ return;
+ }
+
+ var normalized = string.IsNullOrWhiteSpace(value)
+ ? setting.DefaultValue
+ : value.Trim();
+ _voiceProviderConfigurationDraft.SetValue(providerId, setting.Key, normalized);
+ }
+
+ private static VoiceProviderSettingDefinition? FindSetting(VoiceProviderOption? provider, string settingKey)
+ {
+ return provider?.Settings.FirstOrDefault(setting =>
+ string.Equals(setting.Key, settingKey, StringComparison.OrdinalIgnoreCase));
+ }
+
+ private static VoiceProviderOption Clone(VoiceProviderOption source)
+ {
+ return new VoiceProviderOption
+ {
+ Id = source.Id,
+ Name = source.Name,
+ Runtime = source.Runtime,
+ Enabled = source.Enabled,
+ VisibleInSettings = source.VisibleInSettings,
+ Selectable = source.Selectable,
+ Description = source.Description,
+ Settings = source.Settings
+ .Select(setting => new VoiceProviderSettingDefinition
+ {
+ Key = setting.Key,
+ Label = setting.Label,
+ Secret = setting.Secret,
+ DefaultValue = setting.DefaultValue,
+ Placeholder = setting.Placeholder,
+ Description = setting.Description,
+ Required = setting.Required,
+ JsonValue = setting.JsonValue,
+ Options = setting.Options.ToList()
+ })
+ .ToList(),
+ TextToSpeechHttp = source.TextToSpeechHttp == null
+ ? null
+ : new VoiceTextToSpeechHttpContract
+ {
+ EndpointTemplate = source.TextToSpeechHttp.EndpointTemplate,
+ HttpMethod = source.TextToSpeechHttp.HttpMethod,
+ AuthenticationHeaderName = source.TextToSpeechHttp.AuthenticationHeaderName,
+ AuthenticationScheme = source.TextToSpeechHttp.AuthenticationScheme,
+ ApiKeySettingKey = source.TextToSpeechHttp.ApiKeySettingKey,
+ RequestContentType = source.TextToSpeechHttp.RequestContentType,
+ RequestBodyTemplate = source.TextToSpeechHttp.RequestBodyTemplate,
+ ResponseAudioMode = source.TextToSpeechHttp.ResponseAudioMode,
+ ResponseAudioJsonPath = source.TextToSpeechHttp.ResponseAudioJsonPath,
+ ResponseStatusCodeJsonPath = source.TextToSpeechHttp.ResponseStatusCodeJsonPath,
+ ResponseStatusMessageJsonPath = source.TextToSpeechHttp.ResponseStatusMessageJsonPath,
+ SuccessStatusValue = source.TextToSpeechHttp.SuccessStatusValue,
+ OutputContentType = source.TextToSpeechHttp.OutputContentType
+ },
+ TextToSpeechWebSocket = source.TextToSpeechWebSocket == null
+ ? null
+ : new VoiceTextToSpeechWebSocketContract
+ {
+ EndpointTemplate = source.TextToSpeechWebSocket.EndpointTemplate,
+ AuthenticationHeaderName = source.TextToSpeechWebSocket.AuthenticationHeaderName,
+ AuthenticationScheme = source.TextToSpeechWebSocket.AuthenticationScheme,
+ ApiKeySettingKey = source.TextToSpeechWebSocket.ApiKeySettingKey,
+ ConnectSuccessEventName = source.TextToSpeechWebSocket.ConnectSuccessEventName,
+ StartMessageTemplate = source.TextToSpeechWebSocket.StartMessageTemplate,
+ StartSuccessEventName = source.TextToSpeechWebSocket.StartSuccessEventName,
+ ContinueMessageTemplate = source.TextToSpeechWebSocket.ContinueMessageTemplate,
+ FinishMessageTemplate = source.TextToSpeechWebSocket.FinishMessageTemplate,
+ ResponseAudioMode = source.TextToSpeechWebSocket.ResponseAudioMode,
+ ResponseAudioJsonPath = source.TextToSpeechWebSocket.ResponseAudioJsonPath,
+ ResponseStatusCodeJsonPath = source.TextToSpeechWebSocket.ResponseStatusCodeJsonPath,
+ ResponseStatusMessageJsonPath = source.TextToSpeechWebSocket.ResponseStatusMessageJsonPath,
+ FinalFlagJsonPath = source.TextToSpeechWebSocket.FinalFlagJsonPath,
+ TaskFailedEventName = source.TextToSpeechWebSocket.TaskFailedEventName,
+ SuccessStatusValue = source.TextToSpeechWebSocket.SuccessStatusValue,
+ OutputContentType = source.TextToSpeechWebSocket.OutputContentType
+ }
+ };
+ }
+
+ private static bool EnsureSelectableProviderSelection(
+ ComboBox comboBox,
+ IReadOnlyList options,
+ ref string activeProviderId)
+ {
+ var previousProviderId = activeProviderId;
+
+ if (comboBox.SelectedItem is VoiceProviderOption selected && selected.Selectable)
+ {
+ activeProviderId = selected.Id;
+ return true;
+ }
+
+ var fallback = options.FirstOrDefault(option =>
+ option.Selectable &&
+ string.Equals(option.Id, previousProviderId, StringComparison.OrdinalIgnoreCase))
+ ?? options.FirstOrDefault(option => option.Selectable);
+
+ if (fallback == null)
+ {
+ return false;
+ }
+
+ if (!ReferenceEquals(comboBox.SelectedItem, fallback))
+ {
+ comboBox.SelectedItem = fallback;
+ }
+
+ activeProviderId = fallback.Id;
+ return false;
+ }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Dialogs/QuickSendDialog.cs b/src/OpenClaw.Tray.WinUI/Dialogs/QuickSendDialog.cs
index 1c54f20..8b23808 100644
--- a/src/OpenClaw.Tray.WinUI/Dialogs/QuickSendDialog.cs
+++ b/src/OpenClaw.Tray.WinUI/Dialogs/QuickSendDialog.cs
@@ -55,7 +55,7 @@ public QuickSendDialog(OpenClawGatewayClient client, string? prefillMessage = nu
Title = LocalizationHelper.GetString("WindowTitle_QuickSend");
this.SetWindowSize(420, 260);
this.CenterOnScreen();
- this.SetIcon(IconHelper.GetStatusIconPath(ConnectionStatus.Connected));
+ this.SetIcon(AppIconHelper.GetStatusIconPath(ConnectionStatus.Connected));
// Apply Acrylic via controller to keep IsInputActive=true.
// This avoids focus/activation oddities on Windows 10 for hotkey-launched windows.
diff --git a/src/OpenClaw.Tray.WinUI/Helpers/AppIconHelper.cs b/src/OpenClaw.Tray.WinUI/Helpers/AppIconHelper.cs
new file mode 100644
index 0000000..1cf815c
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Helpers/AppIconHelper.cs
@@ -0,0 +1,64 @@
+using OpenClaw.Shared;
+
+namespace OpenClawTray.Helpers;
+
+public static class AppIconHelper
+{
+ private static readonly string AssetsPath = ResolveAssetsPath();
+ private static readonly string IconsPath = Path.Combine(AssetsPath, "Icons");
+
+ public static string GetStatusIconPath(ConnectionStatus status)
+ {
+ var iconName = status switch
+ {
+ ConnectionStatus.Connected => "StatusConnected.ico",
+ ConnectionStatus.Connecting => "StatusConnecting.ico",
+ ConnectionStatus.Error => "StatusError.ico",
+ _ => "StatusDisconnected.ico"
+ };
+
+ var path = Path.Combine(IconsPath, iconName);
+ if (!File.Exists(path))
+ {
+ path = GetAppIconPath();
+ }
+
+ return path;
+ }
+
+ public static string GetAppIconPath()
+ {
+ var path = Path.Combine(AssetsPath, "openclaw.ico");
+ if (!File.Exists(path))
+ {
+ throw new FileNotFoundException(
+ $"Application icon was not found at '{path}'. Ensure the Assets folder is packaged correctly and contains 'openclaw.ico'.",
+ path);
+ }
+
+ return path;
+ }
+
+ private static string ResolveAssetsPath()
+ {
+ var bundledPath = Path.Combine(AppContext.BaseDirectory, "Assets");
+ if (Directory.Exists(bundledPath))
+ {
+ return bundledPath;
+ }
+
+ var current = new DirectoryInfo(AppContext.BaseDirectory);
+ while (current != null)
+ {
+ var sourcePath = Path.Combine(current.FullName, "src", "OpenClaw.Tray.WinUI", "Assets");
+ if (Directory.Exists(sourcePath))
+ {
+ return sourcePath;
+ }
+
+ current = current.Parent;
+ }
+
+ return bundledPath;
+ }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Helpers/IconHelper.cs b/src/OpenClaw.Tray.WinUI/Helpers/IconHelper.cs
deleted file mode 100644
index d2181cd..0000000
--- a/src/OpenClaw.Tray.WinUI/Helpers/IconHelper.cs
+++ /dev/null
@@ -1,145 +0,0 @@
-using OpenClaw.Shared;
-using System;
-using System.Drawing;
-using System.IO;
-using System.Runtime.InteropServices;
-
-namespace OpenClawTray.Helpers;
-
-///
-/// Provides icon resources for the tray application.
-/// Creates dynamic status icons with lobster pixel art.
-///
-public static class IconHelper
-{
- private static readonly string AssetsPath = Path.Combine(AppContext.BaseDirectory, "Assets");
- private static readonly string IconsPath = Path.Combine(AssetsPath, "Icons");
-
- // Icon cache
- private static Icon? _connectedIcon;
- private static Icon? _disconnectedIcon;
- private static Icon? _activityIcon;
- private static Icon? _errorIcon;
- private static Icon? _appIcon;
-
- public static string GetStatusIconPath(ConnectionStatus status)
- {
- var iconName = status switch
- {
- ConnectionStatus.Connected => "StatusConnected.ico",
- ConnectionStatus.Connecting => "StatusConnecting.ico",
- ConnectionStatus.Error => "StatusError.ico",
- _ => "StatusDisconnected.ico"
- };
-
- var path = Path.Combine(IconsPath, iconName);
-
- // If specific icon doesn't exist, fall back to main icon
- if (!File.Exists(path))
- {
- path = Path.Combine(AssetsPath, "openclaw.ico");
- }
-
- return path;
- }
-
- public static Icon GetStatusIcon(ConnectionStatus status)
- {
- return status switch
- {
- ConnectionStatus.Connected => GetOrCreateIcon(ref _connectedIcon, ConnectionStatus.Connected),
- ConnectionStatus.Connecting => GetOrCreateIcon(ref _activityIcon, ConnectionStatus.Connecting),
- ConnectionStatus.Error => GetOrCreateIcon(ref _errorIcon, ConnectionStatus.Error),
- _ => GetOrCreateIcon(ref _disconnectedIcon, ConnectionStatus.Disconnected)
- };
- }
-
- public static Icon GetAppIcon()
- {
- if (_appIcon != null) return _appIcon;
-
- var iconPath = Path.Combine(AssetsPath, "openclaw.ico");
- if (File.Exists(iconPath))
- {
- _appIcon = new Icon(iconPath);
- }
- else
- {
- _appIcon = CreateLobsterIcon(Color.FromArgb(255, 99, 71)); // Lobster red
- }
-
- return _appIcon;
- }
-
- private static Icon GetOrCreateIcon(ref Icon? cached, ConnectionStatus status)
- {
- if (cached != null) return cached;
-
- var iconPath = GetStatusIconPath(status);
- if (File.Exists(iconPath))
- {
- cached = new Icon(iconPath);
- }
- else
- {
- // Generate dynamic icon
- var color = status switch
- {
- ConnectionStatus.Connected => Color.FromArgb(76, 175, 80), // Green
- ConnectionStatus.Connecting => Color.FromArgb(255, 193, 7), // Amber
- ConnectionStatus.Error => Color.FromArgb(244, 67, 54), // Red
- _ => Color.FromArgb(158, 158, 158) // Gray
- };
- cached = CreateLobsterIcon(color);
- }
-
- return cached;
- }
-
- ///
- /// Creates a simple colored lobster icon programmatically.
- /// Uses pixel art style matching the original WinForms version.
- ///
- public static Icon CreateLobsterIcon(Color color)
- {
- const int size = 16;
- using var bitmap = new Bitmap(size, size);
- using var g = Graphics.FromImage(bitmap);
-
- g.Clear(Color.Transparent);
-
- // Simple lobster silhouette (pixel art style)
- using var brush = new SolidBrush(color);
-
- // Body
- g.FillRectangle(brush, 6, 6, 4, 6);
-
- // Claws
- g.FillRectangle(brush, 3, 4, 2, 2);
- g.FillRectangle(brush, 11, 4, 2, 2);
- g.FillRectangle(brush, 4, 6, 2, 2);
- g.FillRectangle(brush, 10, 6, 2, 2);
-
- // Tail
- g.FillRectangle(brush, 7, 12, 2, 3);
- g.FillRectangle(brush, 5, 14, 6, 1);
-
- // Eyes
- using var eyeBrush = new SolidBrush(Color.White);
- g.FillRectangle(eyeBrush, 6, 5, 1, 1);
- g.FillRectangle(eyeBrush, 9, 5, 1, 1);
-
- // Convert bitmap to icon
- var hIcon = bitmap.GetHicon();
- var icon = Icon.FromHandle(hIcon);
-
- // Clone to own the icon data
- var result = (Icon)icon.Clone();
- DestroyIcon(hIcon);
-
- return result;
- }
-
- [DllImport("user32.dll", CharSet = CharSet.Auto)]
- private static extern bool DestroyIcon(IntPtr handle);
-}
diff --git a/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj b/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj
index ee86169..83811cf 100644
--- a/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj
+++ b/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj
@@ -33,6 +33,7 @@
+
@@ -61,4 +62,3 @@
-
diff --git a/src/OpenClaw.Tray.WinUI/Properties/AssemblyInfo.cs b/src/OpenClaw.Tray.WinUI/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000..a3f7e8f
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Properties/AssemblyInfo.cs
@@ -0,0 +1,3 @@
+using System.Runtime.CompilerServices;
+
+[assembly: InternalsVisibleTo("OpenClaw.Tray.Tests")]
diff --git a/src/OpenClaw.Tray.WinUI/Services/GlobalHotkeyService.cs b/src/OpenClaw.Tray.WinUI/Services/GlobalHotkeyService.cs
index d0e5f93..49fe829 100644
--- a/src/OpenClaw.Tray.WinUI/Services/GlobalHotkeyService.cs
+++ b/src/OpenClaw.Tray.WinUI/Services/GlobalHotkeyService.cs
@@ -7,15 +7,19 @@ namespace OpenClawTray.Services;
///
/// Registers and handles global hotkeys using P/Invoke.
-/// Default: Ctrl+Alt+Shift+C for Quick Send.
+/// Defaults:
+/// - Ctrl+Alt+Shift+C for Quick Send
+/// - Ctrl+Alt+Shift+V for Voice pause/resume
///
public class GlobalHotkeyService : IDisposable
{
- private const int HOTKEY_ID = 9001;
+ private const int QUICK_SEND_HOTKEY_ID = 9001;
+ private const int VOICE_TOGGLE_HOTKEY_ID = 9002;
private const uint MOD_CONTROL = 0x0002;
private const uint MOD_ALT = 0x0001;
private const uint MOD_SHIFT = 0x0004;
private const uint VK_C = 0x43;
+ private const uint VK_V = 0x56;
private const int WM_HOTKEY = 0x0312;
[DllImport("user32.dll", SetLastError = true)]
@@ -105,6 +109,7 @@ private struct POINT
private IntPtr _hwnd;
private bool _registered;
private bool _disposed;
+ private readonly object _sync = new();
private Thread? _messageThread;
private WndProcDelegate? _wndProcDelegate; // prevent GC collection
private volatile bool _running;
@@ -113,7 +118,8 @@ private struct POINT
private readonly ManualResetEventSlim _windowReady = new(false);
private readonly ManualResetEventSlim _opCompleted = new(false);
- public event EventHandler? HotkeyPressed;
+ public event EventHandler? QuickSendHotkeyPressed;
+ public event EventHandler? VoiceToggleHotkeyPressed;
public GlobalHotkeyService()
{
@@ -121,12 +127,15 @@ public GlobalHotkeyService()
public bool Register()
{
- if (_registered) return true;
-
try
{
- // Create message window on a dedicated thread with message loop
- EnsureMessageLoop();
+ lock (_sync)
+ {
+ if (_registered) return true;
+
+ // Create message window on a dedicated thread with message loop
+ EnsureMessageLoop();
+ }
if (!_windowReady.Wait(TimeSpan.FromSeconds(2)))
{
@@ -134,18 +143,21 @@ public bool Register()
return false;
}
- if (_hwnd == IntPtr.Zero)
+ lock (_sync)
{
- Logger.Warn("Failed to create hotkey message window");
- return false;
- }
+ if (_hwnd == IntPtr.Zero)
+ {
+ Logger.Warn("Failed to create hotkey message window");
+ return false;
+ }
- _opCompleted.Reset();
- if (!PostMessage(_hwnd, WM_APP_REGISTER, IntPtr.Zero, IntPtr.Zero))
- {
- Logger.Warn("Failed to post WM_APP_REGISTER message for hotkey registration");
- _registered = false;
- return false;
+ _opCompleted.Reset();
+ if (!PostMessage(_hwnd, WM_APP_REGISTER, IntPtr.Zero, IntPtr.Zero))
+ {
+ Logger.Warn("Failed to post WM_APP_REGISTER message for hotkey registration");
+ _registered = false;
+ return false;
+ }
}
if (!_opCompleted.Wait(TimeSpan.FromSeconds(2)))
@@ -225,19 +237,34 @@ private IntPtr WndProc(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam)
if (msg == WM_APP_REGISTER)
{
// Register from the message-loop thread that owns hWnd.
- _registered = RegisterHotKey(hWnd, HOTKEY_ID,
+ var quickSendRegistered = RegisterHotKey(hWnd, QUICK_SEND_HOTKEY_ID,
MOD_CONTROL | MOD_ALT | MOD_SHIFT | MOD_NOREPEAT,
VK_C);
+ var voiceToggleRegistered = RegisterHotKey(hWnd, VOICE_TOGGLE_HOTKEY_ID,
+ MOD_CONTROL | MOD_ALT | MOD_SHIFT | MOD_NOREPEAT,
+ VK_V);
+
+ _registered = quickSendRegistered && voiceToggleRegistered;
if (_registered)
{
- Logger.Info("Global hotkey registered: Ctrl+Alt+Shift+C");
+ Logger.Info("Global hotkeys registered: Ctrl+Alt+Shift+C (Quick Send), Ctrl+Alt+Shift+V (Voice Pause)");
}
else
{
+ if (quickSendRegistered)
+ {
+ UnregisterHotKey(hWnd, QUICK_SEND_HOTKEY_ID);
+ }
+
+ if (voiceToggleRegistered)
+ {
+ UnregisterHotKey(hWnd, VOICE_TOGGLE_HOTKEY_ID);
+ }
+
var err = Marshal.GetLastWin32Error();
var errMsg = new Win32Exception(err).Message;
- Logger.Warn($"Failed to register global hotkey (Win32Error={err}: {errMsg})");
+ Logger.Warn($"Failed to register one or more global hotkeys (Win32Error={err}: {errMsg})");
}
_opCompleted.Set();
@@ -250,9 +277,10 @@ private IntPtr WndProc(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam)
{
if (_registered)
{
- UnregisterHotKey(hWnd, HOTKEY_ID);
+ UnregisterHotKey(hWnd, QUICK_SEND_HOTKEY_ID);
+ UnregisterHotKey(hWnd, VOICE_TOGGLE_HOTKEY_ID);
_registered = false;
- Logger.Info("Global hotkey unregistered");
+ Logger.Info("Global hotkeys unregistered");
}
}
catch (Exception ex)
@@ -266,10 +294,15 @@ private IntPtr WndProc(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam)
return IntPtr.Zero;
}
- if (msg == WM_HOTKEY && wParam.ToInt32() == HOTKEY_ID)
+ if (msg == WM_HOTKEY && wParam.ToInt32() == QUICK_SEND_HOTKEY_ID)
{
Logger.Info("Hotkey pressed: Ctrl+Alt+Shift+C");
- OnHotkeyPressed();
+ OnQuickSendHotkeyPressed();
+ }
+ else if (msg == WM_HOTKEY && wParam.ToInt32() == VOICE_TOGGLE_HOTKEY_ID)
+ {
+ Logger.Info("Hotkey pressed: Ctrl+Alt+Shift+V");
+ OnVoiceToggleHotkeyPressed();
}
return DefWindowProc(hWnd, msg, wParam, lParam);
}
@@ -302,9 +335,14 @@ public void Unregister()
}
}
- internal void OnHotkeyPressed()
+ internal void OnQuickSendHotkeyPressed()
+ {
+ QuickSendHotkeyPressed?.Invoke(this, EventArgs.Empty);
+ }
+
+ internal void OnVoiceToggleHotkeyPressed()
{
- HotkeyPressed?.Invoke(this, EventArgs.Empty);
+ VoiceToggleHotkeyPressed?.Invoke(this, EventArgs.Empty);
}
public void Dispose()
diff --git a/src/OpenClaw.Tray.WinUI/Services/NodeService.cs b/src/OpenClaw.Tray.WinUI/Services/NodeService.cs
index 731359f..da95c19 100644
--- a/src/OpenClaw.Tray.WinUI/Services/NodeService.cs
+++ b/src/OpenClaw.Tray.WinUI/Services/NodeService.cs
@@ -5,6 +5,7 @@
using OpenClaw.Shared;
using OpenClaw.Shared.Capabilities;
using OpenClawTray.Helpers;
+using OpenClawTray.Services.Voice;
using OpenClawTray.Windows;
using Microsoft.UI.Xaml;
@@ -21,6 +22,7 @@ public class NodeService : IDisposable
private CanvasWindow? _canvasWindow;
private ScreenCaptureService? _screenCaptureService;
private CameraCaptureService? _cameraCaptureService;
+ private VoiceService? _voiceService;
private DateTime _lastScreenCaptureNotification = DateTime.MinValue;
private string? _a2uiHostUrl;
@@ -29,6 +31,7 @@ public class NodeService : IDisposable
private CanvasCapability? _canvasCapability;
private ScreenCapability? _screenCapability;
private CameraCapability? _cameraCapability;
+ private VoiceCapability? _voiceCapability;
private readonly string _dataPath;
// Events
@@ -44,13 +47,14 @@ public class NodeService : IDisposable
public string? FullDeviceId => _nodeClient?.FullDeviceId;
public string? GatewayUrl => _nodeClient?.GatewayUrl;
- public NodeService(IOpenClawLogger logger, DispatcherQueue dispatcherQueue, string dataPath)
+ public NodeService(IOpenClawLogger logger, DispatcherQueue dispatcherQueue, VoiceService voiceService, string dataPath)
{
_logger = logger;
_dispatcherQueue = dispatcherQueue;
_dataPath = dataPath;
_screenCaptureService = new ScreenCaptureService(logger);
_cameraCaptureService = new CameraCaptureService(logger);
+ _voiceService = voiceService;
}
///
@@ -79,6 +83,34 @@ public async Task ConnectAsync(string gatewayUrl, string token)
await _nodeClient.ConnectAsync();
_a2uiHostUrl = BuildA2UIHostUrl(_nodeClient.GatewayUrl);
+
+ if (_voiceService != null)
+ {
+ var settings = await _voiceService.GetSettingsAsync();
+ if (settings.Enabled && settings.Mode != VoiceActivationMode.Off)
+ {
+ var startTcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
+ var enqueued = _dispatcherQueue.TryEnqueue(async () =>
+ {
+ try
+ {
+ await _voiceService.StartAsync(new VoiceStartArgs { Mode = settings.Mode });
+ startTcs.TrySetResult(true);
+ }
+ catch (Exception ex)
+ {
+ startTcs.TrySetException(ex);
+ }
+ });
+
+ if (!enqueued)
+ {
+ throw new InvalidOperationException("Dispatcher queue unavailable for voice startup.");
+ }
+
+ await startTcs.Task;
+ }
+ }
}
///
@@ -92,6 +124,30 @@ public async Task DisconnectAsync()
_nodeClient.Dispose();
_nodeClient = null;
}
+
+ if (_voiceService != null)
+ {
+ var stopTcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
+ var enqueued = _dispatcherQueue.TryEnqueue(async () =>
+ {
+ try
+ {
+ await _voiceService.StopAsync(new VoiceStopArgs { Reason = "Node disconnected" });
+ stopTcs.TrySetResult(true);
+ }
+ catch (Exception ex)
+ {
+ stopTcs.TrySetException(ex);
+ }
+ });
+
+ if (!enqueued)
+ {
+ throw new InvalidOperationException("Dispatcher queue unavailable for voice shutdown.");
+ }
+
+ await stopTcs.Task;
+ }
// Close canvas window
if (_canvasWindow != null && !_canvasWindow.IsClosed)
@@ -134,6 +190,19 @@ private void RegisterCapabilities()
_cameraCapability.ListRequested += OnCameraList;
_cameraCapability.SnapRequested += OnCameraSnap;
_nodeClient.RegisterCapability(_cameraCapability);
+
+ // Voice capability
+ _voiceCapability = new VoiceCapability(_logger);
+ _voiceCapability.ListDevicesRequested += OnVoiceListDevices;
+ _voiceCapability.SettingsRequested += OnVoiceGetSettings;
+ _voiceCapability.SettingsUpdateRequested += OnVoiceSetSettings;
+ _voiceCapability.StatusRequested += OnVoiceGetStatus;
+ _voiceCapability.StartRequested += OnVoiceStart;
+ _voiceCapability.StopRequested += OnVoiceStop;
+ _voiceCapability.PauseRequested += OnVoicePause;
+ _voiceCapability.ResumeRequested += OnVoiceResume;
+ _voiceCapability.SkipRequested += OnVoiceSkip;
+ _nodeClient.RegisterCapability(_voiceCapability);
_logger.Info("All capabilities registered");
}
@@ -475,6 +544,82 @@ private async Task OnCameraSnap(CameraSnapArgs args)
}
}
+ #endregion
+
+ #region Voice Capability Handlers
+
+ private Task OnVoiceListDevices()
+ {
+ if (_voiceService == null)
+ throw new InvalidOperationException("Voice service not available");
+
+ return _voiceService.ListDevicesAsync();
+ }
+
+ private Task OnVoiceGetSettings()
+ {
+ if (_voiceService == null)
+ throw new InvalidOperationException("Voice service not available");
+
+ return _voiceService.GetSettingsAsync();
+ }
+
+ private Task OnVoiceSetSettings(VoiceSettingsUpdateArgs args)
+ {
+ if (_voiceService == null)
+ throw new InvalidOperationException("Voice service not available");
+
+ return _voiceService.UpdateSettingsAsync(args);
+ }
+
+ private Task OnVoiceGetStatus()
+ {
+ if (_voiceService == null)
+ throw new InvalidOperationException("Voice service not available");
+
+ return _voiceService.GetStatusAsync();
+ }
+
+ private Task OnVoiceStart(VoiceStartArgs args)
+ {
+ if (_voiceService == null)
+ throw new InvalidOperationException("Voice service not available");
+
+ return _voiceService.StartAsync(args);
+ }
+
+ private Task OnVoiceStop(VoiceStopArgs args)
+ {
+ if (_voiceService == null)
+ throw new InvalidOperationException("Voice service not available");
+
+ return _voiceService.StopAsync(args);
+ }
+
+ private Task OnVoicePause(VoicePauseArgs args)
+ {
+ if (_voiceService == null)
+ throw new InvalidOperationException("Voice service not available");
+
+ return _voiceService.PauseAsync(args);
+ }
+
+ private Task OnVoiceResume(VoiceResumeArgs args)
+ {
+ if (_voiceService == null)
+ throw new InvalidOperationException("Voice service not available");
+
+ return _voiceService.ResumeAsync(args);
+ }
+
+ private Task OnVoiceSkip(VoiceSkipArgs args)
+ {
+ if (_voiceService == null)
+ throw new InvalidOperationException("Voice service not available");
+
+ return _voiceService.SkipCurrentReplyAsync(args);
+ }
+
#endregion
public void Dispose()
@@ -484,7 +629,6 @@ public void Dispose()
try { client?.Dispose(); } catch { /* ignore */ }
try { _cameraCaptureService?.Dispose(); } catch { /* ignore */ }
-
if (_canvasWindow != null && !_canvasWindow.IsClosed)
{
var window = _canvasWindow;
diff --git a/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs b/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs
index f89e513..f9d0e82 100644
--- a/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs
+++ b/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs
@@ -47,6 +47,9 @@ public class SettingsManager
public bool NotifyChatResponses { get; set; } = true;
public bool PreferStructuredCategories { get; set; } = true;
public List UserRules { get; set; } = new();
+ public VoiceSettings Voice { get; set; } = new();
+ public VoiceRepeaterWindowSettings VoiceRepeaterWindow { get; set; } = new();
+ public VoiceProviderConfigurationStore VoiceProviderConfiguration { get; set; } = new();
// Node mode (enables Windows as a node, not just operator)
public bool EnableNodeMode { get; set; } = false;
@@ -94,6 +97,10 @@ public void Load()
PreferStructuredCategories = loaded.PreferStructuredCategories;
if (loaded.UserRules != null)
UserRules = loaded.UserRules;
+ Voice = loaded.Voice ?? new VoiceSettings();
+ VoiceRepeaterWindow = loaded.VoiceRepeaterWindow ?? new VoiceRepeaterWindowSettings();
+ VoiceProviderConfiguration = loaded.VoiceProviderConfiguration?.Clone() ?? new VoiceProviderConfigurationStore();
+ VoiceProviderConfiguration.MigrateLegacyCredentials(loaded.VoiceProviderCredentials);
}
}
}
@@ -103,7 +110,7 @@ public void Load()
}
}
- public void Save()
+ public void Save(bool logSuccess = true)
{
try
{
@@ -135,13 +142,19 @@ public void Save()
SkippedUpdateTag = string.IsNullOrWhiteSpace(SkippedUpdateTag) ? null : SkippedUpdateTag,
NotifyChatResponses = NotifyChatResponses,
PreferStructuredCategories = PreferStructuredCategories,
- UserRules = UserRules
+ UserRules = UserRules,
+ Voice = Voice,
+ VoiceRepeaterWindow = VoiceRepeaterWindow,
+ VoiceProviderConfiguration = VoiceProviderConfiguration.Clone()
};
var json = data.ToJson();
File.WriteAllText(SettingsFilePath, json);
- Logger.Info("Settings saved");
+ if (logSuccess)
+ {
+ Logger.Info("Settings saved");
+ }
}
catch (Exception ex)
{
diff --git a/src/OpenClaw.Tray.WinUI/Services/Voice/AudioGraphStreamingSpeechToTextRoute.cs b/src/OpenClaw.Tray.WinUI/Services/Voice/AudioGraphStreamingSpeechToTextRoute.cs
new file mode 100644
index 0000000..088200a
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Services/Voice/AudioGraphStreamingSpeechToTextRoute.cs
@@ -0,0 +1,29 @@
+using System;
+using System.Threading;
+using System.Threading.Tasks;
+using OpenClaw.Shared;
+
+namespace OpenClawTray.Services.Voice;
+
+internal sealed class AudioGraphStreamingSpeechToTextRoute : IVoiceSpeechToTextRoute
+{
+ private readonly IOpenClawLogger _logger;
+
+ public AudioGraphStreamingSpeechToTextRoute(IOpenClawLogger logger)
+ {
+ _logger = logger;
+ }
+
+ public VoiceSpeechToTextRouteKind Kind => VoiceSpeechToTextRouteKind.Streaming;
+
+ public Task StartAsync(
+ VoiceProviderOption provider,
+ VoiceSettings settings,
+ CancellationToken cancellationToken)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+ _logger.Info($"Selected streaming STT route for provider '{provider.Name}'.");
+ throw new NotSupportedException(
+ $"STT provider '{provider.Name}' is assigned to the AudioGraph streaming route, but that adapter is not implemented yet.");
+ }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Services/Voice/DispatcherQueueAdapter.cs b/src/OpenClaw.Tray.WinUI/Services/Voice/DispatcherQueueAdapter.cs
new file mode 100644
index 0000000..6c51f55
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Services/Voice/DispatcherQueueAdapter.cs
@@ -0,0 +1,16 @@
+namespace OpenClawTray.Services.Voice;
+
+public sealed class DispatcherQueueAdapter : IUiDispatcher
+{
+ private readonly Microsoft.UI.Dispatching.DispatcherQueue _dispatcherQueue;
+
+ public DispatcherQueueAdapter(Microsoft.UI.Dispatching.DispatcherQueue dispatcherQueue)
+ {
+ _dispatcherQueue = dispatcherQueue;
+ }
+
+ public bool TryEnqueue(Action callback)
+ {
+ return _dispatcherQueue.TryEnqueue(() => callback());
+ }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Services/Voice/IVoiceSpeechToTextRoute.cs b/src/OpenClaw.Tray.WinUI/Services/Voice/IVoiceSpeechToTextRoute.cs
new file mode 100644
index 0000000..16e3350
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Services/Voice/IVoiceSpeechToTextRoute.cs
@@ -0,0 +1,15 @@
+using System.Threading;
+using System.Threading.Tasks;
+using OpenClaw.Shared;
+
+namespace OpenClawTray.Services.Voice;
+
+internal interface IVoiceSpeechToTextRoute
+{
+ VoiceSpeechToTextRouteKind Kind { get; }
+
+ Task StartAsync(
+ VoiceProviderOption provider,
+ VoiceSettings settings,
+ CancellationToken cancellationToken);
+}
diff --git a/src/OpenClaw.Tray.WinUI/Services/Voice/SherpaOnnxSpeechToTextRoute.cs b/src/OpenClaw.Tray.WinUI/Services/Voice/SherpaOnnxSpeechToTextRoute.cs
new file mode 100644
index 0000000..3698d51
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Services/Voice/SherpaOnnxSpeechToTextRoute.cs
@@ -0,0 +1,29 @@
+using System;
+using System.Threading;
+using System.Threading.Tasks;
+using OpenClaw.Shared;
+
+namespace OpenClawTray.Services.Voice;
+
+internal sealed class SherpaOnnxSpeechToTextRoute : IVoiceSpeechToTextRoute
+{
+ private readonly IOpenClawLogger _logger;
+
+ public SherpaOnnxSpeechToTextRoute(IOpenClawLogger logger)
+ {
+ _logger = logger;
+ }
+
+ public VoiceSpeechToTextRouteKind Kind => VoiceSpeechToTextRouteKind.SherpaOnnx;
+
+ public Task StartAsync(
+ VoiceProviderOption provider,
+ VoiceSettings settings,
+ CancellationToken cancellationToken)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+ _logger.Info($"Selected embedded sherpa-onnx STT route for provider '{provider.Name}'.");
+ throw new NotSupportedException(
+ "The sherpa-onnx STT route is not implemented yet. This route will require a user-provided local model bundle.");
+ }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Services/Voice/VoiceCaptureService.cs b/src/OpenClaw.Tray.WinUI/Services/Voice/VoiceCaptureService.cs
new file mode 100644
index 0000000..c4e053f
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Services/Voice/VoiceCaptureService.cs
@@ -0,0 +1,431 @@
+using System;
+using System.Linq;
+using System.Runtime.InteropServices;
+using System.Threading;
+using System.Threading.Tasks;
+using OpenClaw.Shared;
+using WinRT;
+using Windows.Devices.Enumeration;
+using Windows.Media;
+using Windows.Media.Audio;
+using Windows.Media.Capture;
+using Windows.Media.Devices;
+using Windows.Media.Render;
+
+namespace OpenClawTray.Services.Voice;
+
+public sealed class VoiceAudioFrameEventArgs : EventArgs
+{
+ public VoiceAudioFrameEventArgs(
+ string? deviceId,
+ string? deviceName,
+ DateTime utcTimestamp,
+ int sampleRateHz,
+ int channelCount,
+ byte[] data,
+ float peakLevel)
+ {
+ DeviceId = deviceId;
+ DeviceName = deviceName;
+ UtcTimestamp = utcTimestamp;
+ SampleRateHz = sampleRateHz;
+ ChannelCount = channelCount;
+ Data = data;
+ PeakLevel = peakLevel;
+ }
+
+ public string? DeviceId { get; }
+ public string? DeviceName { get; }
+ public DateTime UtcTimestamp { get; }
+ public int SampleRateHz { get; }
+ public int ChannelCount { get; }
+ public byte[] Data { get; }
+ public float PeakLevel { get; }
+}
+
+public sealed class VoiceCaptureSignalEventArgs : EventArgs
+{
+ public VoiceCaptureSignalEventArgs(
+ string? deviceId,
+ string? deviceName,
+ DateTime utcTimestamp,
+ float peakLevel)
+ {
+ DeviceId = deviceId;
+ DeviceName = deviceName;
+ UtcTimestamp = utcTimestamp;
+ PeakLevel = peakLevel;
+ }
+
+ public string? DeviceId { get; }
+ public string? DeviceName { get; }
+ public DateTime UtcTimestamp { get; }
+ public float PeakLevel { get; }
+}
+
+public sealed class VoiceCaptureService : IAsyncDisposable
+{
+ private const float DefaultSignalThreshold = 0.015f;
+
+ private readonly IOpenClawLogger _logger;
+ private readonly object _gate = new();
+
+ private AudioGraph? _audioGraph;
+ private AudioDeviceInputNode? _deviceInputNode;
+ private AudioFrameOutputNode? _frameOutputNode;
+ private DeviceInformation? _activeCaptureDevice;
+ private int _sampleRateHz;
+ private int _channelCount;
+ private bool _captureReady;
+ private TaskCompletionSource _captureReadyTcs = CreateCaptureReadyTcs();
+
+ public VoiceCaptureService(IOpenClawLogger logger)
+ {
+ _logger = logger;
+ }
+
+ public event EventHandler? FrameCaptured;
+ public event EventHandler? SignalDetected;
+
+ public bool IsRunning
+ {
+ get
+ {
+ lock (_gate)
+ {
+ return _audioGraph != null;
+ }
+ }
+ }
+
+ public string? ActiveDeviceId
+ {
+ get
+ {
+ lock (_gate)
+ {
+ return _activeCaptureDevice?.Id;
+ }
+ }
+ }
+
+ public string? ActiveDeviceName
+ {
+ get
+ {
+ lock (_gate)
+ {
+ return _activeCaptureDevice?.Name;
+ }
+ }
+ }
+
+ public async Task StartAsync(VoiceSettings settings, CancellationToken cancellationToken)
+ {
+ ArgumentNullException.ThrowIfNull(settings);
+
+ await StopAsync();
+ cancellationToken.ThrowIfCancellationRequested();
+
+ lock (_gate)
+ {
+ _captureReady = false;
+ _captureReadyTcs = CreateCaptureReadyTcs();
+ }
+
+ AudioGraph? audioGraph = null;
+ AudioDeviceInputNode? deviceInputNode = null;
+ AudioFrameOutputNode? frameOutputNode = null;
+
+ try
+ {
+ var graphSettings = new AudioGraphSettings(AudioRenderCategory.Speech)
+ {
+ QuantumSizeSelectionMode = QuantumSizeSelectionMode.ClosestToDesired,
+ DesiredSamplesPerQuantum = (int)ResolveDesiredSamplesPerQuantum(settings.SampleRateHz, settings.CaptureChunkMs)
+ };
+
+ var graphCreation = await AudioGraph.CreateAsync(graphSettings);
+ if (graphCreation.Status != AudioGraphCreationStatus.Success || graphCreation.Graph == null)
+ {
+ throw new InvalidOperationException($"AudioGraph unavailable: {graphCreation.Status}");
+ }
+
+ audioGraph = graphCreation.Graph;
+ var captureDevice = await ResolveCaptureDeviceAsync(settings.InputDeviceId);
+ var inputCreation = await audioGraph.CreateDeviceInputNodeAsync(
+ MediaCategory.Speech,
+ audioGraph.EncodingProperties,
+ captureDevice);
+
+ if (inputCreation.Status != AudioDeviceNodeCreationStatus.Success || inputCreation.DeviceInputNode == null)
+ {
+ throw new InvalidOperationException($"Audio input node unavailable: {inputCreation.Status}");
+ }
+
+ deviceInputNode = inputCreation.DeviceInputNode;
+ frameOutputNode = audioGraph.CreateFrameOutputNode(audioGraph.EncodingProperties);
+ deviceInputNode.AddOutgoingConnection(frameOutputNode);
+
+ audioGraph.QuantumStarted += OnAudioGraphQuantumStarted;
+ audioGraph.UnrecoverableErrorOccurred += OnAudioGraphUnrecoverableErrorOccurred;
+
+ lock (_gate)
+ {
+ _audioGraph = audioGraph;
+ _deviceInputNode = deviceInputNode;
+ _frameOutputNode = frameOutputNode;
+ _activeCaptureDevice = captureDevice;
+ _sampleRateHz = (int)audioGraph.EncodingProperties.SampleRate;
+ _channelCount = (int)audioGraph.EncodingProperties.ChannelCount;
+ }
+
+ frameOutputNode.Start();
+ deviceInputNode.Start();
+ audioGraph.Start();
+
+ audioGraph = null;
+ deviceInputNode = null;
+ frameOutputNode = null;
+
+ _logger.Info(
+ $"Voice capture graph started on {(captureDevice?.Name ?? "system default microphone")} ({captureDevice?.Id ?? "default"})");
+ }
+ finally
+ {
+ if (frameOutputNode != null)
+ {
+ try { frameOutputNode.Stop(); } catch { }
+ try { frameOutputNode.Dispose(); } catch { }
+ }
+
+ if (deviceInputNode != null)
+ {
+ try { deviceInputNode.Stop(); } catch { }
+ try { deviceInputNode.Dispose(); } catch { }
+ }
+
+ if (audioGraph != null)
+ {
+ audioGraph.QuantumStarted -= OnAudioGraphQuantumStarted;
+ audioGraph.UnrecoverableErrorOccurred -= OnAudioGraphUnrecoverableErrorOccurred;
+ try { audioGraph.Stop(); } catch { }
+ try { audioGraph.Dispose(); } catch { }
+ }
+ }
+ }
+
+ public ValueTask DisposeAsync()
+ {
+ return new ValueTask(StopAsync());
+ }
+
+ public async Task StopAsync()
+ {
+ AudioGraph? audioGraph;
+ AudioDeviceInputNode? deviceInputNode;
+ AudioFrameOutputNode? frameOutputNode;
+ string? deviceName;
+
+ lock (_gate)
+ {
+ audioGraph = _audioGraph;
+ _audioGraph = null;
+ deviceInputNode = _deviceInputNode;
+ _deviceInputNode = null;
+ frameOutputNode = _frameOutputNode;
+ _frameOutputNode = null;
+ deviceName = _activeCaptureDevice?.Name;
+ _activeCaptureDevice = null;
+ _sampleRateHz = 0;
+ _channelCount = 0;
+ }
+
+ if (audioGraph == null && deviceInputNode == null && frameOutputNode == null)
+ {
+ return;
+ }
+
+ if (audioGraph != null)
+ {
+ audioGraph.QuantumStarted -= OnAudioGraphQuantumStarted;
+ audioGraph.UnrecoverableErrorOccurred -= OnAudioGraphUnrecoverableErrorOccurred;
+ }
+
+ try { frameOutputNode?.Stop(); } catch { }
+ try { deviceInputNode?.Stop(); } catch { }
+ try { audioGraph?.Stop(); } catch { }
+
+ try { frameOutputNode?.Dispose(); } catch { }
+ try { deviceInputNode?.Dispose(); } catch { }
+ try { audioGraph?.Dispose(); } catch { }
+
+ await Task.CompletedTask;
+ _logger.Info($"Voice capture graph stopped{(string.IsNullOrWhiteSpace(deviceName) ? string.Empty : $" ({deviceName})")}");
+ }
+
+ public Task WaitForCaptureReadyAsync(CancellationToken cancellationToken)
+ {
+ Task readinessTask;
+
+ lock (_gate)
+ {
+ readinessTask = _captureReady ? Task.CompletedTask : _captureReadyTcs.Task;
+ }
+
+ return readinessTask.WaitAsync(cancellationToken);
+ }
+
+ internal static uint ResolveDesiredSamplesPerQuantum(int sampleRateHz, int chunkMs)
+ {
+ return VoiceCaptureMath.ResolveDesiredSamplesPerQuantum(sampleRateHz, chunkMs);
+ }
+
+ internal static bool HasAudibleSignal(float peakLevel, float threshold = DefaultSignalThreshold)
+ {
+ return VoiceCaptureMath.HasAudibleSignal(peakLevel, threshold);
+ }
+
+ internal static float ComputePeakLevel(byte[] data)
+ {
+ return VoiceCaptureMath.ComputePeakLevel(data);
+ }
+
+ private async Task ResolveCaptureDeviceAsync(string? preferredInputDeviceId)
+ {
+ var devices = await DeviceInformation.FindAllAsync(DeviceClass.AudioCapture);
+ if (devices.Count == 0)
+ {
+ throw new InvalidOperationException("No audio capture devices are available.");
+ }
+
+ if (!string.IsNullOrWhiteSpace(preferredInputDeviceId))
+ {
+ var selected = devices.FirstOrDefault(device =>
+ string.Equals(device.Id, preferredInputDeviceId, StringComparison.Ordinal));
+
+ if (selected != null)
+ {
+ return selected;
+ }
+
+ throw new InvalidOperationException($"Selected input device '{preferredInputDeviceId}' was not found.");
+ }
+
+ var defaultId = MediaDevice.GetDefaultAudioCaptureId(AudioDeviceRole.Default);
+ var defaultDevice = devices.FirstOrDefault(device =>
+ string.Equals(device.Id, defaultId, StringComparison.Ordinal));
+
+ return defaultDevice ?? devices[0];
+ }
+
+ private void OnAudioGraphUnrecoverableErrorOccurred(AudioGraph sender, AudioGraphUnrecoverableErrorOccurredEventArgs args)
+ {
+ _logger.Warn($"Voice capture graph unrecoverable error: {args.Error}");
+ }
+
+ private void OnAudioGraphQuantumStarted(AudioGraph sender, object args)
+ {
+ try
+ {
+ AudioFrameOutputNode? frameOutputNode;
+ string? deviceId;
+ string? deviceName;
+ int sampleRateHz;
+ int channelCount;
+
+ lock (_gate)
+ {
+ frameOutputNode = _frameOutputNode;
+ deviceId = _activeCaptureDevice?.Id;
+ deviceName = _activeCaptureDevice?.Name;
+ sampleRateHz = _sampleRateHz;
+ channelCount = _channelCount;
+ }
+
+ if (frameOutputNode == null)
+ {
+ return;
+ }
+
+ using var frame = frameOutputNode.GetFrame();
+ if (!TryCopyAudioFrame(frame, out var bytes) || bytes.Length == 0)
+ {
+ return;
+ }
+
+ TaskCompletionSource? captureReadyTcs = null;
+
+ lock (_gate)
+ {
+ if (!_captureReady)
+ {
+ _captureReady = true;
+ captureReadyTcs = _captureReadyTcs;
+ }
+ }
+
+ captureReadyTcs?.TrySetResult(true);
+
+ var utcNow = DateTime.UtcNow;
+ var peak = ComputePeakLevel(bytes);
+ FrameCaptured?.Invoke(
+ this,
+ new VoiceAudioFrameEventArgs(
+ deviceId,
+ deviceName,
+ utcNow,
+ sampleRateHz,
+ channelCount,
+ bytes,
+ peak));
+
+ if (HasAudibleSignal(peak))
+ {
+ SignalDetected?.Invoke(
+ this,
+ new VoiceCaptureSignalEventArgs(
+ deviceId,
+ deviceName,
+ utcNow,
+ peak));
+ }
+ }
+ catch (Exception ex)
+ {
+ _logger.Warn($"Voice capture quantum processing failed: {ex.Message}");
+ }
+ }
+
+ private static bool TryCopyAudioFrame(AudioFrame frame, out byte[] bytes)
+ {
+ bytes = [];
+
+ using var buffer = frame.LockBuffer(AudioBufferAccessMode.Read);
+ using var reference = buffer.CreateReference();
+ var access = reference.As();
+ access.GetBuffer(out var data, out var capacity);
+
+ if (data == IntPtr.Zero || capacity == 0)
+ {
+ return false;
+ }
+
+ bytes = new byte[capacity];
+ Marshal.Copy(data, bytes, 0, (int)capacity);
+ return true;
+ }
+
+ private static TaskCompletionSource CreateCaptureReadyTcs()
+ {
+ return new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
+ }
+
+ [ComImport]
+ [Guid("5B0D3235-4DBA-4D44-865E-8F1D0E4FD04D")]
+ [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
+ private interface IMemoryBufferByteAccess
+ {
+ void GetBuffer(out IntPtr buffer, out uint capacity);
+ }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Services/Voice/VoiceDisplayHelper.cs b/src/OpenClaw.Tray.WinUI/Services/Voice/VoiceDisplayHelper.cs
new file mode 100644
index 0000000..a671cf0
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Services/Voice/VoiceDisplayHelper.cs
@@ -0,0 +1,49 @@
+using OpenClaw.Shared;
+
+namespace OpenClawTray.Services.Voice;
+
+public static class VoiceDisplayHelper
+{
+ public static string GetModeLabel(VoiceActivationMode mode)
+ {
+ return mode switch
+ {
+ VoiceActivationMode.VoiceWake => "Voice Wake",
+ VoiceActivationMode.TalkMode => "Talk Mode",
+ _ => "Off"
+ };
+ }
+
+ public static string GetStateLabel(VoiceRuntimeState state)
+ {
+ return state switch
+ {
+ VoiceRuntimeState.Arming => "Starting",
+ VoiceRuntimeState.ListeningForVoiceWake => "Listening",
+ VoiceRuntimeState.ListeningContinuously => "Listening",
+ VoiceRuntimeState.RecordingUtterance => "Recording",
+ VoiceRuntimeState.SubmittingAudio => "Sending",
+ VoiceRuntimeState.AwaitingResponse => "Waiting for reply",
+ VoiceRuntimeState.PlayingResponse => "Speaking",
+ VoiceRuntimeState.Paused => "Paused",
+ VoiceRuntimeState.Error => "Error",
+ VoiceRuntimeState.Idle => "Idle",
+ _ => "Stopped"
+ };
+ }
+
+ public static string GetRuntimeLabel(VoiceStatusInfo status)
+ {
+ if (status.State == VoiceRuntimeState.Paused)
+ {
+ return $"{GetModeLabel(status.Mode)} (Paused)";
+ }
+
+ if (status.Running)
+ {
+ return $"{GetModeLabel(status.Mode)} ({GetStateLabel(status.State)})";
+ }
+
+ return "Off";
+ }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Services/Voice/VoiceService.cs b/src/OpenClaw.Tray.WinUI/Services/Voice/VoiceService.cs
new file mode 100644
index 0000000..646bb25
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Services/Voice/VoiceService.cs
@@ -0,0 +1,2589 @@
+using System;
+using System.Collections.Generic;
+using System.Globalization;
+using System.Linq;
+using System.Net.Http;
+using System.Diagnostics;
+using System.IO;
+using System.Runtime.InteropServices.WindowsRuntime;
+using System.Text.Json;
+using System.Threading;
+using System.Threading.Tasks;
+using OpenClaw.Shared;
+using OpenClawTray.Helpers;
+using Windows.Devices.Enumeration;
+using Windows.Foundation;
+using Windows.Media.Capture;
+using Windows.Media.Core;
+using Windows.Media.Devices;
+using Windows.Media.Playback;
+using Windows.Media.SpeechRecognition;
+using Windows.Media.SpeechSynthesis;
+using Windows.Storage.Streams;
+
+namespace OpenClawTray.Services.Voice;
+
+public sealed class VoiceService : IVoiceRuntime, IVoiceConfigurationApi, IVoiceRuntimeControlApi, IDisposable
+{
+ private const string DefaultSessionKey = "agent:main:main";
+ private const int HResultSpeechPrivacyDeclined = unchecked((int)0x80045509);
+ private static readonly TimeSpan TransportConnectTimeout = TimeSpan.FromSeconds(10);
+ private static readonly TimeSpan ReplyTimeout = TimeSpan.FromSeconds(45);
+ private static readonly TimeSpan LateReplyGraceWindow = TimeSpan.FromMinutes(2);
+ private static readonly TimeSpan InitialRecognitionReadyDelay = TimeSpan.FromMilliseconds(500);
+ private static readonly TimeSpan DuplicateTranscriptWindow = TimeSpan.FromMilliseconds(750);
+ private static readonly TimeSpan DuplicateAssistantReplyWindow = TimeSpan.FromSeconds(5);
+ private static readonly TimeSpan HypothesisPromotionWindow = TimeSpan.FromSeconds(2);
+ private static readonly TimeSpan RecognitionResumeRetryDelay = TimeSpan.FromMilliseconds(500);
+ private static readonly TimeSpan QueuedReplyPlaybackGap = TimeSpan.FromMilliseconds(500);
+ private const string LowConfidenceRepeatPrompt = "Sorry, I didn't catch that. Could you say it again?";
+
+ private readonly IOpenClawLogger _logger;
+ private readonly SettingsManager _settings;
+ private readonly VoiceCloudTextToSpeechClient _cloudTextToSpeechClient;
+ private readonly object _gate = new();
+
+ private VoiceStatusInfo _status;
+ private VoiceActivationMode? _runtimeModeOverride;
+ private CancellationTokenSource? _runtimeCts;
+ private OpenClawGatewayClient? _chatClient;
+ private ConnectionStatus _chatTransportStatus = ConnectionStatus.Disconnected;
+ private TaskCompletionSource? _transportReadyTcs;
+ private VoiceCaptureService? _voiceCaptureService;
+ private IVoiceSpeechToTextRoute? _speechToTextRoute;
+ private SpeechRecognizer? _speechRecognizer;
+ private SpeechSynthesizer? _speechSynthesizer;
+ private MediaPlayer? _mediaPlayer;
+ private bool _recognitionActive;
+ private int _recognitionSessionGeneration;
+ private bool _recognitionSessionHadActivity;
+ private bool _recognitionSessionHadCaptureSignal;
+ private bool _recognitionRestartInProgress;
+ private bool _awaitingReply;
+ private bool _isSpeaking;
+ private bool _replyPlaybackLoopActive;
+ private bool _quickPaused;
+ private string? _lastTranscript;
+ private DateTime _lastTranscriptUtc;
+ private string? _lastHypothesisText;
+ private DateTime _lastHypothesisUtc;
+ private readonly Queue<(string Text, string? SessionKey)> _pendingAssistantReplies = new();
+ private CancellationTokenSource? _playbackSkipCts;
+ private string? _currentReplyPreview;
+ private string? _lateReplySessionKey;
+ private DateTime? _lateReplyGraceUntilUtc;
+ private string? _lastAcceptedAssistantReplyText;
+ private string? _lastAcceptedAssistantReplySessionKey;
+ private DateTime _lastAcceptedAssistantReplyUtc;
+ private bool _disposed;
+
+ public event EventHandler? ConversationTurnAvailable;
+ public event EventHandler? TranscriptDraftUpdated;
+
+ public VoiceService(IOpenClawLogger logger, SettingsManager settings)
+ {
+ _logger = logger;
+ _settings = settings;
+ _cloudTextToSpeechClient = new VoiceCloudTextToSpeechClient();
+ _status = new VoiceStatusInfo();
+ _status = BuildStoppedStatus(null, null);
+ MediaDevice.DefaultAudioCaptureDeviceChanged += OnDefaultAudioCaptureDeviceChanged;
+ }
+
+ public VoiceStatusInfo CurrentStatus
+ {
+ get
+ {
+ lock (_gate)
+ {
+ return Clone(_status);
+ }
+ }
+ }
+
+ public Task GetSettingsAsync()
+ {
+ lock (_gate)
+ {
+ return Task.FromResult(Clone(_settings.Voice));
+ }
+ }
+
+ public Task UpdateSettingsAsync(VoiceSettingsUpdateArgs update)
+ {
+ ArgumentNullException.ThrowIfNull(update);
+
+ lock (_gate)
+ {
+ _settings.Voice = Clone(update.Settings);
+ if (update.Persist)
+ {
+ _settings.Save();
+ }
+
+ if (! _settings.Voice.Enabled || _settings.Voice.Mode == VoiceActivationMode.Off)
+ {
+ _quickPaused = false;
+ _status = BuildStoppedStatus(_status.SessionKey, _status.LastError);
+ }
+ else if (_quickPaused || _status.State == VoiceRuntimeState.Paused)
+ {
+ _status = BuildPausedStatus(
+ _runtimeModeOverride ?? _settings.Voice.Mode,
+ _status.SessionKey,
+ _status.LastError);
+ }
+ else if (_status.Running)
+ {
+ _status = BuildRunningStatus(
+ _runtimeModeOverride ?? _settings.Voice.Mode,
+ _status.SessionKey,
+ _status.State,
+ _status.LastError);
+ }
+ else
+ {
+ _status = BuildStoppedStatus(_status.SessionKey, _status.LastError);
+ }
+
+ return Task.FromResult(Clone(_settings.Voice));
+ }
+ }
+
+ public VoiceProviderConfigurationStore GetProviderConfiguration()
+ {
+ lock (_gate)
+ {
+ return _settings.VoiceProviderConfiguration.Clone();
+ }
+ }
+
+ public void SetProviderConfiguration(VoiceProviderConfigurationStore configurationStore)
+ {
+ ArgumentNullException.ThrowIfNull(configurationStore);
+
+ lock (_gate)
+ {
+ _settings.VoiceProviderConfiguration = configurationStore.Clone();
+ }
+ }
+
+ public Task GetStatusAsync()
+ {
+ lock (_gate)
+ {
+ return Task.FromResult(Clone(_status));
+ }
+ }
+
+ public async Task ToggleQuickPauseAsync()
+ {
+ ObjectDisposedException.ThrowIf(_disposed, this);
+
+ VoiceActivationMode mode;
+ string? sessionKey;
+ bool shouldResume;
+
+ lock (_gate)
+ {
+ mode = _runtimeModeOverride ?? _settings.Voice.Mode;
+ sessionKey = _status.SessionKey;
+
+ if (!_settings.Voice.Enabled || mode == VoiceActivationMode.Off)
+ {
+ _quickPaused = false;
+ _status = BuildStoppedStatus(sessionKey, "Voice mode is disabled");
+ return Clone(_status);
+ }
+
+ shouldResume = _quickPaused || _status.State == VoiceRuntimeState.Paused;
+ if (!shouldResume)
+ {
+ _quickPaused = true;
+ }
+ }
+
+ if (shouldResume)
+ {
+ lock (_gate)
+ {
+ _quickPaused = false;
+ }
+
+ var resumed = await StartAsync(new VoiceStartArgs
+ {
+ Mode = mode,
+ SessionKey = sessionKey
+ });
+ _logger.Info($"Voice runtime resumed via quick toggle ({mode})");
+ return resumed;
+ }
+
+ await StopRuntimeResourcesAsync(updateStoppedStatus: false);
+
+ lock (_gate)
+ {
+ _status = BuildPausedStatus(mode, sessionKey, null);
+ _logger.Info($"Voice runtime paused via quick toggle ({mode})");
+ return Clone(_status);
+ }
+ }
+
+ public async Task StartAsync(VoiceStartArgs args)
+ {
+ ObjectDisposedException.ThrowIf(_disposed, this);
+
+ args ??= new VoiceStartArgs();
+
+ VoiceSettings effectiveSettings;
+ VoiceActivationMode requestedMode;
+ string? sessionKey;
+
+ lock (_gate)
+ {
+ effectiveSettings = Clone(_settings.Voice);
+ requestedMode = args.Mode ?? effectiveSettings.Mode;
+ sessionKey = args.SessionKey ?? _status.SessionKey;
+
+ if (args.Mode.HasValue && args.Mode.Value != VoiceActivationMode.Off)
+ {
+ effectiveSettings.Enabled = true;
+ effectiveSettings.Mode = args.Mode.Value;
+ _runtimeModeOverride = args.Mode.Value;
+ }
+ else if (args.Mode == VoiceActivationMode.Off)
+ {
+ _runtimeModeOverride = null;
+ }
+
+ if (!effectiveSettings.Enabled || requestedMode == VoiceActivationMode.Off)
+ {
+ _quickPaused = false;
+ _status = BuildStoppedStatus(sessionKey, "Voice mode is disabled");
+ return Clone(_status);
+ }
+
+ if (_quickPaused)
+ {
+ _status = BuildPausedStatus(requestedMode, sessionKey, _status.LastError);
+ return Clone(_status);
+ }
+ }
+
+ await StopRuntimeResourcesAsync(updateStoppedStatus: false);
+
+ try
+ {
+ switch (requestedMode)
+ {
+ case VoiceActivationMode.TalkMode:
+ await StartTalkModeRuntimeAsync(effectiveSettings, sessionKey);
+ break;
+ case VoiceActivationMode.VoiceWake:
+ lock (_gate)
+ {
+ _status = BuildRunningStatus(
+ VoiceActivationMode.VoiceWake,
+ sessionKey,
+ VoiceRuntimeState.ListeningForVoiceWake,
+ "Voice Wake capture is not implemented yet");
+ }
+ _logger.Info("Voice runtime started in mode VoiceWake");
+ break;
+ default:
+ lock (_gate)
+ {
+ _status = BuildStoppedStatus(sessionKey, "Voice mode is disabled");
+ }
+ break;
+ }
+ }
+ catch (Exception ex)
+ {
+ _logger.Error("Voice runtime start failed", ex);
+ lock (_gate)
+ {
+ _status = BuildErrorStatus(requestedMode, sessionKey, GetUserFacingErrorMessage(ex));
+ }
+ }
+
+ return CurrentStatus;
+ }
+
+ public async Task StopAsync(VoiceStopArgs args)
+ {
+ args ??= new VoiceStopArgs();
+
+ await StopRuntimeResourcesAsync(updateStoppedStatus: false);
+
+ lock (_gate)
+ {
+ _quickPaused = false;
+ _runtimeModeOverride = null;
+ _status = BuildStoppedStatus(_status.SessionKey, args.Reason);
+ _logger.Info($"Voice runtime stopped{(string.IsNullOrWhiteSpace(args.Reason) ? string.Empty : $": {args.Reason}")}");
+ return Clone(_status);
+ }
+ }
+
+ public async Task ListDevicesAsync()
+ {
+ try
+ {
+ var inputDefaultId = MediaDevice.GetDefaultAudioCaptureId(AudioDeviceRole.Default);
+ var outputDefaultId = MediaDevice.GetDefaultAudioRenderId(AudioDeviceRole.Default);
+ var results = new List();
+
+ var inputDevices = await DeviceInformation.FindAllAsync(DeviceClass.AudioCapture);
+ foreach (var device in inputDevices)
+ {
+ results.Add(new VoiceAudioDeviceInfo
+ {
+ DeviceId = device.Id,
+ Name = device.Name,
+ IsDefault = string.Equals(device.Id, inputDefaultId, StringComparison.Ordinal),
+ IsInput = true
+ });
+ }
+
+ var outputDevices = await DeviceInformation.FindAllAsync(DeviceClass.AudioRender);
+ foreach (var device in outputDevices)
+ {
+ results.Add(new VoiceAudioDeviceInfo
+ {
+ DeviceId = device.Id,
+ Name = device.Name,
+ IsDefault = string.Equals(device.Id, outputDefaultId, StringComparison.Ordinal),
+ IsOutput = true
+ });
+ }
+
+ return results
+ .OrderByDescending(d => d.IsDefault)
+ .ThenBy(d => d.IsInput ? 0 : 1)
+ .ThenBy(d => d.Name, StringComparer.OrdinalIgnoreCase)
+ .ToArray();
+ }
+ catch (Exception ex)
+ {
+ _logger.Warn($"Voice device enumeration failed: {ex.Message}");
+ return
+ [
+ new VoiceAudioDeviceInfo
+ {
+ DeviceId = "default-input",
+ Name = "System default microphone",
+ IsDefault = true,
+ IsInput = true
+ },
+ new VoiceAudioDeviceInfo
+ {
+ DeviceId = "default-output",
+ Name = "System default speaker",
+ IsDefault = true,
+ IsOutput = true
+ }
+ ];
+ }
+ }
+
+ public VoiceProviderCatalog GetProviderCatalog()
+ {
+ return VoiceProviderCatalogService.LoadCatalog(_logger);
+ }
+
+ public void Dispose()
+ {
+ if (_disposed)
+ {
+ return;
+ }
+
+ _disposed = true;
+ MediaDevice.DefaultAudioCaptureDeviceChanged -= OnDefaultAudioCaptureDeviceChanged;
+ try
+ {
+ Task.Run(() => StopRuntimeResourcesAsync(updateStoppedStatus: true)).GetAwaiter().GetResult();
+ }
+ catch (Exception ex)
+ {
+ _logger.Warn($"Voice runtime dispose cleanup failed: {ex.Message}");
+ }
+ }
+
+ public async Task PauseAsync(VoicePauseArgs? args = null)
+ {
+ ObjectDisposedException.ThrowIf(_disposed, this);
+ args ??= new VoicePauseArgs();
+
+ VoiceActivationMode mode;
+ string? sessionKey;
+
+ lock (_gate)
+ {
+ mode = _runtimeModeOverride ?? _settings.Voice.Mode;
+ sessionKey = _status.SessionKey;
+
+ if (!_settings.Voice.Enabled || mode == VoiceActivationMode.Off)
+ {
+ _quickPaused = false;
+ _status = BuildStoppedStatus(sessionKey, "Voice mode is disabled");
+ return Clone(_status);
+ }
+
+ if (_quickPaused || _status.State == VoiceRuntimeState.Paused)
+ {
+ return Clone(_status);
+ }
+
+ _quickPaused = true;
+ }
+
+ await StopRuntimeResourcesAsync(updateStoppedStatus: false);
+
+ lock (_gate)
+ {
+ _status = BuildPausedStatus(mode, sessionKey, args.Reason);
+ _logger.Info($"Voice runtime paused{(string.IsNullOrWhiteSpace(args.Reason) ? string.Empty : $": {args.Reason}")}");
+ return Clone(_status);
+ }
+ }
+
+ public async Task ResumeAsync(VoiceResumeArgs? args = null)
+ {
+ ObjectDisposedException.ThrowIf(_disposed, this);
+ args ??= new VoiceResumeArgs();
+
+ VoiceActivationMode mode;
+ string? sessionKey;
+
+ lock (_gate)
+ {
+ mode = _runtimeModeOverride ?? _settings.Voice.Mode;
+ sessionKey = _status.SessionKey;
+ _quickPaused = false;
+ }
+
+ var resumed = await StartAsync(new VoiceStartArgs
+ {
+ Mode = mode,
+ SessionKey = sessionKey
+ });
+
+ _logger.Info($"Voice runtime resumed{(string.IsNullOrWhiteSpace(args.Reason) ? string.Empty : $": {args.Reason}")}");
+ return resumed;
+ }
+
+ public async Task SkipCurrentReplyAsync(VoiceSkipArgs? args = null)
+ {
+ args ??= new VoiceSkipArgs();
+
+ CancellationTokenSource? playbackSkipCts;
+
+ lock (_gate)
+ {
+ playbackSkipCts = _playbackSkipCts;
+ if (playbackSkipCts == null && _pendingAssistantReplies.Count == 0)
+ {
+ return Clone(_status);
+ }
+ }
+
+ playbackSkipCts?.Cancel();
+
+ await Task.Yield();
+
+ lock (_gate)
+ {
+ _logger.Info($"Voice reply skipped{(string.IsNullOrWhiteSpace(args.Reason) ? string.Empty : $": {args.Reason}")}");
+ return Clone(_status);
+ }
+ }
+
+ private async Task StartTalkModeRuntimeAsync(VoiceSettings settings, string? sessionKey)
+ {
+ var effectiveSessionKey = string.IsNullOrWhiteSpace(sessionKey) ? DefaultSessionKey : sessionKey;
+ var selectedSpeechToText = VoiceProviderCatalogService.ResolveSpeechToTextProvider(
+ settings.SpeechToTextProviderId,
+ _logger);
+ var selectedTextToSpeech = VoiceProviderCatalogService.ResolveTextToSpeechProvider(
+ settings.TextToSpeechProviderId,
+ _logger);
+ var fallbackMessage = BuildProviderFallbackMessage(selectedSpeechToText, selectedTextToSpeech);
+
+ await EnsureMicrophoneConsentAsync();
+
+ CancellationTokenSource? runtimeCts = null;
+ IVoiceSpeechToTextRoute? speechToTextRoute = null;
+ VoiceCaptureService? captureService = null;
+ SpeechRecognizer? recognizer = null;
+ SpeechSynthesizer? synthesizer = null;
+ MediaPlayer? player = null;
+
+ try
+ {
+ runtimeCts = new CancellationTokenSource();
+ speechToTextRoute = VoiceSpeechToTextRouteFactory.Create(selectedSpeechToText, _logger);
+ var speechToTextResources = await speechToTextRoute.StartAsync(selectedSpeechToText, settings, runtimeCts.Token);
+ captureService = speechToTextResources.CaptureService;
+ recognizer = speechToTextResources.SpeechRecognizer;
+ synthesizer = new SpeechSynthesizer();
+ player = new MediaPlayer();
+ await ConfigurePlaybackOutputDeviceAsync(player, settings);
+ await WarmSpeechPlaybackPipelineAsync(player, synthesizer, selectedTextToSpeech, runtimeCts.Token);
+
+ if (recognizer != null)
+ {
+ recognizer.HypothesisGenerated += OnSpeechHypothesisGenerated;
+ recognizer.ContinuousRecognitionSession.ResultGenerated += OnSpeechResultGenerated;
+ recognizer.ContinuousRecognitionSession.Completed += OnSpeechRecognitionCompleted;
+ }
+
+ lock (_gate)
+ {
+ _runtimeCts = runtimeCts;
+ _voiceCaptureService = captureService;
+ _speechToTextRoute = speechToTextRoute;
+ _speechRecognizer = recognizer;
+ _speechSynthesizer = synthesizer;
+ _mediaPlayer = player;
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ effectiveSessionKey,
+ VoiceRuntimeState.Arming,
+ fallbackMessage);
+ }
+
+ await EnsureChatTransportAsync(runtimeCts.Token);
+ await StartRecognitionSessionAsync();
+ _logger.Info("Voice runtime started in mode TalkMode");
+ }
+ catch
+ {
+ var cleanupStoredState = false;
+ lock (_gate)
+ {
+ cleanupStoredState = ReferenceEquals(_runtimeCts, runtimeCts);
+ }
+
+ if (cleanupStoredState)
+ {
+ await StopRuntimeResourcesAsync(updateStoppedStatus: false);
+ }
+ else
+ {
+ if (captureService != null)
+ {
+ try { await captureService.StopAsync(); } catch { }
+ try { await captureService.DisposeAsync(); } catch { }
+ }
+
+ if (recognizer != null)
+ {
+ try { recognizer.HypothesisGenerated -= OnSpeechHypothesisGenerated; } catch { }
+ try { recognizer.ContinuousRecognitionSession.ResultGenerated -= OnSpeechResultGenerated; } catch { }
+ try { recognizer.ContinuousRecognitionSession.Completed -= OnSpeechRecognitionCompleted; } catch { }
+ try { recognizer.Dispose(); } catch { }
+ }
+
+ try { player?.Dispose(); } catch { }
+ try { synthesizer?.Dispose(); } catch { }
+ try { runtimeCts?.Dispose(); } catch { }
+ }
+
+ throw;
+ }
+ }
+
+ private async Task ConfigurePlaybackOutputDeviceAsync(MediaPlayer player, VoiceSettings settings)
+ {
+ if (string.IsNullOrWhiteSpace(settings.OutputDeviceId))
+ {
+ return;
+ }
+
+ try
+ {
+ var renderSelector = MediaDevice.GetAudioRenderSelector();
+ var renderDevices = await DeviceInformation.FindAllAsync(renderSelector);
+ var selectedRenderDevice = renderDevices.FirstOrDefault(device =>
+ string.Equals(device.Id, settings.OutputDeviceId, StringComparison.Ordinal));
+
+ if (selectedRenderDevice == null)
+ {
+ _logger.Warn(
+ $"Selected output device '{settings.OutputDeviceId}' was not found; falling back to the system default speaker.");
+ return;
+ }
+
+ player.AudioDevice = selectedRenderDevice;
+ _logger.Info($"Voice playback output device set to {selectedRenderDevice.Name}");
+ }
+ catch (Exception ex)
+ {
+ _logger.Warn($"Failed to configure selected output device: {ex.Message}");
+ }
+ }
+
+ private async Task EnsureMicrophoneConsentAsync()
+ {
+ if (!PackageHelper.IsPackaged)
+ {
+ return;
+ }
+
+ using var capture = new MediaCapture();
+ var initSettings = new MediaCaptureInitializationSettings
+ {
+ StreamingCaptureMode = StreamingCaptureMode.Audio,
+ SharingMode = MediaCaptureSharingMode.SharedReadOnly,
+ MemoryPreference = MediaCaptureMemoryPreference.Cpu
+ };
+
+ await capture.InitializeAsync(initSettings);
+ }
+
+ private async Task EnsureChatTransportAsync(CancellationToken cancellationToken)
+ {
+ OpenClawGatewayClient? existingClient;
+ TaskCompletionSource readySource;
+ bool shouldStartConnection;
+
+ lock (_gate)
+ {
+ existingClient = _chatClient;
+ if (_chatTransportStatus == ConnectionStatus.Connected)
+ {
+ return;
+ }
+
+ readySource = GetOrCreateTransportReadySource(
+ _chatTransportStatus,
+ _transportReadyTcs,
+ out shouldStartConnection);
+ _transportReadyTcs = readySource;
+
+ if (shouldStartConnection)
+ {
+ _chatTransportStatus = ConnectionStatus.Connecting;
+
+ if (existingClient == null)
+ {
+ _chatClient = new OpenClawGatewayClient(_settings.GatewayUrl, _settings.Token, _logger);
+ _chatClient.StatusChanged += OnChatTransportStatusChanged;
+ _chatClient.ChatMessageReceived += OnChatMessageReceived;
+ _chatClient.SessionPreviewUpdated += OnSessionPreviewUpdated;
+ existingClient = _chatClient;
+ }
+ }
+ }
+
+ if (shouldStartConnection)
+ {
+ await existingClient!.ConnectAsync();
+ }
+
+ var readyTask = readySource.Task;
+ var timeoutTask = Task.Delay(TransportConnectTimeout, cancellationToken);
+ var completed = await Task.WhenAny(readyTask, timeoutTask);
+ if (completed != readyTask)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+ throw new TimeoutException("Timed out connecting voice chat transport.");
+ }
+
+ await readyTask;
+ }
+
+ private static TaskCompletionSource GetOrCreateTransportReadySource(
+ ConnectionStatus transportStatus,
+ TaskCompletionSource? existingReadySource,
+ out bool shouldStartConnection)
+ {
+ return VoiceServiceTransportLogic.GetOrCreateTransportReadySource(
+ transportStatus,
+ existingReadySource,
+ out shouldStartConnection);
+ }
+
+ private async Task StartRecognitionSessionAsync(bool updateListeningStatus = true)
+ {
+ SpeechRecognizer? recognizer;
+ CancellationToken runtimeToken;
+ int generation;
+
+ lock (_gate)
+ {
+ recognizer = _speechRecognizer;
+ if (recognizer == null || _recognitionActive || _runtimeCts == null)
+ {
+ return;
+ }
+
+ runtimeToken = _runtimeCts.Token;
+ generation = ++_recognitionSessionGeneration;
+ _recognitionSessionHadCaptureSignal = false;
+ }
+
+ _logger.Info("Starting speech recognition session");
+ await recognizer.ContinuousRecognitionSession.StartAsync();
+
+ lock (_gate)
+ {
+ _recognitionActive = true;
+ _recognitionRestartInProgress = false;
+ _recognitionSessionHadActivity = false;
+ _lastHypothesisText = null;
+ _lastHypothesisUtc = default;
+ if (updateListeningStatus && _status.Running && !_awaitingReply && !_isSpeaking)
+ {
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ VoiceRuntimeState.Arming,
+ _status.LastError);
+ }
+ }
+
+ _logger.Info("Speech recognition session started");
+ if (updateListeningStatus)
+ {
+ _ = MonitorListeningReadyAsync(generation, runtimeToken);
+ }
+
+ }
+
+ private async Task MonitorListeningReadyAsync(int generation, CancellationToken cancellationToken)
+ {
+ try
+ {
+ VoiceCaptureService? captureService;
+
+ lock (_gate)
+ {
+ captureService = _voiceCaptureService;
+ }
+
+ if (captureService != null)
+ {
+ await captureService.WaitForCaptureReadyAsync(cancellationToken);
+ }
+ await Task.Delay(InitialRecognitionReadyDelay, cancellationToken);
+
+ var transitionedToListening = false;
+
+ lock (_gate)
+ {
+ if (_runtimeCts == null ||
+ _runtimeCts.IsCancellationRequested ||
+ !_status.Running ||
+ _status.Mode != VoiceActivationMode.TalkMode ||
+ !_recognitionActive ||
+ _recognitionSessionGeneration != generation ||
+ _awaitingReply ||
+ _isSpeaking)
+ {
+ return;
+ }
+
+ if (_status.State != VoiceRuntimeState.ListeningContinuously)
+ {
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ VoiceRuntimeState.ListeningContinuously,
+ _status.LastError);
+ transitionedToListening = true;
+ }
+ }
+
+ if (transitionedToListening)
+ {
+ var readinessSource = captureService == null
+ ? "recognizer warm-up completed"
+ : "capture frames observed and recognizer warm-up completed";
+ _logger.Info(
+ $"Speech pipeline ready; {readinessSource} ({InitialRecognitionReadyDelay.TotalMilliseconds:0}ms)");
+ }
+ }
+ catch (OperationCanceledException)
+ {
+ }
+ catch (Exception ex)
+ {
+ _logger.Warn($"Voice listening readiness check failed: {ex.Message}");
+ }
+ }
+
+ private async Task ResumeRecognitionSessionAsync(
+ CancellationToken cancellationToken,
+ string reason,
+ string? lastError = null,
+ bool rebuildRecognizer = false)
+ {
+ const int maxAttempts = 2;
+ string? currentError = lastError;
+
+ for (var attempt = 1; attempt <= maxAttempts; attempt++)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+
+ try
+ {
+ if (rebuildRecognizer && attempt == 1)
+ {
+ await RebuildSpeechRecognizerAsync(reason, cancellationToken);
+ }
+
+ await StartRecognitionSessionAsync();
+ return;
+ }
+ catch (OperationCanceledException)
+ {
+ throw;
+ }
+ catch (Exception ex)
+ {
+ currentError = GetUserFacingErrorMessage(ex);
+ _logger.Warn(
+ $"Voice recognition resume failed ({reason}, attempt {attempt}/{maxAttempts}): {ex.Message}");
+
+ lock (_gate)
+ {
+ _recognitionRestartInProgress = false;
+ if (_runtimeCts == null ||
+ !_status.Running ||
+ _status.Mode != VoiceActivationMode.TalkMode ||
+ _awaitingReply ||
+ _isSpeaking)
+ {
+ return;
+ }
+
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ VoiceRuntimeState.Arming,
+ currentError);
+ }
+
+ if (attempt == maxAttempts)
+ {
+ return;
+ }
+
+ await Task.Delay(RecognitionResumeRetryDelay, cancellationToken);
+ }
+ }
+ }
+
+ private async Task StopRecognitionSessionAsync()
+ {
+ SpeechRecognizer? recognizer;
+
+ lock (_gate)
+ {
+ recognizer = _speechRecognizer;
+ if (recognizer == null || !_recognitionActive)
+ {
+ return;
+ }
+
+ _recognitionActive = false;
+ _recognitionSessionHadCaptureSignal = false;
+ _lastHypothesisText = null;
+ _lastHypothesisUtc = default;
+ }
+
+ try
+ {
+ await recognizer.ContinuousRecognitionSession.CancelAsync();
+ }
+ catch (Exception ex)
+ {
+ _logger.Warn($"Voice recognition stop failed: {ex.Message}");
+ }
+ }
+
+ private async void OnSpeechResultGenerated(
+ SpeechContinuousRecognitionSession sender,
+ SpeechContinuousRecognitionResultGeneratedEventArgs args)
+ {
+ try
+ {
+ var result = args.Result;
+ var text = result.Text?.Trim();
+ var promotedHypothesis = false;
+ if (string.IsNullOrWhiteSpace(text))
+ {
+ return;
+ }
+
+ if (result.Status != SpeechRecognitionResultStatus.Success ||
+ result.Confidence == SpeechRecognitionConfidence.Rejected ||
+ result.Confidence == SpeechRecognitionConfidence.Low)
+ {
+ _logger.Info($"Voice recognition ignored result with confidence {result.Confidence}: {text}");
+ return;
+ }
+
+ lock (_gate)
+ {
+ text = SelectRecognizedText(
+ text,
+ _lastHypothesisText,
+ _lastHypothesisUtc,
+ DateTime.UtcNow,
+ out promotedHypothesis);
+ }
+
+ if (promotedHypothesis)
+ {
+ _logger.Info($"Voice recognition promoted recent hypothesis to recover truncated final result: {text}");
+ }
+
+ _logger.Info($"Voice recognition result ({result.Confidence}): {text}");
+ await HandleRecognizedTextAsync(text);
+ }
+ catch (Exception ex)
+ {
+ _logger.Error("Voice recognition handler failed", ex);
+ CancellationToken cancellationToken;
+ var shouldResume = false;
+ var userMessage = GetUserFacingErrorMessage(ex);
+ lock (_gate)
+ {
+ if (_runtimeCts != null &&
+ _status.Running &&
+ _status.Mode == VoiceActivationMode.TalkMode)
+ {
+ cancellationToken = _runtimeCts.Token;
+ _awaitingReply = false;
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ VoiceRuntimeState.Arming,
+ userMessage);
+ shouldResume = true;
+ }
+ else
+ {
+ return;
+ }
+ }
+
+ if (shouldResume)
+ {
+ try
+ {
+ await ResumeRecognitionSessionAsync(cancellationToken, "result handler failure", userMessage);
+ }
+ catch (OperationCanceledException)
+ {
+ }
+ }
+ }
+ }
+
+ private void OnSpeechHypothesisGenerated(SpeechRecognizer sender, SpeechRecognitionHypothesisGeneratedEventArgs args)
+ {
+ string? sessionKey = null;
+ string? text = null;
+
+ lock (_gate)
+ {
+ if (_runtimeCts == null ||
+ _status.Mode != VoiceActivationMode.TalkMode ||
+ !_status.Running ||
+ _awaitingReply ||
+ _isSpeaking)
+ {
+ return;
+ }
+
+ text = args.Hypothesis?.Text?.Trim();
+ sessionKey = GetCurrentVoiceSessionKey();
+ _recognitionSessionHadActivity = true;
+ _lastHypothesisText = text;
+ _lastHypothesisUtc = DateTime.UtcNow;
+ if (_status.State != VoiceRuntimeState.RecordingUtterance)
+ {
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ VoiceRuntimeState.RecordingUtterance,
+ _status.LastError);
+ }
+ }
+
+ if (string.IsNullOrWhiteSpace(text))
+ {
+ return;
+ }
+
+ RaiseTranscriptDraft(text, sessionKey, clear: false);
+ }
+
+ private async Task HandleRecognizedTextAsync(string text)
+ {
+ CancellationToken cancellationToken;
+ string sessionKey;
+ var pipelineStopwatch = Stopwatch.StartNew();
+ long recognitionStopElapsedMs = 0;
+ long transportReadyElapsedMs = 0;
+ long directSendElapsedMs = 0;
+
+ lock (_gate)
+ {
+ if (_runtimeCts == null || _status.Mode != VoiceActivationMode.TalkMode || !_status.Running)
+ {
+ return;
+ }
+
+ if (_awaitingReply || _isSpeaking)
+ {
+ return;
+ }
+
+ if (string.Equals(text, _lastTranscript, StringComparison.OrdinalIgnoreCase) &&
+ DateTime.UtcNow - _lastTranscriptUtc < DuplicateTranscriptWindow)
+ {
+ _logger.Info($"Voice recognition suppressed duplicate transcript: {text}");
+ return;
+ }
+
+ _lastTranscript = text;
+ _lastTranscriptUtc = DateTime.UtcNow;
+ _recognitionSessionHadActivity = true;
+ _recognitionSessionHadCaptureSignal = false;
+ _lastHypothesisText = null;
+ _lastHypothesisUtc = default;
+ cancellationToken = _runtimeCts.Token;
+ sessionKey = GetCurrentVoiceSessionKey();
+ }
+
+ RaiseTranscriptDraft(text, sessionKey, clear: false);
+
+ await StopRecognitionSessionAsync();
+ recognitionStopElapsedMs = pipelineStopwatch.ElapsedMilliseconds;
+
+ try
+ {
+ await EnsureChatTransportAsync(cancellationToken);
+ transportReadyElapsedMs = pipelineStopwatch.ElapsedMilliseconds - recognitionStopElapsedMs;
+
+ OpenClawGatewayClient? client;
+ lock (_gate)
+ {
+ client = _chatClient;
+ }
+
+ if (client == null)
+ {
+ throw new InvalidOperationException("Voice chat transport is unavailable.");
+ }
+
+ _logger.Info($"Voice transcript captured: {text}");
+ var directSendStopwatch = Stopwatch.StartNew();
+ await client.SendChatMessageAsync(text, sessionKey);
+ directSendElapsedMs = directSendStopwatch.ElapsedMilliseconds;
+ _logger.Info($"Voice direct send path: elapsed={directSendElapsedMs}ms");
+
+ _logger.Info(
+ $"Voice pre-response latency: recognitionStop={recognitionStopElapsedMs}ms transportReady={transportReadyElapsedMs}ms directSend={directSendElapsedMs}ms total={pipelineStopwatch.ElapsedMilliseconds}ms");
+ lock (_gate)
+ {
+ _awaitingReply = true;
+ _lateReplySessionKey = null;
+ _lateReplyGraceUntilUtc = null;
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ VoiceRuntimeState.AwaitingResponse,
+ _status.LastError);
+ _status.LastUtteranceUtc = DateTime.UtcNow;
+ }
+
+ _logger.Info("Voice response wait started");
+ RaiseConversationTurn(VoiceConversationDirection.Outgoing, text, sessionKey);
+ RaiseTranscriptDraft(string.Empty, sessionKey, clear: true);
+ _ = MonitorReplyTimeoutAsync(text, cancellationToken);
+ }
+ catch (Exception ex)
+ {
+ _logger.Error("Voice transcript submit failed", ex);
+ var userMessage = GetUserFacingErrorMessage(ex);
+
+ lock (_gate)
+ {
+ _awaitingReply = false;
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ VoiceRuntimeState.Arming,
+ userMessage);
+ }
+
+ await ResumeRecognitionSessionAsync(cancellationToken, "transcript submit failure", userMessage);
+ }
+ }
+
+ private async Task MonitorReplyTimeoutAsync(string transcript, CancellationToken cancellationToken)
+ {
+ try
+ {
+ await Task.Delay(ReplyTimeout, cancellationToken);
+
+ var shouldResume = false;
+ string? lateReplySessionKey = null;
+ lock (_gate)
+ {
+ if (_awaitingReply &&
+ string.Equals(_lastTranscript, transcript, StringComparison.OrdinalIgnoreCase))
+ {
+ _awaitingReply = false;
+ lateReplySessionKey = GetCurrentVoiceSessionKey();
+ _lateReplySessionKey = lateReplySessionKey;
+ _lateReplyGraceUntilUtc = DateTime.UtcNow.Add(LateReplyGraceWindow);
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ VoiceRuntimeState.Arming,
+ "Timed out waiting for an assistant reply.");
+ shouldResume = true;
+ }
+ }
+
+ if (shouldResume)
+ {
+ _logger.Warn(
+ $"Voice reply wait timed out after {ReplyTimeout.TotalSeconds:0}s; accepting late replies for {LateReplyGraceWindow.TotalSeconds:0}s on session {lateReplySessionKey ?? "(none)"}");
+ await ResumeRecognitionSessionAsync(cancellationToken, "reply timeout");
+ }
+ }
+ catch (OperationCanceledException)
+ {
+ }
+ }
+
+ private async void OnChatMessageReceived(object? sender, ChatMessageEventArgs args)
+ {
+ try
+ {
+ if (!args.IsFinal ||
+ !string.Equals(args.Role, "assistant", StringComparison.OrdinalIgnoreCase) ||
+ string.IsNullOrWhiteSpace(args.Message))
+ {
+ return;
+ }
+
+ await AcceptAssistantReplyAsync(args.SessionKey, args.Message, "chat event");
+ }
+ catch (Exception ex)
+ {
+ _logger.Warn($"Voice chat message handler failed: {ex.Message}");
+ }
+ }
+
+ private async void OnSessionPreviewUpdated(object? sender, SessionsPreviewPayloadInfo payload)
+ {
+ try
+ {
+ if (payload.Previews == null || payload.Previews.Count == 0)
+ {
+ return;
+ }
+
+ string? expectedSessionKey;
+ lock (_gate)
+ {
+ if (!_status.Running || _status.Mode != VoiceActivationMode.TalkMode)
+ {
+ return;
+ }
+
+ expectedSessionKey = GetCurrentVoiceSessionKey();
+ }
+
+ foreach (var preview in payload.Previews)
+ {
+ if (!IsMatchingSessionKey(preview.Key, expectedSessionKey))
+ {
+ continue;
+ }
+
+ var assistantText = preview.Items
+ .LastOrDefault(item =>
+ string.Equals(item.Role, "assistant", StringComparison.OrdinalIgnoreCase) &&
+ !string.IsNullOrWhiteSpace(item.Text))
+ ?.Text;
+
+ if (string.IsNullOrWhiteSpace(assistantText))
+ {
+ continue;
+ }
+
+ await AcceptAssistantReplyAsync(preview.Key, assistantText, "session preview");
+ return;
+ }
+ }
+ catch (Exception ex)
+ {
+ _logger.Warn($"Voice session preview handler failed: {ex.Message}");
+ }
+ }
+
+ private async Task AcceptAssistantReplyAsync(string? sessionKey, string? rawText, string source)
+ {
+ if (string.IsNullOrWhiteSpace(rawText))
+ {
+ return;
+ }
+
+ string text;
+ bool acceptedViaLateReplyGrace;
+ bool shouldResumeRecognition = false;
+ bool shouldStartPlaybackLoop = false;
+ var utcNow = DateTime.UtcNow;
+
+ lock (_gate)
+ {
+ if (!_status.Running || _status.Mode != VoiceActivationMode.TalkMode)
+ {
+ return;
+ }
+
+ if (!IsMatchingSessionKey(sessionKey, GetCurrentVoiceSessionKey()))
+ {
+ return;
+ }
+
+ acceptedViaLateReplyGrace = ShouldAcceptLateAssistantReply(
+ _awaitingReply,
+ _isSpeaking,
+ _pendingAssistantReplies.Count,
+ _lateReplySessionKey,
+ _lateReplyGraceUntilUtc,
+ sessionKey,
+ utcNow);
+
+ if (!ShouldAcceptAssistantReply(_awaitingReply, _isSpeaking, _pendingAssistantReplies.Count, acceptedViaLateReplyGrace))
+ {
+ return;
+ }
+
+ text = PrepareReplyForSpeech(rawText);
+ if (ShouldSuppressDuplicateAssistantReply(sessionKey, text, utcNow))
+ {
+ return;
+ }
+
+ _awaitingReply = false;
+ if (acceptedViaLateReplyGrace)
+ {
+ _lateReplySessionKey = null;
+ _lateReplyGraceUntilUtc = null;
+ }
+
+ RememberAcceptedAssistantReply(sessionKey, text, utcNow);
+
+ if (string.IsNullOrWhiteSpace(text))
+ {
+ if (_status.Running && !_replyPlaybackLoopActive)
+ {
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ VoiceRuntimeState.Arming,
+ _status.LastError);
+ shouldResumeRecognition = true;
+ }
+ }
+ else
+ {
+ QueueAssistantReplyForPlayback(text, sessionKey, out shouldStartPlaybackLoop);
+ }
+ }
+
+ if (acceptedViaLateReplyGrace)
+ {
+ _logger.Warn($"Voice accepted late assistant reply after timeout for session {sessionKey} via {source}");
+ }
+
+ if (string.IsNullOrWhiteSpace(text) && shouldResumeRecognition)
+ {
+ await ResumeRecognitionSessionAsync(CancellationToken.None, "empty assistant reply");
+ }
+ }
+
+ private void QueueAssistantReplyForPlayback(string text, string? sessionKey, out bool shouldStartPlaybackLoop)
+ {
+ shouldStartPlaybackLoop = false;
+ RaiseConversationTurn(VoiceConversationDirection.Incoming, text, sessionKey);
+
+ lock (_gate)
+ {
+ _pendingAssistantReplies.Enqueue((text, sessionKey));
+ _logger.Info($"Voice reply queued: pending={_pendingAssistantReplies.Count}");
+
+ if (!_replyPlaybackLoopActive)
+ {
+ _replyPlaybackLoopActive = true;
+ _isSpeaking = true;
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ VoiceRuntimeState.PlayingResponse,
+ _status.LastError);
+ shouldStartPlaybackLoop = true;
+ }
+ else
+ {
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ VoiceRuntimeState.PlayingResponse,
+ _status.LastError);
+ }
+ }
+
+ if (shouldStartPlaybackLoop)
+ {
+ _ = ProcessQueuedAssistantRepliesAsync();
+ }
+ }
+
+ private async Task ProcessQueuedAssistantRepliesAsync()
+ {
+ try
+ {
+ while (true)
+ {
+ (string Text, string? SessionKey) reply;
+ var shouldPauseBeforeNextReply = false;
+ CancellationTokenSource? playbackSkipCts = null;
+
+ lock (_gate)
+ {
+ if (_pendingAssistantReplies.Count == 0)
+ {
+ _replyPlaybackLoopActive = false;
+ _isSpeaking = false;
+ _currentReplyPreview = null;
+
+ if (_status.Running)
+ {
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ VoiceRuntimeState.Arming,
+ _status.LastError);
+ }
+
+ break;
+ }
+
+ reply = _pendingAssistantReplies.Dequeue();
+ shouldPauseBeforeNextReply = _pendingAssistantReplies.Count > 0;
+ _currentReplyPreview = CreateReplyPreview(reply.Text);
+ _isSpeaking = true;
+ _playbackSkipCts = playbackSkipCts = new CancellationTokenSource();
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ VoiceRuntimeState.PlayingResponse,
+ _status.LastError);
+ }
+
+ try
+ {
+ await SpeakTextAsync(reply.Text, playbackSkipCts.Token);
+ }
+ catch (OperationCanceledException)
+ {
+ _logger.Info($"Voice reply playback canceled: remainingQueue={CurrentStatus.PendingReplyCount}");
+ }
+ catch (Exception ex)
+ {
+ _logger.Error("Voice reply playback failed", ex);
+ lock (_gate)
+ {
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ shouldPauseBeforeNextReply ? VoiceRuntimeState.PlayingResponse : VoiceRuntimeState.Arming,
+ GetUserFacingErrorMessage(ex));
+ }
+ }
+ finally
+ {
+ lock (_gate)
+ {
+ if (ReferenceEquals(_playbackSkipCts, playbackSkipCts))
+ {
+ _playbackSkipCts = null;
+ }
+
+ _currentReplyPreview = null;
+ }
+
+ playbackSkipCts?.Dispose();
+ }
+
+ if (shouldPauseBeforeNextReply)
+ {
+ _logger.Info($"Voice reply playback paused before next queued response ({QueuedReplyPlaybackGap.TotalMilliseconds}ms)");
+ await Task.Delay(QueuedReplyPlaybackGap);
+ }
+ }
+ }
+ finally
+ {
+ lock (_gate)
+ {
+ _replyPlaybackLoopActive = false;
+ _isSpeaking = false;
+ _currentReplyPreview = null;
+ if (_status.Running)
+ {
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ VoiceRuntimeState.Arming,
+ _status.LastError);
+ }
+ }
+
+ try
+ {
+ await ResumeRecognitionSessionAsync(CancellationToken.None, "queued assistant reply playback completed");
+ }
+ catch (Exception ex)
+ {
+ _logger.Warn($"Voice recognition resume failed: {ex.Message}");
+ }
+ }
+ }
+
+ private async Task SpeakTextAsync(string text, CancellationToken cancellationToken)
+ {
+ VoiceSettings settings;
+ VoiceProviderConfigurationStore providerConfiguration;
+ SpeechSynthesizer? synthesizer;
+ MediaPlayer? player;
+
+ lock (_gate)
+ {
+ settings = Clone(_settings.Voice);
+ providerConfiguration = _settings.VoiceProviderConfiguration.Clone();
+ synthesizer = _speechSynthesizer;
+ player = _mediaPlayer;
+ }
+
+ if (player == null)
+ {
+ throw new InvalidOperationException("Speech playback is not ready.");
+ }
+
+ var provider = VoiceProviderCatalogService.ResolveTextToSpeechProvider(
+ settings.TextToSpeechProviderId,
+ _logger);
+
+ if (UsesCloudTextToSpeechRuntime(provider))
+ {
+ using var result = await _cloudTextToSpeechClient.SynthesizeAsync(text, provider, providerConfiguration, _logger, cancellationToken);
+ await PlayStreamAsync(player, result.Stream, result.ContentType, cancellationToken);
+ return;
+ }
+
+ if (synthesizer == null)
+ {
+ throw new InvalidOperationException("Speech playback is not ready.");
+ }
+
+ var stopwatch = Stopwatch.StartNew();
+ using var stream = await synthesizer.SynthesizeTextToStreamAsync(text);
+ _logger.Info($"Windows TTS latency: total={stopwatch.ElapsedMilliseconds}ms");
+ await PlayStreamAsync(player, stream, stream.ContentType, cancellationToken);
+ }
+
+ private async Task WarmSpeechPlaybackPipelineAsync(
+ MediaPlayer player,
+ SpeechSynthesizer? synthesizer,
+ VoiceProviderOption provider,
+ CancellationToken cancellationToken)
+ {
+ var stopwatch = Stopwatch.StartNew();
+
+ try
+ {
+ using var silentStream = CreateSilentWaveStream();
+ await PreloadStreamAsync(player, silentStream, "audio/wav", cancellationToken);
+
+ if (!UsesCloudTextToSpeechRuntime(provider) && synthesizer != null)
+ {
+ using var warmupStream = await synthesizer.SynthesizeTextToStreamAsync(" ");
+ }
+
+ _logger.Info($"Voice playback warm-up completed: total={stopwatch.ElapsedMilliseconds}ms");
+ }
+ catch (OperationCanceledException)
+ {
+ }
+ catch (ObjectDisposedException)
+ {
+ _logger.Info("Voice playback warm-up skipped because playback resources were disposed during initialization.");
+ }
+ catch (Exception ex)
+ {
+ _logger.Warn($"Voice playback warm-up failed: {ex.Message}");
+ }
+ }
+
+ private static bool UsesCloudTextToSpeechRuntime(VoiceProviderOption provider)
+ {
+ return VoiceServiceTransportLogic.UsesCloudTextToSpeechRuntime(provider);
+ }
+
+ internal static bool ShouldAcceptAssistantReply(
+ bool awaitingReply,
+ bool isSpeaking,
+ int queuedReplyCount,
+ bool acceptedViaLateReplyGrace = false)
+ {
+ return VoiceServiceTransportLogic.ShouldAcceptAssistantReply(
+ awaitingReply,
+ isSpeaking,
+ queuedReplyCount,
+ acceptedViaLateReplyGrace);
+ }
+
+ internal static bool ShouldAcceptLateAssistantReply(
+ bool awaitingReply,
+ bool isSpeaking,
+ int queuedReplyCount,
+ string? lateReplySessionKey,
+ DateTime? lateReplyGraceUntilUtc,
+ string? incomingSessionKey,
+ DateTime utcNow)
+ {
+ return VoiceServiceTransportLogic.ShouldAcceptLateAssistantReply(
+ awaitingReply,
+ isSpeaking,
+ queuedReplyCount,
+ lateReplySessionKey,
+ lateReplyGraceUntilUtc,
+ incomingSessionKey,
+ utcNow);
+ }
+
+ internal static bool ShouldRestartRecognitionAfterCompletion(
+ bool running,
+ VoiceActivationMode mode,
+ bool restartInProgress,
+ bool awaitingReply,
+ bool isSpeaking)
+ {
+ return VoiceServiceTransportLogic.ShouldRestartRecognitionAfterCompletion(
+ running,
+ mode,
+ restartInProgress,
+ awaitingReply,
+ isSpeaking);
+ }
+
+ internal static string DescribeRecognitionCompletionRestartDecision(
+ bool running,
+ VoiceActivationMode mode,
+ bool restartInProgress,
+ bool awaitingReply,
+ bool isSpeaking)
+ {
+ return VoiceServiceTransportLogic.DescribeRecognitionCompletionRestartDecision(
+ running,
+ mode,
+ restartInProgress,
+ awaitingReply,
+ isSpeaking);
+ }
+
+ internal static bool ShouldRebuildRecognitionAfterCompletion(
+ SpeechRecognitionResultStatus status,
+ bool sessionHadActivity,
+ bool sessionHadCaptureSignal,
+ bool restartInProgress,
+ bool awaitingReply,
+ bool isSpeaking)
+ {
+ return VoiceServiceTransportLogic.ShouldRebuildRecognitionAfterCompletion(
+ status,
+ sessionHadActivity,
+ sessionHadCaptureSignal,
+ restartInProgress,
+ awaitingReply,
+ isSpeaking);
+ }
+
+ internal static string DescribeRecognitionCompletionRebuildDecision(
+ SpeechRecognitionResultStatus status,
+ bool sessionHadActivity,
+ bool sessionHadCaptureSignal,
+ bool restartInProgress,
+ bool awaitingReply,
+ bool isSpeaking)
+ {
+ return VoiceServiceTransportLogic.DescribeRecognitionCompletionRebuildDecision(
+ status,
+ sessionHadActivity,
+ sessionHadCaptureSignal,
+ restartInProgress,
+ awaitingReply,
+ isSpeaking);
+ }
+
+ internal static string SelectRecognizedText(
+ string recognizedText,
+ string? latestHypothesisText,
+ DateTime latestHypothesisUtc,
+ DateTime utcNow,
+ out bool promotedHypothesis)
+ {
+ return VoiceServiceTransportLogic.SelectRecognizedText(
+ recognizedText,
+ latestHypothesisText,
+ latestHypothesisUtc,
+ utcNow,
+ out promotedHypothesis);
+ }
+
+ internal static string? SelectCompletionFallbackText(
+ bool sessionHadActivity,
+ string? latestHypothesisText,
+ DateTime latestHypothesisUtc,
+ DateTime utcNow)
+ {
+ return VoiceServiceTransportLogic.SelectCompletionFallbackText(
+ sessionHadActivity,
+ latestHypothesisText,
+ latestHypothesisUtc,
+ utcNow);
+ }
+
+ internal static bool ShouldClearTranscriptDraftAfterCompletion(
+ bool awaitingReply,
+ bool isSpeaking,
+ bool usedFallbackTranscript)
+ {
+ return VoiceServiceTransportLogic.ShouldClearTranscriptDraftAfterCompletion(
+ awaitingReply,
+ isSpeaking,
+ usedFallbackTranscript);
+ }
+
+ internal static bool ShouldRepromptAfterIncompleteRecognition(
+ bool sessionHadActivity,
+ bool awaitingReply,
+ bool isSpeaking,
+ bool usedFallbackTranscript)
+ {
+ return VoiceServiceTransportLogic.ShouldRepromptAfterIncompleteRecognition(
+ sessionHadActivity,
+ awaitingReply,
+ isSpeaking,
+ usedFallbackTranscript);
+ }
+
+ private static string CreateReplyPreview(string text)
+ {
+ var trimmed = text.Trim();
+ if (trimmed.Length <= 120)
+ {
+ return trimmed;
+ }
+
+ return $"{trimmed[..117]}...";
+ }
+
+ private static InMemoryRandomAccessStream CreateSilentWaveStream()
+ {
+ const int sampleRate = 16000;
+ const short bitsPerSample = 16;
+ const short channels = 1;
+ const int durationMs = 120;
+
+ var bytesPerSample = bitsPerSample / 8;
+ var sampleCount = sampleRate * durationMs / 1000;
+ var dataSize = sampleCount * channels * bytesPerSample;
+ var byteRate = sampleRate * channels * bytesPerSample;
+ var blockAlign = (short)(channels * bytesPerSample);
+
+ var buffer = new byte[44 + dataSize];
+ using var writer = new BinaryWriter(new MemoryStream(buffer, writable: true));
+
+ writer.Write(System.Text.Encoding.ASCII.GetBytes("RIFF"));
+ writer.Write(36 + dataSize);
+ writer.Write(System.Text.Encoding.ASCII.GetBytes("WAVE"));
+ writer.Write(System.Text.Encoding.ASCII.GetBytes("fmt "));
+ writer.Write(16);
+ writer.Write((short)1);
+ writer.Write(channels);
+ writer.Write(sampleRate);
+ writer.Write(byteRate);
+ writer.Write(blockAlign);
+ writer.Write(bitsPerSample);
+ writer.Write(System.Text.Encoding.ASCII.GetBytes("data"));
+ writer.Write(dataSize);
+
+ var stream = new InMemoryRandomAccessStream();
+ using (var output = stream.AsStreamForWrite())
+ {
+ output.Write(buffer, 0, buffer.Length);
+ output.Flush();
+ }
+
+ stream.Seek(0);
+ return stream;
+ }
+
+ private static async Task PreloadStreamAsync(
+ MediaPlayer player,
+ IRandomAccessStream stream,
+ string contentType,
+ CancellationToken cancellationToken)
+ {
+ stream.Seek(0);
+ var mediaOpened = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
+
+ TypedEventHandler? openedHandler = null;
+ TypedEventHandler? failedHandler = null;
+
+ openedHandler = (sender, _) => mediaOpened.TrySetResult(true);
+ failedHandler = (sender, args) =>
+ {
+ var errorMessage = string.IsNullOrWhiteSpace(args.ErrorMessage)
+ ? "Media preload failed."
+ : args.ErrorMessage;
+ mediaOpened.TrySetException(new InvalidOperationException(errorMessage));
+ };
+
+ player.MediaOpened += openedHandler;
+ player.MediaFailed += failedHandler;
+ using var registration = cancellationToken.Register(() =>
+ {
+ try { player.Source = null; } catch { }
+ mediaOpened.TrySetCanceled(cancellationToken);
+ });
+
+ try
+ {
+ player.Source = MediaSource.CreateFromStream(stream, contentType);
+ await mediaOpened.Task;
+ }
+ finally
+ {
+ try { player.MediaOpened -= openedHandler; } catch { }
+ try { player.MediaFailed -= failedHandler; } catch { }
+ try { player.Source = null; } catch { }
+ }
+ }
+
+ private static async Task PlayStreamAsync(
+ MediaPlayer player,
+ IRandomAccessStream stream,
+ string contentType,
+ CancellationToken cancellationToken)
+ {
+ stream.Seek(0);
+ var mediaOpened = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
+ var playbackEnded = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
+
+ TypedEventHandler? openedHandler = null;
+ TypedEventHandler? endedHandler = null;
+ TypedEventHandler? failedHandler = null;
+
+ openedHandler = (sender, _) => mediaOpened.TrySetResult(true);
+ endedHandler = (sender, _) => playbackEnded.TrySetResult(true);
+ failedHandler = (sender, args) =>
+ {
+ var errorMessage = string.IsNullOrWhiteSpace(args.ErrorMessage)
+ ? "Media playback failed."
+ : args.ErrorMessage;
+ var exception = new InvalidOperationException(errorMessage);
+
+ if (!mediaOpened.Task.IsCompleted)
+ {
+ mediaOpened.TrySetException(exception);
+ return;
+ }
+
+ playbackEnded.TrySetException(exception);
+ };
+
+ player.MediaOpened += openedHandler;
+ player.MediaEnded += endedHandler;
+ player.MediaFailed += failedHandler;
+ using var registration = cancellationToken.Register(() =>
+ {
+ try { player.Pause(); } catch { }
+ try { player.Source = null; } catch { }
+ mediaOpened.TrySetCanceled(cancellationToken);
+ playbackEnded.TrySetCanceled(cancellationToken);
+ });
+
+ try
+ {
+ player.Source = MediaSource.CreateFromStream(stream, contentType);
+ await mediaOpened.Task;
+ player.Play();
+ await playbackEnded.Task;
+ }
+ catch
+ {
+ if (playbackEnded.Task.IsFaulted)
+ {
+ _ = playbackEnded.Task.Exception;
+ }
+
+ throw;
+ }
+ finally
+ {
+ try { player.MediaOpened -= openedHandler; } catch { }
+ try { player.MediaEnded -= endedHandler; } catch { }
+ try { player.MediaFailed -= failedHandler; } catch { }
+ try { player.Source = null; } catch { }
+ }
+ }
+
+ private async void OnSpeechRecognitionCompleted(
+ SpeechContinuousRecognitionSession sender,
+ SpeechContinuousRecognitionCompletedEventArgs args)
+ {
+ try
+ {
+ CancellationToken token;
+ var shouldRestart = false;
+ var shouldRebuildRecognizer = false;
+ var restartInProgress = false;
+ var sessionHadActivity = false;
+ var sessionHadCaptureSignal = false;
+ var restartDecisionReason = string.Empty;
+ var rebuildDecisionReason = string.Empty;
+ string? fallbackText = null;
+ string? sessionKey = null;
+ var shouldClearDraft = false;
+ var shouldReprompt = false;
+
+ lock (_gate)
+ {
+ if (_runtimeCts == null || _runtimeCts.IsCancellationRequested)
+ {
+ return;
+ }
+
+ _recognitionActive = false;
+ sessionHadActivity = _recognitionSessionHadActivity;
+ sessionHadCaptureSignal = _recognitionSessionHadCaptureSignal;
+ fallbackText = SelectCompletionFallbackText(
+ sessionHadActivity,
+ _lastHypothesisText,
+ _lastHypothesisUtc,
+ DateTime.UtcNow);
+ sessionKey = GetCurrentVoiceSessionKey();
+ _recognitionSessionHadActivity = false;
+ _recognitionSessionHadCaptureSignal = false;
+ restartInProgress = _recognitionRestartInProgress;
+ if (restartInProgress)
+ {
+ _recognitionRestartInProgress = false;
+ }
+ token = _runtimeCts.Token;
+ shouldRestart = ShouldRestartRecognitionAfterCompletion(
+ _status.Running,
+ _status.Mode,
+ restartInProgress,
+ _awaitingReply,
+ _isSpeaking);
+ restartDecisionReason = DescribeRecognitionCompletionRestartDecision(
+ _status.Running,
+ _status.Mode,
+ restartInProgress,
+ _awaitingReply,
+ _isSpeaking);
+ shouldRebuildRecognizer = ShouldRebuildRecognitionAfterCompletion(
+ args.Status,
+ sessionHadActivity,
+ sessionHadCaptureSignal,
+ restartInProgress,
+ _awaitingReply,
+ _isSpeaking);
+ rebuildDecisionReason = DescribeRecognitionCompletionRebuildDecision(
+ args.Status,
+ sessionHadActivity,
+ sessionHadCaptureSignal,
+ restartInProgress,
+ _awaitingReply,
+ _isSpeaking);
+ shouldClearDraft = ShouldClearTranscriptDraftAfterCompletion(
+ _awaitingReply,
+ _isSpeaking,
+ !string.IsNullOrWhiteSpace(fallbackText));
+ shouldReprompt = ShouldRepromptAfterIncompleteRecognition(
+ sessionHadActivity,
+ _awaitingReply,
+ _isSpeaking,
+ !string.IsNullOrWhiteSpace(fallbackText));
+ }
+
+ _logger.Warn(
+ $"Speech recognition session completed with status {args.Status}; restart={shouldRestart} ({restartDecisionReason}); rebuild={shouldRebuildRecognizer} ({rebuildDecisionReason}); hadActivity={sessionHadActivity}; hadCaptureSignal={sessionHadCaptureSignal}");
+
+ if (!string.IsNullOrWhiteSpace(fallbackText) &&
+ !_awaitingReply &&
+ !_isSpeaking &&
+ !token.IsCancellationRequested)
+ {
+ _logger.Warn(
+ $"Voice recognition completed without a final result; promoting recent hypothesis as fallback transcript: {fallbackText}");
+ await HandleRecognizedTextAsync(fallbackText);
+ return;
+ }
+
+ if (shouldClearDraft)
+ {
+ RaiseTranscriptDraft(string.Empty, sessionKey, clear: true);
+ }
+
+ if (shouldReprompt)
+ {
+ _logger.Warn("Voice recognition session ended after speech activity but without a usable transcript; prompting user to repeat.");
+ QueueAssistantReplyForPlayback(LowConfidenceRepeatPrompt, sessionKey, out _);
+ return;
+ }
+
+ if (shouldRestart && !token.IsCancellationRequested)
+ {
+ await Task.Delay(250, token);
+ await ResumeRecognitionSessionAsync(
+ token,
+ $"recognition completed ({args.Status})",
+ rebuildRecognizer: shouldRebuildRecognizer);
+ }
+ }
+ catch (OperationCanceledException)
+ {
+ }
+ catch (Exception ex)
+ {
+ _logger.Warn($"Voice recognition completion handler failed: {ex.Message}");
+ }
+ }
+
+ private void OnChatTransportStatusChanged(object? sender, ConnectionStatus status)
+ {
+ lock (_gate)
+ {
+ _chatTransportStatus = status;
+
+ if (status == ConnectionStatus.Connected)
+ {
+ _transportReadyTcs?.TrySetResult(true);
+ }
+ else if (status == ConnectionStatus.Error)
+ {
+ _transportReadyTcs?.TrySetException(
+ new InvalidOperationException("Voice chat transport failed to connect."));
+
+ if (_status.Running && _status.Mode == VoiceActivationMode.TalkMode)
+ {
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ VoiceRuntimeState.Arming,
+ "Voice chat transport failed.");
+ }
+ }
+ else if (status == ConnectionStatus.Disconnected)
+ {
+ if (_status.Running && _status.Mode == VoiceActivationMode.TalkMode)
+ {
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ VoiceRuntimeState.Arming,
+ "Voice chat transport disconnected.");
+ }
+ }
+ }
+ }
+
+ private async Task StopRuntimeResourcesAsync(bool updateStoppedStatus)
+ {
+ CancellationTokenSource? runtimeCts;
+ CancellationTokenSource? playbackSkipCts;
+ OpenClawGatewayClient? chatClient;
+ VoiceCaptureService? captureService;
+ SpeechRecognizer? recognizer;
+ SpeechSynthesizer? synthesizer;
+ MediaPlayer? player;
+ var sessionKey = CurrentStatus.SessionKey;
+
+ lock (_gate)
+ {
+ runtimeCts = _runtimeCts;
+ _runtimeCts = null;
+
+ chatClient = _chatClient;
+ _chatClient = null;
+ _chatTransportStatus = ConnectionStatus.Disconnected;
+ _transportReadyTcs = null;
+
+ captureService = _voiceCaptureService;
+ _voiceCaptureService = null;
+ _speechToTextRoute = null;
+
+ recognizer = _speechRecognizer;
+ _speechRecognizer = null;
+ _recognitionActive = false;
+ _recognitionSessionHadActivity = false;
+ _recognitionSessionHadCaptureSignal = false;
+ _recognitionRestartInProgress = false;
+ _lastHypothesisText = null;
+ _lastHypothesisUtc = default;
+
+ synthesizer = _speechSynthesizer;
+ _speechSynthesizer = null;
+
+ player = _mediaPlayer;
+ _mediaPlayer = null;
+
+ _awaitingReply = false;
+ _isSpeaking = false;
+ _replyPlaybackLoopActive = false;
+ _pendingAssistantReplies.Clear();
+ _currentReplyPreview = null;
+ _lateReplySessionKey = null;
+ _lateReplyGraceUntilUtc = null;
+ _lastAcceptedAssistantReplyText = null;
+ _lastAcceptedAssistantReplySessionKey = null;
+ _lastAcceptedAssistantReplyUtc = default;
+ playbackSkipCts = _playbackSkipCts;
+ _playbackSkipCts = null;
+ }
+
+ try { runtimeCts?.Cancel(); } catch { }
+ try { playbackSkipCts?.Cancel(); } catch { }
+
+ if (captureService != null)
+ {
+ try { await captureService.StopAsync(); } catch { }
+ try { await captureService.DisposeAsync(); } catch { }
+ }
+
+ if (recognizer != null)
+ {
+ recognizer.HypothesisGenerated -= OnSpeechHypothesisGenerated;
+ recognizer.ContinuousRecognitionSession.ResultGenerated -= OnSpeechResultGenerated;
+ recognizer.ContinuousRecognitionSession.Completed -= OnSpeechRecognitionCompleted;
+
+ try { await recognizer.ContinuousRecognitionSession.CancelAsync(); } catch { }
+ try { recognizer.Dispose(); } catch { }
+ }
+
+ if (player != null)
+ {
+ try { player.Pause(); } catch { }
+ try { player.Source = null; } catch { }
+ try { player.Dispose(); } catch { }
+ }
+
+ try { synthesizer?.Dispose(); } catch { }
+
+ if (chatClient != null)
+ {
+ chatClient.StatusChanged -= OnChatTransportStatusChanged;
+ chatClient.ChatMessageReceived -= OnChatMessageReceived;
+ chatClient.SessionPreviewUpdated -= OnSessionPreviewUpdated;
+ try { await chatClient.DisconnectAsync(); } catch { }
+ try { chatClient.Dispose(); } catch { }
+ }
+
+ try { runtimeCts?.Dispose(); } catch { }
+ try { playbackSkipCts?.Dispose(); } catch { }
+
+ if (updateStoppedStatus)
+ {
+ lock (_gate)
+ {
+ _status = BuildStoppedStatus(sessionKey, "Disposed");
+ }
+ }
+
+ RaiseTranscriptDraft(string.Empty, sessionKey, clear: true);
+ }
+
+ private string GetCurrentVoiceSessionKey()
+ {
+ return string.IsNullOrWhiteSpace(_status.SessionKey) ? DefaultSessionKey : _status.SessionKey!;
+ }
+
+ private static bool IsMatchingSessionKey(string? actualSessionKey, string? expectedSessionKey)
+ {
+ actualSessionKey = string.IsNullOrWhiteSpace(actualSessionKey) ? DefaultSessionKey : actualSessionKey;
+ expectedSessionKey = string.IsNullOrWhiteSpace(expectedSessionKey) ? DefaultSessionKey : expectedSessionKey;
+
+ if (string.Equals(actualSessionKey, expectedSessionKey, StringComparison.Ordinal))
+ {
+ return true;
+ }
+
+ return IsMainSessionKey(actualSessionKey) && IsMainSessionKey(expectedSessionKey);
+ }
+
+ private static bool IsMainSessionKey(string sessionKey)
+ {
+ return string.Equals(sessionKey, "main", StringComparison.OrdinalIgnoreCase) ||
+ string.Equals(sessionKey, DefaultSessionKey, StringComparison.OrdinalIgnoreCase) ||
+ sessionKey.Contains(":main:", StringComparison.OrdinalIgnoreCase);
+ }
+
+ private bool ShouldSuppressDuplicateAssistantReply(string? sessionKey, string text, DateTime utcNow)
+ {
+ if (string.IsNullOrWhiteSpace(text))
+ {
+ return false;
+ }
+
+ return string.Equals(_lastAcceptedAssistantReplySessionKey, string.IsNullOrWhiteSpace(sessionKey) ? DefaultSessionKey : sessionKey, StringComparison.OrdinalIgnoreCase) &&
+ string.Equals(_lastAcceptedAssistantReplyText, text, StringComparison.Ordinal) &&
+ utcNow - _lastAcceptedAssistantReplyUtc <= DuplicateAssistantReplyWindow;
+ }
+
+ private void RememberAcceptedAssistantReply(string? sessionKey, string text, DateTime utcNow)
+ {
+ _lastAcceptedAssistantReplySessionKey = string.IsNullOrWhiteSpace(sessionKey) ? DefaultSessionKey : sessionKey;
+ _lastAcceptedAssistantReplyText = string.IsNullOrWhiteSpace(text) ? null : text;
+ _lastAcceptedAssistantReplyUtc = utcNow;
+ }
+
+ internal static bool ShouldRefreshRecognitionForDefaultCaptureDeviceChange(
+ bool running,
+ VoiceActivationMode mode,
+ string? configuredInputDeviceId,
+ AudioDeviceRole role)
+ {
+ return VoiceServiceTransportLogic.ShouldRefreshRecognitionForDefaultCaptureDeviceChange(
+ running,
+ mode,
+ configuredInputDeviceId,
+ role);
+ }
+
+ private static string PrepareReplyForSpeech(string text)
+ {
+ var trimmed = text.Trim();
+ if (string.IsNullOrWhiteSpace(trimmed))
+ {
+ return string.Empty;
+ }
+
+ var firstNewline = trimmed.IndexOf('\n');
+ if (firstNewline <= 0)
+ {
+ return trimmed;
+ }
+
+ var firstLine = trimmed[..firstNewline].Trim();
+ if (!firstLine.StartsWith("{", StringComparison.Ordinal))
+ {
+ return trimmed;
+ }
+
+ try
+ {
+ using var doc = JsonDocument.Parse(firstLine);
+ if (doc.RootElement.ValueKind != JsonValueKind.Object ||
+ !doc.RootElement.TryGetProperty("voice", out _) &&
+ !doc.RootElement.TryGetProperty("voiceId", out _) &&
+ !doc.RootElement.TryGetProperty("voice_id", out _))
+ {
+ return trimmed;
+ }
+
+ return trimmed[(firstNewline + 1)..].TrimStart();
+ }
+ catch (JsonException)
+ {
+ return trimmed;
+ }
+ }
+
+ private VoiceStatusInfo BuildRunningStatus(
+ VoiceActivationMode mode,
+ string? sessionKey,
+ VoiceRuntimeState state,
+ string? lastError)
+ {
+ var settings = _settings.Voice;
+ return new VoiceStatusInfo
+ {
+ Available = true,
+ Running = true,
+ Mode = mode,
+ State = state,
+ SessionKey = sessionKey,
+ InputDeviceId = settings.InputDeviceId,
+ OutputDeviceId = settings.OutputDeviceId,
+ VoiceWakeModelId = settings.VoiceWake.ModelId,
+ VoiceWakeLoaded = mode == VoiceActivationMode.VoiceWake,
+ LastVoiceWakeUtc = _status.LastVoiceWakeUtc,
+ LastUtteranceUtc = _status.LastUtteranceUtc,
+ PendingReplyCount = _pendingAssistantReplies.Count,
+ CanSkipReply = _isSpeaking || _pendingAssistantReplies.Count > 0,
+ CurrentReplyPreview = _currentReplyPreview,
+ LastError = lastError
+ };
+ }
+
+ private async void OnDefaultAudioCaptureDeviceChanged(object sender, DefaultAudioCaptureDeviceChangedEventArgs args)
+ {
+ try
+ {
+ CancellationToken token;
+ bool shouldRefresh;
+ bool shouldRestartListening;
+ string? newDeviceId = args.Id;
+
+ lock (_gate)
+ {
+ if (_runtimeCts == null || _runtimeCts.IsCancellationRequested)
+ {
+ return;
+ }
+
+ shouldRefresh = ShouldRefreshRecognitionForDefaultCaptureDeviceChange(
+ _status.Running,
+ _status.Mode,
+ _settings.Voice.InputDeviceId,
+ args.Role);
+ shouldRestartListening = shouldRefresh && _recognitionActive && !_awaitingReply && !_isSpeaking;
+ token = _runtimeCts.Token;
+
+ if (shouldRefresh)
+ {
+ _recognitionRestartInProgress = shouldRestartListening;
+ _status = BuildRunningStatus(
+ VoiceActivationMode.TalkMode,
+ _status.SessionKey,
+ VoiceRuntimeState.Arming,
+ "Microphone device changed; refreshing speech recognition.");
+ }
+ }
+
+ if (!shouldRefresh)
+ {
+ return;
+ }
+
+ _logger.Info(
+ $"Default capture device changed to {newDeviceId ?? "(unknown)"}; refreshing TalkMode recognizer");
+
+ if (shouldRestartListening)
+ {
+ await StopRecognitionSessionAsync();
+ }
+
+ await RebuildVoiceCaptureAsync("default capture device changed", token);
+ await RebuildSpeechRecognizerAsync("default capture device changed", token);
+
+ if (shouldRestartListening && !token.IsCancellationRequested)
+ {
+ await ResumeRecognitionSessionAsync(token, "default capture device changed");
+ }
+ }
+ catch (OperationCanceledException)
+ {
+ }
+ catch (Exception ex)
+ {
+ _logger.Warn($"Default capture device refresh failed: {ex.Message}");
+ }
+ }
+
+ private async Task RebuildSpeechRecognizerAsync(string reason, CancellationToken cancellationToken)
+ {
+ SpeechRecognizer? oldRecognizer;
+ SpeechRecognizer? newRecognizer = null;
+ IVoiceSpeechToTextRoute? speechToTextRoute;
+ VoiceSettings settings;
+
+ lock (_gate)
+ {
+ if (_runtimeCts == null || _runtimeCts.IsCancellationRequested)
+ {
+ return;
+ }
+
+ oldRecognizer = _speechRecognizer;
+ speechToTextRoute = _speechToTextRoute;
+ settings = Clone(_settings.Voice);
+ _speechRecognizer = null;
+ _recognitionActive = false;
+ _recognitionSessionHadActivity = false;
+ _recognitionSessionHadCaptureSignal = false;
+ }
+
+ if (oldRecognizer != null)
+ {
+ try { oldRecognizer.HypothesisGenerated -= OnSpeechHypothesisGenerated; } catch { }
+ try { oldRecognizer.ContinuousRecognitionSession.ResultGenerated -= OnSpeechResultGenerated; } catch { }
+ try { oldRecognizer.ContinuousRecognitionSession.Completed -= OnSpeechRecognitionCompleted; } catch { }
+ try { await oldRecognizer.ContinuousRecognitionSession.CancelAsync(); } catch { }
+ try { oldRecognizer.Dispose(); } catch { }
+ }
+
+ try
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+ if (speechToTextRoute is not WindowsMediaSpeechToTextRoute windowsRoute)
+ {
+ throw new InvalidOperationException("Speech recognizer rebuild is only available for the Windows.Media STT route.");
+ }
+
+ newRecognizer = await windowsRoute.CreateRecognizerAsync(settings);
+ newRecognizer.HypothesisGenerated += OnSpeechHypothesisGenerated;
+ newRecognizer.ContinuousRecognitionSession.ResultGenerated += OnSpeechResultGenerated;
+ newRecognizer.ContinuousRecognitionSession.Completed += OnSpeechRecognitionCompleted;
+
+ lock (_gate)
+ {
+ if (_runtimeCts == null || _runtimeCts.IsCancellationRequested || !_status.Running)
+ {
+ return;
+ }
+
+ _speechRecognizer = newRecognizer;
+ newRecognizer = null;
+ }
+
+ _logger.Warn($"Speech recognizer rebuilt ({reason})");
+ }
+ finally
+ {
+ if (newRecognizer != null)
+ {
+ try { newRecognizer.HypothesisGenerated -= OnSpeechHypothesisGenerated; } catch { }
+ try { newRecognizer.ContinuousRecognitionSession.ResultGenerated -= OnSpeechResultGenerated; } catch { }
+ try { newRecognizer.ContinuousRecognitionSession.Completed -= OnSpeechRecognitionCompleted; } catch { }
+ try { newRecognizer.Dispose(); } catch { }
+ }
+ }
+ }
+
+ private async Task RebuildVoiceCaptureAsync(string reason, CancellationToken cancellationToken)
+ {
+ VoiceCaptureService? captureService;
+ VoiceSettings settings;
+
+ lock (_gate)
+ {
+ if (_runtimeCts == null || _runtimeCts.IsCancellationRequested)
+ {
+ return;
+ }
+
+ captureService = _voiceCaptureService;
+ settings = Clone(_settings.Voice);
+ _recognitionSessionHadCaptureSignal = false;
+ }
+
+ if (captureService == null)
+ {
+ return;
+ }
+
+ cancellationToken.ThrowIfCancellationRequested();
+ await captureService.StartAsync(settings, cancellationToken);
+ _logger.Info($"Voice capture graph rebuilt ({reason})");
+ }
+
+ private VoiceStatusInfo BuildStoppedStatus(string? sessionKey, string? reason)
+ {
+ var settings = _settings.Voice;
+ return new VoiceStatusInfo
+ {
+ Available = true,
+ Running = false,
+ Mode = _runtimeModeOverride ?? settings.Mode,
+ State = VoiceRuntimeState.Stopped,
+ SessionKey = sessionKey,
+ InputDeviceId = settings.InputDeviceId,
+ OutputDeviceId = settings.OutputDeviceId,
+ VoiceWakeModelId = settings.VoiceWake.ModelId,
+ VoiceWakeLoaded = false,
+ LastVoiceWakeUtc = _status.LastVoiceWakeUtc,
+ LastUtteranceUtc = _status.LastUtteranceUtc,
+ PendingReplyCount = _pendingAssistantReplies.Count,
+ CanSkipReply = _isSpeaking || _pendingAssistantReplies.Count > 0,
+ CurrentReplyPreview = _currentReplyPreview,
+ LastError = reason
+ };
+ }
+
+ private VoiceStatusInfo BuildPausedStatus(VoiceActivationMode mode, string? sessionKey, string? reason)
+ {
+ var settings = _settings.Voice;
+ return new VoiceStatusInfo
+ {
+ Available = true,
+ Running = false,
+ Mode = mode,
+ State = VoiceRuntimeState.Paused,
+ SessionKey = sessionKey,
+ InputDeviceId = settings.InputDeviceId,
+ OutputDeviceId = settings.OutputDeviceId,
+ VoiceWakeModelId = settings.VoiceWake.ModelId,
+ VoiceWakeLoaded = false,
+ LastVoiceWakeUtc = _status.LastVoiceWakeUtc,
+ LastUtteranceUtc = _status.LastUtteranceUtc,
+ PendingReplyCount = _pendingAssistantReplies.Count,
+ CanSkipReply = _isSpeaking || _pendingAssistantReplies.Count > 0,
+ CurrentReplyPreview = _currentReplyPreview,
+ LastError = reason
+ };
+ }
+
+ private VoiceStatusInfo BuildErrorStatus(VoiceActivationMode mode, string? sessionKey, string? reason)
+ {
+ var status = BuildRunningStatus(mode, sessionKey, VoiceRuntimeState.Error, reason);
+ status.Running = false;
+ return status;
+ }
+
+ private static VoiceSettings Clone(VoiceSettings source)
+ {
+ return new VoiceSettings
+ {
+ Mode = source.Mode,
+ Enabled = source.Enabled,
+ ShowRepeaterAtStartup = source.ShowRepeaterAtStartup,
+ ShowConversationToasts = source.ShowConversationToasts,
+ SpeechToTextProviderId = source.SpeechToTextProviderId,
+ TextToSpeechProviderId = source.TextToSpeechProviderId,
+ InputDeviceId = source.InputDeviceId,
+ OutputDeviceId = source.OutputDeviceId,
+ SampleRateHz = source.SampleRateHz,
+ CaptureChunkMs = source.CaptureChunkMs,
+ BargeInEnabled = source.BargeInEnabled,
+ VoiceWake = new VoiceWakeSettings
+ {
+ Engine = source.VoiceWake.Engine,
+ ModelId = source.VoiceWake.ModelId,
+ TriggerThreshold = source.VoiceWake.TriggerThreshold,
+ TriggerCooldownMs = source.VoiceWake.TriggerCooldownMs,
+ PreRollMs = source.VoiceWake.PreRollMs,
+ EndSilenceMs = source.VoiceWake.EndSilenceMs
+ },
+ TalkMode = new TalkModeSettings
+ {
+ MinSpeechMs = source.TalkMode.MinSpeechMs,
+ EndSilenceMs = source.TalkMode.EndSilenceMs,
+ MaxUtteranceMs = source.TalkMode.MaxUtteranceMs
+ }
+ };
+ }
+
+ private static VoiceStatusInfo Clone(VoiceStatusInfo source)
+ {
+ return new VoiceStatusInfo
+ {
+ Available = source.Available,
+ Running = source.Running,
+ Mode = source.Mode,
+ State = source.State,
+ SessionKey = source.SessionKey,
+ InputDeviceId = source.InputDeviceId,
+ OutputDeviceId = source.OutputDeviceId,
+ VoiceWakeModelId = source.VoiceWakeModelId,
+ VoiceWakeLoaded = source.VoiceWakeLoaded,
+ LastVoiceWakeUtc = source.LastVoiceWakeUtc,
+ LastUtteranceUtc = source.LastUtteranceUtc,
+ PendingReplyCount = source.PendingReplyCount,
+ CanSkipReply = source.CanSkipReply,
+ CurrentReplyPreview = source.CurrentReplyPreview,
+ LastError = source.LastError
+ };
+ }
+
+ private static string? BuildProviderFallbackMessage(
+ VoiceProviderOption speechToTextProvider,
+ VoiceProviderOption textToSpeechProvider)
+ {
+ var fallbacks = new List();
+
+ if (!VoiceProviderCatalogService.SupportsSpeechToTextRuntime(speechToTextProvider.Id))
+ {
+ fallbacks.Add($"STT '{speechToTextProvider.Name}' is not implemented yet.");
+ }
+
+ if (!VoiceProviderCatalogService.SupportsTextToSpeechRuntime(textToSpeechProvider.Id))
+ {
+ fallbacks.Add($"TTS '{textToSpeechProvider.Name}' is not implemented yet; using Windows Speech Synthesis.");
+ }
+
+ return fallbacks.Count == 0 ? null : string.Join(" ", fallbacks);
+ }
+
+ private static string GetUserFacingErrorMessage(Exception ex)
+ {
+ if (IsSpeechPrivacyDeclined(ex))
+ {
+ return "Windows online speech recognition is disabled. Open Settings > Privacy & security > Speech and turn on Online speech recognition, then restart Voice Mode.";
+ }
+
+ if (ex is UnauthorizedAccessException)
+ {
+ return "Microphone access is blocked. Open Settings > Privacy & security > Microphone and allow desktop apps to use the microphone.";
+ }
+
+ return ex.Message;
+ }
+
+ private static bool IsSpeechPrivacyDeclined(Exception ex)
+ {
+ if (ex.HResult == HResultSpeechPrivacyDeclined)
+ {
+ return true;
+ }
+
+ return ex.Message.Contains("speech privacy policy", StringComparison.OrdinalIgnoreCase) ||
+ ex.Message.Contains("online speech recognition", StringComparison.OrdinalIgnoreCase);
+ }
+
+ private void RaiseConversationTurn(VoiceConversationDirection direction, string text, string? sessionKey)
+ {
+ if (string.IsNullOrWhiteSpace(text))
+ {
+ return;
+ }
+
+ ConversationTurnAvailable?.Invoke(this, new VoiceConversationTurnEventArgs
+ {
+ Direction = direction,
+ Message = text,
+ SessionKey = string.IsNullOrWhiteSpace(sessionKey) ? DefaultSessionKey : sessionKey,
+ Mode = _runtimeModeOverride ?? _settings.Voice.Mode
+ });
+ }
+
+ private void RaiseTranscriptDraft(string text, string? sessionKey, bool clear)
+ {
+ TranscriptDraftUpdated?.Invoke(this, new VoiceTranscriptDraftEventArgs
+ {
+ SessionKey = string.IsNullOrWhiteSpace(sessionKey) ? DefaultSessionKey : sessionKey,
+ Text = clear ? string.Empty : text,
+ Clear = clear,
+ Mode = _runtimeModeOverride ?? _settings.Voice.Mode
+ });
+ }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Services/Voice/VoiceSpeechToTextRouteFactory.cs b/src/OpenClaw.Tray.WinUI/Services/Voice/VoiceSpeechToTextRouteFactory.cs
new file mode 100644
index 0000000..2c23175
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Services/Voice/VoiceSpeechToTextRouteFactory.cs
@@ -0,0 +1,28 @@
+using System;
+using OpenClaw.Shared;
+
+namespace OpenClawTray.Services.Voice;
+
+internal static class VoiceSpeechToTextRouteFactory
+{
+ public static IVoiceSpeechToTextRoute Create(
+ VoiceProviderOption provider,
+ IOpenClawLogger logger)
+ {
+ ArgumentNullException.ThrowIfNull(provider);
+ ArgumentNullException.ThrowIfNull(logger);
+
+ return ResolveRouteKind(provider) switch
+ {
+ VoiceSpeechToTextRouteKind.WindowsMedia => new WindowsMediaSpeechToTextRoute(logger),
+ VoiceSpeechToTextRouteKind.Streaming => new AudioGraphStreamingSpeechToTextRoute(logger),
+ VoiceSpeechToTextRouteKind.SherpaOnnx => new SherpaOnnxSpeechToTextRoute(logger),
+ _ => new WindowsMediaSpeechToTextRoute(logger)
+ };
+ }
+
+ public static VoiceSpeechToTextRouteKind ResolveRouteKind(VoiceProviderOption provider)
+ {
+ return VoiceSpeechToTextRouteResolver.ResolveRouteKind(provider);
+ }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Services/Voice/VoiceSpeechToTextRouteResources.cs b/src/OpenClaw.Tray.WinUI/Services/Voice/VoiceSpeechToTextRouteResources.cs
new file mode 100644
index 0000000..c3b5f54
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Services/Voice/VoiceSpeechToTextRouteResources.cs
@@ -0,0 +1,9 @@
+using Windows.Media.SpeechRecognition;
+
+namespace OpenClawTray.Services.Voice;
+
+internal sealed class VoiceSpeechToTextRouteResources
+{
+ public VoiceCaptureService? CaptureService { get; init; }
+ public SpeechRecognizer? SpeechRecognizer { get; init; }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Services/Voice/WindowsMediaSpeechToTextRoute.cs b/src/OpenClaw.Tray.WinUI/Services/Voice/WindowsMediaSpeechToTextRoute.cs
new file mode 100644
index 0000000..248e0a3
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Services/Voice/WindowsMediaSpeechToTextRoute.cs
@@ -0,0 +1,55 @@
+using System;
+using System.Threading;
+using System.Threading.Tasks;
+using OpenClaw.Shared;
+using Windows.Media.SpeechRecognition;
+
+namespace OpenClawTray.Services.Voice;
+
+internal sealed class WindowsMediaSpeechToTextRoute : IVoiceSpeechToTextRoute
+{
+ private static readonly TimeSpan InitialSilenceTimeout = TimeSpan.FromSeconds(30);
+ private static readonly TimeSpan BabbleTimeout = TimeSpan.FromSeconds(4);
+
+ private readonly IOpenClawLogger _logger;
+
+ public WindowsMediaSpeechToTextRoute(IOpenClawLogger logger)
+ {
+ _logger = logger;
+ }
+
+ public VoiceSpeechToTextRouteKind Kind => VoiceSpeechToTextRouteKind.WindowsMedia;
+
+ public async Task StartAsync(
+ VoiceProviderOption provider,
+ VoiceSettings settings,
+ CancellationToken cancellationToken)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+
+ return new VoiceSpeechToTextRouteResources
+ {
+ SpeechRecognizer = await CreateRecognizerAsync(settings)
+ };
+ }
+
+ public async Task CreateRecognizerAsync(VoiceSettings settings)
+ {
+ var recognizer = new SpeechRecognizer();
+ recognizer.Timeouts.EndSilenceTimeout = TimeSpan.FromMilliseconds(settings.TalkMode.EndSilenceMs);
+ recognizer.Timeouts.InitialSilenceTimeout = InitialSilenceTimeout;
+ recognizer.Timeouts.BabbleTimeout = BabbleTimeout;
+ recognizer.Constraints.Add(new SpeechRecognitionTopicConstraint(SpeechRecognitionScenario.Dictation, "always-on-dictation"));
+
+ var compilation = await recognizer.CompileConstraintsAsync();
+ if (compilation.Status != SpeechRecognitionResultStatus.Success)
+ {
+ recognizer.Dispose();
+ throw new InvalidOperationException($"Speech recognizer unavailable: {compilation.Status}");
+ }
+
+ _logger.Debug(
+ $"Speech recognizer compiled successfully ({compilation.Status}); endSilenceMs={recognizer.Timeouts.EndSilenceTimeout.TotalMilliseconds:0}; initialSilenceMs={recognizer.Timeouts.InitialSilenceTimeout.TotalMilliseconds:0}; babbleMs={recognizer.Timeouts.BabbleTimeout.TotalMilliseconds:0}");
+ return recognizer;
+ }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Windows/ActivityStreamWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/ActivityStreamWindow.xaml.cs
index 27e0cc1..dd77b8e 100644
--- a/src/OpenClaw.Tray.WinUI/Windows/ActivityStreamWindow.xaml.cs
+++ b/src/OpenClaw.Tray.WinUI/Windows/ActivityStreamWindow.xaml.cs
@@ -26,7 +26,7 @@ public ActivityStreamWindow(Action openDashboard)
this.SetWindowSize(520, 640);
this.CenterOnScreen();
- this.SetIcon(IconHelper.GetStatusIconPath(ConnectionStatus.Connected));
+ this.SetIcon(AppIconHelper.GetStatusIconPath(ConnectionStatus.Connected));
Closed += OnClosed;
ActivityStreamService.Updated += OnActivityUpdated;
diff --git a/src/OpenClaw.Tray.WinUI/Windows/NotificationHistoryWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/NotificationHistoryWindow.xaml.cs
index 92f88ce..1173ff2 100644
--- a/src/OpenClaw.Tray.WinUI/Windows/NotificationHistoryWindow.xaml.cs
+++ b/src/OpenClaw.Tray.WinUI/Windows/NotificationHistoryWindow.xaml.cs
@@ -23,7 +23,7 @@ public NotificationHistoryWindow()
// Window configuration
this.SetWindowSize(450, 600);
this.CenterOnScreen();
- this.SetIcon(IconHelper.GetStatusIconPath(ConnectionStatus.Connected));
+ this.SetIcon(AppIconHelper.GetStatusIconPath(ConnectionStatus.Connected));
Closed += (s, e) => IsClosed = true;
diff --git a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml
index e0f15ac..2df3820 100644
--- a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml
+++ b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml
@@ -3,6 +3,7 @@
x:Class="OpenClawTray.Windows.SettingsWindow"
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
+ xmlns:controls="using:OpenClawTray.Controls"
xmlns:winex="using:WinUIEx"
Title="Settings — OpenClaw Tray"
MinWidth="400" MinHeight="500">
@@ -112,7 +113,7 @@
-
+
+
+
diff --git a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml.cs
index 2308c6f..8e5eff3 100644
--- a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml.cs
+++ b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml.cs
@@ -1,8 +1,10 @@
using Microsoft.Toolkit.Uwp.Notifications;
using Microsoft.UI.Xaml;
+using Microsoft.UI.Xaml.Controls;
using OpenClaw.Shared;
using OpenClawTray.Helpers;
using OpenClawTray.Services;
+using OpenClawTray.Services.Voice;
using System;
using System.Threading.Tasks;
using WinUIEx;
@@ -17,22 +19,22 @@ public sealed partial class SettingsWindow : WindowEx
public event EventHandler? SettingsSaved;
- public SettingsWindow(SettingsManager settings)
+ public SettingsWindow(SettingsManager settings, IVoiceConfigurationApi voiceConfigurationApi)
{
_settings = settings;
InitializeComponent();
-
+
Title = LocalizationHelper.GetString("WindowTitle_Settings");
-
- // Window configuration
- this.SetWindowSize(480, 700);
+
+ this.SetWindowSize(560, 860);
this.CenterOnScreen();
- this.SetIcon(IconHelper.GetStatusIconPath(ConnectionStatus.Connected));
-
+ this.SetIcon(AppIconHelper.GetStatusIconPath(ConnectionStatus.Connected));
+
LoadSettings();
-
+ VoiceSettingsPanel.Initialize(_settings, voiceConfigurationApi);
+
Closed += (s, e) => IsClosed = true;
-
+
Logger.Info("[Settings] Window opened");
}
@@ -50,11 +52,10 @@ private void LoadSettings()
AutoStartToggle.IsOn = _settings.AutoStart;
GlobalHotkeyToggle.IsOn = _settings.GlobalHotkeyEnabled;
NotificationsToggle.IsOn = _settings.ShowNotifications;
-
- // Set sound combo — match by Tag (stable persistence key), not Content (display text)
+
for (int i = 0; i < NotificationSoundComboBox.Items.Count; i++)
{
- if (NotificationSoundComboBox.Items[i] is Microsoft.UI.Xaml.Controls.ComboBoxItem item &&
+ if (NotificationSoundComboBox.Items[i] is ComboBoxItem item &&
item.Tag?.ToString() == _settings.NotificationSound)
{
NotificationSoundComboBox.SelectedIndex = i;
@@ -62,9 +63,10 @@ private void LoadSettings()
}
}
if (NotificationSoundComboBox.SelectedIndex < 0)
+ {
NotificationSoundComboBox.SelectedIndex = 0;
+ }
- // Notification filters
NotifyHealthCb.IsChecked = _settings.NotifyHealth;
NotifyUrgentCb.IsChecked = _settings.NotifyUrgent;
NotifyReminderCb.IsChecked = _settings.NotifyReminder;
@@ -73,12 +75,11 @@ private void LoadSettings()
NotifyBuildCb.IsChecked = _settings.NotifyBuild;
NotifyStockCb.IsChecked = _settings.NotifyStock;
NotifyInfoCb.IsChecked = _settings.NotifyInfo;
-
- // Advanced
+
NodeModeToggle.IsOn = _settings.EnableNodeMode;
}
- private void SaveSettings()
+ private async Task SaveSettingsAsync()
{
_settings.UseSshTunnel = UseSshTunnelToggle.IsOn;
_settings.SshTunnelUser = SshTunnelUserTextBox.Text.Trim();
@@ -94,8 +95,8 @@ private void SaveSettings()
_settings.AutoStart = AutoStartToggle.IsOn;
_settings.GlobalHotkeyEnabled = GlobalHotkeyToggle.IsOn;
_settings.ShowNotifications = NotificationsToggle.IsOn;
-
- if (NotificationSoundComboBox.SelectedItem is Microsoft.UI.Xaml.Controls.ComboBoxItem item)
+
+ if (NotificationSoundComboBox.SelectedItem is ComboBoxItem item)
{
_settings.NotificationSound = item.Tag?.ToString() ?? "Default";
}
@@ -108,12 +109,22 @@ private void SaveSettings()
_settings.NotifyBuild = NotifyBuildCb.IsChecked ?? true;
_settings.NotifyStock = NotifyStockCb.IsChecked ?? true;
_settings.NotifyInfo = NotifyInfoCb.IsChecked ?? true;
-
- // Advanced
_settings.EnableNodeMode = NodeModeToggle.IsOn;
+ try
+ {
+ await VoiceSettingsPanel.ApplyAsync(_settings);
+ }
+ catch (Exception ex)
+ {
+ Logger.Error($"[Settings] Failed to apply voice settings: {ex.Message}");
+ StatusLabel.Text = $"❌ Failed to apply voice settings: {ex.Message}";
+ return false;
+ }
+
_settings.Save();
AutoStartManager.SetAutoStart(_settings.AutoStart);
+ return true;
}
private async void OnTestConnection(object sender, RoutedEventArgs e)
@@ -159,7 +170,7 @@ private async void OnTestConnection(object sender, RoutedEventArgs e)
var connected = false;
var tcs = new TaskCompletionSource();
-
+
client.StatusChanged += (s, status) =>
{
if (status == ConnectionStatus.Connected)
@@ -174,8 +185,7 @@ private async void OnTestConnection(object sender, RoutedEventArgs e)
};
_ = client.ConnectAsync();
-
- // Wait up to 5 seconds for connection
+
var completedTask = await Task.WhenAny(tcs.Task, Task.Delay(5000));
if (completedTask != tcs.Task)
{
@@ -224,13 +234,13 @@ private void OnTestNotification(object sender, RoutedEventArgs e)
}
}
- private void OnSave(object sender, RoutedEventArgs e)
+ private async void OnSave(object sender, RoutedEventArgs e)
{
var useSshTunnel = UseSshTunnelToggle.IsOn;
var gatewayUrl = GatewayUrlTextBox.Text.Trim();
if (!useSshTunnel && !GatewayUrlHelper.IsValidGatewayUrl(gatewayUrl))
{
- Logger.Warn($"[Settings] Save blocked — invalid gateway URL");
+ Logger.Warn("[Settings] Save blocked — invalid gateway URL");
StatusLabel.Text = $"❌ {GatewayUrlHelper.ValidationMessage}";
return;
}
@@ -246,14 +256,23 @@ private void OnSave(object sender, RoutedEventArgs e)
var oldGateway = _settings.GatewayUrl;
var oldAutoStart = _settings.AutoStart;
var oldNodeMode = _settings.EnableNodeMode;
- SaveSettings();
+ if (!await SaveSettingsAsync())
+ {
+ return;
+ }
if (!string.Equals(oldGateway, _settings.GatewayUrl, StringComparison.Ordinal))
- Logger.Info($"[Settings] GatewayUrl changed");
+ {
+ Logger.Info("[Settings] GatewayUrl changed");
+ }
if (oldAutoStart != _settings.AutoStart)
+ {
Logger.Info($"[Settings] AutoStart changed to {_settings.AutoStart}");
+ }
if (oldNodeMode != _settings.EnableNodeMode)
+ {
Logger.Info($"[Settings] NodeMode changed to {_settings.EnableNodeMode}");
+ }
Logger.Info("[Settings] Settings saved");
SettingsSaved?.Invoke(this, EventArgs.Empty);
diff --git a/src/OpenClaw.Tray.WinUI/Windows/StatusDetailWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/StatusDetailWindow.xaml.cs
index 7b499c3..967d88c 100644
--- a/src/OpenClaw.Tray.WinUI/Windows/StatusDetailWindow.xaml.cs
+++ b/src/OpenClaw.Tray.WinUI/Windows/StatusDetailWindow.xaml.cs
@@ -31,7 +31,7 @@ public StatusDetailWindow(
// Window configuration
this.SetWindowSize(420, 550);
this.CenterOnScreen();
- this.SetIcon(IconHelper.GetStatusIconPath(status));
+ this.SetIcon(AppIconHelper.GetStatusIconPath(status));
Closed += (s, e) => IsClosed = true;
diff --git a/src/OpenClaw.Tray.WinUI/Windows/VoiceModeWindow.xaml b/src/OpenClaw.Tray.WinUI/Windows/VoiceModeWindow.xaml
new file mode 100644
index 0000000..90ed291
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Windows/VoiceModeWindow.xaml
@@ -0,0 +1,120 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/OpenClaw.Tray.WinUI/Windows/VoiceModeWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/VoiceModeWindow.xaml.cs
new file mode 100644
index 0000000..3685707
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Windows/VoiceModeWindow.xaml.cs
@@ -0,0 +1,171 @@
+using Microsoft.UI.Xaml;
+using OpenClaw.Shared;
+using OpenClawTray.Helpers;
+using OpenClawTray.Services;
+using OpenClawTray.Services.Voice;
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using WinUIEx;
+
+namespace OpenClawTray.Windows;
+
+public sealed partial class VoiceModeWindow : WindowEx
+{
+ private readonly SettingsManager _settings;
+ private readonly IVoiceRuntimeControlApi _voiceRuntimeControlApi;
+ private readonly IVoiceConfigurationApi _voiceConfigurationApi;
+
+ public bool IsClosed { get; private set; }
+
+ public event EventHandler? OpenSettingsRequested;
+
+ public VoiceModeWindow(
+ SettingsManager settings,
+ IVoiceRuntimeControlApi voiceRuntimeControlApi,
+ IVoiceConfigurationApi voiceConfigurationApi)
+ {
+ _settings = settings;
+ _voiceRuntimeControlApi = voiceRuntimeControlApi;
+ _voiceConfigurationApi = voiceConfigurationApi;
+
+ InitializeComponent();
+
+ Title = "Voice Mode";
+ this.SetWindowSize(520, 620);
+ this.CenterOnScreen();
+ this.SetIcon(AppIconHelper.GetStatusIconPath(ConnectionStatus.Connected));
+
+ Closed += (s, e) => IsClosed = true;
+
+ RefreshStatus();
+ }
+
+ public void RefreshStatus()
+ {
+ var running = _voiceRuntimeControlApi.CurrentStatus;
+ var catalog = _voiceConfigurationApi.GetProviderCatalog();
+
+ StatusItemsControl.ItemsSource = new List
+ {
+ new("Mode", VoiceDisplayHelper.GetModeLabel(_settings.Voice.Mode)),
+ new("Runtime", VoiceDisplayHelper.GetRuntimeLabel(running)),
+ new("Node Mode", _settings.EnableNodeMode ? "Enabled" : "Disabled"),
+ new("Session", string.IsNullOrWhiteSpace(running.SessionKey) ? "main" : running.SessionKey!),
+ new("State", VoiceDisplayHelper.GetStateLabel(running.State)),
+ new("Queued replies", running.PendingReplyCount.ToString())
+ };
+
+ ConfigurationItemsControl.ItemsSource = new List
+ {
+ new("Speech to text", ResolveProviderName(catalog.SpeechToTextProviders, _settings.Voice.SpeechToTextProviderId, "Windows Speech Recognition")),
+ new("Text to speech", ResolveProviderName(catalog.TextToSpeechProviders, _settings.Voice.TextToSpeechProviderId, "Windows Speech Synthesis")),
+ new("Listen device", DescribeDevice(_settings.Voice.InputDeviceId, "System default microphone")),
+ new("Talk device", DescribeDevice(_settings.Voice.OutputDeviceId, "System default speaker")),
+ new("Voice toasts", _settings.Voice.ShowConversationToasts ? "Enabled" : "Disabled")
+ };
+
+ RecentItemsControl.ItemsSource = new List
+ {
+ new("Last utterance", FormatTimestamp(running.LastUtteranceUtc)),
+ new("Last wake", FormatTimestamp(running.LastVoiceWakeUtc)),
+ new("Last issue", string.IsNullOrWhiteSpace(running.LastError) ? "None" : running.LastError!)
+ };
+
+ UpdateTroubleshooting(running.LastError);
+ }
+
+ private static string ResolveProviderName(
+ IReadOnlyList providers,
+ string? providerId,
+ string fallback)
+ {
+ foreach (var provider in providers)
+ {
+ if (string.Equals(provider.Id, providerId, StringComparison.OrdinalIgnoreCase))
+ {
+ return provider.Name;
+ }
+ }
+
+ return fallback;
+ }
+
+ private static string DescribeDevice(string? deviceId, string defaultLabel)
+ {
+ return string.IsNullOrWhiteSpace(deviceId) ? defaultLabel : "Selected device";
+ }
+
+ private static string FormatTimestamp(DateTime? value)
+ {
+ return value?.ToLocalTime().ToString("HH:mm:ss") ?? "None";
+ }
+
+ private void UpdateTroubleshooting(string? error)
+ {
+ TroubleshootingPanel.Visibility = Visibility.Collapsed;
+ OpenSpeechSettingsButton.Visibility = Visibility.Collapsed;
+ OpenMicrophoneSettingsButton.Visibility = Visibility.Collapsed;
+ TroubleshootingTextBlock.Text = string.Empty;
+
+ if (string.IsNullOrWhiteSpace(error))
+ {
+ return;
+ }
+
+ if (error.Contains("online speech recognition is disabled", StringComparison.OrdinalIgnoreCase))
+ {
+ TroubleshootingPanel.Visibility = Visibility.Visible;
+ OpenSpeechSettingsButton.Visibility = Visibility.Visible;
+ TroubleshootingTextBlock.Text =
+ "To fix this: open Windows Settings, go to Privacy & security > Speech, turn on Online speech recognition, then restart voice mode.";
+ return;
+ }
+
+ if (error.Contains("microphone access is blocked", StringComparison.OrdinalIgnoreCase))
+ {
+ TroubleshootingPanel.Visibility = Visibility.Visible;
+ OpenMicrophoneSettingsButton.Visibility = Visibility.Visible;
+ TroubleshootingTextBlock.Text =
+ "To fix this: open Windows Settings, go to Privacy & security > Microphone, allow microphone access and enable desktop app access, then restart voice mode.";
+ }
+ }
+
+ private void OnOpenSpeechSettings(object sender, RoutedEventArgs e)
+ {
+ OpenSettingsUri("ms-settings:privacy-speech");
+ }
+
+ private void OnOpenMicrophoneSettings(object sender, RoutedEventArgs e)
+ {
+ OpenSettingsUri("ms-settings:privacy-microphone");
+ }
+
+ private void OnRefresh(object sender, RoutedEventArgs e)
+ {
+ RefreshStatus();
+ }
+
+ private void OnOpenSettings(object sender, RoutedEventArgs e)
+ {
+ OpenSettingsRequested?.Invoke(this, EventArgs.Empty);
+ }
+
+ private void OnClose(object sender, RoutedEventArgs e)
+ {
+ Close();
+ }
+
+ private static void OpenSettingsUri(string uri)
+ {
+ try
+ {
+ Process.Start(new ProcessStartInfo(uri) { UseShellExecute = true });
+ }
+ catch
+ {
+ }
+ }
+
+ private sealed record DetailRow(string Label, string Value);
+}
diff --git a/src/OpenClaw.Tray.WinUI/Windows/VoiceRepeaterWindow.xaml b/src/OpenClaw.Tray.WinUI/Windows/VoiceRepeaterWindow.xaml
new file mode 100644
index 0000000..c34f030
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Windows/VoiceRepeaterWindow.xaml
@@ -0,0 +1,160 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/OpenClaw.Tray.WinUI/Windows/VoiceRepeaterWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/VoiceRepeaterWindow.xaml.cs
new file mode 100644
index 0000000..67dde46
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Windows/VoiceRepeaterWindow.xaml.cs
@@ -0,0 +1,563 @@
+using Microsoft.UI.Windowing;
+using Microsoft.UI.Dispatching;
+using Microsoft.UI.Xaml;
+using Microsoft.UI.Xaml.Controls;
+using OpenClaw.Shared;
+using OpenClawTray.Helpers;
+using OpenClawTray.Services;
+using OpenClawTray.Services.Voice;
+using System;
+using System.Collections.ObjectModel;
+using System.ComponentModel;
+using System.Runtime.CompilerServices;
+using System.Threading.Tasks;
+using Windows.Graphics;
+using WinUIEx;
+
+namespace OpenClawTray.Windows;
+
+public sealed partial class VoiceRepeaterWindow : WindowEx, IVoiceChatWindow
+{
+ private const int MaxConversationItems = 24;
+ private const int DefaultWidth = 360;
+ private const int DefaultHeight = 170;
+ private const int DefaultMargin = 12;
+ private const double DefaultTextSize = 13;
+ private const double DefaultCaptionSize = 10;
+
+ private readonly SettingsManager _settings;
+ private readonly IVoiceRuntimeControlApi _voiceRuntimeControlApi;
+ private readonly ObservableCollection _conversationItems = [];
+ private readonly DispatcherQueueTimer? _refreshTimer;
+ private readonly DispatcherQueueTimer? _layoutSaveTimer;
+
+ private bool _controlActionInFlight;
+ private bool _suppressSettingsEvents;
+ private bool _suppressPlacementSave = true;
+ private bool _initialPlacementPending = true;
+ private bool _placementDirty;
+ private bool _autoScrollEnabled;
+ private double _messageFontSize = DefaultTextSize;
+ private double _captionFontSize = DefaultCaptionSize;
+
+ public bool IsClosed { get; private set; }
+
+ public event EventHandler? OpenVoiceStatusRequested;
+
+ public VoiceRepeaterWindow(
+ SettingsManager settings,
+ IVoiceRuntimeControlApi voiceRuntimeControlApi)
+ {
+ _settings = settings;
+ _voiceRuntimeControlApi = voiceRuntimeControlApi;
+ _autoScrollEnabled = _settings.VoiceRepeaterWindow.AutoScroll;
+
+ InitializeComponent();
+
+ Title = "Voice Mode";
+ ApplyStoredWindowPlacement();
+ this.SetIcon(AppIconHelper.GetStatusIconPath(ConnectionStatus.Connected));
+
+ ConversationItemsControl.ItemsSource = _conversationItems;
+
+ Closed += OnWindowClosed;
+ Activated += OnWindowActivated;
+
+ var dispatcherQueue = DispatcherQueue.GetForCurrentThread();
+ if (dispatcherQueue != null)
+ {
+ _refreshTimer = dispatcherQueue.CreateTimer();
+ _refreshTimer.Interval = TimeSpan.FromMilliseconds(400);
+ _refreshTimer.Tick += (_, _) => RefreshStatus();
+ _refreshTimer.Start();
+
+ _layoutSaveTimer = dispatcherQueue.CreateTimer();
+ _layoutSaveTimer.Interval = TimeSpan.FromMilliseconds(600);
+ _layoutSaveTimer.IsRepeating = false;
+ _layoutSaveTimer.Tick += (_, _) =>
+ {
+ _layoutSaveTimer.Stop();
+ SaveWindowPlacement();
+ };
+ }
+
+ if (AppWindow is not null)
+ {
+ AppWindow.Changed += OnAppWindowChanged;
+ }
+
+ ApplyViewSettings();
+ RefreshStatus();
+ UpdateConversationPlaceholder();
+ }
+
+ public void RefreshStatus()
+ {
+ var status = _voiceRuntimeControlApi.CurrentStatus;
+ ApplyStatus(status);
+ }
+
+ public Task UpdateVoiceTranscriptDraftAsync(string text, bool clear)
+ {
+ var draftText = clear ? string.Empty : (text ?? string.Empty);
+ DraftTextBlock.Text = draftText;
+ DraftPanel.Visibility = string.IsNullOrWhiteSpace(draftText)
+ ? Visibility.Collapsed
+ : Visibility.Visible;
+
+ UpdateConversationPlaceholder();
+ ScrollConversationToEnd();
+ return Task.CompletedTask;
+ }
+
+ public Task AppendVoiceConversationTurnAsync(VoiceConversationTurnEventArgs args)
+ {
+ if (args == null || string.IsNullOrWhiteSpace(args.Message))
+ {
+ return Task.CompletedTask;
+ }
+
+ var item = new ConversationItem(
+ args.Direction == VoiceConversationDirection.Outgoing ? "You" : "Assistant",
+ DateTime.Now.ToString("HH:mm:ss"),
+ args.Message,
+ _messageFontSize,
+ _captionFontSize);
+
+ _conversationItems.Add(item);
+ while (_conversationItems.Count > MaxConversationItems)
+ {
+ _conversationItems.RemoveAt(0);
+ }
+
+ UpdateConversationPlaceholder();
+ ScrollConversationToEnd();
+ return Task.CompletedTask;
+ }
+
+ private async void OnPauseResumeClick(object sender, RoutedEventArgs e)
+ {
+ if (_controlActionInFlight)
+ {
+ return;
+ }
+
+ _controlActionInFlight = true;
+ ApplyStatus(_voiceRuntimeControlApi.CurrentStatus);
+
+ try
+ {
+ var status = _voiceRuntimeControlApi.CurrentStatus;
+ if (status.State == VoiceRuntimeState.Paused)
+ {
+ await _voiceRuntimeControlApi.ResumeAsync(new VoiceResumeArgs { Reason = "Voice repeater resume button" });
+ }
+ else
+ {
+ await _voiceRuntimeControlApi.PauseAsync(new VoicePauseArgs { Reason = "Voice repeater pause button" });
+ }
+ }
+ finally
+ {
+ _controlActionInFlight = false;
+ RefreshStatus();
+ }
+ }
+
+ private async void OnSkipReplyClick(object sender, RoutedEventArgs e)
+ {
+ if (_controlActionInFlight || !_voiceRuntimeControlApi.CurrentStatus.CanSkipReply)
+ {
+ return;
+ }
+
+ _controlActionInFlight = true;
+ ApplyStatus(_voiceRuntimeControlApi.CurrentStatus);
+
+ try
+ {
+ await _voiceRuntimeControlApi.SkipCurrentReplyAsync(new VoiceSkipArgs
+ {
+ Reason = "Voice repeater skip button"
+ });
+ }
+ finally
+ {
+ _controlActionInFlight = false;
+ RefreshStatus();
+ }
+ }
+
+ private void OnAutoScrollChanged(object sender, RoutedEventArgs e)
+ {
+ if (_suppressSettingsEvents)
+ {
+ return;
+ }
+
+ _autoScrollEnabled = AutoScrollCheckBox.IsChecked == true;
+ _settings.VoiceRepeaterWindow.AutoScroll = _autoScrollEnabled;
+ _settings.Save(logSuccess: false);
+
+ if (_autoScrollEnabled)
+ {
+ ScrollConversationToEnd();
+ }
+ }
+
+ private void OnTextSizeSelectionChanged(object sender, SelectionChangedEventArgs e)
+ {
+ if (_suppressSettingsEvents || TextSizeComboBox.SelectedItem is not ComboBoxItem item)
+ {
+ return;
+ }
+
+ if (!double.TryParse(item.Tag?.ToString(), out var size))
+ {
+ return;
+ }
+
+ _settings.VoiceRepeaterWindow.TextSize = size;
+ ApplyViewSettings();
+ _settings.Save(logSuccess: false);
+ }
+
+ private void OnFloatingEnabledChanged(object sender, RoutedEventArgs e)
+ {
+ if (_suppressSettingsEvents)
+ {
+ return;
+ }
+
+ var enabled = FloatingEnabledCheckBox.IsChecked == true;
+ _settings.VoiceRepeaterWindow.FloatingEnabled = enabled;
+ IsAlwaysOnTop = enabled;
+ _settings.Save(logSuccess: false);
+ }
+
+ private void OnOpenVoiceStatusClick(object sender, RoutedEventArgs e)
+ {
+ OpenVoiceStatusRequested?.Invoke(this, EventArgs.Empty);
+ }
+
+ private void OnWindowClosed(object sender, WindowEventArgs e)
+ {
+ if (_refreshTimer != null)
+ {
+ _refreshTimer.Stop();
+ }
+
+ if (_layoutSaveTimer != null)
+ {
+ _layoutSaveTimer.Stop();
+ }
+
+ if (AppWindow is not null)
+ {
+ AppWindow.Changed -= OnAppWindowChanged;
+ }
+
+ Activated -= OnWindowActivated;
+ FlushWindowPlacement();
+ IsClosed = true;
+ }
+
+ private void OnWindowActivated(object sender, WindowActivatedEventArgs args)
+ {
+ if (!_initialPlacementPending)
+ {
+ return;
+ }
+
+ _initialPlacementPending = false;
+ ApplyStoredWindowPlacement();
+
+ var dispatcherQueue = DispatcherQueue.GetForCurrentThread();
+ _ = dispatcherQueue?.TryEnqueue(() => _suppressPlacementSave = false);
+ }
+
+ private void OnAppWindowChanged(AppWindow sender, AppWindowChangedEventArgs args)
+ {
+ if (_suppressPlacementSave)
+ {
+ return;
+ }
+
+ if (args.DidPositionChange || args.DidSizeChange)
+ {
+ _placementDirty = true;
+ _layoutSaveTimer?.Stop();
+ _layoutSaveTimer?.Start();
+ }
+ }
+
+ private void ApplyStatus(VoiceStatusInfo status)
+ {
+ Title = $"Voice Mode ({GetWindowStateLabel(status)})";
+ DraftCaptionTextBlock.Text = status.State == VoiceRuntimeState.RecordingUtterance
+ ? "You (speaking)"
+ : "You (draft)";
+
+ if (string.IsNullOrWhiteSpace(status.LastError))
+ {
+ TroubleshootingTextBlock.Visibility = Visibility.Collapsed;
+ TroubleshootingTextBlock.Text = string.Empty;
+ }
+ else
+ {
+ TroubleshootingTextBlock.Visibility = Visibility.Visible;
+ TroubleshootingTextBlock.Text = status.LastError;
+ }
+
+ var paused = status.State == VoiceRuntimeState.Paused;
+ PauseResumeButton.IsEnabled = !_controlActionInFlight && status.Mode != VoiceActivationMode.Off;
+ PauseResumeIcon.Symbol = paused ? Symbol.Play : Symbol.Pause;
+ ToolTipService.SetToolTip(
+ PauseResumeButton,
+ paused ? "Resume voice mode" : "Pause voice mode");
+
+ SkipReplyButton.IsEnabled = !_controlActionInFlight && status.CanSkipReply;
+ }
+
+ private void ApplyStoredWindowPlacement()
+ {
+ if (AppWindow is null)
+ {
+ return;
+ }
+
+ var prefs = _settings.VoiceRepeaterWindow;
+ var width = prefs.HasSavedPlacement
+ ? prefs.Width.GetValueOrDefault(DefaultWidth)
+ : DefaultWidth;
+ var height = prefs.HasSavedPlacement
+ ? prefs.Height.GetValueOrDefault(DefaultHeight)
+ : DefaultHeight;
+ var clampedWidth = Math.Max(width, 320);
+ var clampedHeight = Math.Max(height, 150);
+
+ IsAlwaysOnTop = prefs.FloatingEnabled;
+
+ var targetRect = prefs.HasSavedPlacement && prefs.X.HasValue && prefs.Y.HasValue
+ ? new RectInt32(prefs.X.Value, prefs.Y.Value, clampedWidth, clampedHeight)
+ : GetDefaultAnchorRect(clampedWidth, clampedHeight);
+
+ if (!IsPlacementVisible(targetRect))
+ {
+ targetRect = GetDefaultAnchorRect(clampedWidth, clampedHeight);
+ }
+
+ try
+ {
+ AppWindow.MoveAndResize(targetRect);
+ }
+ catch
+ {
+ this.SetWindowSize(targetRect.Width, targetRect.Height);
+ AppWindow.Move(new PointInt32(targetRect.X, targetRect.Y));
+ }
+ }
+
+ private void ApplyViewSettings()
+ {
+ _suppressSettingsEvents = true;
+ try
+ {
+ _autoScrollEnabled = _settings.VoiceRepeaterWindow.AutoScroll;
+ _messageFontSize = Math.Clamp(
+ _settings.VoiceRepeaterWindow.TextSize > 0 ? _settings.VoiceRepeaterWindow.TextSize : DefaultTextSize,
+ 11,
+ 15);
+ _captionFontSize = Math.Max(9, _messageFontSize - 3);
+
+ DraftTextBlock.FontSize = _messageFontSize;
+ DraftCaptionTextBlock.FontSize = _captionFontSize;
+ TroubleshootingTextBlock.FontSize = _captionFontSize;
+
+ foreach (var item in _conversationItems)
+ {
+ item.MessageFontSize = _messageFontSize;
+ item.CaptionFontSize = _captionFontSize;
+ }
+
+ AutoScrollCheckBox.IsChecked = _autoScrollEnabled;
+ FloatingEnabledCheckBox.IsChecked = _settings.VoiceRepeaterWindow.FloatingEnabled;
+ SelectTextSizeItem(_messageFontSize);
+ }
+ finally
+ {
+ _suppressSettingsEvents = false;
+ }
+ }
+
+ private void SaveWindowPlacement()
+ {
+ if (IsClosed || AppWindow is null || _suppressPlacementSave)
+ {
+ return;
+ }
+
+ var size = AppWindow.Size;
+ var position = AppWindow.Position;
+ _settings.VoiceRepeaterWindow.Width = size.Width;
+ _settings.VoiceRepeaterWindow.Height = size.Height;
+ _settings.VoiceRepeaterWindow.X = position.X;
+ _settings.VoiceRepeaterWindow.Y = position.Y;
+ _settings.VoiceRepeaterWindow.HasSavedPlacement = true;
+ _settings.Save(logSuccess: false);
+ _placementDirty = false;
+ }
+
+ private void FlushWindowPlacement()
+ {
+ if (_placementDirty || !IsClosed)
+ {
+ SaveWindowPlacement();
+ }
+ }
+
+ private RectInt32 GetDefaultAnchorRect(int width, int height)
+ {
+ var displayArea = DisplayArea.Primary;
+ var x = displayArea.WorkArea.X + DefaultMargin;
+ var y = displayArea.WorkArea.Y + Math.Max(DefaultMargin, displayArea.WorkArea.Height - height - DefaultMargin);
+ return new RectInt32(x, y, width, height);
+ }
+
+ private static bool IsPlacementVisible(RectInt32 rect)
+ {
+ try
+ {
+ var displayArea = DisplayArea.GetFromRect(rect, DisplayAreaFallback.Nearest);
+ var workArea = displayArea.WorkArea;
+ return rect.Width > 0 &&
+ rect.Height > 0 &&
+ rect.X < workArea.X + workArea.Width &&
+ rect.X + rect.Width > workArea.X &&
+ rect.Y < workArea.Y + workArea.Height &&
+ rect.Y + rect.Height > workArea.Y;
+ }
+ catch
+ {
+ return false;
+ }
+ }
+
+ private void SelectTextSizeItem(double size)
+ {
+ var sizeTag = ((int)Math.Round(size)).ToString();
+ foreach (var entry in TextSizeComboBox.Items)
+ {
+ if (entry is ComboBoxItem item && string.Equals(item.Tag?.ToString(), sizeTag, StringComparison.Ordinal))
+ {
+ TextSizeComboBox.SelectedItem = item;
+ return;
+ }
+ }
+
+ TextSizeComboBox.SelectedIndex = 2;
+ }
+
+ private void UpdateConversationPlaceholder()
+ {
+ EmptyConversationTextBlock.Visibility = _conversationItems.Count == 0 && DraftPanel.Visibility != Visibility.Visible
+ ? Visibility.Visible
+ : Visibility.Collapsed;
+ }
+
+ private void ScrollConversationToEnd()
+ {
+ if (!_autoScrollEnabled)
+ {
+ return;
+ }
+
+ var dispatcherQueue = DispatcherQueue.GetForCurrentThread();
+ _ = dispatcherQueue?.TryEnqueue(() =>
+ {
+ ConversationScrollViewer.UpdateLayout();
+ ConversationScrollViewer.ChangeView(null, ConversationScrollViewer.ScrollableHeight, null, true);
+ _ = dispatcherQueue.TryEnqueue(() =>
+ ConversationScrollViewer.ChangeView(null, ConversationScrollViewer.ScrollableHeight, null, true));
+ });
+ }
+
+ private static string GetWindowStateLabel(VoiceStatusInfo status)
+ {
+ return status.State switch
+ {
+ VoiceRuntimeState.ListeningForVoiceWake => "listening",
+ VoiceRuntimeState.ListeningContinuously => "listening",
+ VoiceRuntimeState.RecordingUtterance => "hearing you",
+ VoiceRuntimeState.AwaitingResponse => "waiting",
+ VoiceRuntimeState.PlayingResponse => "speaking",
+ VoiceRuntimeState.Paused => "paused",
+ VoiceRuntimeState.Arming => "starting",
+ VoiceRuntimeState.Error => "error",
+ _ when status.Mode == VoiceActivationMode.Off => "off",
+ _ => "idle"
+ };
+ }
+
+ private sealed class ConversationItem : INotifyPropertyChanged
+ {
+ private double _messageFontSize;
+ private double _captionFontSize;
+
+ public ConversationItem(
+ string speaker,
+ string timestamp,
+ string message,
+ double messageFontSize,
+ double captionFontSize)
+ {
+ Speaker = speaker;
+ Timestamp = timestamp;
+ Message = message;
+ _messageFontSize = messageFontSize;
+ _captionFontSize = captionFontSize;
+ }
+
+ public string Speaker { get; }
+ public string Timestamp { get; }
+ public string Message { get; }
+ public string Caption => $"{Speaker} · {Timestamp}";
+
+ public double MessageFontSize
+ {
+ get => _messageFontSize;
+ set
+ {
+ if (Math.Abs(_messageFontSize - value) < 0.01)
+ {
+ return;
+ }
+
+ _messageFontSize = value;
+ OnPropertyChanged();
+ }
+ }
+
+ public double CaptionFontSize
+ {
+ get => _captionFontSize;
+ set
+ {
+ if (Math.Abs(_captionFontSize - value) < 0.01)
+ {
+ return;
+ }
+
+ _captionFontSize = value;
+ OnPropertyChanged();
+ }
+ }
+
+ public event PropertyChangedEventHandler? PropertyChanged;
+
+ private void OnPropertyChanged([CallerMemberName] string? propertyName = null)
+ {
+ PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(propertyName));
+ }
+ }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Windows/WebChatVoiceDomState.cs b/src/OpenClaw.Tray.WinUI/Windows/WebChatVoiceDomState.cs
new file mode 100644
index 0000000..59cdcfe
--- /dev/null
+++ b/src/OpenClaw.Tray.WinUI/Windows/WebChatVoiceDomState.cs
@@ -0,0 +1,15 @@
+namespace OpenClawTray.Windows;
+
+internal sealed class WebChatVoiceDomState
+{
+ public WebChatVoiceDomState()
+ {
+ }
+
+ public string PendingDraft { get; private set; } = string.Empty;
+
+ public void SetDraft(string? text, bool clear)
+ {
+ PendingDraft = clear ? string.Empty : (text ?? string.Empty);
+ }
+}
diff --git a/src/OpenClaw.Tray.WinUI/Windows/WebChatWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/WebChatWindow.xaml.cs
index 8a6bc4b..dca6739 100644
--- a/src/OpenClaw.Tray.WinUI/Windows/WebChatWindow.xaml.cs
+++ b/src/OpenClaw.Tray.WinUI/Windows/WebChatWindow.xaml.cs
@@ -3,6 +3,7 @@
using OpenClaw.Shared;
using OpenClawTray.Helpers;
using OpenClawTray.Services;
+using OpenClawTray.Services.Voice;
using System;
using System.Diagnostics;
using System.IO;
@@ -14,14 +15,16 @@
namespace OpenClawTray.Windows;
public sealed partial class WebChatWindow : WindowEx
+ , IVoiceChatWindow
{
private readonly string _gatewayUrl;
private readonly string _token;
-
- // Store event handlers for cleanup
+ private readonly WebChatVoiceDomState _voiceDomState;
+ private bool _voiceDomReady;
+
private TypedEventHandler? _navigationCompletedHandler;
private TypedEventHandler? _navigationStartingHandler;
-
+
public bool IsClosed { get; private set; }
public WebChatWindow(string gatewayUrl, string token)
@@ -29,18 +32,18 @@ public WebChatWindow(string gatewayUrl, string token)
Logger.Info($"WebChatWindow: Constructor called, gateway={gatewayUrl}");
_gatewayUrl = gatewayUrl;
_token = token;
-
+ _voiceDomState = new WebChatVoiceDomState();
+
InitializeComponent();
-
- // Window configuration
+
this.SetWindowSize(520, 750);
this.MinWidth = 380;
this.MinHeight = 450;
this.CenterOnScreen();
- this.SetIcon(IconHelper.GetStatusIconPath(ConnectionStatus.Connected));
-
+ this.SetIcon(AppIconHelper.GetStatusIconPath(ConnectionStatus.Connected));
+
Closed += OnWindowClosed;
-
+
Logger.Info("WebChatWindow: Starting InitializeWebViewAsync");
_ = InitializeWebViewAsync();
}
@@ -48,8 +51,8 @@ public WebChatWindow(string gatewayUrl, string token)
private void OnWindowClosed(object sender, WindowEventArgs e)
{
IsClosed = true;
-
- // Cleanup WebView2 event handlers
+ _voiceDomReady = false;
+
if (WebView.CoreWebView2 != null)
{
if (_navigationCompletedHandler != null)
@@ -64,35 +67,39 @@ private async Task InitializeWebViewAsync()
try
{
Logger.Info("WebChatWindow: Initializing WebView2...");
-
- // Set up user data folder for WebView2
+
var userDataFolder = Path.Combine(
Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData),
"OpenClawTray", "WebView2");
-
+
Directory.CreateDirectory(userDataFolder);
Logger.Info($"WebChatWindow: User data folder: {userDataFolder}");
- // Set environment variable for user data folder
Environment.SetEnvironmentVariable("WEBVIEW2_USER_DATA_FOLDER", userDataFolder);
-
+
Logger.Info("WebChatWindow: Calling EnsureCoreWebView2Async...");
await WebView.EnsureCoreWebView2Async();
Logger.Info("WebChatWindow: CoreWebView2 initialized successfully");
-
- // Configure WebView2
+
WebView.CoreWebView2.Settings.IsStatusBarEnabled = false;
WebView.CoreWebView2.Settings.AreDefaultContextMenusEnabled = true;
WebView.CoreWebView2.Settings.IsZoomControlEnabled = true;
+ await WebView.CoreWebView2.AddScriptToExecuteOnDocumentCreatedAsync(WebChatVoiceDomBridge.DocumentCreatedScript);
+
+ _voiceDomReady = false;
- // Handle navigation events (store for cleanup)
_navigationCompletedHandler = (s, e) =>
{
Logger.Info($"WebChatWindow: Navigation completed, success={e.IsSuccess}, status={e.WebErrorStatus}");
LoadingRing.IsActive = false;
LoadingRing.Visibility = Visibility.Collapsed;
-
- // Show friendly error if connection failed
+ _voiceDomReady = e.IsSuccess;
+
+ if (e.IsSuccess)
+ {
+ _ = RefreshTrayVoiceDomStateAsync();
+ }
+
if (!e.IsSuccess && (e.WebErrorStatus == CoreWebView2WebErrorStatus.ConnectionAborted ||
e.WebErrorStatus == CoreWebView2WebErrorStatus.CannotConnect ||
e.WebErrorStatus == CoreWebView2WebErrorStatus.ConnectionReset ||
@@ -115,15 +122,14 @@ private async Task InitializeWebViewAsync()
_navigationStartingHandler = (s, e) =>
{
- // Strip query params to avoid logging tokens
var safeUri = e.Uri?.Split('?')[0] ?? "unknown";
Logger.Info($"WebChatWindow: Navigation starting to {safeUri}");
+ _voiceDomReady = false;
LoadingRing.IsActive = true;
LoadingRing.Visibility = Visibility.Visible;
};
WebView.CoreWebView2.NavigationStarting += _navigationStartingHandler;
- // Navigate to chat
NavigateToChat();
}
catch (Exception ex)
@@ -135,13 +141,12 @@ private async Task InitializeWebViewAsync()
Logger.Error($"WebView2 inner exception: {ex.InnerException.GetType().FullName}: {ex.InnerException.Message}");
}
Logger.Error($"WebView2 stack trace: {ex.StackTrace}");
-
- // Show error in the dialog instead of falling back to browser
+
LoadingRing.IsActive = false;
LoadingRing.Visibility = Visibility.Collapsed;
WebView.Visibility = Visibility.Collapsed;
ErrorPanel.Visibility = Visibility.Visible;
-
+
var errorDetails = $"Exception: {ex.GetType().FullName}\n" +
$"HResult: 0x{ex.HResult:X8}\n" +
$"Message: {ex.Message}\n\n" +
@@ -149,17 +154,16 @@ private async Task InitializeWebViewAsync()
$"Architecture: {RuntimeInformation.ProcessArchitecture}\n" +
$"OS: {RuntimeInformation.OSDescription}\n\n" +
$"Stack Trace:\n{ex.StackTrace}";
-
+
if (ex.InnerException != null)
{
errorDetails += $"\n\nInner Exception: {ex.InnerException.GetType().FullName}\n{ex.InnerException.Message}";
}
-
+
ErrorText.Text = errorDetails;
}
}
- // Set to a test URL to bypass gateway (e.g., "https://www.bing.com"), or null for normal operation
private const string? DEBUG_TEST_URL = null;
private static bool IsLocalHost(Uri uri)
@@ -208,12 +212,11 @@ private void ShowErrorMessage(string message)
ErrorPanel.Visibility = Visibility.Visible;
ErrorText.Text = message;
}
-
+
private void NavigateToChat()
{
if (WebView.CoreWebView2 == null) return;
- // If debug URL is set, use it instead of gateway
if (!string.IsNullOrEmpty(DEBUG_TEST_URL))
{
Logger.Info($"WebChatWindow: DEBUG MODE - Navigating to test URL: {DEBUG_TEST_URL}");
@@ -251,7 +254,7 @@ private void OnPopout(object sender, RoutedEventArgs e)
ShowErrorMessage(errorMessage);
return;
}
-
+
try
{
Process.Start(new ProcessStartInfo(url) { UseShellExecute = true });
@@ -266,4 +269,34 @@ private void OnDevTools(object sender, RoutedEventArgs e)
{
WebView.CoreWebView2?.OpenDevToolsWindow();
}
+
+ public async Task UpdateVoiceTranscriptDraftAsync(string text, bool clear)
+ {
+ _voiceDomState.SetDraft(text, clear);
+ await RefreshTrayVoiceDomStateAsync();
+ }
+
+ public async Task AppendVoiceConversationTurnAsync(VoiceConversationTurnEventArgs args)
+ {
+ await Task.CompletedTask;
+ }
+
+ private async Task RefreshTrayVoiceDomStateAsync()
+ {
+ if (WebView.CoreWebView2 == null || !_voiceDomReady || IsClosed)
+ {
+ return;
+ }
+
+ try
+ {
+ await WebView.CoreWebView2.ExecuteScriptAsync(
+ WebChatVoiceDomBridge.BuildSetDraftScript(_voiceDomState.PendingDraft));
+ await WebView.CoreWebView2.ExecuteScriptAsync(WebChatVoiceDomBridge.ClearLegacyTurnsScript);
+ }
+ catch (Exception ex)
+ {
+ Logger.Warn($"WebChatWindow: Failed to apply voice DOM state: {ex.Message}");
+ }
+ }
}
diff --git a/tests/OpenClaw.Shared.Tests/CapabilityTests.cs b/tests/OpenClaw.Shared.Tests/CapabilityTests.cs
index 67de774..f1de44e 100644
--- a/tests/OpenClaw.Shared.Tests/CapabilityTests.cs
+++ b/tests/OpenClaw.Shared.Tests/CapabilityTests.cs
@@ -965,3 +965,234 @@ public async Task Snap_ReturnsError_WhenHandlerThrows()
Assert.Contains("Camera access blocked", res.Error);
}
}
+
+public class VoiceCapabilityTests
+{
+ private static JsonElement Parse(string json)
+ {
+ using var doc = JsonDocument.Parse(json);
+ return doc.RootElement.Clone();
+ }
+
+ [Fact]
+ public void CanHandle_VoiceCommands()
+ {
+ var cap = new VoiceCapability(NullLogger.Instance);
+ Assert.True(cap.CanHandle(VoiceCommands.ListDevices));
+ Assert.True(cap.CanHandle(VoiceCommands.GetSettings));
+ Assert.True(cap.CanHandle(VoiceCommands.SetSettings));
+ Assert.True(cap.CanHandle(VoiceCommands.GetStatus));
+ Assert.True(cap.CanHandle(VoiceCommands.Start));
+ Assert.True(cap.CanHandle(VoiceCommands.Stop));
+ Assert.False(cap.CanHandle("voice.unknown"));
+ Assert.Equal("voice", cap.Category);
+ }
+
+ [Fact]
+ public async Task ListDevices_ReturnsArrayFromHandler()
+ {
+ var cap = new VoiceCapability(NullLogger.Instance);
+ cap.ListDevicesRequested += () => Task.FromResult(
+ [
+ new VoiceAudioDeviceInfo
+ {
+ DeviceId = "default-input",
+ Name = "System default microphone",
+ IsDefault = true,
+ IsInput = true
+ }
+ ]);
+
+ var res = await cap.ExecuteAsync(new NodeInvokeRequest
+ {
+ Id = "voice1",
+ Command = VoiceCommands.ListDevices,
+ Args = Parse("""{}""")
+ });
+
+ Assert.True(res.Ok);
+ var json = JsonSerializer.Serialize(res.Payload);
+ using var doc = JsonDocument.Parse(json);
+ Assert.Equal(JsonValueKind.Array, doc.RootElement.ValueKind);
+ Assert.Equal("default-input", doc.RootElement[0].GetProperty("DeviceId").GetString());
+ }
+
+ [Fact]
+ public async Task GetSettings_ReturnsSettingsFromHandler()
+ {
+ var cap = new VoiceCapability(NullLogger.Instance);
+ cap.SettingsRequested += () => Task.FromResult(new VoiceSettings
+ {
+ Enabled = true,
+ Mode = VoiceActivationMode.VoiceWake
+ });
+
+ var res = await cap.ExecuteAsync(new NodeInvokeRequest
+ {
+ Id = "voice2",
+ Command = VoiceCommands.GetSettings,
+ Args = Parse("""{}""")
+ });
+
+ Assert.True(res.Ok);
+ var json = JsonSerializer.Serialize(res.Payload);
+ using var doc = JsonDocument.Parse(json);
+ Assert.True(doc.RootElement.GetProperty("Enabled").GetBoolean());
+ Assert.Equal("VoiceWake", doc.RootElement.GetProperty("Mode").GetString());
+ }
+
+ [Fact]
+ public async Task SetSettings_UsesUpdateEnvelope_WhenPresent()
+ {
+ var cap = new VoiceCapability(NullLogger.Instance);
+ VoiceSettingsUpdateArgs? received = null;
+ cap.SettingsUpdateRequested += update =>
+ {
+ received = update;
+ return Task.FromResult(update.Settings);
+ };
+
+ var res = await cap.ExecuteAsync(new NodeInvokeRequest
+ {
+ Id = "voice3",
+ Command = VoiceCommands.SetSettings,
+ Args = Parse("""{"update":{"persist":false,"settings":{"enabled":true,"mode":"TalkMode"}}}""")
+ });
+
+ Assert.True(res.Ok);
+ Assert.NotNull(received);
+ Assert.False(received!.Persist);
+ Assert.Equal(VoiceActivationMode.TalkMode, received.Settings.Mode);
+ }
+
+ [Fact]
+ public async Task GetStatus_ReturnsStatusFromHandler()
+ {
+ var cap = new VoiceCapability(NullLogger.Instance);
+ cap.StatusRequested += () => Task.FromResult(new VoiceStatusInfo
+ {
+ Available = true,
+ Running = true,
+ Mode = VoiceActivationMode.TalkMode,
+ State = VoiceRuntimeState.ListeningContinuously
+ });
+
+ var res = await cap.ExecuteAsync(new NodeInvokeRequest
+ {
+ Id = "voice4",
+ Command = VoiceCommands.GetStatus,
+ Args = Parse("""{}""")
+ });
+
+ Assert.True(res.Ok);
+ var json = JsonSerializer.Serialize(res.Payload);
+ using var doc = JsonDocument.Parse(json);
+ Assert.True(doc.RootElement.GetProperty("Running").GetBoolean());
+ Assert.Equal("ListeningContinuously", doc.RootElement.GetProperty("State").GetString());
+ }
+
+ [Fact]
+ public async Task Start_PassesArgsToHandler()
+ {
+ var cap = new VoiceCapability(NullLogger.Instance);
+ VoiceStartArgs? received = null;
+ cap.StartRequested += args =>
+ {
+ received = args;
+ return Task.FromResult(new VoiceStatusInfo
+ {
+ Available = true,
+ Running = true,
+ Mode = args.Mode ?? VoiceActivationMode.Off,
+ State = VoiceRuntimeState.ListeningForVoiceWake,
+ SessionKey = args.SessionKey
+ });
+ };
+
+ var res = await cap.ExecuteAsync(new NodeInvokeRequest
+ {
+ Id = "voice5",
+ Command = VoiceCommands.Start,
+ Args = Parse("""{"mode":"VoiceWake","sessionKey":"session-123"}""")
+ });
+
+ Assert.True(res.Ok);
+ Assert.NotNull(received);
+ Assert.Equal(VoiceActivationMode.VoiceWake, received!.Mode);
+ Assert.Equal("session-123", received.SessionKey);
+ }
+
+ [Fact]
+ public async Task Stop_PassesReasonToHandler()
+ {
+ var cap = new VoiceCapability(NullLogger.Instance);
+ VoiceStopArgs? received = null;
+ cap.StopRequested += args =>
+ {
+ received = args;
+ return Task.FromResult(new VoiceStatusInfo
+ {
+ Available = true,
+ Running = false,
+ Mode = VoiceActivationMode.Off,
+ State = VoiceRuntimeState.Stopped,
+ LastError = args.Reason
+ });
+ };
+
+ var res = await cap.ExecuteAsync(new NodeInvokeRequest
+ {
+ Id = "voice6",
+ Command = VoiceCommands.Stop,
+ Args = Parse("""{"reason":"user requested"}""")
+ });
+
+ Assert.True(res.Ok);
+ Assert.NotNull(received);
+ Assert.Equal("user requested", received!.Reason);
+ }
+
+ [Fact]
+ public async Task Start_ReturnsError_WhenHandlerMissing()
+ {
+ var cap = new VoiceCapability(NullLogger.Instance);
+ var res = await cap.ExecuteAsync(new NodeInvokeRequest
+ {
+ Id = "voice7",
+ Command = VoiceCommands.Start,
+ Args = Parse("""{}""")
+ });
+
+ Assert.False(res.Ok);
+ Assert.Contains("not available", res.Error, StringComparison.OrdinalIgnoreCase);
+ }
+
+ [Fact]
+ public async Task LegacyVoiceSkipCommand_RemainsAccepted()
+ {
+ var cap = new VoiceCapability(NullLogger.Instance);
+ VoiceSkipArgs? received = null;
+ cap.SkipRequested += args =>
+ {
+ received = args;
+ return Task.FromResult(new VoiceStatusInfo
+ {
+ Available = true,
+ Running = true,
+ Mode = VoiceActivationMode.TalkMode,
+ State = VoiceRuntimeState.PlayingResponse
+ });
+ };
+
+ var res = await cap.ExecuteAsync(new NodeInvokeRequest
+ {
+ Id = "voice8",
+ Command = "voice.skip",
+ Args = Parse("""{"reason":"legacy caller"}""")
+ });
+
+ Assert.True(res.Ok);
+ Assert.NotNull(received);
+ Assert.Equal("legacy caller", received!.Reason);
+ }
+}
diff --git a/tests/OpenClaw.Shared.Tests/OpenClawGatewayClientTests.cs b/tests/OpenClaw.Shared.Tests/OpenClawGatewayClientTests.cs
index d44f3fd..9cc95a6 100644
--- a/tests/OpenClaw.Shared.Tests/OpenClawGatewayClientTests.cs
+++ b/tests/OpenClaw.Shared.Tests/OpenClawGatewayClientTests.cs
@@ -1,6 +1,8 @@
using System;
+using System.Collections;
using System.Collections.Generic;
using System.Linq;
+using System.Reflection;
using System.Text.Json;
using Xunit;
using OpenClaw.Shared;
@@ -78,6 +80,54 @@ public SessionInfo[] GetSessionList()
return _client.GetSessionList();
}
+ public string GetDefaultChatSessionKey()
+ {
+ return GetPrivateField("_defaultChatSessionKey");
+ }
+
+ public void UpdateDefaultChatSessionKeyFromHello(string payloadJson)
+ {
+ using var doc = JsonDocument.Parse(payloadJson);
+ var method = typeof(OpenClawGatewayClient).GetMethod(
+ "UpdateDefaultChatSessionKeyFromHello",
+ System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
+ method!.Invoke(_client, new object[] { doc.RootElement.Clone() });
+ }
+
+ public string SerializeChatSendRequest(string message, string sessionKey, string idempotencyKey)
+ {
+ var parametersMethod = typeof(OpenClawGatewayClient).GetMethod(
+ "BuildChatSendParameters",
+ System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
+ var parameters = parametersMethod!.Invoke(_client, new object[] { message, sessionKey, idempotencyKey });
+
+ var serializeMethod = typeof(OpenClawGatewayClient).GetMethod(
+ "SerializeRequest",
+ System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static);
+ return (string)serializeMethod!.Invoke(null, new object[] { "request-123", "chat.send", parameters! })!;
+ }
+
+ public string SerializeConnectRequest()
+ {
+ var parametersMethod = typeof(OpenClawGatewayClient).GetMethod(
+ "BuildConnectParameters",
+ System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
+ var parameters = parametersMethod!.Invoke(_client, Array.Empty