microsoft · foxmsft · Jun 11, 2026 · Apr 16, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/.github/actions/spell-check/allow/zoomit.txt b/.github/actions/spell-check/allow/zoomit.txt
diff --git a/.github/actions/spell-check/expect.txt b/.github/actions/spell-check/expect.txt
@@ -113,7 +113,6 @@ azman
 azureaiinference
 azureinference
 azureopenai
-Backlight
 backticks
 Badflags
 Badmode
@@ -472,7 +471,6 @@ DWMWINDOWATTRIBUTE
 DWMWINDOWMAXIMIZEDCHANGE
 DWORDLONG
 dworigin
-DWRITE
 dxgi
 Dxva
 eab
@@ -998,7 +996,6 @@ luid
 lusrmgr
 LVDS
 LWA
-LWIN
 LZero
 MAGTRANSFORM
 makeappx
@@ -1208,7 +1205,6 @@ nonclient
 NONCLIENTMETRICSW
 NONELEVATED
 nonspace
-nonstd
 NOOWNERZORDER
 NOPARENTNOTIFY
 NOPREFIX
@@ -1422,7 +1418,6 @@ Prefixer
 Premul
 prependpath
 prepopulate
-Prereq
 prevhost
 previewer
 PREVIEWHANDLERFRAMEINFO
@@ -2000,7 +1995,6 @@ valuegenerator
 VARTYPE
 vbcscompiler
 vcamp
-VCENTER
 vcgtq
 VCINSTALLDIR
 vcp
@@ -2038,7 +2032,6 @@ vorrq
 VOS
 vpaddlq
 vqsubq
-VREDRAW
 vreinterpretq
 VSC
 VSCBD

diff --git a/.github/actions/spell-check/patterns.txt b/.github/actions/spell-check/patterns.txt
@@ -312,3 +312,9 @@ ms-windows-store://\S+
 
 # ANSI color codes
 (?:\\(?:u00|x)1[Bb]|\\03[1-7]|\x1b|\\u\{1[Bb]\})\[\d+(?:;\d+)*m
+
+# Special licenses text from RNNoise (BSD-style disclaimer: ``AS IS'')
+``AS IS''
+
+# Old school moniker for macOS from RNNoise
+MacOS
diff --git a/.gitignore b/.gitignore
@@ -19,6 +19,9 @@
 [Rr]eleases/
 x64/
 x86/
+!**/rnnoise/
+!**/rnnoise/x86/
+!**/rnnoise/x86/**
 ARM64/
 bld/
 [Bb]in/

diff --git a/src/modules/ZoomIt/ZoomIt/AudioSampleGenerator.cpp b/src/modules/ZoomIt/ZoomIt/AudioSampleGenerator.cpp
@@ -34,15 +34,17 @@ namespace winrt
     using namespace Windows::Devices::Enumeration;
 }
 
-AudioSampleGenerator::AudioSampleGenerator(bool captureMicrophone, bool captureSystemAudio, bool micMonoMix)
+AudioSampleGenerator::AudioSampleGenerator(bool captureMicrophone, bool captureSystemAudio, bool mixMicrophoneMono, bool useNoiseCancellation)
     : m_captureMicrophone(captureMicrophone)
     , m_captureSystemAudio(captureSystemAudio)
-    , m_micMonoMix(micMonoMix)
+    , m_mixMicrophoneMono(mixMicrophoneMono)
+    , m_useNoiseCancellation(useNoiseCancellation)
 {
     OutputDebugStringA(("AudioSampleGenerator created, captureMicrophone=" +
         std::string(captureMicrophone ? "true" : "false") +
         ", captureSystemAudio=" + std::string(captureSystemAudio ? "true" : "false") +
-        ", micMonoMix=" + std::string(micMonoMix ? "true" : "false") + "\n").c_str());
+        ", mixMicrophoneMono=" + std::string(mixMicrophoneMono ? "true" : "false") +
+        ", useNoiseCancellation=" + std::string(useNoiseCancellation ? "true" : "false") + "\n").c_str());
     m_audioEvent.create(wil::EventOptions::ManualReset);
     m_endEvent.create(wil::EventOptions::ManualReset);
     m_startEvent.create(wil::EventOptions::ManualReset);
@@ -158,8 +160,24 @@ winrt::IAsyncAction AudioSampleGenerator::InitializeAsync()
             throw winrt::hresult_error(E_FAIL, L"Failed to initialize loopback audio capture!");
         }
 
+        // Initialize noise suppressor for microphone audio if enabled
+        if (m_useNoiseCancellation && m_captureMicrophone)
+        {
+            m_noiseSuppressor = std::make_unique<NoiseSuppressor>();
+            OutputDebugStringA("Noise cancellation enabled for microphone\n");
+        }
+
         m_audioGraph.QuantumStarted({ this, &AudioSampleGenerator::OnAudioQuantumStarted });
 
+        // Start the AudioGraph now so the microphone device begins warming up
+        // during the remaining recording initialization (transcoder setup, etc.).
+        // OnAudioQuantumStarted returns early while m_started is false, so audio
+        // samples are discarded until Start() is called.  The side-effect of
+        // starting the graph early is that the system mic-active icon appears
+        // sooner, which also triggers a desktop-content change that helps
+        // unblock the WGC frame pool wait in OnMediaStreamSourceStarting.
+        m_audioGraph.Start();
+
         m_asyncInitialized.SetEvent();
     }
 }
@@ -205,67 +223,65 @@ std::optional<winrt::MediaStreamSample> AudioSampleGenerator::TryGetNextSample()
         }
     }
 
+    // Wait for audio samples to become available, retrying on spurious wakes
+    // (e.g. when OnAudioQuantumStarted signals m_audioEvent but the quantum
+    // produced an empty buffer so m_samples is still empty).
+    for (;;)
     {
-        auto lock = m_lock.lock_exclusive();
-        if (m_samples.empty() && m_endEvent.is_signaled())
-        {
-            return std::nullopt;
-        }
-        else if (!m_samples.empty())
         {
-            std::optional result(m_samples.front());
-            m_samples.pop_front();
-            return result;
+            auto lock = m_lock.lock_exclusive();
+            if (m_samples.empty() && m_endEvent.is_signaled())
+            {
+                return std::nullopt;
+            }
+            else if (!m_samples.empty())
+            {
+                std::optional result(m_samples.front());
+                m_samples.pop_front();
+                return result;
+            }
         }
-    }
-
-    m_audioEvent.ResetEvent();
-    std::vector<HANDLE> events = { m_endEvent.get(), m_audioEvent.get() };
-    auto waitResult = WaitForMultipleObjectsEx(static_cast<DWORD>(events.size()), events.data(), false, INFINITE, false);
-    auto eventIndex = -1;
-    switch (waitResult)
-    {
-    case WAIT_OBJECT_0:
-    case WAIT_OBJECT_0 + 1:
-        eventIndex = waitResult - WAIT_OBJECT_0;
-        break;
-    }
-    WINRT_VERIFY(eventIndex >= 0);
 
-    auto signaledEvent = events[eventIndex];
-    if (signaledEvent == m_endEvent.get())
-    {
-        // End was signaled, but check for any remaining samples before returning nullopt
-        auto lock = m_lock.lock_exclusive();
-        if (!m_samples.empty())
+        m_audioEvent.ResetEvent();
+        std::vector<HANDLE> events = { m_endEvent.get(), m_audioEvent.get() };
+        auto waitResult = WaitForMultipleObjectsEx(static_cast<DWORD>(events.size()), events.data(), false, INFINITE, false);
+        auto eventIndex = -1;
+        switch (waitResult)
         {
-            std::optional result(m_samples.front());
-            m_samples.pop_front();
-            return result;
+        case WAIT_OBJECT_0:
+        case WAIT_OBJECT_0 + 1:
+            eventIndex = waitResult - WAIT_OBJECT_0;
+            break;
         }
-        return std::nullopt;
-    }
-    else
-    {
-        auto lock = m_lock.lock_exclusive();
-        if (m_samples.empty())
+        WINRT_VERIFY(eventIndex >= 0);
+
+        auto signaledEvent = events[eventIndex];
+        if (signaledEvent == m_endEvent.get())
         {
-            // Spurious wake or race - no samples available
-            // If end is signaled, return nullopt
-            return m_endEvent.is_signaled() ? std::nullopt : std::optional<winrt::MediaStreamSample>{};
+            // End was signaled, but check for any remaining samples before returning nullopt
+            auto lock = m_lock.lock_exclusive();
+            if (!m_samples.empty())
+            {
+                std::optional result(m_samples.front());
+                m_samples.pop_front();
+                return result;
+            }
+            return std::nullopt;
         }
-        std::optional result(m_samples.front());
-        m_samples.pop_front();
-        return result;
+        // m_audioEvent was signaled — loop back to check m_samples again.
+        // If the quantum produced an empty buffer, m_samples will still be
+        // empty and we'll wait for the next quantum.
     }
 }
 
-void AudioSampleGenerator::Start()
+void AudioSampleGenerator::Start(int64_t videoStartTimestamp)
 {
     CheckInitialized();
+    m_videoStartTimestamp = videoStartTimestamp;
     auto expected = false;
     if (m_started.compare_exchange_strong(expected, true))
     {
+        OutputDebugStringW( L"[AudioGen] Start(): m_started set to true, setting m_startEvent\n" );
         m_endEvent.ResetEvent();
         m_startEvent.SetEvent();
 
@@ -284,7 +300,7 @@ void AudioSampleGenerator::Start()
             m_loopbackCapture->Start();
         }
 
-        m_audioGraph.Start();
+        // AudioGraph was already started in InitializeAsync for mic warmup.
     }
 }
 
@@ -611,12 +627,21 @@ void AudioSampleGenerator::CombineQueuedSamples()
 
 void AudioSampleGenerator::OnAudioQuantumStarted(winrt::AudioGraph const& sender, winrt::IInspectable const& args)
 {
-    // Don't process if we're not actively recording
+    // Don't process if we're not actively recording, but DO drain the
+    // output node so stale audio doesn't accumulate during mic warmup.
+    // Without this, the first GetFrame() after m_started becomes true
+    // would return several seconds of buffered audio, confusing the
+    // transcoder's A/V interleaving.
     if (!m_started.load())
     {
+        auto frame = m_audioOutputNode.GetFrame();
+        (void)frame;  // discard
         return;
     }
 
+    static int s_quantumCount = 0;
+    s_quantumCount++;
+
     {
         auto lock = m_lock.lock_exclusive();
 
@@ -628,6 +653,14 @@ void AudioSampleGenerator::OnAudioQuantumStarted(winrt::AudioGraph const& sender
         auto sampleBuffer = winrt::Buffer::CreateCopyFromMemoryBuffer(audioBuffer);
         sampleBuffer.Length(audioBuffer.Length());
 
+        if( s_quantumCount <= 5 )
+        {
+            wchar_t dbg[256];
+            swprintf_s( dbg, L"[AudioGen] quantum #%d: audioBuffer.Length=%u sampleBuffer.Length=%u started=%d\n",
+                         s_quantumCount, audioBuffer.Length(), sampleBuffer.Length(), m_started.load() ? 1 : 0 );
+            OutputDebugStringW( dbg );
+        }
+
         // Calculate expected samples per quantum (~10ms at graph sample rate)
         // AudioGraph uses 10ms quantums by default
         uint32_t expectedSamplesPerQuantum = (m_graphSampleRate / 100) * m_graphChannels;
@@ -636,7 +669,7 @@ void AudioSampleGenerator::OnAudioQuantumStarted(winrt::AudioGraph const& sender
         // Apply mono mixing to microphone audio if enabled
         // This converts stereo mic input (with same signal on both channels) to true mono
         // by averaging the channels and writing the result to both channels
-        if (m_micMonoMix && m_captureMicrophone && numMicSamples > 0 && m_graphChannels >= 2)
+        if (m_mixMicrophoneMono && m_captureMicrophone && numMicSamples > 0 && m_graphChannels >= 2)
         {
             float* micData = reinterpret_cast<float*>(sampleBuffer.data());
             uint32_t numFrames = numMicSamples / m_graphChannels;
@@ -656,6 +689,12 @@ void AudioSampleGenerator::OnAudioQuantumStarted(winrt::AudioGraph const& sender
                 }
             }
         }
+        // Apply noise suppression to microphone audio before mixing with loopback
+        if (m_noiseSuppressor && m_captureMicrophone && numMicSamples > 0)
+        {
+            float* micData = reinterpret_cast<float*>(sampleBuffer.data());
+            m_noiseSuppressor->Process(micData, numMicSamples, m_graphChannels);
+        }
 
         // Drain loopback samples regardless of whether we have mic audio
         if (m_loopbackCapture)
@@ -733,13 +772,25 @@ void AudioSampleGenerator::OnAudioQuantumStarted(winrt::AudioGraph const& sender
 
         if (sampleBuffer.Length() > 0)
         {
-            auto sample = winrt::MediaStreamSample::CreateFromBuffer(sampleBuffer, timestamp.value());
+            // Rebase audio timestamps to the video's SystemRelativeTime domain.
+            // AudioGraph RelativeTime starts near 0 (or a few hundred ms after
+            // warmup draining), while video uses absolute SRT (~hours since boot).
+            // Without rebasing, the transcoder sees audio far behind video and
+            // starves video while trying to fill the gap with audio.
+            if (!m_hasTimestampOffset && timestamp.has_value())
+            {
+                m_timestampOffset = m_videoStartTimestamp - timestamp.value().count();
+                m_hasTimestampOffset = true;
+            }
+            auto adjustedTs = winrt::TimeSpan{ timestamp.value().count() + m_timestampOffset };
+
+            auto sample = winrt::MediaStreamSample::CreateFromBuffer(sampleBuffer, adjustedTs);
             m_samples.push_back(sample);
 
             const uint32_t sampleCount = sampleBuffer.Length() / sizeof(float);
             const uint32_t frames = (m_graphChannels > 0) ? (sampleCount / m_graphChannels) : 0;
             const int64_t durationTicks = (m_graphSampleRate > 0) ? (static_cast<int64_t>(frames) * 10000000LL / m_graphSampleRate) : 0;
-            m_lastSampleTimestamp = timestamp.value();
+            m_lastSampleTimestamp = adjustedTs;
             m_lastSampleDuration = winrt::TimeSpan{ durationTicks };
             m_hasLastSampleTimestamp = true;
         }

diff --git a/src/modules/ZoomIt/ZoomIt/AudioSampleGenerator.h b/src/modules/ZoomIt/ZoomIt/AudioSampleGenerator.h
@@ -1,18 +1,23 @@
 #pragma once
 
 #include "LoopbackCapture.h"
+#include "NoiseSuppressor.h"
+
+#include <deque>
+#include <optional>
+#include <memory>
 
 class AudioSampleGenerator
 {
 public:
-    AudioSampleGenerator(bool captureMicrophone = true, bool captureSystemAudio = true, bool micMonoMix = false);
+    AudioSampleGenerator(bool captureMicrophone = true, bool captureSystemAudio = true, bool mixMicrophoneMono = false, bool useNoiseCancellation = false);
     ~AudioSampleGenerator();
 
     winrt::Windows::Foundation::IAsyncAction InitializeAsync();
     winrt::Windows::Media::MediaProperties::AudioEncodingProperties GetEncodingProperties();
 
     std::optional<winrt::Windows::Media::Core::MediaStreamSample> TryGetNextSample();
-    void Start();
+    void Start(int64_t videoStartTimestamp = 0);
     void Stop();
 
 private:
@@ -70,5 +75,14 @@ class AudioSampleGenerator
     std::atomic<bool> m_started = false;
     bool m_captureMicrophone = true;
     bool m_captureSystemAudio = true;
-    bool m_micMonoMix = false;
-};
+    bool m_mixMicrophoneMono = false;
+    bool m_useNoiseCancellation = false;
+    std::unique_ptr<NoiseSuppressor> m_noiseSuppressor;
+
+    // Timestamp rebasing: audio RelativeTime → video SystemRelativeTime domain.
+    // Without this, the transcoder sees audio timestamps (~350ms) far below video
+    // timestamps (~111 billion ticks) and starves video while trying to fill the gap.
+    int64_t m_videoStartTimestamp = 0;
+    int64_t m_timestampOffset = 0;
+    bool m_hasTimestampOffset = false;
+};