PowerToys/src/modules/ZoomIt/ZoomIt/AudioSampleGenerator.cpp at 947b0273886c0bf3b3810683396352cf83c71e97 · foxmsft/PowerToys · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
#include "pch.h"
#include "AudioSampleGenerator.h"
#include "CaptureFrameWait.h"
#include "LoopbackCapture.h"
#include <wrl/client.h>

extern TCHAR g_MicrophoneDeviceId[];

namespace
{
    // Declare the IMemoryBufferByteAccess interface for accessing raw buffer data
    MIDL_INTERFACE("5b0d3235-4dba-4d44-8657-1f1d0f83e9a3")
    IMemoryBufferByteAccess : public IUnknown
    {
    public:
        virtual HRESULT STDMETHODCALLTYPE GetBuffer(
            BYTE** value,
            UINT32* capacity) = 0;
    };
}

namespace winrt
{
    using namespace Windows::Foundation;
    using namespace Windows::Storage;
    using namespace Windows::Storage::Streams;
    using namespace Windows::Media;
    using namespace Windows::Media::Audio;
    using namespace Windows::Media::Capture;
    using namespace Windows::Media::Core;
    using namespace Windows::Media::Render;
    using namespace Windows::Media::MediaProperties;
    using namespace Windows::Media::Devices;
    using namespace Windows::Devices::Enumeration;
}

AudioSampleGenerator::AudioSampleGenerator(bool captureMicrophone, bool captureSystemAudio, bool mixMicrophoneMono, bool useNoiseCancellation)
    : m_captureMicrophone(captureMicrophone)
    , m_captureSystemAudio(captureSystemAudio)
    , m_mixMicrophoneMono(mixMicrophoneMono)
    , m_useNoiseCancellation(useNoiseCancellation)
{
    OutputDebugStringA(("AudioSampleGenerator created, captureMicrophone=" +
        std::string(captureMicrophone ? "true" : "false") +
        ", captureSystemAudio=" + std::string(captureSystemAudio ? "true" : "false") +
        ", mixMicrophoneMono=" + std::string(mixMicrophoneMono ? "true" : "false") +
        ", useNoiseCancellation=" + std::string(useNoiseCancellation ? "true" : "false") + "\n").c_str());
    m_audioEvent.create(wil::EventOptions::ManualReset);
    m_endEvent.create(wil::EventOptions::ManualReset);
    m_startEvent.create(wil::EventOptions::ManualReset);
    m_asyncInitialized.create(wil::EventOptions::ManualReset);
}

AudioSampleGenerator::~AudioSampleGenerator()
{
    Stop();
    if (m_audioGraph)
    {
        m_audioGraph.Close();
    }
}

winrt::IAsyncAction AudioSampleGenerator::InitializeAsync()
{
    auto expected = false;
    if (m_initialized.compare_exchange_strong(expected, true))
    {
        // Reset state in case this instance is reused.
        m_endEvent.ResetEvent();
        m_startEvent.ResetEvent();

        // Initialize the audio graph
        auto audioGraphSettings = winrt::AudioGraphSettings(winrt::AudioRenderCategory::Media);
        auto audioGraphResult = co_await winrt::AudioGraph::CreateAsync(audioGraphSettings);
        if (audioGraphResult.Status() != winrt::AudioGraphCreationStatus::Success)
        {
            throw winrt::hresult_error(E_FAIL, L"Failed to initialize AudioGraph!");
        }
        m_audioGraph = audioGraphResult.Graph();

        // Get AudioGraph encoding properties for resampling
        auto graphProps = m_audioGraph.EncodingProperties();
        m_graphSampleRate = graphProps.SampleRate();
        m_graphChannels = graphProps.ChannelCount();

        OutputDebugStringA(("AudioGraph initialized: " + std::to_string(m_graphSampleRate) +
            " Hz, " + std::to_string(m_graphChannels) + " ch\n").c_str());

        // Create submix node to mix microphone and loopback audio
        m_submixNode = m_audioGraph.CreateSubmixNode();
        m_audioOutputNode = m_audioGraph.CreateFrameOutputNode();
        m_submixNode.AddOutgoingConnection(m_audioOutputNode);

        // Initialize WASAPI loopback capture for system audio (if enabled)
        if (m_captureSystemAudio)
        {
            m_loopbackCapture = std::make_unique<LoopbackCapture>();
        }
        if (m_loopbackCapture && SUCCEEDED(m_loopbackCapture->Initialize()))
        {
            auto loopbackFormat = m_loopbackCapture->GetFormat();
            if (loopbackFormat)
            {
                m_loopbackChannels = loopbackFormat->nChannels;
                m_loopbackSampleRate = loopbackFormat->nSamplesPerSec;
                m_resampleRatio = static_cast<double>(m_loopbackSampleRate) / static_cast<double>(m_graphSampleRate);

                OutputDebugStringA(("Loopback initialized: " + std::to_string(m_loopbackSampleRate) +
                    " Hz, " + std::to_string(m_loopbackChannels) + " ch, resample ratio=" +
                    std::to_string(m_resampleRatio) + "\n").c_str());
            }
        }
        else if (m_captureSystemAudio)
        {
            OutputDebugStringA("WARNING: Failed to initialize loopback capture\n");
            m_loopbackCapture.reset();
        }

        // Always initialize a microphone input node to keep the AudioGraph running at real-time pace.
        // When mic capture is disabled, we mute it so only loopback audio is captured.
        {
            auto defaultMicrophoneId = winrt::MediaDevice::GetDefaultAudioCaptureId(winrt::AudioDeviceRole::Default);
            auto microphoneId = (m_captureMicrophone && g_MicrophoneDeviceId[0] != 0)
                ? winrt::to_hstring(g_MicrophoneDeviceId)
                : defaultMicrophoneId;
            if (!microphoneId.empty())
            {
                auto microphone = co_await winrt::DeviceInformation::CreateFromIdAsync(microphoneId);

                // Initialize audio input node
                auto inputNodeResult = co_await m_audioGraph.CreateDeviceInputNodeAsync(winrt::MediaCategory::Media, m_audioGraph.EncodingProperties(), microphone);
                if (inputNodeResult.Status() != winrt::AudioDeviceNodeCreationStatus::Success && microphoneId != defaultMicrophoneId)
                {
                    // If the selected microphone failed, try again with the default
                    microphone = co_await winrt::DeviceInformation::CreateFromIdAsync(defaultMicrophoneId);
                    inputNodeResult = co_await m_audioGraph.CreateDeviceInputNodeAsync(winrt::MediaCategory::Media, m_audioGraph.EncodingProperties(), microphone);
                }
                if (inputNodeResult.Status() == winrt::AudioDeviceNodeCreationStatus::Success)
                {
                    m_audioInputNode = inputNodeResult.DeviceInputNode();
                    m_audioInputNode.AddOutgoingConnection(m_submixNode);

                    // If mic capture is disabled, mute the input so only loopback is captured
                    if (!m_captureMicrophone)
                    {
                        m_audioInputNode.OutgoingGain(0.0);
                        OutputDebugStringA("Mic input created but muted (loopback-only mode)\n");
                    }
                    else
                    {
                        OutputDebugStringA("Mic input created and active\n");
                    }
                }
            }
        }

        // Loopback capture is only required when system audio capture is enabled
        if (m_captureSystemAudio && !m_loopbackCapture)
        {
            throw winrt::hresult_error(E_FAIL, L"Failed to initialize loopback audio capture!");
        }

        // Initialize noise suppressor for microphone audio if enabled
        if (m_useNoiseCancellation && m_captureMicrophone)
        {
            m_noiseSuppressor = std::make_unique<NoiseSuppressor>();
            OutputDebugStringA("Noise cancellation enabled for microphone\n");
        }

        m_audioGraph.QuantumStarted({ this, &AudioSampleGenerator::OnAudioQuantumStarted });

        // Start the AudioGraph now so the microphone device begins warming up
        // during the remaining recording initialization (transcoder setup, etc.).
        // OnAudioQuantumStarted returns early while m_started is false, so audio
        // samples are discarded until Start() is called.  The side-effect of
        // starting the graph early is that the system mic-active icon appears
        // sooner, which also triggers a desktop-content change that helps
        // unblock the WGC frame pool wait in OnMediaStreamSourceStarting.
        m_audioGraph.Start();

        m_asyncInitialized.SetEvent();
    }
}

winrt::AudioEncodingProperties AudioSampleGenerator::GetEncodingProperties()
{
    CheckInitialized();
    return m_audioOutputNode.EncodingProperties();
}

std::optional<winrt::MediaStreamSample> AudioSampleGenerator::TryGetNextSample()
{
    CheckInitialized();

    // The MediaStreamSource can request audio samples before we've started the audio graph.
    // Instead of throwing (which crashes the app), wait until either Start() is called
    // or Stop() signals end-of-stream.
    if (!m_started.load())
    {
        std::vector<HANDLE> events = { m_endEvent.get(), m_startEvent.get() };
        auto waitResult = WaitForMultipleObjectsEx(static_cast<DWORD>(events.size()), events.data(), false, INFINITE, false);
        auto eventIndex = -1;
        switch (waitResult)
        {
        case WAIT_OBJECT_0:
        case WAIT_OBJECT_0 + 1:
            eventIndex = waitResult - WAIT_OBJECT_0;
            break;
        }
        WINRT_VERIFY(eventIndex >= 0);

        if (events[eventIndex] == m_endEvent.get())
        {
            // End event signaled, but check if there are any remaining samples in the queue
            auto lock = m_lock.lock_exclusive();
            if (!m_samples.empty())
            {
                std::optional result(m_samples.front());
                m_samples.pop_front();
                return result;
            }
            return std::nullopt;
        }
    }

    // Wait for audio samples to become available, retrying on spurious wakes
    // (e.g. when OnAudioQuantumStarted signals m_audioEvent but the quantum
    // produced an empty buffer so m_samples is still empty).
    for (;;)
    {
        {
            auto lock = m_lock.lock_exclusive();
            if (m_samples.empty() && m_endEvent.is_signaled())
            {
                return std::nullopt;
            }
            else if (!m_samples.empty())
            {
                std::optional result(m_samples.front());
                m_samples.pop_front();
                return result;
            }
        }

        m_audioEvent.ResetEvent();
        std::vector<HANDLE> events = { m_endEvent.get(), m_audioEvent.get() };
        auto waitResult = WaitForMultipleObjectsEx(static_cast<DWORD>(events.size()), events.data(), false, INFINITE, false);
        auto eventIndex = -1;
        switch (waitResult)
        {
        case WAIT_OBJECT_0:
        case WAIT_OBJECT_0 + 1:
            eventIndex = waitResult - WAIT_OBJECT_0;
            break;
        }
        WINRT_VERIFY(eventIndex >= 0);

        auto signaledEvent = events[eventIndex];
        if (signaledEvent == m_endEvent.get())
        {
            // End was signaled, but check for any remaining samples before returning nullopt
            auto lock = m_lock.lock_exclusive();
            if (!m_samples.empty())
            {
                std::optional result(m_samples.front());
                m_samples.pop_front();
                return result;
            }
            return std::nullopt;
        }
        // m_audioEvent was signaled — loop back to check m_samples again.
        // If the quantum produced an empty buffer, m_samples will still be
        // empty and we'll wait for the next quantum.
    }
}

void AudioSampleGenerator::Start(int64_t videoStartTimestamp)
{
    CheckInitialized();
    m_videoStartTimestamp = videoStartTimestamp;
    auto expected = false;
    if (m_started.compare_exchange_strong(expected, true))
    {
        OutputDebugStringW( L"[AudioGen] Start(): m_started set to true, setting m_startEvent\n" );
        m_endEvent.ResetEvent();
        m_startEvent.SetEvent();

        // Start loopback capture if available
        if (m_loopbackCapture)
        {
            // Clear any stale samples
            {
                auto lock = m_loopbackBufferLock.lock_exclusive();
                m_loopbackBuffer.clear();
            }

            m_resampleInputBuffer.clear();
            m_resampleInputPos = 0.0;

            m_loopbackCapture->Start();
        }

        // AudioGraph was already started in InitializeAsync for mic warmup.
    }
}

void AudioSampleGenerator::Stop()
{
    // Stop may be called during teardown even if initialization hasn't completed.
    // It must never throw.

    if (!m_initialized.load())
    {
        m_endEvent.SetEvent();
        return;
    }

    m_asyncInitialized.wait();

    // Stop loopback capture first
    if (m_loopbackCapture)
    {
        m_loopbackCapture->Stop();
    }

    // Flush any remaining samples from the loopback capture before stopping the audio graph
    FlushRemainingAudio();

    // Stop the audio graph - no more quantum callbacks will run
    m_audioGraph.Stop();

    // Close the microphone input node to release the device so Windows no longer
    // reports the microphone as in use by ZoomIt.
    if (m_audioInputNode)
    {
        m_audioInputNode.Close();
        m_audioInputNode = nullptr;
    }

    // Mark as stopped
    m_started.store(false);

    // Combine all remaining queued samples into one final sample so it can be
    // returned immediately without waiting for additional TryGetNextSample calls
    CombineQueuedSamples();

    // NOW signal end event - this allows TryGetNextSample to return remaining
    // queued samples and then return nullopt
    m_endEvent.SetEvent();
    m_audioEvent.SetEvent(); // Also wake any waiting TryGetNextSample

    // DO NOT clear m_loopbackBuffer or m_samples here - allow MediaTranscoder to
    // consume remaining queued audio samples to avoid audio cutoff at end of recording.
    // TryGetNextSample() will return nullopt once m_samples is empty and
    // m_endEvent is signaled. Buffers will be cleaned up on destruction.
}

void AudioSampleGenerator::AppendResampledLoopbackSamples(std::vector<float> const& rawLoopbackSamples, bool flushRemaining)
{
    if (rawLoopbackSamples.empty())
    {
        return;
    }

    m_resampleInputBuffer.insert(m_resampleInputBuffer.end(), rawLoopbackSamples.begin(), rawLoopbackSamples.end());

    if (m_loopbackChannels == 0 || m_graphChannels == 0 || m_resampleRatio <= 0.0)
    {
        return;
    }

    std::vector<float> resampledSamples;
    while (true)
    {
        const uint32_t inputFrames = static_cast<uint32_t>(m_resampleInputBuffer.size() / m_loopbackChannels);
        if (inputFrames == 0)
        {
            break;
        }

        if (!flushRemaining)
        {
            if (inputFrames < 2 || (m_resampleInputPos + 1.0) >= inputFrames)
            {
                break;
            }
        }
        else
        {
            if (m_resampleInputPos >= inputFrames)
            {
                break;
            }
        }

        uint32_t inputFrame = static_cast<uint32_t>(m_resampleInputPos);
        double frac = m_resampleInputPos - inputFrame;
        uint32_t nextFrame = (inputFrame + 1 < inputFrames) ? (inputFrame + 1) : inputFrame;

        for (uint32_t outCh = 0; outCh < m_graphChannels; outCh++)
        {
            float sample = 0.0f;

            if (m_loopbackChannels == m_graphChannels)
            {
                uint32_t idx1 = inputFrame * m_loopbackChannels + outCh;
                uint32_t idx2 = nextFrame * m_loopbackChannels + outCh;
                float s1 = m_resampleInputBuffer[idx1];
                float s2 = m_resampleInputBuffer[idx2];
                sample = static_cast<float>(s1 * (1.0 - frac) + s2 * frac);
            }
            else if (m_loopbackChannels > m_graphChannels)
            {
                float sum = 0.0f;
                for (uint32_t inCh = 0; inCh < m_loopbackChannels; inCh++)
                {
                    uint32_t idx1 = inputFrame * m_loopbackChannels + inCh;
                    uint32_t idx2 = nextFrame * m_loopbackChannels + inCh;
                    float s1 = m_resampleInputBuffer[idx1];
                    float s2 = m_resampleInputBuffer[idx2];
                    sum += static_cast<float>(s1 * (1.0 - frac) + s2 * frac);
                }
                sample = sum / m_loopbackChannels;
            }
            else
            {
                uint32_t idx1 = inputFrame * m_loopbackChannels;
                uint32_t idx2 = nextFrame * m_loopbackChannels;
                float s1 = m_resampleInputBuffer[idx1];
                float s2 = m_resampleInputBuffer[idx2];
                sample = static_cast<float>(s1 * (1.0 - frac) + s2 * frac);
            }

            resampledSamples.push_back(sample);
        }

        m_resampleInputPos += m_resampleRatio;
    }

    uint32_t consumedFrames = static_cast<uint32_t>(m_resampleInputPos);
    if (consumedFrames > 0)
    {
        size_t samplesToErase = static_cast<size_t>(consumedFrames) * m_loopbackChannels;
        if (samplesToErase >= m_resampleInputBuffer.size())
        {
            m_resampleInputBuffer.clear();
            m_resampleInputPos = 0.0;
        }
        else
        {
            m_resampleInputBuffer.erase(m_resampleInputBuffer.begin(), m_resampleInputBuffer.begin() + samplesToErase);
            m_resampleInputPos -= consumedFrames;
        }
    }

    if (flushRemaining)
    {
        m_resampleInputBuffer.clear();
        m_resampleInputPos = 0.0;
    }

    if (!resampledSamples.empty())
    {
        auto loopbackLock = m_loopbackBufferLock.lock_exclusive();
        const size_t maxBufferSize = static_cast<size_t>(m_graphSampleRate) * m_graphChannels;

        if (m_loopbackBuffer.size() + resampledSamples.size() > maxBufferSize)
        {
            size_t overflow = (m_loopbackBuffer.size() + resampledSamples.size()) - maxBufferSize;
            if (overflow >= m_loopbackBuffer.size())
            {
                m_loopbackBuffer.clear();
            }
            else
            {
                m_loopbackBuffer.erase(m_loopbackBuffer.begin(), m_loopbackBuffer.begin() + overflow);
            }
        }

        m_loopbackBuffer.insert(m_loopbackBuffer.end(), resampledSamples.begin(), resampledSamples.end());
    }
}

void AudioSampleGenerator::FlushRemainingAudio()
{
    // Called during stop to drain any remaining samples from loopback capture
    // and convert them to MediaStreamSamples before the audio graph stops.

    if (!m_loopbackCapture)
    {
        return;
    }

    auto lock = m_lock.lock_exclusive();

    // Drain all remaining samples from the loopback capture client
    std::vector<float> rawLoopbackSamples;
    {
        std::vector<float> tempSamples;
        while (m_loopbackCapture->TryGetSamples(tempSamples))
        {
            rawLoopbackSamples.insert(rawLoopbackSamples.end(), tempSamples.begin(), tempSamples.end());
        }
    }

    // Resample and channel-convert the loopback audio to match AudioGraph format
    if (!rawLoopbackSamples.empty())
    {
        AppendResampledLoopbackSamples(rawLoopbackSamples, true);
    }

    // Now convert everything in m_loopbackBuffer to MediaStreamSamples
    auto loopbackLock = m_loopbackBufferLock.lock_exclusive();

    if (!m_loopbackBuffer.empty())
    {
        uint32_t outputSampleCount = static_cast<uint32_t>(m_loopbackBuffer.size());
        std::vector<uint8_t> outputData(outputSampleCount * sizeof(float), 0);
        float* outputFloats = reinterpret_cast<float*>(outputData.data());

        for (uint32_t i = 0; i < outputSampleCount; i++)
        {
            float sample = m_loopbackBuffer[i];
            if (sample > 1.0f) sample = 1.0f;
            else if (sample < -1.0f) sample = -1.0f;
            outputFloats[i] = sample;
        }

        m_loopbackBuffer.clear();

        // Create buffer and sample
        winrt::Buffer sampleBuffer(outputSampleCount * sizeof(float));
        memcpy(sampleBuffer.data(), outputData.data(), outputData.size());
        sampleBuffer.Length(static_cast<uint32_t>(outputData.size()));

        if (sampleBuffer.Length() > 0)
        {
            const uint32_t sampleCount = sampleBuffer.Length() / sizeof(float);
            const uint32_t frames = (m_graphChannels > 0) ? (sampleCount / m_graphChannels) : 0;
            const int64_t durationTicks = (m_graphSampleRate > 0) ? (static_cast<int64_t>(frames) * 10000000LL / m_graphSampleRate) : 0;
            const winrt::TimeSpan duration{ durationTicks };

            winrt::TimeSpan timestamp{ 0 };
            if (m_hasLastSampleTimestamp)
            {
                timestamp = winrt::TimeSpan{ m_lastSampleTimestamp.count() + m_lastSampleDuration.count() };
            }

            auto sample = winrt::MediaStreamSample::CreateFromBuffer(sampleBuffer, timestamp);
            m_samples.push_back(sample);
            m_audioEvent.SetEvent();

            m_lastSampleTimestamp = timestamp;
            m_lastSampleDuration = duration;
            m_hasLastSampleTimestamp = true;
        }
    }
}

void AudioSampleGenerator::CombineQueuedSamples()
{
    // Combine all queued samples into a single sample so it can be returned
    // immediately in the next TryGetNextSample call. This is critical because
    // once video ends, the MediaTranscoder may only request one more audio sample.

    auto lock = m_lock.lock_exclusive();

    if (m_samples.size() <= 1)
    {
        return;
    }

    // Calculate total size and collect all sample data
    size_t totalBytes = 0;
    std::vector<std::pair<winrt::Windows::Storage::Streams::IBuffer, winrt::Windows::Foundation::TimeSpan>> buffers;
    winrt::Windows::Foundation::TimeSpan firstTimestamp{ 0 };
    bool hasFirstTimestamp = false;

    for (auto& sample : m_samples)
    {
        auto buffer = sample.Buffer();
        if (buffer)
        {
            totalBytes += buffer.Length();
            if (!hasFirstTimestamp)
            {
                firstTimestamp = sample.Timestamp();
                hasFirstTimestamp = true;
            }
            buffers.push_back({ buffer, sample.Timestamp() });
        }
    }

    if (totalBytes == 0)
    {
        return;
    }

    // Create combined buffer
    winrt::Buffer combinedBuffer(static_cast<uint32_t>(totalBytes));
    uint8_t* dest = combinedBuffer.data();
    uint32_t offset = 0;

    for (auto& [buffer, ts] : buffers)
    {
        uint32_t len = buffer.Length();
        memcpy(dest + offset, buffer.data(), len);
        offset += len;
    }
    combinedBuffer.Length(static_cast<uint32_t>(totalBytes));

    // Create combined sample with first timestamp
    auto combinedSample = winrt::Windows::Media::Core::MediaStreamSample::CreateFromBuffer(combinedBuffer, firstTimestamp);

    // Clear queue and add combined sample
    m_samples.clear();
    m_samples.push_back(combinedSample);

    // Update timestamp tracking
    const uint32_t sampleCount = static_cast<uint32_t>(totalBytes) / sizeof(float);
    const uint32_t frames = (m_graphChannels > 0) ? (sampleCount / m_graphChannels) : 0;
    const int64_t durationTicks = (m_graphSampleRate > 0) ? (static_cast<int64_t>(frames) * 10000000LL / m_graphSampleRate) : 0;
    m_lastSampleTimestamp = firstTimestamp;
    m_lastSampleDuration = winrt::Windows::Foundation::TimeSpan{ durationTicks };
    m_hasLastSampleTimestamp = true;
}

void AudioSampleGenerator::OnAudioQuantumStarted(winrt::AudioGraph const& sender, winrt::IInspectable const& args)
{
    // Don't process if we're not actively recording, but DO drain the
    // output node so stale audio doesn't accumulate during mic warmup.
    // Without this, the first GetFrame() after m_started becomes true
    // would return several seconds of buffered audio, confusing the
    // transcoder's A/V interleaving.
    if (!m_started.load())
    {
        auto frame = m_audioOutputNode.GetFrame();
        (void)frame;  // discard
        return;
    }

    static int s_quantumCount = 0;
    s_quantumCount++;

    {
        auto lock = m_lock.lock_exclusive();

        auto frame = m_audioOutputNode.GetFrame();
        std::optional<winrt::TimeSpan> timestamp = frame.RelativeTime();
        auto audioBuffer = frame.LockBuffer(winrt::AudioBufferAccessMode::Read);

        // Get mic audio as a buffer (may be empty if no microphone)
        auto sampleBuffer = winrt::Buffer::CreateCopyFromMemoryBuffer(audioBuffer);
        sampleBuffer.Length(audioBuffer.Length());

        if( s_quantumCount <= 5 )
        {
            wchar_t dbg[256];
            swprintf_s( dbg, L"[AudioGen] quantum #%d: audioBuffer.Length=%u sampleBuffer.Length=%u started=%d\n",
                         s_quantumCount, audioBuffer.Length(), sampleBuffer.Length(), m_started.load() ? 1 : 0 );
            OutputDebugStringW( dbg );
        }

        // Calculate expected samples per quantum (~10ms at graph sample rate)
        // AudioGraph uses 10ms quantums by default
        uint32_t expectedSamplesPerQuantum = (m_graphSampleRate / 100) * m_graphChannels;
        uint32_t numMicSamples = audioBuffer.Length() / sizeof(float);

        // Apply mono mixing to microphone audio if enabled
        // This converts stereo mic input (with same signal on both channels) to true mono
        // by averaging the channels and writing the result to both channels
        if (m_mixMicrophoneMono && m_captureMicrophone && numMicSamples > 0 && m_graphChannels >= 2)
        {
            float* micData = reinterpret_cast<float*>(sampleBuffer.data());
            uint32_t numFrames = numMicSamples / m_graphChannels;
            for (uint32_t i = 0; i < numFrames; i++)
            {
                // Sum all channels for this frame
                float sum = 0.0f;
                for (uint32_t ch = 0; ch < m_graphChannels; ch++)
                {
                    sum += micData[i * m_graphChannels + ch];
                }
                // Power-preserving mix: divide by sqrt(N) to maintain perceived loudness
                float mono = sum / std::sqrt(static_cast<float>(m_graphChannels));
                for (uint32_t ch = 0; ch < m_graphChannels; ch++)
                {
                    micData[i * m_graphChannels + ch] = mono;
                }
            }
        }
        // Apply noise suppression to microphone audio before mixing with loopback
        if (m_noiseSuppressor && m_captureMicrophone && numMicSamples > 0)
        {
            float* micData = reinterpret_cast<float*>(sampleBuffer.data());
            m_noiseSuppressor->Process(micData, numMicSamples, m_graphChannels);
        }

        // Drain loopback samples regardless of whether we have mic audio
        if (m_loopbackCapture)
        {
            std::vector<float> rawLoopbackSamples;
            {
                std::vector<float> tempSamples;
                while (m_loopbackCapture->TryGetSamples(tempSamples))
                {
                    rawLoopbackSamples.insert(rawLoopbackSamples.end(), tempSamples.begin(), tempSamples.end());
                }
            }

            // Resample and channel-convert the loopback audio to match AudioGraph format
            if (!rawLoopbackSamples.empty())
            {
                AppendResampledLoopbackSamples(rawLoopbackSamples);
            }
        }

        // Determine the actual number of samples we'll output
        // Use mic sample count if mic is enabled
        uint32_t outputSampleCount = m_captureMicrophone ? numMicSamples : expectedSamplesPerQuantum;

        // If microphone is disabled, create a buffer with only loopback audio
        if (!m_captureMicrophone && outputSampleCount > 0)
        {
            // Create a buffer filled with loopback audio or silence
            std::vector<uint8_t> outputData(outputSampleCount * sizeof(float), 0);
            float* outputFloats = reinterpret_cast<float*>(outputData.data());

            {
                auto loopbackLock = m_loopbackBufferLock.lock_exclusive();
                uint32_t samplesToUse = min(outputSampleCount, static_cast<uint32_t>(m_loopbackBuffer.size()));

                for (uint32_t i = 0; i < samplesToUse; i++)
                {
                    float sample = m_loopbackBuffer[i];
                    if (sample > 1.0f) sample = 1.0f;
                    else if (sample < -1.0f) sample = -1.0f;
                    outputFloats[i] = sample;
                }

                if (samplesToUse > 0)
                {
                    m_loopbackBuffer.erase(m_loopbackBuffer.begin(), m_loopbackBuffer.begin() + samplesToUse);
                }
            }

            // Create a new buffer with our loopback data
            sampleBuffer = winrt::Buffer(outputSampleCount * sizeof(float));
            memcpy(sampleBuffer.data(), outputData.data(), outputData.size());
            sampleBuffer.Length(static_cast<uint32_t>(outputData.size()));
        }
        else if (m_captureMicrophone && numMicSamples > 0)
        {
            // Mix loopback into mic samples
            auto loopbackLock = m_loopbackBufferLock.lock_exclusive();
            float* bufferData = reinterpret_cast<float*>(sampleBuffer.data());
            uint32_t samplesToMix = min(numMicSamples, static_cast<uint32_t>(m_loopbackBuffer.size()));

            for (uint32_t i = 0; i < samplesToMix; i++)
            {
                float mixed = bufferData[i] + m_loopbackBuffer[i];
                if (mixed > 1.0f) mixed = 1.0f;
                else if (mixed < -1.0f) mixed = -1.0f;
                bufferData[i] = mixed;
            }

            if (samplesToMix > 0)
            {
                m_loopbackBuffer.erase(m_loopbackBuffer.begin(), m_loopbackBuffer.begin() + samplesToMix);
            }
        }

        if (sampleBuffer.Length() > 0)
        {
            // Rebase audio timestamps to the video's SystemRelativeTime domain.
            // AudioGraph RelativeTime starts near 0 (or a few hundred ms after
            // warmup draining), while video uses absolute SRT (~hours since boot).
            // Without rebasing, the transcoder sees audio far behind video and
            // starves video while trying to fill the gap with audio.
            if (!m_hasTimestampOffset && timestamp.has_value())
            {
                m_timestampOffset = m_videoStartTimestamp - timestamp.value().count();
                m_hasTimestampOffset = true;
            }
            auto adjustedTs = winrt::TimeSpan{ timestamp.value().count() + m_timestampOffset };

            auto sample = winrt::MediaStreamSample::CreateFromBuffer(sampleBuffer, adjustedTs);
            m_samples.push_back(sample);

            const uint32_t sampleCount = sampleBuffer.Length() / sizeof(float);
            const uint32_t frames = (m_graphChannels > 0) ? (sampleCount / m_graphChannels) : 0;
            const int64_t durationTicks = (m_graphSampleRate > 0) ? (static_cast<int64_t>(frames) * 10000000LL / m_graphSampleRate) : 0;
            m_lastSampleTimestamp = adjustedTs;
            m_lastSampleDuration = winrt::TimeSpan{ durationTicks };
            m_hasLastSampleTimestamp = true;
        }
    }
    m_audioEvent.SetEvent();
}