[*] refactoring: change encoding rgb image to yuv image into h264 format

heng30 · heng30 · commit 2eaf493cb620 · 2025-10-27T15:26:37.000+08:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -82,6 +82,7 @@ console_log = "1.0"
 crypto-hash = "0.3"
 platform-dirs = "0.3"
 
+yuv = "0.8"
 mp4 = "0.14"
 x264 = "0.5"
 cpal = "0.16"
@@ -97,6 +98,7 @@ thiserror = "2.0"
 crossbeam = "0.8"
 spin_sleep = "1.3"
 nnnoiseless = "0.5"
+h264-reader = "0.8"
 ffmpeg-sidecar = "2.2"
 derive_builder = "0.20"
 wayland-client = "0.31"
diff --git a/lib/mp4m/Cargo.toml b/lib/mp4m/Cargo.toml
@@ -19,6 +19,7 @@ rubato.workspace = true
 fdk-aac.workspace = true
 thiserror.workspace = true
 crossbeam.workspace = true
+h264-reader.workspace = true
 derive_builder.workspace = true
 
 [dev-dependencies]
diff --git a/lib/mp4m/examples/mp4_processor_demo.rs b/lib/mp4m/examples/mp4_processor_demo.rs
@@ -58,13 +58,6 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         spec: *spec,
     })?;
 
-    // Start processing in a separate thread
-    let processor_thread = thread::spawn(move || {
-        if let Err(e) = processor.run_processing_loop() {
-            log::warn!("MP4 processing error: {}", e);
-        }
-    });
-
     // Process audio samples if available
     if let Some(spec) = audio_spec {
         log::debug!(
@@ -75,8 +68,8 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         );
 
         // Use AAC-friendly frame size (1024 samples per channel)
-        // let aac_frame_size = 1024; // AAC typically uses 1024 samples per frame
-        let aac_frame_size = 1124 * 3; // AAC typically uses 1024 samples per frame
+        let aac_frame_size = 1024; // AAC typically uses 1024 samples per frame
+        // let aac_frame_size = 1124 * 3; // AAC typically uses 1024 samples per frame
         let samples_per_frame = aac_frame_size * spec.channels as usize;
 
         log::debug!(
@@ -108,9 +101,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     // Generate and send video frames
     let mut h264_encoder = VideoEncoder::new(width, height, fps)?;
     let headers_data = h264_encoder.headers()?.entirety().to_vec();
-    if let Err(e) = video_sender.send(VideoFrameType::Frame(headers_data)) {
-        panic!("video sender h264 header failed: {e}");
-    }
+
+    // Start processing in a separate thread with headers data
+    let processor_thread = thread::spawn(move || {
+        if let Err(e) = processor.run_processing_loop(Some(headers_data)) {
+            log::warn!("MP4 processing error: {}", e);
+        }
+    });
 
     for frame_num in 0..total_frames {
         let img = match (frame_num / fps.to_u32()) % 3 {
diff --git a/lib/mp4m/src/mp4_processor.rs b/lib/mp4m/src/mp4_processor.rs
@@ -1,14 +1,29 @@
 use crossbeam::channel::{Receiver, Sender, bounded};
 use derive_builder::Builder;
 use fdk_aac::enc::{BitRate, ChannelMode, Encoder, EncoderParams, Transport};
+use h264_reader::{
+    annexb::AnnexBReader,
+    nal::{Nal, RefNal, UnitType},
+    push::NalInterest,
+};
 use hound::WavSpec;
 use mp4::{
     AacConfig, AvcConfig, ChannelConfig, Mp4Config, Mp4Sample, Mp4Writer, SampleFreqIndex,
     TrackConfig, TrackType,
 };
-use std::{fs::File, io::BufWriter, path::PathBuf};
+use std::{
+    fs::File,
+    io::{BufWriter, Read},
+    path::PathBuf,
+};
 use thiserror::Error;
 
+const DEFAULT_PPS: [u8; 6] = [0x68, 0xeb, 0xe3, 0xcb, 0x22, 0xc0];
+const DEFAULT_SPS: [u8; 25] = [
+    0x67, 0x64, 0x00, 0x1e, 0xac, 0xd9, 0x40, 0xa0, 0x2f, 0xf9, 0x70, 0x11, 0x00, 0x00, 0x03, 0x03,
+    0xe9, 0x00, 0x00, 0xea, 0x60, 0x0f, 0x16, 0x2d, 0x96,
+];
+
 pub enum VideoFrameType {
     Frame(Vec<u8>),
     End,
@@ -221,18 +236,72 @@ impl Mp4Processor {
             .map_err(|e| Mp4ProcessorError::Mp4(e.to_string()))
     }
 
+    fn extract_sps_pps_from_headers(
+        &self,
+        headers_data: &[u8],
+    ) -> Result<(Vec<u8>, Vec<u8>), Mp4ProcessorError> {
+        let mut sps = None;
+        let mut pps = None;
+
+        let mut reader = AnnexBReader::accumulate(|nal: RefNal<'_>| {
+            let nal_unit_type = nal.header().unwrap().nal_unit_type();
+
+            // Read all data from the NAL unit
+            let mut reader = nal.reader();
+            let mut data = Vec::new();
+            if let Ok(_) = reader.read_to_end(&mut data) {
+                match nal_unit_type {
+                    UnitType::SeqParameterSet => {
+                        sps = Some(data);
+                    }
+                    UnitType::PicParameterSet => {
+                        pps = Some(data);
+                    }
+                    _ => {}
+                }
+            }
+
+            NalInterest::Buffer
+        });
+
+        reader.push(headers_data);
+        reader.reset();
+
+        match (sps, pps) {
+            (Some(sps_data), Some(pps_data)) => {
+                log::info!(
+                    "Successfully extracted SPS ({} bytes) and PPS ({} bytes) from headers",
+                    sps_data.len(),
+                    pps_data.len()
+                );
+                log::debug!(
+                    "SPS first 10 bytes: {:02x?}",
+                    &sps_data[..sps_data.len().min(10)]
+                );
+                log::debug!(
+                    "PPS first 10 bytes: {:02x?}",
+                    &pps_data[..pps_data.len().min(10)]
+                );
+                Ok((sps_data, pps_data))
+            }
+            _ => {
+                log::warn!("Failed to extract SPS/PPS from headers, using fallback");
+                Ok((DEFAULT_SPS.to_vec(), DEFAULT_PPS.to_vec()))
+            }
+        }
+    }
+
     fn setup_video_track(
         &self,
         mp4_writer: &mut Mp4Writer<BufWriter<File>>,
         video_config: &VideoConfig,
+        headers_data: Option<&[u8]>,
     ) -> Result<(), Mp4ProcessorError> {
-        // Setup video track with minimal SPS/PPS for H.264
-        // These are basic parameters that should work for most cases
-        let sps = vec![
-            0x67, 0x64, 0x00, 0x1e, 0xac, 0xd9, 0x40, 0xa0, 0x2f, 0xf9, 0x70, 0x11, 0x00, 0x00,
-            0x03, 0x03, 0xe9, 0x00, 0x00, 0xea, 0x60, 0x0f, 0x16, 0x2d, 0x96,
-        ];
-        let pps = vec![0x68, 0xeb, 0xe3, 0xcb, 0x22, 0xc0];
+        let (sps, pps) = if let Some(headers) = headers_data {
+            self.extract_sps_pps_from_headers(headers)?
+        } else {
+            (DEFAULT_SPS.to_vec(), DEFAULT_PPS.to_vec())
+        };
 
         let video_track_config = TrackConfig {
             track_type: TrackType::Video,
@@ -318,9 +387,16 @@ impl Mp4Processor {
         Ok(audio_track_ids)
     }
 
-    pub fn run_processing_loop(&mut self) -> Result<(), Mp4ProcessorError> {
+    pub fn run_processing_loop(
+        &mut self,
+        headers_data: Option<Vec<u8>>,
+    ) -> Result<(), Mp4ProcessorError> {
         let mut mp4_writer = self.setup_mp4_writer()?;
-        self.setup_video_track(&mut mp4_writer, &self.config.video_config)?;
+        self.setup_video_track(
+            &mut mp4_writer,
+            &self.config.video_config,
+            headers_data.as_deref(),
+        )?;
         let audio_track_ids = self.setup_audio_tracks(&mut mp4_writer)?;
 
         let mut video_timestamp = 0u64;
@@ -398,7 +474,7 @@ impl Mp4Processor {
 
             match self.encode_samples_to_aac(track_index, chunk) {
                 Ok(aac_data) => {
-                    log::info!("aac_data len: {} bytes", aac_data.len());
+                    // log::info!("aac_data len: {} bytes", aac_data.len());
 
                     let samples_per_channel = chunk.len() / channels;
 
@@ -459,11 +535,11 @@ impl Mp4Processor {
     ) {
         for track_index in 0..self.audio_buffer_cache.len() {
             if !self.audio_buffer_cache[track_index].is_empty() {
-                log::info!(
-                    "Flushing cached audio data for track {}: {} samples",
-                    track_index,
-                    self.audio_buffer_cache[track_index].len()
-                );
+                // log::info!(
+                //     "Flushing cached audio data for track {}: {} samples",
+                //     track_index,
+                //     self.audio_buffer_cache[track_index].len()
+                // );
 
                 // Process the remaining cached data
                 let cached_data = std::mem::take(&mut self.audio_buffer_cache[track_index]);
diff --git a/lib/recorder/Cargo.toml b/lib/recorder/Cargo.toml
@@ -26,6 +26,7 @@ once_cell.workspace = true
 spin_sleep.workspace = true
 nnnoiseless.workspace = true
 derive_setters.workspace = true
+yuv = { workspace = true, features = ["rayon"] }
 fast_image_resize = { workspace = true, features = ["rayon"] }
 
 # ffmpeg-sidecar.workspace = true
diff --git a/lib/recorder/examples/recording_10m_demo.rs b/lib/recorder/examples/recording_10m_demo.rs
@@ -8,7 +8,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
 
     log::debug!("Recording for exactly 5 seconds...");
 
-    let audio_recorder = AudioRecorder::new(None)?;
+    let audio_recorder = AudioRecorder::new();
     let Some(default_input) = audio_recorder.get_default_input_device()? else {
         panic!("No default input device found");
     };
@@ -43,7 +43,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
 
     let mut session = RecordingSession::new(config);
 
-    let stop_sig = session.stop_sig().clone();
+    let stop_sig = session.get_stop_sig().clone();
 
     // Start a timer thread that stops recording after 5 seconds
     thread::spawn(move || {
@@ -53,10 +53,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     });
 
     session.start()?;
-    session.wait(None::<Box<dyn FnMut(f32)>>, move |v| {
-        let v = (v * 100.0) as u32;
-        log::debug!("combine tracks progress: {v}%");
-    })?;
+    session.wait()?;
 
     log::debug!("Recording completed successfully!");
 
diff --git a/lib/recorder/examples/recording_5s_demo.rs b/lib/recorder/examples/recording_5s_demo.rs
@@ -31,10 +31,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     )
     // .with_enable_audio_channel_user(true)
     // .with_enable_speaker_channel_user(true)
-    // .with_enable_denoise(true)
+    .with_enable_denoise(true)
     .with_audio_device_name(Some(default_input.name))
-    // .with_enable_recording_speaker(true)
-    // .with_convert_mono(true)
+    .with_enable_recording_speaker(true)
+    .with_convert_to_mono(true)
     .with_resolution(recorder::Resolution::Original((
         screen_infos[0].logical_size.width as u32,
         screen_infos[0].logical_size.height as u32,
diff --git a/lib/recorder/src/recorder.rs b/lib/recorder/src/recorder.rs
diff --git a/lib/recorder/src/video_encoder.rs b/lib/recorder/src/video_encoder.rs