From 1397c4bfd9d73123ae16797d0e5f901b502d5f58 Mon Sep 17 00:00:00 2001 From: Sam Zackrisson Date: Fri, 8 Apr 2022 17:24:45 +0200 Subject: [PATCH] AEC3: Handle temporary stereo content more robustly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During temporary stereo content when the AEC3 uses a mono reference signal, the signal is downmixed by averaging instead of using only the left channel. Additionally, temporary stereo content is flagged as an echo path change. Tested: Modified local build: Verified stereo mode entered / left in accordance with hysteresis and timeout thresholds. Verified temporary stereo detected during temporary stereo playout. Made an aecdump and inspected content. Bug: chromium:1295710 Change-Id: I6bd53e615dfb3ec39bc1c73275b7d6d599ac7c57 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/258481 Reviewed-by: Per Ã…hgren Commit-Queue: Sam Zackrisson Cr-Commit-Position: refs/heads/main@{#36504} --- .../audio_processing/aec3/echo_canceller3.cc | 93 +++++++++++++------ .../aec3/multi_channel_content_detector.h | 2 +- ...multi_channel_content_detector_unittest.cc | 24 ++--- 3 files changed, 78 insertions(+), 41 deletions(-) diff --git a/modules/audio_processing/aec3/echo_canceller3.cc b/modules/audio_processing/aec3/echo_canceller3.cc index 992e295dfb..36bf8769f4 100644 --- a/modules/audio_processing/aec3/echo_canceller3.cc +++ b/modules/audio_processing/aec3/echo_canceller3.cc @@ -96,22 +96,43 @@ void FillSubFrameView( } void FillSubFrameView( + bool proper_downmix_needed, std::vector>>* frame, size_t sub_frame_index, std::vector>>* sub_frame_view) { RTC_DCHECK_GE(1, sub_frame_index); RTC_DCHECK_EQ(frame->size(), sub_frame_view->size()); - if ((*frame)[0].size() > (*sub_frame_view)[0].size()) { - RTC_DCHECK_EQ((*sub_frame_view)[0].size(), 1); - // Downmix the audio to mono (should only be done when the audio contains - // fake-stereo or fake-multichannel). + const size_t frame_num_channels = (*frame)[0].size(); + const size_t sub_frame_num_channels = (*sub_frame_view)[0].size(); + if (frame_num_channels > sub_frame_num_channels) { + RTC_DCHECK_EQ(sub_frame_num_channels, 1u); + if (proper_downmix_needed) { + // When a proper downmix is needed (which is the case when proper stereo + // is present in the echo reference signal but the echo canceller does the + // processing in mono) downmix the echo reference by averaging the channel + // content (otherwise downmixing is done by selecting channel 0). + for (size_t band = 0; band < frame->size(); ++band) { + for (size_t ch = 1; ch < frame_num_channels; ++ch) { + for (size_t k = 0; k < kSubFrameLength; ++k) { + (*frame)[band][/*channel=*/0] + [sub_frame_index * kSubFrameLength + k] += + (*frame)[band][ch][sub_frame_index * kSubFrameLength + k]; + } + } + const float one_by_num_channels = 1.0f / frame_num_channels; + for (size_t k = 0; k < kSubFrameLength; ++k) { + (*frame)[band][/*channel=*/0][sub_frame_index * kSubFrameLength + + k] *= one_by_num_channels; + } + } + } for (size_t band = 0; band < frame->size(); ++band) { (*sub_frame_view)[band][/*channel=*/0] = rtc::ArrayView( &(*frame)[band][/*channel=*/0][sub_frame_index * kSubFrameLength], kSubFrameLength); } } else { - RTC_DCHECK_EQ((*frame)[0].size(), (*sub_frame_view)[0].size()); + RTC_DCHECK_EQ(frame_num_channels, sub_frame_num_channels); for (size_t band = 0; band < frame->size(); ++band) { for (size_t channel = 0; channel < (*frame)[band].size(); ++channel) { (*sub_frame_view)[band][channel] = rtc::ArrayView( @@ -126,6 +147,7 @@ void ProcessCaptureFrameContent( AudioBuffer* linear_output, AudioBuffer* capture, bool level_change, + bool aec_reference_is_downmixed_stereo, bool saturated_microphone_signal, size_t sub_frame_index, FrameBlocker* capture_blocker, @@ -149,7 +171,9 @@ void ProcessCaptureFrameContent( capture_blocker->InsertSubFrameAndExtractBlock(*capture_sub_frame_view, capture_block); - block_processor->ProcessCapture(level_change, saturated_microphone_signal, + block_processor->ProcessCapture(/*echo_path_gain_change=*/level_change || + aec_reference_is_downmixed_stereo, + saturated_microphone_signal, linear_output_block, capture_block); output_framer->InsertBlockAndExtractSubFrame(*capture_block, capture_sub_frame_view); @@ -163,6 +187,7 @@ void ProcessCaptureFrameContent( void ProcessRemainingCaptureFrameContent( bool level_change, + bool aec_reference_is_downmixed_stereo, bool saturated_microphone_signal, FrameBlocker* capture_blocker, BlockFramer* linear_output_framer, @@ -175,8 +200,10 @@ void ProcessRemainingCaptureFrameContent( } capture_blocker->ExtractBlock(block); - block_processor->ProcessCapture(level_change, saturated_microphone_signal, - linear_output_block, block); + block_processor->ProcessCapture( + /*echo_path_gain_change=*/level_change || + aec_reference_is_downmixed_stereo, + saturated_microphone_signal, linear_output_block, block); output_framer->InsertBlock(*block); if (linear_output_framer) { @@ -186,13 +213,15 @@ void ProcessRemainingCaptureFrameContent( } void BufferRenderFrameContent( + bool proper_downmix_needed, std::vector>>* render_frame, size_t sub_frame_index, FrameBlocker* render_blocker, BlockProcessor* block_processor, std::vector>>* block, std::vector>>* sub_frame_view) { - FillSubFrameView(render_frame, sub_frame_index, sub_frame_view); + FillSubFrameView(proper_downmix_needed, render_frame, sub_frame_index, + sub_frame_view); render_blocker->InsertSubFrameAndExtractBlock(*sub_frame_view, block); block_processor->BufferRender(*block); } @@ -863,22 +892,26 @@ void EchoCanceller3::ProcessCapture(AudioBuffer* capture, EmptyRenderQueue(); - ProcessCaptureFrameContent(linear_output, capture, level_change, - saturated_microphone_signal_, 0, &capture_blocker_, - linear_output_framer_.get(), &output_framer_, - block_processor_.get(), linear_output_block_.get(), - &linear_output_sub_frame_view_, &capture_block_, - &capture_sub_frame_view_); + ProcessCaptureFrameContent( + linear_output, capture, level_change, + multichannel_content_detector_.IsTemporaryMultiChannelContentDetected(), + saturated_microphone_signal_, 0, &capture_blocker_, + linear_output_framer_.get(), &output_framer_, block_processor_.get(), + linear_output_block_.get(), &linear_output_sub_frame_view_, + &capture_block_, &capture_sub_frame_view_); - ProcessCaptureFrameContent(linear_output, capture, level_change, - saturated_microphone_signal_, 1, &capture_blocker_, - linear_output_framer_.get(), &output_framer_, - block_processor_.get(), linear_output_block_.get(), - &linear_output_sub_frame_view_, &capture_block_, - &capture_sub_frame_view_); + ProcessCaptureFrameContent( + linear_output, capture, level_change, + multichannel_content_detector_.IsTemporaryMultiChannelContentDetected(), + saturated_microphone_signal_, 1, &capture_blocker_, + linear_output_framer_.get(), &output_framer_, block_processor_.get(), + linear_output_block_.get(), &linear_output_sub_frame_view_, + &capture_block_, &capture_sub_frame_view_); ProcessRemainingCaptureFrameContent( - level_change, saturated_microphone_signal_, &capture_blocker_, + level_change, + multichannel_content_detector_.IsTemporaryMultiChannelContentDetected(), + saturated_microphone_signal_, &capture_blocker_, linear_output_framer_.get(), &output_framer_, block_processor_.get(), linear_output_block_.get(), &capture_block_); @@ -944,13 +977,17 @@ void EchoCanceller3::EmptyRenderQueue() { } // Buffer frame content. - BufferRenderFrameContent(&render_queue_output_frame_, 0, - render_blocker_.get(), block_processor_.get(), - &render_block_, &render_sub_frame_view_); + BufferRenderFrameContent( + /*proper_downmix_needed=*/multichannel_content_detector_ + .IsTemporaryMultiChannelContentDetected(), + &render_queue_output_frame_, 0, render_blocker_.get(), + block_processor_.get(), &render_block_, &render_sub_frame_view_); - BufferRenderFrameContent(&render_queue_output_frame_, 1, - render_blocker_.get(), block_processor_.get(), - &render_block_, &render_sub_frame_view_); + BufferRenderFrameContent( + /*proper_downmix_needed=*/multichannel_content_detector_ + .IsTemporaryMultiChannelContentDetected(), + &render_queue_output_frame_, 1, render_blocker_.get(), + block_processor_.get(), &render_block_, &render_sub_frame_view_); BufferRemainingRenderFrameContent(render_blocker_.get(), block_processor_.get(), &render_block_); diff --git a/modules/audio_processing/aec3/multi_channel_content_detector.h b/modules/audio_processing/aec3/multi_channel_content_detector.h index 3120502258..ad5f4f2886 100644 --- a/modules/audio_processing/aec3/multi_channel_content_detector.h +++ b/modules/audio_processing/aec3/multi_channel_content_detector.h @@ -45,7 +45,7 @@ class MultiChannelContentDetector { return persistent_multichannel_content_detected_; } - bool IsTemporaryMultiChannelContentDetectedForTesting() const { + bool IsTemporaryMultiChannelContentDetected() const { return temporary_multichannel_content_detected_; } diff --git a/modules/audio_processing/aec3/multi_channel_content_detector_unittest.cc b/modules/audio_processing/aec3/multi_channel_content_detector_unittest.cc index 0857bee4bd..86817da017 100644 --- a/modules/audio_processing/aec3/multi_channel_content_detector_unittest.cc +++ b/modules/audio_processing/aec3/multi_channel_content_detector_unittest.cc @@ -274,7 +274,7 @@ TEST_P(MultiChannelContentDetectorHysteresisBehavior, } else { EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); } - EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); + EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected()); } // Pass a two true stereo frames and verify that they are properly detected. @@ -289,16 +289,16 @@ TEST_P(MultiChannelContentDetectorHysteresisBehavior, EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame)); } EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); - EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); + EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected()); } else { EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame)); EXPECT_FALSE(mc.IsProperMultiChannelContentDetected()); - EXPECT_TRUE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); + EXPECT_TRUE(mc.IsTemporaryMultiChannelContentDetected()); } } else { EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame)); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); - EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); + EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected()); } } @@ -311,11 +311,11 @@ TEST_P(MultiChannelContentDetectorHysteresisBehavior, if (detect_stereo_content) { EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame)); EXPECT_FALSE(mc.IsProperMultiChannelContentDetected()); - EXPECT_TRUE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); + EXPECT_TRUE(mc.IsTemporaryMultiChannelContentDetected()); } else { EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame)); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); - EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); + EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected()); } } @@ -323,11 +323,11 @@ TEST_P(MultiChannelContentDetectorHysteresisBehavior, if (detect_stereo_content) { EXPECT_TRUE(mc.UpdateDetection(true_stereo_frame)); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); - EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); + EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected()); } else { EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame)); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); - EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); + EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected()); } // Pass an additional true stereo frame and verify that it is properly @@ -335,22 +335,22 @@ TEST_P(MultiChannelContentDetectorHysteresisBehavior, if (detect_stereo_content) { EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame)); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); - EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); + EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected()); } else { EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame)); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); - EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); + EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected()); } // Pass a fake stereo frame and verify that it is properly detected. if (detect_stereo_content) { EXPECT_FALSE(mc.UpdateDetection(fake_stereo_frame)); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); - EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); + EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected()); } else { EXPECT_FALSE(mc.UpdateDetection(fake_stereo_frame)); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); - EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); + EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected()); } }