AEC3: Handle temporary stereo content more robustly

During temporary stereo content when the AEC3 uses a mono reference signal, the signal is downmixed by averaging instead of using only the left channel.

Additionally, temporary stereo content is flagged as an echo path change.

Tested: Modified local build: Verified stereo mode entered / left in accordance with hysteresis and timeout thresholds. Verified temporary stereo detected during temporary stereo playout. Made an aecdump and inspected content.
Bug: chromium:1295710
Change-Id: I6bd53e615dfb3ec39bc1c73275b7d6d599ac7c57
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/258481
Reviewed-by: Per Åhgren <peah@webrtc.org>
Commit-Queue: Sam Zackrisson <saza@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#36504}
This commit is contained in:
Sam Zackrisson 2022-04-08 17:24:45 +02:00 committed by WebRTC LUCI CQ
parent cf7f7f9fa0
commit 1397c4bfd9
3 changed files with 78 additions and 41 deletions

View File

@ -96,22 +96,43 @@ void FillSubFrameView(
} }
void FillSubFrameView( void FillSubFrameView(
bool proper_downmix_needed,
std::vector<std::vector<std::vector<float>>>* frame, std::vector<std::vector<std::vector<float>>>* frame,
size_t sub_frame_index, size_t sub_frame_index,
std::vector<std::vector<rtc::ArrayView<float>>>* sub_frame_view) { std::vector<std::vector<rtc::ArrayView<float>>>* sub_frame_view) {
RTC_DCHECK_GE(1, sub_frame_index); RTC_DCHECK_GE(1, sub_frame_index);
RTC_DCHECK_EQ(frame->size(), sub_frame_view->size()); RTC_DCHECK_EQ(frame->size(), sub_frame_view->size());
if ((*frame)[0].size() > (*sub_frame_view)[0].size()) { const size_t frame_num_channels = (*frame)[0].size();
RTC_DCHECK_EQ((*sub_frame_view)[0].size(), 1); const size_t sub_frame_num_channels = (*sub_frame_view)[0].size();
// Downmix the audio to mono (should only be done when the audio contains if (frame_num_channels > sub_frame_num_channels) {
// fake-stereo or fake-multichannel). RTC_DCHECK_EQ(sub_frame_num_channels, 1u);
if (proper_downmix_needed) {
// When a proper downmix is needed (which is the case when proper stereo
// is present in the echo reference signal but the echo canceller does the
// processing in mono) downmix the echo reference by averaging the channel
// content (otherwise downmixing is done by selecting channel 0).
for (size_t band = 0; band < frame->size(); ++band) {
for (size_t ch = 1; ch < frame_num_channels; ++ch) {
for (size_t k = 0; k < kSubFrameLength; ++k) {
(*frame)[band][/*channel=*/0]
[sub_frame_index * kSubFrameLength + k] +=
(*frame)[band][ch][sub_frame_index * kSubFrameLength + k];
}
}
const float one_by_num_channels = 1.0f / frame_num_channels;
for (size_t k = 0; k < kSubFrameLength; ++k) {
(*frame)[band][/*channel=*/0][sub_frame_index * kSubFrameLength +
k] *= one_by_num_channels;
}
}
}
for (size_t band = 0; band < frame->size(); ++band) { for (size_t band = 0; band < frame->size(); ++band) {
(*sub_frame_view)[band][/*channel=*/0] = rtc::ArrayView<float>( (*sub_frame_view)[band][/*channel=*/0] = rtc::ArrayView<float>(
&(*frame)[band][/*channel=*/0][sub_frame_index * kSubFrameLength], &(*frame)[band][/*channel=*/0][sub_frame_index * kSubFrameLength],
kSubFrameLength); kSubFrameLength);
} }
} else { } else {
RTC_DCHECK_EQ((*frame)[0].size(), (*sub_frame_view)[0].size()); RTC_DCHECK_EQ(frame_num_channels, sub_frame_num_channels);
for (size_t band = 0; band < frame->size(); ++band) { for (size_t band = 0; band < frame->size(); ++band) {
for (size_t channel = 0; channel < (*frame)[band].size(); ++channel) { for (size_t channel = 0; channel < (*frame)[band].size(); ++channel) {
(*sub_frame_view)[band][channel] = rtc::ArrayView<float>( (*sub_frame_view)[band][channel] = rtc::ArrayView<float>(
@ -126,6 +147,7 @@ void ProcessCaptureFrameContent(
AudioBuffer* linear_output, AudioBuffer* linear_output,
AudioBuffer* capture, AudioBuffer* capture,
bool level_change, bool level_change,
bool aec_reference_is_downmixed_stereo,
bool saturated_microphone_signal, bool saturated_microphone_signal,
size_t sub_frame_index, size_t sub_frame_index,
FrameBlocker* capture_blocker, FrameBlocker* capture_blocker,
@ -149,7 +171,9 @@ void ProcessCaptureFrameContent(
capture_blocker->InsertSubFrameAndExtractBlock(*capture_sub_frame_view, capture_blocker->InsertSubFrameAndExtractBlock(*capture_sub_frame_view,
capture_block); capture_block);
block_processor->ProcessCapture(level_change, saturated_microphone_signal, block_processor->ProcessCapture(/*echo_path_gain_change=*/level_change ||
aec_reference_is_downmixed_stereo,
saturated_microphone_signal,
linear_output_block, capture_block); linear_output_block, capture_block);
output_framer->InsertBlockAndExtractSubFrame(*capture_block, output_framer->InsertBlockAndExtractSubFrame(*capture_block,
capture_sub_frame_view); capture_sub_frame_view);
@ -163,6 +187,7 @@ void ProcessCaptureFrameContent(
void ProcessRemainingCaptureFrameContent( void ProcessRemainingCaptureFrameContent(
bool level_change, bool level_change,
bool aec_reference_is_downmixed_stereo,
bool saturated_microphone_signal, bool saturated_microphone_signal,
FrameBlocker* capture_blocker, FrameBlocker* capture_blocker,
BlockFramer* linear_output_framer, BlockFramer* linear_output_framer,
@ -175,8 +200,10 @@ void ProcessRemainingCaptureFrameContent(
} }
capture_blocker->ExtractBlock(block); capture_blocker->ExtractBlock(block);
block_processor->ProcessCapture(level_change, saturated_microphone_signal, block_processor->ProcessCapture(
linear_output_block, block); /*echo_path_gain_change=*/level_change ||
aec_reference_is_downmixed_stereo,
saturated_microphone_signal, linear_output_block, block);
output_framer->InsertBlock(*block); output_framer->InsertBlock(*block);
if (linear_output_framer) { if (linear_output_framer) {
@ -186,13 +213,15 @@ void ProcessRemainingCaptureFrameContent(
} }
void BufferRenderFrameContent( void BufferRenderFrameContent(
bool proper_downmix_needed,
std::vector<std::vector<std::vector<float>>>* render_frame, std::vector<std::vector<std::vector<float>>>* render_frame,
size_t sub_frame_index, size_t sub_frame_index,
FrameBlocker* render_blocker, FrameBlocker* render_blocker,
BlockProcessor* block_processor, BlockProcessor* block_processor,
std::vector<std::vector<std::vector<float>>>* block, std::vector<std::vector<std::vector<float>>>* block,
std::vector<std::vector<rtc::ArrayView<float>>>* sub_frame_view) { std::vector<std::vector<rtc::ArrayView<float>>>* sub_frame_view) {
FillSubFrameView(render_frame, sub_frame_index, sub_frame_view); FillSubFrameView(proper_downmix_needed, render_frame, sub_frame_index,
sub_frame_view);
render_blocker->InsertSubFrameAndExtractBlock(*sub_frame_view, block); render_blocker->InsertSubFrameAndExtractBlock(*sub_frame_view, block);
block_processor->BufferRender(*block); block_processor->BufferRender(*block);
} }
@ -863,22 +892,26 @@ void EchoCanceller3::ProcessCapture(AudioBuffer* capture,
EmptyRenderQueue(); EmptyRenderQueue();
ProcessCaptureFrameContent(linear_output, capture, level_change, ProcessCaptureFrameContent(
saturated_microphone_signal_, 0, &capture_blocker_, linear_output, capture, level_change,
linear_output_framer_.get(), &output_framer_, multichannel_content_detector_.IsTemporaryMultiChannelContentDetected(),
block_processor_.get(), linear_output_block_.get(), saturated_microphone_signal_, 0, &capture_blocker_,
&linear_output_sub_frame_view_, &capture_block_, linear_output_framer_.get(), &output_framer_, block_processor_.get(),
&capture_sub_frame_view_); linear_output_block_.get(), &linear_output_sub_frame_view_,
&capture_block_, &capture_sub_frame_view_);
ProcessCaptureFrameContent(linear_output, capture, level_change, ProcessCaptureFrameContent(
saturated_microphone_signal_, 1, &capture_blocker_, linear_output, capture, level_change,
linear_output_framer_.get(), &output_framer_, multichannel_content_detector_.IsTemporaryMultiChannelContentDetected(),
block_processor_.get(), linear_output_block_.get(), saturated_microphone_signal_, 1, &capture_blocker_,
&linear_output_sub_frame_view_, &capture_block_, linear_output_framer_.get(), &output_framer_, block_processor_.get(),
&capture_sub_frame_view_); linear_output_block_.get(), &linear_output_sub_frame_view_,
&capture_block_, &capture_sub_frame_view_);
ProcessRemainingCaptureFrameContent( ProcessRemainingCaptureFrameContent(
level_change, saturated_microphone_signal_, &capture_blocker_, level_change,
multichannel_content_detector_.IsTemporaryMultiChannelContentDetected(),
saturated_microphone_signal_, &capture_blocker_,
linear_output_framer_.get(), &output_framer_, block_processor_.get(), linear_output_framer_.get(), &output_framer_, block_processor_.get(),
linear_output_block_.get(), &capture_block_); linear_output_block_.get(), &capture_block_);
@ -944,13 +977,17 @@ void EchoCanceller3::EmptyRenderQueue() {
} }
// Buffer frame content. // Buffer frame content.
BufferRenderFrameContent(&render_queue_output_frame_, 0, BufferRenderFrameContent(
render_blocker_.get(), block_processor_.get(), /*proper_downmix_needed=*/multichannel_content_detector_
&render_block_, &render_sub_frame_view_); .IsTemporaryMultiChannelContentDetected(),
&render_queue_output_frame_, 0, render_blocker_.get(),
block_processor_.get(), &render_block_, &render_sub_frame_view_);
BufferRenderFrameContent(&render_queue_output_frame_, 1, BufferRenderFrameContent(
render_blocker_.get(), block_processor_.get(), /*proper_downmix_needed=*/multichannel_content_detector_
&render_block_, &render_sub_frame_view_); .IsTemporaryMultiChannelContentDetected(),
&render_queue_output_frame_, 1, render_blocker_.get(),
block_processor_.get(), &render_block_, &render_sub_frame_view_);
BufferRemainingRenderFrameContent(render_blocker_.get(), BufferRemainingRenderFrameContent(render_blocker_.get(),
block_processor_.get(), &render_block_); block_processor_.get(), &render_block_);

View File

@ -45,7 +45,7 @@ class MultiChannelContentDetector {
return persistent_multichannel_content_detected_; return persistent_multichannel_content_detected_;
} }
bool IsTemporaryMultiChannelContentDetectedForTesting() const { bool IsTemporaryMultiChannelContentDetected() const {
return temporary_multichannel_content_detected_; return temporary_multichannel_content_detected_;
} }

View File

@ -274,7 +274,7 @@ TEST_P(MultiChannelContentDetectorHysteresisBehavior,
} else { } else {
EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected());
} }
EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected());
} }
// Pass a two true stereo frames and verify that they are properly detected. // Pass a two true stereo frames and verify that they are properly detected.
@ -289,16 +289,16 @@ TEST_P(MultiChannelContentDetectorHysteresisBehavior,
EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame)); EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame));
} }
EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected());
EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected());
} else { } else {
EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame)); EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame));
EXPECT_FALSE(mc.IsProperMultiChannelContentDetected()); EXPECT_FALSE(mc.IsProperMultiChannelContentDetected());
EXPECT_TRUE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); EXPECT_TRUE(mc.IsTemporaryMultiChannelContentDetected());
} }
} else { } else {
EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame)); EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame));
EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected());
EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected());
} }
} }
@ -311,11 +311,11 @@ TEST_P(MultiChannelContentDetectorHysteresisBehavior,
if (detect_stereo_content) { if (detect_stereo_content) {
EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame)); EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame));
EXPECT_FALSE(mc.IsProperMultiChannelContentDetected()); EXPECT_FALSE(mc.IsProperMultiChannelContentDetected());
EXPECT_TRUE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); EXPECT_TRUE(mc.IsTemporaryMultiChannelContentDetected());
} else { } else {
EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame)); EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame));
EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected());
EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected());
} }
} }
@ -323,11 +323,11 @@ TEST_P(MultiChannelContentDetectorHysteresisBehavior,
if (detect_stereo_content) { if (detect_stereo_content) {
EXPECT_TRUE(mc.UpdateDetection(true_stereo_frame)); EXPECT_TRUE(mc.UpdateDetection(true_stereo_frame));
EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected());
EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected());
} else { } else {
EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame)); EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame));
EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected());
EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected());
} }
// Pass an additional true stereo frame and verify that it is properly // Pass an additional true stereo frame and verify that it is properly
@ -335,22 +335,22 @@ TEST_P(MultiChannelContentDetectorHysteresisBehavior,
if (detect_stereo_content) { if (detect_stereo_content) {
EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame)); EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame));
EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected());
EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected());
} else { } else {
EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame)); EXPECT_FALSE(mc.UpdateDetection(true_stereo_frame));
EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected());
EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected());
} }
// Pass a fake stereo frame and verify that it is properly detected. // Pass a fake stereo frame and verify that it is properly detected.
if (detect_stereo_content) { if (detect_stereo_content) {
EXPECT_FALSE(mc.UpdateDetection(fake_stereo_frame)); EXPECT_FALSE(mc.UpdateDetection(fake_stereo_frame));
EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected());
EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected());
} else { } else {
EXPECT_FALSE(mc.UpdateDetection(fake_stereo_frame)); EXPECT_FALSE(mc.UpdateDetection(fake_stereo_frame));
EXPECT_TRUE(mc.IsProperMultiChannelContentDetected()); EXPECT_TRUE(mc.IsProperMultiChannelContentDetected());
EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetectedForTesting()); EXPECT_FALSE(mc.IsTemporaryMultiChannelContentDetected());
} }
} }