From 5ea5749a861ebc49935faf82a694806b57ae878a Mon Sep 17 00:00:00 2001 From: Gustaf Ullberg Date: Tue, 5 Nov 2019 15:19:02 +0100 Subject: [PATCH] AEC3: Multichannel suppressor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change adds multichannel support to the AEC3 suppressor. Processing of mono capture is bit-exact to the previous code. Bug: webrtc:10913 Change-Id: I89affe3e066021bc34e4b525edf44dd3bea68365 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/158882 Commit-Queue: Gustaf Ullberg Reviewed-by: Per Ã…hgren Cr-Commit-Position: refs/heads/master@{#29692} --- modules/audio_processing/aec3/BUILD.gn | 2 + .../aec3/dominant_nearend_detector.cc | 76 ++++++ .../aec3/dominant_nearend_detector.h | 56 ++++ modules/audio_processing/aec3/echo_remover.cc | 76 ++---- .../audio_processing/aec3/suppression_gain.cc | 239 ++++++++---------- .../audio_processing/aec3/suppression_gain.h | 78 ++---- .../aec3/suppression_gain_unittest.cc | 74 +++--- 7 files changed, 323 insertions(+), 278 deletions(-) create mode 100644 modules/audio_processing/aec3/dominant_nearend_detector.cc create mode 100644 modules/audio_processing/aec3/dominant_nearend_detector.h diff --git a/modules/audio_processing/aec3/BUILD.gn b/modules/audio_processing/aec3/BUILD.gn index 0379c39e16..ce76bd18bc 100644 --- a/modules/audio_processing/aec3/BUILD.gn +++ b/modules/audio_processing/aec3/BUILD.gn @@ -41,6 +41,8 @@ rtc_library("aec3") { "decimator.cc", "decimator.h", "delay_estimate.h", + "dominant_nearend_detector.cc", + "dominant_nearend_detector.h", "downsampled_render_buffer.cc", "downsampled_render_buffer.h", "echo_audibility.cc", diff --git a/modules/audio_processing/aec3/dominant_nearend_detector.cc b/modules/audio_processing/aec3/dominant_nearend_detector.cc new file mode 100644 index 0000000000..64d8b09616 --- /dev/null +++ b/modules/audio_processing/aec3/dominant_nearend_detector.cc @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2019 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_processing/aec3/dominant_nearend_detector.h" + +#include +#include + +namespace webrtc { +DominantNearendDetector::DominantNearendDetector( + const EchoCanceller3Config::Suppressor::DominantNearendDetection config, + size_t num_capture_channels) + : enr_threshold_(config.enr_threshold), + enr_exit_threshold_(config.enr_exit_threshold), + snr_threshold_(config.snr_threshold), + hold_duration_(config.hold_duration), + trigger_threshold_(config.trigger_threshold), + use_during_initial_phase_(config.use_during_initial_phase), + num_capture_channels_(num_capture_channels), + trigger_counters_(num_capture_channels_), + hold_counters_(num_capture_channels_) {} + +void DominantNearendDetector::Update( + rtc::ArrayView> + nearend_spectrum, + rtc::ArrayView> + residual_echo_spectrum, + rtc::ArrayView> + comfort_noise_spectrum, + bool initial_state) { + nearend_state_ = false; + + auto low_frequency_energy = [](rtc::ArrayView spectrum) { + RTC_DCHECK_LE(16, spectrum.size()); + return std::accumulate(spectrum.begin() + 1, spectrum.begin() + 16, 0.f); + }; + + for (size_t ch = 0; ch < num_capture_channels_; ++ch) { + const float ne_sum = low_frequency_energy(nearend_spectrum[ch]); + const float echo_sum = low_frequency_energy(residual_echo_spectrum[ch]); + const float noise_sum = low_frequency_energy(comfort_noise_spectrum[ch]); + + // Detect strong active nearend if the nearend is sufficiently stronger than + // the echo and the nearend noise. + if ((!initial_state || use_during_initial_phase_) && + echo_sum < enr_threshold_ * ne_sum && + ne_sum > snr_threshold_ * noise_sum) { + if (++trigger_counters_[ch] >= trigger_threshold_) { + // After a period of strong active nearend activity, flag nearend mode. + hold_counters_[ch] = hold_duration_; + trigger_counters_[ch] = trigger_threshold_; + } + } else { + // Forget previously detected strong active nearend activity. + trigger_counters_[ch] = std::max(0, trigger_counters_[ch] - 1); + } + + // Exit nearend-state early at strong echo. + if (echo_sum > enr_exit_threshold_ * ne_sum && + echo_sum > snr_threshold_ * noise_sum) { + hold_counters_[ch] = 0; + } + + // Remain in any nearend mode for a certain duration. + hold_counters_[ch] = std::max(0, hold_counters_[ch] - 1); + nearend_state_ = nearend_state_ || hold_counters_[ch] > 0; + } +} +} // namespace webrtc diff --git a/modules/audio_processing/aec3/dominant_nearend_detector.h b/modules/audio_processing/aec3/dominant_nearend_detector.h new file mode 100644 index 0000000000..dea9fe5d46 --- /dev/null +++ b/modules/audio_processing/aec3/dominant_nearend_detector.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2019 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef MODULES_AUDIO_PROCESSING_AEC3_DOMINANT_NEAREND_DETECTOR_H_ +#define MODULES_AUDIO_PROCESSING_AEC3_DOMINANT_NEAREND_DETECTOR_H_ + +#include + +#include "api/array_view.h" +#include "api/audio/echo_canceller3_config.h" +#include "modules/audio_processing/aec3/aec3_common.h" + +namespace webrtc { +// Class for selecting whether the suppressor is in the nearend or echo state. +class DominantNearendDetector { + public: + DominantNearendDetector( + const EchoCanceller3Config::Suppressor::DominantNearendDetection config, + size_t num_capture_channels); + + // Returns whether the current state is the nearend state. + bool IsNearendState() const { return nearend_state_; } + + // Updates the state selection based on latest spectral estimates. + void Update(rtc::ArrayView> + nearend_spectrum, + rtc::ArrayView> + residual_echo_spectrum, + rtc::ArrayView> + comfort_noise_spectrum, + bool initial_state); + + private: + const float enr_threshold_; + const float enr_exit_threshold_; + const float snr_threshold_; + const int hold_duration_; + const int trigger_threshold_; + const bool use_during_initial_phase_; + const size_t num_capture_channels_; + + bool nearend_state_ = false; + std::vector trigger_counters_; + std::vector hold_counters_; +}; + +} // namespace webrtc + +#endif // MODULES_AUDIO_PROCESSING_AEC3_DOMINANT_NEAREND_DETECTOR_H_ diff --git a/modules/audio_processing/aec3/echo_remover.cc b/modules/audio_processing/aec3/echo_remover.cc index 5f48e225db..bf68f36e63 100644 --- a/modules/audio_processing/aec3/echo_remover.cc +++ b/modules/audio_processing/aec3/echo_remover.cc @@ -148,7 +148,7 @@ class EchoRemoverImpl final : public EchoRemover { const size_t num_capture_channels_; const bool use_shadow_filter_output_; Subtractor subtractor_; - std::vector> suppression_gains_; + SuppressionGain suppression_gain_; ComfortNoiseGenerator cng_; SuppressionFilter suppression_filter_; RenderSignalAnalyzer render_signal_analyzer_; @@ -195,7 +195,10 @@ EchoRemoverImpl::EchoRemoverImpl(const EchoCanceller3Config& config, num_capture_channels_, data_dumper_.get(), optimization_), - suppression_gains_(num_capture_channels_), + suppression_gain_(config_, + optimization_, + sample_rate_hz, + num_capture_channels), cng_(optimization_, num_capture_channels_), suppression_filter_(optimization_, sample_rate_hz_, @@ -203,9 +206,9 @@ EchoRemoverImpl::EchoRemoverImpl(const EchoCanceller3Config& config, render_signal_analyzer_(config_), residual_echo_estimator_(config_, num_render_channels), aec_state_(config_, num_capture_channels_), - e_old_(num_capture_channels_), - y_old_(num_capture_channels_), - e_heap_(NumChannelsOnHeap(num_capture_channels_)), + e_old_(num_capture_channels_, {0.f}), + y_old_(num_capture_channels_, {0.f}), + e_heap_(NumChannelsOnHeap(num_capture_channels_), {0.f}), Y2_heap_(NumChannelsOnHeap(num_capture_channels_)), E2_heap_(NumChannelsOnHeap(num_capture_channels_)), R2_heap_(NumChannelsOnHeap(num_capture_channels_)), @@ -216,16 +219,6 @@ EchoRemoverImpl::EchoRemoverImpl(const EchoCanceller3Config& config, high_band_comfort_noise_heap_(NumChannelsOnHeap(num_capture_channels_)), subtractor_output_heap_(NumChannelsOnHeap(num_capture_channels_)) { RTC_DCHECK(ValidFullBandRate(sample_rate_hz)); - for (auto& e_k : e_heap_) { - e_k.fill(0.f); - } - - for (size_t ch = 0; ch < num_capture_channels_; ++ch) { - suppression_gains_[ch] = std::make_unique( - config_, optimization_, sample_rate_hz); - e_old_[ch].fill(0.f); - y_old_[ch].fill(0.f); - } } EchoRemoverImpl::~EchoRemoverImpl() = default; @@ -343,9 +336,7 @@ void EchoRemoverImpl::ProcessCapture( if (echo_path_variability.delay_change != EchoPathVariability::DelayAdjustment::kNone) { - for (size_t ch = 0; ch < num_capture_channels_; ++ch) { - suppression_gains_[ch]->SetInitialState(true); - } + suppression_gain_.SetInitialState(true); } } if (gain_change_hangover_ > 0) { @@ -359,9 +350,7 @@ void EchoRemoverImpl::ProcessCapture( // State transition. if (aec_state_.TransitionTriggered()) { subtractor_.ExitInitialState(); - for (size_t ch = 0; ch < num_capture_channels_; ++ch) { - suppression_gains_[ch]->SetInitialState(false); - } + suppression_gain_.SetInitialState(false); } // Perform linear echo cancellation. @@ -390,10 +379,6 @@ void EchoRemoverImpl::ProcessCapture( 1); data_dumper_->DumpWav("aec3_output_linear2", kBlockSize, &e[0][0], 16000, 1); - float high_bands_gain = 1.f; - std::array G; - G.fill(1.f); - // Estimate the residual echo power. residual_echo_estimator_.Estimate(aec_state_, *render_buffer, S2_linear, Y2, R2); @@ -402,34 +387,27 @@ void EchoRemoverImpl::ProcessCapture( cng_.Compute(aec_state_.SaturatedCapture(), Y2, comfort_noise, high_band_comfort_noise); - for (size_t ch = 0; ch < num_capture_channels_; ++ch) { - // Suppressor echo estimate. - const auto& echo_spectrum = - aec_state_.UsableLinearEstimate() ? S2_linear[ch] : R2[ch]; - - // Suppressor nearend estimate. - std::array nearend_spectrum_bounded; - if (aec_state_.UsableLinearEstimate()) { + // Suppressor nearend estimate. + if (aec_state_.UsableLinearEstimate()) { + // E2 is bound by Y2. + for (size_t ch = 0; ch < num_capture_channels_; ++ch) { std::transform(E2[ch].begin(), E2[ch].end(), Y2[ch].begin(), - nearend_spectrum_bounded.begin(), + E2[ch].begin(), [](float a, float b) { return std::min(a, b); }); } - const auto& nearend_spectrum = - aec_state_.UsableLinearEstimate() ? nearend_spectrum_bounded : Y2[ch]; - - // Compute preferred gains for each channel. The minimum gain determines the - // final gain. - float high_bands_gain_channel; - std::array G_channel; - suppression_gains_[ch]->GetGain(nearend_spectrum, echo_spectrum, R2[ch], - cng_.NoiseSpectrum()[ch], - render_signal_analyzer_, aec_state_, x, - &high_bands_gain_channel, &G_channel); - - high_bands_gain = std::min(high_bands_gain, high_bands_gain_channel); - std::transform(G.begin(), G.end(), G_channel.begin(), G.begin(), - [](float a, float b) { return std::min(a, b); }); } + const auto& nearend_spectrum = aec_state_.UsableLinearEstimate() ? E2 : Y2; + + // Suppressor echo estimate. + const auto& echo_spectrum = + aec_state_.UsableLinearEstimate() ? S2_linear : R2; + + // Compute preferred gains. + float high_bands_gain; + std::array G; + suppression_gain_.GetGain(nearend_spectrum, echo_spectrum, R2, + cng_.NoiseSpectrum(), render_signal_analyzer_, + aec_state_, x, &high_bands_gain, &G); suppression_filter_.ApplyGain(comfort_noise, high_band_comfort_noise, G, high_bands_gain, Y_fft, y); diff --git a/modules/audio_processing/aec3/suppression_gain.cc b/modules/audio_processing/aec3/suppression_gain.cc index 6ec70bfade..d1ef326dfa 100644 --- a/modules/audio_processing/aec3/suppression_gain.cc +++ b/modules/audio_processing/aec3/suppression_gain.cc @@ -25,8 +25,10 @@ namespace webrtc { namespace { -// Adjust the gains according to the presence of known external filters. -void AdjustForExternalFilters(std::array* gain) { +void PostprocessGains(std::array* gain) { + // TODO(gustaf): Investigate if this can be relaxed to achieve higher + // transparency above 2 kHz. + // Limit the low frequency gains to avoid the impact of the high-pass filter // on the lower-frequency gain influencing the overall achieved gain. (*gain)[0] = (*gain)[1] = std::min((*gain)[1], (*gain)[2]); @@ -41,6 +43,21 @@ void AdjustForExternalFilters(std::array* gain) { gain->begin() + kAntiAliasingImpactLimit, gain->end() - 1, [min_upper_gain](float& a) { a = std::min(a, min_upper_gain); }); (*gain)[kFftLengthBy2] = (*gain)[kFftLengthBy2Minus1]; + + // Limits the gain in the frequencies for which the adaptive filter has not + // converged. + // TODO(peah): Make adaptive to take the actual filter error into account. + constexpr size_t kUpperAccurateBandPlus1 = 29; + + constexpr float oneByBandsInSum = + 1 / static_cast(kUpperAccurateBandPlus1 - 20); + const float hf_gain_bound = + std::accumulate(gain->begin() + 20, + gain->begin() + kUpperAccurateBandPlus1, 0.f) * + oneByBandsInSum; + + std::for_each(gain->begin() + kUpperAccurateBandPlus1, gain->end(), + [hf_gain_bound](float& a) { a = std::min(a, hf_gain_bound); }); } // Scales the echo according to assessed audibility at the other end. @@ -79,33 +96,14 @@ void WeightEchoForAudibility(const EchoCanceller3Config& config, weigh(threshold, normalizer, 7, kFftLengthBy2Plus1, echo, weighted_echo); } -// TODO(peah): Make adaptive to take the actual filter error into account. -constexpr size_t kUpperAccurateBandPlus1 = 29; - -// Limits the gain in the frequencies for which the adaptive filter has not -// converged. Currently, these frequencies are not hardcoded to the frequencies -// which are typically not excited by speech. -// TODO(peah): Make adaptive to take the actual filter error into account. -void AdjustNonConvergedFrequencies( - std::array* gain) { - constexpr float oneByBandsInSum = - 1 / static_cast(kUpperAccurateBandPlus1 - 20); - const float hf_gain_bound = - std::accumulate(gain->begin() + 20, - gain->begin() + kUpperAccurateBandPlus1, 0.f) * - oneByBandsInSum; - - std::for_each(gain->begin() + kUpperAccurateBandPlus1, gain->end(), - [hf_gain_bound](float& a) { a = std::min(a, hf_gain_bound); }); -} - } // namespace int SuppressionGain::instance_count_ = 0; float SuppressionGain::UpperBandsGain( - const std::array& echo_spectrum, - const std::array& comfort_noise_spectrum, + rtc::ArrayView> echo_spectrum, + rtc::ArrayView> + comfort_noise_spectrum, const absl::optional& narrow_peak_band, bool saturated_echo, const std::vector>>& render, @@ -161,18 +159,22 @@ float SuppressionGain::UpperBandsGain( anti_howling_gain = 0.01f * sqrtf(low_band_energy / high_band_energy); } - // Bound the upper gain during significant echo activity. - auto low_frequency_energy = [](rtc::ArrayView spectrum) { - RTC_DCHECK_LE(16, spectrum.size()); - return std::accumulate(spectrum.begin() + 1, spectrum.begin() + 16, 0.f); - }; - const float echo_sum = low_frequency_energy(echo_spectrum); - const float noise_sum = low_frequency_energy(comfort_noise_spectrum); - const auto& cfg = config_.suppressor.high_bands_suppression; float gain_bound = 1.f; - if (echo_sum > cfg.enr_threshold * noise_sum && - !dominant_nearend_detector_.IsNearendState()) { - gain_bound = cfg.max_gain_during_echo; + if (!dominant_nearend_detector_.IsNearendState()) { + // Bound the upper gain during significant echo activity. + const auto& cfg = config_.suppressor.high_bands_suppression; + auto low_frequency_energy = [](rtc::ArrayView spectrum) { + RTC_DCHECK_LE(16, spectrum.size()); + return std::accumulate(spectrum.begin() + 1, spectrum.begin() + 16, 0.f); + }; + for (size_t ch = 0; ch < num_capture_channels_; ++ch) { + const float echo_sum = low_frequency_energy(echo_spectrum[ch]); + const float noise_sum = low_frequency_energy(comfort_noise_spectrum[ch]); + if (echo_sum > cfg.enr_threshold * noise_sum) { + gain_bound = cfg.max_gain_during_echo; + break; + } + } } // Choose the gain as the minimum of the lower and upper gains. @@ -184,8 +186,6 @@ void SuppressionGain::GainToNoAudibleEcho( const std::array& nearend, const std::array& echo, const std::array& masker, - const std::array& min_gain, - const std::array& max_gain, std::array* gain) const { const auto& p = dominant_nearend_detector_.IsNearendState() ? nearend_params_ : normal_params_; @@ -198,7 +198,7 @@ void SuppressionGain::GainToNoAudibleEcho( (p.enr_suppress_[k] - p.enr_transparent_[k]); g = std::max(g, p.emr_transparent_[k] / emr); } - (*gain)[k] = std::max(std::min(g, max_gain[k]), min_gain[k]); + (*gain)[k] = g; } } @@ -206,6 +206,8 @@ void SuppressionGain::GainToNoAudibleEcho( // above the zero sample values. void SuppressionGain::GetMinGain( rtc::ArrayView weighted_residual_echo, + rtc::ArrayView last_nearend, + rtc::ArrayView last_echo, bool low_noise_render, bool saturated_echo, rtc::ArrayView min_gain) const { @@ -227,7 +229,7 @@ void SuppressionGain::GetMinGain( // Make sure the gains of the low frequencies do not decrease too // quickly after strong nearend. - if (last_nearend_[k] > last_echo_[k]) { + if (last_nearend[k] > last_echo[k]) { min_gain[k] = std::max(min_gain[k], last_gain_[k] * dec); min_gain[k] = std::min(min_gain[k], 1.f); } @@ -249,79 +251,91 @@ void SuppressionGain::GetMaxGain(rtc::ArrayView max_gain) const { } } -// TODO(peah): Add further optimizations, in particular for the divisions. void SuppressionGain::LowerBandGain( bool low_noise_render, const AecState& aec_state, - const std::array& suppressor_input, - const std::array& nearend, - const std::array& residual_echo, - const std::array& comfort_noise, + rtc::ArrayView> + suppressor_input, + rtc::ArrayView> residual_echo, + rtc::ArrayView> comfort_noise, std::array* gain) { + gain->fill(1.f); const bool saturated_echo = aec_state.SaturatedEcho(); - - // Weight echo power in terms of audibility. // Precompute 1/weighted echo - // (note that when the echo is zero, the precomputed value is never used). - std::array weighted_residual_echo; - WeightEchoForAudibility(config_, residual_echo, weighted_residual_echo); - - std::array min_gain; - GetMinGain(weighted_residual_echo, low_noise_render, saturated_echo, - min_gain); - std::array max_gain; GetMaxGain(max_gain); - GainToNoAudibleEcho(nearend, weighted_residual_echo, comfort_noise, min_gain, - max_gain, gain); - AdjustForExternalFilters(gain); + for (size_t ch = 0; ch < num_capture_channels_; ++ch) { + std::array G; + std::array nearend; + nearend_smoothers_[ch].Average(suppressor_input[ch], nearend); - // Adjust the gain for frequencies which have not yet converged. - AdjustNonConvergedFrequencies(gain); + // Weight echo power in terms of audibility. + std::array weighted_residual_echo; + WeightEchoForAudibility(config_, residual_echo[ch], weighted_residual_echo); - // Store data required for the gain computation of the next block. - std::copy(nearend.begin(), nearend.end(), last_nearend_.begin()); - std::copy(weighted_residual_echo.begin(), weighted_residual_echo.end(), - last_echo_.begin()); + std::array min_gain; + GetMinGain(weighted_residual_echo, last_nearend_[ch], last_echo_[ch], + low_noise_render, saturated_echo, min_gain); + + GainToNoAudibleEcho(nearend, weighted_residual_echo, comfort_noise[0], &G); + + // Clamp gains. + for (size_t k = 0; k < gain->size(); ++k) { + G[k] = std::max(std::min(G[k], max_gain[k]), min_gain[k]); + (*gain)[k] = std::min((*gain)[k], G[k]); + } + + // Store data required for the gain computation of the next block. + std::copy(nearend.begin(), nearend.end(), last_nearend_[ch].begin()); + std::copy(weighted_residual_echo.begin(), weighted_residual_echo.end(), + last_echo_[ch].begin()); + } + + // Limit high-frequency gains. + PostprocessGains(gain); + + // Store computed gains. std::copy(gain->begin(), gain->end(), last_gain_.begin()); - aec3::VectorMath(optimization_).Sqrt(*gain); - // Debug outputs for the purpose of development and analysis. - data_dumper_->DumpRaw("aec3_suppressor_min_gain", min_gain); - data_dumper_->DumpRaw("aec3_suppressor_max_gain", max_gain); - data_dumper_->DumpRaw("aec3_dominant_nearend", - dominant_nearend_detector_.IsNearendState()); + // Transform gains to amplitude domain. + aec3::VectorMath(optimization_).Sqrt(*gain); } SuppressionGain::SuppressionGain(const EchoCanceller3Config& config, Aec3Optimization optimization, - int sample_rate_hz) + int sample_rate_hz, + size_t num_capture_channels) : data_dumper_( new ApmDataDumper(rtc::AtomicOps::Increment(&instance_count_))), optimization_(optimization), config_(config), + num_capture_channels_(num_capture_channels), state_change_duration_blocks_( static_cast(config_.filter.config_change_duration_blocks)), - moving_average_(kFftLengthBy2Plus1, - config.suppressor.nearend_average_blocks), + last_nearend_(num_capture_channels_, {0}), + last_echo_(num_capture_channels_, {0}), + nearend_smoothers_( + num_capture_channels_, + aec3::MovingAverage(kFftLengthBy2Plus1, + config.suppressor.nearend_average_blocks)), nearend_params_(config_.suppressor.nearend_tuning), normal_params_(config_.suppressor.normal_tuning), - dominant_nearend_detector_( - config_.suppressor.dominant_nearend_detection) { + dominant_nearend_detector_(config_.suppressor.dominant_nearend_detection, + num_capture_channels_) { RTC_DCHECK_LT(0, state_change_duration_blocks_); - one_by_state_change_duration_blocks_ = 1.f / state_change_duration_blocks_; last_gain_.fill(1.f); - last_nearend_.fill(0.f); - last_echo_.fill(0.f); } SuppressionGain::~SuppressionGain() = default; void SuppressionGain::GetGain( - const std::array& nearend_spectrum, - const std::array& echo_spectrum, - const std::array& residual_echo_spectrum, - const std::array& comfort_noise_spectrum, + rtc::ArrayView> + nearend_spectrum, + rtc::ArrayView> echo_spectrum, + rtc::ArrayView> + residual_echo_spectrum, + rtc::ArrayView> + comfort_noise_spectrum, const RenderSignalAnalyzer& render_signal_analyzer, const AecState& aec_state, const std::vector>>& render, @@ -337,18 +351,20 @@ void SuppressionGain::GetGain( return; } - std::array nearend_average; - moving_average_.Average(nearend_spectrum, nearend_average); - - // Update the state selection. + // Update the nearend state selection. dominant_nearend_detector_.Update(nearend_spectrum, residual_echo_spectrum, comfort_noise_spectrum, initial_state_); // Compute gain for the lower band. bool low_noise_render = low_render_detector_.Detect(render); - LowerBandGain(low_noise_render, aec_state, nearend_spectrum, nearend_average, + LowerBandGain(low_noise_render, aec_state, nearend_spectrum, residual_echo_spectrum, comfort_noise_spectrum, low_band_gain); + if (cfg.enforce_empty_higher_bands) { + *high_bands_gain = 0.f; + return; + } + // Compute the gain for the upper bands. const absl::optional narrow_peak_band = render_signal_analyzer.NarrowPeakBand(); @@ -356,9 +372,6 @@ void SuppressionGain::GetGain( *high_bands_gain = UpperBandsGain(echo_spectrum, comfort_noise_spectrum, narrow_peak_band, aec_state.SaturatedEcho(), render, *low_band_gain); - if (cfg.enforce_empty_higher_bands) { - *high_bands_gain = 0.f; - } } void SuppressionGain::SetInitialState(bool state) { @@ -394,54 +407,6 @@ bool SuppressionGain::LowNoiseRenderDetector::Detect( return low_noise_render; } -SuppressionGain::DominantNearendDetector::DominantNearendDetector( - const EchoCanceller3Config::Suppressor::DominantNearendDetection config) - : enr_threshold_(config.enr_threshold), - enr_exit_threshold_(config.enr_exit_threshold), - snr_threshold_(config.snr_threshold), - hold_duration_(config.hold_duration), - trigger_threshold_(config.trigger_threshold), - use_during_initial_phase_(config.use_during_initial_phase) {} - -void SuppressionGain::DominantNearendDetector::Update( - rtc::ArrayView nearend_spectrum, - rtc::ArrayView residual_echo_spectrum, - rtc::ArrayView comfort_noise_spectrum, - bool initial_state) { - auto low_frequency_energy = [](rtc::ArrayView spectrum) { - RTC_DCHECK_LE(16, spectrum.size()); - return std::accumulate(spectrum.begin() + 1, spectrum.begin() + 16, 0.f); - }; - const float ne_sum = low_frequency_energy(nearend_spectrum); - const float echo_sum = low_frequency_energy(residual_echo_spectrum); - const float noise_sum = low_frequency_energy(comfort_noise_spectrum); - - // Detect strong active nearend if the nearend is sufficiently stronger than - // the echo and the nearend noise. - if ((!initial_state || use_during_initial_phase_) && - echo_sum < enr_threshold_ * ne_sum && - ne_sum > snr_threshold_ * noise_sum) { - if (++trigger_counter_ >= trigger_threshold_) { - // After a period of strong active nearend activity, flag nearend mode. - hold_counter_ = hold_duration_; - trigger_counter_ = trigger_threshold_; - } - } else { - // Forget previously detected strong active nearend activity. - trigger_counter_ = std::max(0, trigger_counter_ - 1); - } - - // Exit nearend-state early at strong echo. - if (echo_sum > enr_exit_threshold_ * ne_sum && - echo_sum > snr_threshold_ * noise_sum) { - hold_counter_ = 0; - } - - // Remain in any nearend mode for a certain duration. - hold_counter_ = std::max(0, hold_counter_ - 1); - nearend_state_ = hold_counter_ > 0; -} - SuppressionGain::GainParameters::GainParameters( const EchoCanceller3Config::Suppressor::Tuning& tuning) : max_inc_factor(tuning.max_inc_factor), diff --git a/modules/audio_processing/aec3/suppression_gain.h b/modules/audio_processing/aec3/suppression_gain.h index a583ef01a3..fe42c8f742 100644 --- a/modules/audio_processing/aec3/suppression_gain.h +++ b/modules/audio_processing/aec3/suppression_gain.h @@ -20,6 +20,7 @@ #include "api/audio/echo_canceller3_config.h" #include "modules/audio_processing/aec3/aec3_common.h" #include "modules/audio_processing/aec3/aec_state.h" +#include "modules/audio_processing/aec3/dominant_nearend_detector.h" #include "modules/audio_processing/aec3/fft_data.h" #include "modules/audio_processing/aec3/moving_average.h" #include "modules/audio_processing/aec3/render_signal_analyzer.h" @@ -32,13 +33,17 @@ class SuppressionGain { public: SuppressionGain(const EchoCanceller3Config& config, Aec3Optimization optimization, - int sample_rate_hz); + int sample_rate_hz, + size_t num_capture_channels); ~SuppressionGain(); void GetGain( - const std::array& nearend_spectrum, - const std::array& echo_spectrum, - const std::array& residual_echo_spectrum, - const std::array& comfort_noise_spectrum, + rtc::ArrayView> + nearend_spectrum, + rtc::ArrayView> echo_spectrum, + rtc::ArrayView> + residual_echo_spectrum, + rtc::ArrayView> + comfort_noise_spectrum, const RenderSignalAnalyzer& render_signal_analyzer, const AecState& aec_state, const std::vector>>& render, @@ -51,31 +56,31 @@ class SuppressionGain { private: // Computes the gain to apply for the bands beyond the first band. float UpperBandsGain( - const std::array& echo_spectrum, - const std::array& comfort_noise_spectrum, + rtc::ArrayView> echo_spectrum, + rtc::ArrayView> + comfort_noise_spectrum, const absl::optional& narrow_peak_band, bool saturated_echo, const std::vector>>& render, const std::array& low_band_gain) const; - void GainToNoAudibleEcho( - const std::array& nearend, - const std::array& echo, - const std::array& masker, - const std::array& min_gain, - const std::array& max_gain, - std::array* gain) const; + void GainToNoAudibleEcho(const std::array& nearend, + const std::array& echo, + const std::array& masker, + std::array* gain) const; void LowerBandGain( bool stationary_with_low_power, const AecState& aec_state, - const std::array& suppressor_input, - const std::array& nearend, - const std::array& residual_echo, - const std::array& comfort_noise, + rtc::ArrayView> + suppressor_input, + rtc::ArrayView> residual_echo, + rtc::ArrayView> comfort_noise, std::array* gain); void GetMinGain(rtc::ArrayView weighted_residual_echo, + rtc::ArrayView last_nearend, + rtc::ArrayView last_echo, bool low_noise_render, bool saturated_echo, rtc::ArrayView min_gain) const; @@ -90,35 +95,6 @@ class SuppressionGain { float average_power_ = 32768.f * 32768.f; }; - // Class for selecting whether the suppressor is in the nearend or echo state. - class DominantNearendDetector { - public: - explicit DominantNearendDetector( - const EchoCanceller3Config::Suppressor::DominantNearendDetection - config); - - // Returns whether the current state is the nearend state. - bool IsNearendState() const { return nearend_state_; } - - // Updates the state selection based on latest spectral estimates. - void Update(rtc::ArrayView nearend_spectrum, - rtc::ArrayView residual_echo_spectrum, - rtc::ArrayView comfort_noise_spectrum, - bool initial_state); - - private: - const float enr_threshold_; - const float enr_exit_threshold_; - const float snr_threshold_; - const int hold_duration_; - const int trigger_threshold_; - const bool use_during_initial_phase_; - - bool nearend_state_ = false; - int trigger_counter_ = 0; - int hold_counter_ = 0; - }; - struct GainParameters { explicit GainParameters( const EchoCanceller3Config::Suppressor::Tuning& tuning); @@ -133,15 +109,15 @@ class SuppressionGain { std::unique_ptr data_dumper_; const Aec3Optimization optimization_; const EchoCanceller3Config config_; + const size_t num_capture_channels_; const int state_change_duration_blocks_; - float one_by_state_change_duration_blocks_; std::array last_gain_; - std::array last_nearend_; - std::array last_echo_; + std::vector> last_nearend_; + std::vector> last_echo_; LowNoiseRenderDetector low_render_detector_; bool initial_state_ = true; int initial_state_change_counter_ = 0; - aec3::MovingAverage moving_average_; + std::vector nearend_smoothers_; const GainParameters nearend_params_; const GainParameters normal_params_; DominantNearendDetector dominant_nearend_detector_; diff --git a/modules/audio_processing/aec3/suppression_gain_unittest.cc b/modules/audio_processing/aec3/suppression_gain_unittest.cc index 6396af8e3a..0452f2e1fb 100644 --- a/modules/audio_processing/aec3/suppression_gain_unittest.cc +++ b/modules/audio_processing/aec3/suppression_gain_unittest.cc @@ -26,16 +26,15 @@ namespace aec3 { // Verifies that the check for non-null output gains works. TEST(SuppressionGain, NullOutputGains) { - std::array E2; - std::array R2; - std::array S2; - std::array N2; + std::vector> E2(1, {0.f}); + std::vector> R2(1, {0.f}); + std::vector> S2(1); + std::vector> N2(1, {0.f}); + for (auto& S2_k : S2) { + S2_k.fill(.1f); + } FftData E; FftData Y; - E2.fill(0.f); - R2.fill(0.f); - S2.fill(0.1f); - N2.fill(0.f); E.re.fill(0.f); E.im.fill(0.f); Y.re.fill(0.f); @@ -44,7 +43,7 @@ TEST(SuppressionGain, NullOutputGains) { float high_bands_gain; AecState aec_state(EchoCanceller3Config{}, 1); EXPECT_DEATH( - SuppressionGain(EchoCanceller3Config{}, DetectOptimization(), 16000) + SuppressionGain(EchoCanceller3Config{}, DetectOptimization(), 16000, 1) .GetGain(E2, S2, R2, N2, RenderSignalAnalyzer((EchoCanceller3Config{})), aec_state, std::vector>>( @@ -59,46 +58,43 @@ TEST(SuppressionGain, NullOutputGains) { // Does a sanity check that the gains are correctly computed. TEST(SuppressionGain, BasicGainComputation) { constexpr size_t kNumRenderChannels = 1; - constexpr size_t kNumCaptureChannels = 1; + constexpr size_t kNumCaptureChannels = 2; constexpr int kSampleRateHz = 16000; constexpr size_t kNumBands = NumBandsForRate(kSampleRateHz); SuppressionGain suppression_gain(EchoCanceller3Config(), DetectOptimization(), - kSampleRateHz); + kSampleRateHz, kNumCaptureChannels); RenderSignalAnalyzer analyzer(EchoCanceller3Config{}); float high_bands_gain; std::vector> E2(kNumCaptureChannels); - std::array S2; + std::vector> S2(kNumCaptureChannels, + {0.f}); std::vector> Y2(kNumCaptureChannels); - std::array R2; - std::array N2; + std::vector> R2(kNumCaptureChannels); + std::vector> N2(kNumCaptureChannels); std::array g; std::vector output(kNumCaptureChannels); - std::array y; std::vector>> x( kNumBands, std::vector>( kNumRenderChannels, std::vector(kBlockSize, 0.f))); EchoCanceller3Config config; AecState aec_state(config, kNumCaptureChannels); ApmDataDumper data_dumper(42); - Subtractor subtractor(config, 1, 1, &data_dumper, DetectOptimization()); + Subtractor subtractor(config, kNumRenderChannels, kNumCaptureChannels, + &data_dumper, DetectOptimization()); std::unique_ptr render_delay_buffer( RenderDelayBuffer::Create(config, kSampleRateHz, kNumRenderChannels)); absl::optional delay_estimate; // Ensure that a strong noise is detected to mask any echoes. - for (auto& E2_k : E2) { - E2_k.fill(10.f); + for (size_t ch = 0; ch < kNumCaptureChannels; ++ch) { + E2[ch].fill(10.f); + Y2[ch].fill(10.f); + R2[ch].fill(.1f); + N2[ch].fill(100.f); } - for (auto& Y2_k : Y2) { - Y2_k.fill(10.f); - } - R2.fill(0.1f); - S2.fill(0.1f); - N2.fill(100.f); for (auto& subtractor_output : output) { subtractor_output.Reset(); } - y.fill(0.f); // Ensure that the gain is no longer forced to zero. for (int k = 0; k <= kNumBlocksPerSecond / 5 + 1; ++k) { @@ -111,41 +107,37 @@ TEST(SuppressionGain, BasicGainComputation) { aec_state.Update(delay_estimate, subtractor.FilterFrequencyResponses(), subtractor.FilterImpulseResponses(), *render_delay_buffer->GetRenderBuffer(), E2, Y2, output); - suppression_gain.GetGain(E2[0], S2, R2, N2, analyzer, aec_state, x, + suppression_gain.GetGain(E2, S2, R2, N2, analyzer, aec_state, x, &high_bands_gain, &g); } std::for_each(g.begin(), g.end(), [](float a) { EXPECT_NEAR(1.f, a, 0.001); }); // Ensure that a strong nearend is detected to mask any echoes. - for (auto& E2_k : E2) { - E2_k.fill(100.f); + for (size_t ch = 0; ch < kNumCaptureChannels; ++ch) { + E2[ch].fill(100.f); + Y2[ch].fill(100.f); + R2[ch].fill(0.1f); + S2[ch].fill(0.1f); + N2[ch].fill(0.f); } - for (auto& Y2_k : Y2) { - Y2_k.fill(100.f); - } - R2.fill(0.1f); - S2.fill(0.1f); - N2.fill(0.f); for (int k = 0; k < 100; ++k) { aec_state.Update(delay_estimate, subtractor.FilterFrequencyResponses(), subtractor.FilterImpulseResponses(), *render_delay_buffer->GetRenderBuffer(), E2, Y2, output); - suppression_gain.GetGain(E2[0], S2, R2, N2, analyzer, aec_state, x, + suppression_gain.GetGain(E2, S2, R2, N2, analyzer, aec_state, x, &high_bands_gain, &g); } std::for_each(g.begin(), g.end(), [](float a) { EXPECT_NEAR(1.f, a, 0.001); }); - // Ensure that a strong echo is suppressed. - for (auto& E2_k : E2) { - E2_k.fill(1000000000.f); - } - R2.fill(10000000000000.f); + // Add a strong echo to one of the channels and ensure that it is suppressed. + E2[1].fill(1000000000.f); + R2[1].fill(10000000000000.f); for (int k = 0; k < 10; ++k) { - suppression_gain.GetGain(E2[0], S2, R2, N2, analyzer, aec_state, x, + suppression_gain.GetGain(E2, S2, R2, N2, analyzer, aec_state, x, &high_bands_gain, &g); } std::for_each(g.begin(), g.end(),