diff --git a/webrtc/modules/audio_processing/aec3/aec_state.cc b/webrtc/modules/audio_processing/aec3/aec_state.cc index 01c3c440fa..de4a0c16eb 100644 --- a/webrtc/modules/audio_processing/aec3/aec_state.cc +++ b/webrtc/modules/audio_processing/aec3/aec_state.cc @@ -22,7 +22,7 @@ namespace webrtc { namespace { -constexpr size_t kEchoPathChangeConvergenceBlocks = 4 * kNumBlocksPerSecond; +constexpr size_t kEchoPathChangeConvergenceBlocks = 2 * kNumBlocksPerSecond; constexpr size_t kSaturationLeakageBlocks = 20; // Computes delay of the adaptive filter. @@ -89,7 +89,6 @@ void AecState::HandleEchoPathChange( const EchoPathVariability& echo_path_variability) { if (echo_path_variability.AudioPathChanged()) { blocks_since_last_saturation_ = 0; - active_render_blocks_ = 0; usable_linear_estimate_ = false; echo_leakage_detected_ = false; capture_signal_saturation_ = false; @@ -98,6 +97,8 @@ void AecState::HandleEchoPathChange( if (echo_path_variability.delay_change) { force_zero_gain_counter_ = 0; + blocks_with_filter_adaptation_ = 0; + render_received_ = false; force_zero_gain_ = true; echo_path_change_counter_ = kEchoPathChangeCounterMax; } @@ -121,7 +122,11 @@ void AecState::Update(const std::vector>& // Update counters. const float x_energy = std::inner_product(x.begin(), x.end(), x.begin(), 0.f); const bool active_render_block = x_energy > 10000.f * kFftLengthBy2; - active_render_blocks_ += active_render_block ? 1 : 0; + if (active_render_block) { + render_received_ = true; + } + blocks_with_filter_adaptation_ += + (active_render_block && (!SaturatedCapture()) ? 1 : 0); --echo_path_change_counter_; // Force zero echo suppression gain after an echo path change to allow at @@ -145,6 +150,8 @@ void AecState::Update(const std::vector>& } // Detect and flag echo saturation. + // TODO(peah): Add the delay in this computation to ensure that the render and + // capture signals are properly aligned. RTC_DCHECK_LT(0, x.size()); const float max_sample = fabs(*std::max_element( x.begin(), x.end(), [](float a, float b) { return a * a < b * b; })); @@ -160,14 +167,17 @@ void AecState::Update(const std::vector>& // Flag whether the linear filter estimate is usable. usable_linear_estimate_ = (!echo_saturation_) && - active_render_blocks_ > kEchoPathChangeConvergenceBlocks && + (!render_received_ || + blocks_with_filter_adaptation_ > kEchoPathChangeConvergenceBlocks) && filter_delay_ && echo_path_change_counter_ <= 0; // After an amount of active render samples for which an echo should have been // detected in the capture signal if the ERL was not infinite, flag that a // headset is used. - headset_detected_ = !external_delay_ && !filter_delay_ && - active_render_blocks_ >= kEchoPathChangeConvergenceBlocks; + headset_detected_ = + !external_delay_ && !filter_delay_ && + (!render_received_ || + blocks_with_filter_adaptation_ >= kEchoPathChangeConvergenceBlocks); } } // namespace webrtc diff --git a/webrtc/modules/audio_processing/aec3/aec_state.h b/webrtc/modules/audio_processing/aec3/aec_state.h index 387c6ea42d..519665f3a2 100644 --- a/webrtc/modules/audio_processing/aec3/aec_state.h +++ b/webrtc/modules/audio_processing/aec3/aec_state.h @@ -41,7 +41,8 @@ class AecState { bool EchoLeakageDetected() const { return echo_leakage_detected_; } // Returns whether the render signal is currently active. - bool ActiveRender() const { return active_render_blocks_ > 200; } + // TODO(peah): Deprecate this in an upcoming CL. + bool ActiveRender() const { return blocks_with_filter_adaptation_ > 200; } // Returns the ERLE. const std::array& Erle() const { @@ -99,7 +100,7 @@ class AecState { ErlEstimator erl_estimator_; ErleEstimator erle_estimator_; int echo_path_change_counter_; - size_t active_render_blocks_ = 0; + size_t blocks_with_filter_adaptation_ = 0; bool usable_linear_estimate_ = false; bool echo_leakage_detected_ = false; bool capture_signal_saturation_ = false; @@ -107,6 +108,7 @@ class AecState { bool headset_detected_ = false; float previous_max_sample_ = 0.f; bool force_zero_gain_ = false; + bool render_received_ = false; size_t force_zero_gain_counter_ = 0; rtc::Optional filter_delay_; rtc::Optional external_delay_; diff --git a/webrtc/modules/audio_processing/aec3/residual_echo_estimator.cc b/webrtc/modules/audio_processing/aec3/residual_echo_estimator.cc index 0a9ecac283..d36720f651 100644 --- a/webrtc/modules/audio_processing/aec3/residual_echo_estimator.cc +++ b/webrtc/modules/audio_processing/aec3/residual_echo_estimator.cc @@ -40,6 +40,43 @@ void EchoGeneratingPower(const RenderBuffer& render_buffer, }); } +constexpr int kNoiseFloorCounterMax = 50; +constexpr float kNoiseFloorMin = 10.f * 10.f * 128.f * 128.f; + +// Updates estimate for the power of the stationary noise component in the +// render signal. +void RenderNoisePower( + const RenderBuffer& render_buffer, + std::array* X2_noise_floor, + std::array* X2_noise_floor_counter) { + RTC_DCHECK(X2_noise_floor); + RTC_DCHECK(X2_noise_floor_counter); + + const auto render_power = render_buffer.Spectrum(0); + RTC_DCHECK_EQ(X2_noise_floor->size(), render_power.size()); + RTC_DCHECK_EQ(X2_noise_floor_counter->size(), render_power.size()); + + // Estimate the stationary noise power in a minimum statistics manner. + for (size_t k = 0; k < render_power.size(); ++k) { + // Decrease rapidly. + if (render_power[k] < (*X2_noise_floor)[k]) { + (*X2_noise_floor)[k] = render_power[k]; + (*X2_noise_floor_counter)[k] = 0; + } else { + // Increase in a delayed, leaky manner. + if ((*X2_noise_floor_counter)[k] >= kNoiseFloorCounterMax) { + (*X2_noise_floor)[k] = + std::max((*X2_noise_floor)[k] * 1.1f, kNoiseFloorMin); + } else { + ++(*X2_noise_floor_counter)[k]; + } + } + } +} + +// Assume a minimum echo path gain of -33 dB for headsets. +constexpr float kHeadsetEchoPathGain = 0.0005f; + } // namespace ResidualEchoEstimator::ResidualEchoEstimator() { @@ -57,28 +94,19 @@ void ResidualEchoEstimator::Estimate( std::array* R2) { RTC_DCHECK(R2); - // Return zero residual echo power when a headset is detected. - if (aec_state.HeadsetDetected()) { - if (!headset_detected_cached_) { - Reset(); - headset_detected_cached_ = true; - } - R2->fill(0.f); - return; - } else { - headset_detected_cached_ = false; - } - const rtc::Optional delay = aec_state.FilterDelay() ? aec_state.FilterDelay() : (aec_state.ExternalDelay() ? aec_state.ExternalDelay() : rtc::Optional()); + // Estimate the power of the stationary noise in the render signal. + RenderNoisePower(render_buffer, &X2_noise_floor_, &X2_noise_floor_counter_); + // Estimate the residual echo power. const bool use_linear_echo_power = aec_state.UsableLinearEstimate() && using_subtractor_output; - if (use_linear_echo_power) { + if (use_linear_echo_power && !aec_state.HeadsetDetected()) { RTC_DCHECK(aec_state.FilterDelay()); const int filter_delay = *aec_state.FilterDelay(); LinearEstimate(S2_linear, aec_state.Erle(), filter_delay, R2); @@ -102,7 +130,15 @@ void ResidualEchoEstimator::Estimate( kResidualEchoPowerRenderWindowSize - 1, &X2); } - NonLinearEstimate(X2, Y2, R2); + // Subtract the stationary noise power to avoid stationary noise causing + // excessive echo suppression. + std::transform( + X2.begin(), X2.end(), X2_noise_floor_.begin(), X2.begin(), + [](float a, float b) { return std::max(0.f, a - 10.f * b); }); + + NonLinearEstimate( + aec_state.HeadsetDetected() ? kHeadsetEchoPathGain : kFixedEchoPathGain, + X2, Y2, R2); AddEchoReverb(*R2, aec_state.SaturatedEcho(), std::min(static_cast(kAdaptiveFilterLength), delay.value_or(kAdaptiveFilterLength)), @@ -119,6 +155,8 @@ void ResidualEchoEstimator::Estimate( } void ResidualEchoEstimator::Reset() { + X2_noise_floor_counter_.fill(kNoiseFloorCounterMax); + X2_noise_floor_.fill(kNoiseFloorMin); R2_reverb_.fill(0.f); R2_old_.fill(0.f); R2_hold_counter_.fill(0.f); @@ -141,14 +179,13 @@ void ResidualEchoEstimator::LinearEstimate( } void ResidualEchoEstimator::NonLinearEstimate( + float echo_path_gain, const std::array& X2, const std::array& Y2, std::array* R2) { // Compute preliminary residual echo. - // TODO(peah): Try to make this adaptive. Currently the gain is hardcoded to - // 20 dB. std::transform(X2.begin(), X2.end(), R2->begin(), - [](float a) { return a * kFixedEchoPathGain; }); + [echo_path_gain](float a) { return a * echo_path_gain; }); for (size_t k = 0; k < R2->size(); ++k) { // Update hold counter. diff --git a/webrtc/modules/audio_processing/aec3/residual_echo_estimator.h b/webrtc/modules/audio_processing/aec3/residual_echo_estimator.h index 6c8a7b26e4..c8e6a28ea7 100644 --- a/webrtc/modules/audio_processing/aec3/residual_echo_estimator.h +++ b/webrtc/modules/audio_processing/aec3/residual_echo_estimator.h @@ -48,7 +48,8 @@ class ResidualEchoEstimator { // Estimates the residual echo power based on the estimate of the echo path // gain. - void NonLinearEstimate(const std::array& X2, + void NonLinearEstimate(float echo_path_gain, + const std::array& X2, const std::array& Y2, std::array* R2); @@ -66,7 +67,8 @@ class ResidualEchoEstimator { int S2_old_index_ = 0; std::array, kAdaptiveFilterLength> S2_old_; - bool headset_detected_cached_ = false; + std::array X2_noise_floor_; + std::array X2_noise_floor_counter_; RTC_DISALLOW_COPY_AND_ASSIGN(ResidualEchoEstimator); };