From 63b494dff72d8e8638b2bfc60e55b351713bf6cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Per=20=C3=85hgren?= Date: Wed, 6 Dec 2017 11:32:38 +0100 Subject: [PATCH] Reverted the new handling of saturated echoes in AEC3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This CL reverts the changes introduced that handles echoes in AEC3. The revert is done to match the behavior which is in M63. Bug: webrtc:8615,chromium:792346 Change-Id: I128ccb17dc359c7889a701a2faaaf06be40f86dd Reviewed-on: https://webrtc-review.googlesource.com/30140 Commit-Queue: Per Ã…hgren Reviewed-by: Gustaf Ullberg Cr-Commit-Position: refs/heads/master@{#21117} --- modules/audio_processing/aec3/aec3_common.h | 2 +- modules/audio_processing/aec3/aec_state.cc | 135 +++++------------- modules/audio_processing/aec3/aec_state.h | 3 +- .../audio_processing/aec3/matched_filter.cc | 2 +- .../aec3/residual_echo_estimator.cc | 51 +++++-- modules/audio_processing/aec3/subtractor.cc | 32 ++--- modules/audio_processing/aec3/subtractor.h | 5 - .../audio_processing/aec3/suppression_gain.cc | 53 +++---- .../include/audio_processing.h | 2 +- 9 files changed, 106 insertions(+), 179 deletions(-) diff --git a/modules/audio_processing/aec3/aec3_common.h b/modules/audio_processing/aec3/aec3_common.h index aa6ffd9bd6..13842f9854 100644 --- a/modules/audio_processing/aec3/aec3_common.h +++ b/modules/audio_processing/aec3/aec3_common.h @@ -39,7 +39,7 @@ constexpr size_t kFftLengthBy2Minus1 = kFftLengthBy2 - 1; constexpr size_t kFftLength = 2 * kFftLengthBy2; constexpr int kAdaptiveFilterLength = 12; -constexpr int kUnknownDelayRenderWindowSize = 30; +constexpr int kUnknownDelayRenderWindowSize = 12; constexpr int kAdaptiveFilterTimeDomainLength = kAdaptiveFilterLength * kFftLengthBy2; constexpr int kRenderTransferQueueSizeFrames = 100; diff --git a/modules/audio_processing/aec3/aec_state.cc b/modules/audio_processing/aec3/aec_state.cc index 58859d837b..a9d901d2c0 100644 --- a/modules/audio_processing/aec3/aec_state.cc +++ b/modules/audio_processing/aec3/aec_state.cc @@ -65,11 +65,12 @@ AecState::~AecState() = default; void AecState::HandleEchoPathChange( const EchoPathVariability& echo_path_variability) { const auto full_reset = [&]() { - blocks_since_last_saturation_ = kUnknownDelayRenderWindowSize + 1; + blocks_since_last_saturation_ = 0; usable_linear_estimate_ = false; echo_leakage_detected_ = false; capture_signal_saturation_ = false; echo_saturation_ = false; + previous_max_sample_ = 0.f; max_render_.fill(0.f); force_zero_gain_counter_ = 0; blocks_with_filter_adaptation_ = 0; @@ -146,77 +147,50 @@ void AecState::Update(const std::vector>& // Update the echo audibility evaluator. echo_audibility_.Update(x, s, converged_filter); + // Detect and flag echo saturation. + // TODO(peah): Add the delay in this computation to ensure that the render and + // capture signals are properly aligned. + RTC_DCHECK_LT(0, x.size()); + const float max_sample = fabs(*std::max_element( + x.begin(), x.end(), [](float a, float b) { return a * a < b * b; })); if (config_.ep_strength.echo_can_saturate) { - // Detect and flag echo saturation. - RTC_DCHECK_LT(0, x.size()); - // Store the render values in a circular buffer. - max_render_index_ = (max_render_index_ + 1) % max_render_.size(); - auto x_max_result = std::minmax_element(x.begin(), x.end()); - max_render_[max_render_index_] = - std::max(fabs(*x_max_result.first), fabs(*x_max_result.second)); + const bool saturated_echo = + (previous_max_sample_ > 200.f) && SaturatedCapture(); - bool saturated_echo = false; - // Check for whether a saturated frame potentially could consist of - // saturated echo. - if (SaturatedCapture()) { - if (converged_filter) { - RTC_DCHECK(filter_delay_); - const size_t index = - (max_render_index_ + max_render_.size() - *filter_delay_) % - max_render_.size(); - saturated_echo = max_render_[index] > 200.f; - } else { - saturated_echo = - *std::max_element(max_render_.begin(), max_render_.end()) > 200.f; - } - } + // Counts the blocks since saturation. + constexpr size_t kSaturationLeakageBlocks = 20; // Set flag for potential presence of saturated echo blocks_since_last_saturation_ = saturated_echo ? 0 : blocks_since_last_saturation_ + 1; - if (converged_filter) { - echo_saturation_ = - blocks_since_last_saturation_ < kAdaptiveFilterLength + 1; - } else { - echo_saturation_ = - blocks_since_last_saturation_ < kUnknownDelayRenderWindowSize + 1; - } - // Set flag for whether the echo path is generally strong enough to saturate - // the echo. - if (converged_filter) { - // Base detection on predicted echo sample. - auto s_max_result = std::minmax_element(s.begin(), s.end()); - const float s_max_abs = - std::max(fabs(*s_max_result.first), fabs(*s_max_result.second)); - - const bool saturated_echo_sample = - s_max_abs >= 10000.f && SaturatedCapture(); - saturating_echo_path_counter_ = saturated_echo_sample - ? 10 * kNumBlocksPerSecond - : saturating_echo_path_counter_ - 1; - } else { - // Base detection on detected potentially echo. - saturating_echo_path_counter_ = saturated_echo - ? 10 * kNumBlocksPerSecond - : saturating_echo_path_counter_ - 1; - } - saturating_echo_path_counter_ = std::max(0, saturating_echo_path_counter_); - saturating_echo_path_ = saturating_echo_path_counter_ > 0; + echo_saturation_ = blocks_since_last_saturation_ < kSaturationLeakageBlocks; } else { echo_saturation_ = false; - saturating_echo_path_ = false; - saturating_echo_path_counter_ = 0; } + previous_max_sample_ = max_sample; - // Compute render energies. + // TODO(peah): Move? + sufficient_filter_updates_ = + blocks_with_filter_adaptation_ >= kEchoPathChangeConvergenceBlocks; + initial_state_ = capture_block_counter_ < 3 * kNumBlocksPerSecond; + + // Flag whether the linear filter estimate is usable. + usable_linear_estimate_ = + (!echo_saturation_) && (converged_filter || SufficientFilterUpdates()) && + capture_block_counter_ >= 2 * kNumBlocksPerSecond && external_delay_; + + linear_echo_estimate_ = UsableLinearEstimate() && !TransparentMode(); + + // After an amount of active render samples for which an echo should have been + // detected in the capture signal if the ERL was not infinite, flag that a + // transparent mode should be entered. const float x_energy = std::inner_product(x.begin(), x.end(), x.begin(), 0.f); const bool active_render_block = x_energy > (config_.render_levels.active_render_limit * config_.render_levels.active_render_limit) * kFftLengthBy2; - const bool strong_render_block = x_energy > 1000 * 1000 * kFftLengthBy2; if (active_render_block) { render_received_ = true; @@ -226,54 +200,9 @@ void AecState::Update(const std::vector>& blocks_with_filter_adaptation_ += (active_render_block && (!SaturatedCapture()) ? 1 : 0); - blocks_with_strong_render_ += - (strong_render_block && (!SaturatedCapture()) ? 1 : 0); - - // After an amount of active render samples for which an echo should have been - // detected in the capture signal if the ERL was not infinite, flag that a - // transparent mode should be entered. - if (SaturatingEchoPath()) { - transparent_mode_ = !converged_filter && - (!render_received_ || blocks_with_strong_render_ >= - 15 * kNumBlocksPerSecond); - } else { - transparent_mode_ = !converged_filter && - (!render_received_ || - blocks_with_strong_render_ >= 5 * kNumBlocksPerSecond); - } - - // Update flag for whether the adaptation is in the initial state. - if (SaturatingEchoPath()) { - initial_state_ = capture_block_counter_ < 6 * kNumBlocksPerSecond; - } else { - initial_state_ = capture_block_counter_ < 3 * kNumBlocksPerSecond; - } - - // Detect whether the linear filter is usable. - if (SaturatingEchoPath()) { - usable_linear_estimate_ = - (!echo_saturation_) && - (converged_filter && SufficientFilterUpdates()) && - capture_block_counter_ >= 5 * kNumBlocksPerSecond && external_delay_; - } else { - usable_linear_estimate_ = - (!echo_saturation_) && - (converged_filter || SufficientFilterUpdates()) && - capture_block_counter_ >= 2 * kNumBlocksPerSecond && external_delay_; - } - - // Flag whether the linear echo estimate should be used. - linear_echo_estimate_ = usable_linear_estimate_ && !TransparentMode(); - - // Flag whether a sufficient number of filter updates has been done for the - // filter to perform well. - if (SaturatingEchoPath()) { - sufficient_filter_updates_ = - blocks_with_filter_adaptation_ >= 2 * kEchoPathChangeConvergenceBlocks; - } else { - sufficient_filter_updates_ = - blocks_with_filter_adaptation_ >= kEchoPathChangeConvergenceBlocks; - } + transparent_mode_ = !converged_filter && + (!render_received_ || blocks_with_filter_adaptation_ >= + 5 * kNumBlocksPerSecond); // Update the room reverb estimate. UpdateReverb(adaptive_filter_impulse_response); diff --git a/modules/audio_processing/aec3/aec_state.h b/modules/audio_processing/aec3/aec_state.h index e9a46b4477..b8c1523a00 100644 --- a/modules/audio_processing/aec3/aec_state.h +++ b/modules/audio_processing/aec3/aec_state.h @@ -156,8 +156,8 @@ class AecState { bool capture_signal_saturation_ = false; bool echo_saturation_ = false; bool transparent_mode_ = false; + float previous_max_sample_ = 0.f; std::array max_render_; - size_t max_render_index_ = 0; bool force_zero_gain_ = false; bool render_received_ = false; size_t force_zero_gain_counter_ = 0; @@ -171,7 +171,6 @@ class AecState { const EchoCanceller3Config config_; float reverb_decay_; bool saturating_echo_path_ = false; - int saturating_echo_path_counter_ = 0; bool initial_state_ = true; bool linear_echo_estimate_ = false; bool sufficient_filter_updates_ = false; diff --git a/modules/audio_processing/aec3/matched_filter.cc b/modules/audio_processing/aec3/matched_filter.cc index 52bb3711fd..25001bc3ba 100644 --- a/modules/audio_processing/aec3/matched_filter.cc +++ b/modules/audio_processing/aec3/matched_filter.cc @@ -376,7 +376,7 @@ void MatchedFilter::Update(const DownsampledRenderBuffer& render_buffer, [](float a, float b) -> bool { return a * a < b * b; })); // Update the lag estimates for the matched filter. - const float kMatchingFilterThreshold = 0.05f; + const float kMatchingFilterThreshold = 0.2f; lag_estimates_[n] = LagEstimate( error_sum_anchor - error_sum, (lag_estimate > 2 && lag_estimate < (filters_[n].size() - 10) && diff --git a/modules/audio_processing/aec3/residual_echo_estimator.cc b/modules/audio_processing/aec3/residual_echo_estimator.cc index 013892ee50..ba65684fad 100644 --- a/modules/audio_processing/aec3/residual_echo_estimator.cc +++ b/modules/audio_processing/aec3/residual_echo_estimator.cc @@ -108,29 +108,56 @@ void ResidualEchoEstimator::Estimate( R2->fill((*std::max_element(R2->begin(), R2->end())) * 100.f); } } else { + const rtc::Optional delay = + aec_state.ExternalDelay() + ? (aec_state.FilterDelay() ? aec_state.FilterDelay() + : aec_state.ExternalDelay()) + : rtc::Optional(); + // Estimate the echo generating signal power. std::array X2; - EchoGeneratingPower(render_buffer, 0, kUnknownDelayRenderWindowSize - 1, - &X2); + if (aec_state.ExternalDelay() && aec_state.FilterDelay()) { + RTC_DCHECK(delay); + const int delay_use = static_cast(*delay); + + // Computes the spectral power over the blocks surrounding the delay. + constexpr int kKnownDelayRenderWindowSize = 5; + // TODO(peah): Add lookahead since that was what was there initially. + static_assert( + kUnknownDelayRenderWindowSize >= kKnownDelayRenderWindowSize, + "Requirement to ensure that the render buffer is overrun"); + EchoGeneratingPower( + render_buffer, std::max(0, delay_use - 1), + std::min(kKnownDelayRenderWindowSize - 1, delay_use + 1), &X2); + } else { + // Computes the spectral power over the latest blocks. + // TODO(peah): Add lookahead since that was what was there initially. + EchoGeneratingPower(render_buffer, 0, kUnknownDelayRenderWindowSize - 1, + &X2); + } // Subtract the stationary noise power to avoid stationary noise causing // excessive echo suppression. - if (!(aec_state.SaturatedEcho() || aec_state.SaturatingEchoPath())) { - std::transform( - X2.begin(), X2.end(), X2_noise_floor_.begin(), X2.begin(), - [](float a, float b) { return std::max(0.f, a - 10.f * b); }); - } + std::transform( + X2.begin(), X2.end(), X2_noise_floor_.begin(), X2.begin(), + [](float a, float b) { return std::max(0.f, a - 10.f * b); }); NonLinearEstimate( - aec_state.SufficientFilterUpdates(), - aec_state.SaturatedEcho() && aec_state.SaturatingEchoPath(), + aec_state.SufficientFilterUpdates(), aec_state.SaturatedEcho(), config_.ep_strength.bounded_erl, aec_state.TransparentMode(), aec_state.InitialState(), X2, Y2, R2); + + if (aec_state.ExternalDelay() && aec_state.FilterDelay() && + aec_state.SaturatedEcho()) { + AddEchoReverb(*R2, aec_state.SaturatedEcho(), + std::min(static_cast(kAdaptiveFilterLength), + delay.value_or(kAdaptiveFilterLength)), + aec_state.ReverbDecay(), R2); + } } // If the echo is deemed inaudible, set the residual echo to zero. - if (aec_state.InaudibleEcho() && - (!(aec_state.SaturatedEcho() || aec_state.SaturatingEchoPath()))) { + if (aec_state.InaudibleEcho()) { R2->fill(0.f); R2_old_.fill(0.f); R2_hold_counter_.fill(0.f); @@ -179,7 +206,7 @@ void ResidualEchoEstimator::NonLinearEstimate( // Set echo path gains. if (saturated_echo) { // If the echo could be saturated, use a very conservative gain. - echo_path_gain_lf = echo_path_gain_mf = echo_path_gain_hf = 1000.f; + echo_path_gain_lf = echo_path_gain_mf = echo_path_gain_hf = 10000.f; } else if (sufficient_filter_updates && !bounded_erl) { // If the filter should have been able to converge, and no assumption is // possible on the ERL, use a low gain. diff --git a/modules/audio_processing/aec3/subtractor.cc b/modules/audio_processing/aec3/subtractor.cc index 3c99b989a4..8d91bd03d1 100644 --- a/modules/audio_processing/aec3/subtractor.cc +++ b/modules/audio_processing/aec3/subtractor.cc @@ -60,13 +60,11 @@ Subtractor::~Subtractor() = default; void Subtractor::HandleEchoPathChange( const EchoPathVariability& echo_path_variability) { const auto full_reset = [&]() { - use_shadow_filter_frequency_response_ = false; main_filter_.HandleEchoPathChange(); shadow_filter_.HandleEchoPathChange(); G_main_.HandleEchoPathChange(echo_path_variability); G_shadow_.HandleEchoPathChange(); converged_filter_ = false; - converged_filter_counter_ = 0; }; // TODO(peah): Add delay-change specific reset behavior. @@ -107,29 +105,17 @@ void Subtractor::Process(const RenderBuffer& render_buffer, shadow_filter_.Filter(render_buffer, &S); PredictionError(fft_, S, y, &e_shadow, &E_shadow, nullptr); - // Determine which frequency response should be used. - const auto sum_of_squares = [](float a, float b) { return a + b * b; }; - const float e2_main = - std::accumulate(e_main.begin(), e_main.end(), 0.f, sum_of_squares); - const float e2_shadow = - std::accumulate(e_shadow.begin(), e_shadow.end(), 0.f, sum_of_squares); - const float y2 = std::accumulate(y.begin(), y.end(), 0.f, sum_of_squares); - if (e2_main < e2_shadow && e2_main < 0.1 * y2) { - use_shadow_filter_frequency_response_ = false; - } else if (e2_shadow < e2_main && e2_shadow < 0.01 * y2) { - use_shadow_filter_frequency_response_ = true; - } - - // Flag whether the filter has at some point converged. - // TODO(peah): Consider using a timeout for this. if (!converged_filter_) { - if (y2 > kBlockSize * 100.f * 100.f) { - if (e2_main < 0.3 * y2) { - converged_filter_ = (++converged_filter_counter_) > 10; - } else { - converged_filter_counter_ = 0; - } + const auto sum_of_squares = [](float a, float b) { return a + b * b; }; + const float e2_main = + std::accumulate(e_main.begin(), e_main.end(), 0.f, sum_of_squares); + const float e2_shadow = + std::accumulate(e_shadow.begin(), e_shadow.end(), 0.f, sum_of_squares); + const float y2 = std::accumulate(y.begin(), y.end(), 0.f, sum_of_squares); + + if (y2 > kBlockSize * 50.f * 50.f) { + converged_filter_ = (e2_main > 0.3 * y2 || e2_shadow > 0.1 * y2); } } diff --git a/modules/audio_processing/aec3/subtractor.h b/modules/audio_processing/aec3/subtractor.h index 11c090f7e8..fe7928ea8a 100644 --- a/modules/audio_processing/aec3/subtractor.h +++ b/modules/audio_processing/aec3/subtractor.h @@ -48,9 +48,6 @@ class Subtractor { // Returns the block-wise frequency response for the main adaptive filter. const std::vector>& FilterFrequencyResponse() const { - if (use_shadow_filter_frequency_response_) { - return shadow_filter_.FilterFrequencyResponse(); - } return main_filter_.FilterFrequencyResponse(); } @@ -71,8 +68,6 @@ class Subtractor { MainFilterUpdateGain G_main_; ShadowFilterUpdateGain G_shadow_; bool converged_filter_ = false; - size_t converged_filter_counter_ = 0; - bool use_shadow_filter_frequency_response_ = false; RTC_DISALLOW_IMPLICIT_CONSTRUCTORS(Subtractor); }; diff --git a/modules/audio_processing/aec3/suppression_gain.cc b/modules/audio_processing/aec3/suppression_gain.cc index e420558685..ae4d4ace23 100644 --- a/modules/audio_processing/aec3/suppression_gain.cc +++ b/modules/audio_processing/aec3/suppression_gain.cc @@ -126,14 +126,7 @@ void UpdateMaxGainIncrease( float min_decreasing; auto& param = config.gain_updates; - if (no_saturation_counter <= 10) { - max_increasing = param.saturation.max_inc; - max_decreasing = param.saturation.max_dec; - rate_increasing = param.saturation.rate_inc; - rate_decreasing = param.saturation.rate_dec; - min_increasing = param.saturation.min_inc; - min_decreasing = param.saturation.min_dec; - } else if (!linear_echo_estimate) { + if (linear_echo_estimate) { max_increasing = param.nonlinear.max_inc; max_decreasing = param.nonlinear.max_dec; rate_increasing = param.nonlinear.rate_inc; @@ -147,13 +140,20 @@ void UpdateMaxGainIncrease( rate_decreasing = param.low_noise.rate_dec; min_increasing = param.low_noise.min_inc; min_decreasing = param.low_noise.min_dec; - } else { + } else if (no_saturation_counter > 10) { max_increasing = param.normal.max_inc; max_decreasing = param.normal.max_dec; rate_increasing = param.normal.rate_inc; rate_decreasing = param.normal.rate_dec; min_increasing = param.normal.min_inc; min_decreasing = param.normal.min_dec; + } else { + max_increasing = param.saturation.max_inc; + max_decreasing = param.saturation.max_dec; + rate_increasing = param.saturation.rate_inc; + rate_decreasing = param.saturation.rate_dec; + min_increasing = param.saturation.min_inc; + min_decreasing = param.saturation.min_dec; } for (size_t k = 0; k < new_gain.size(); ++k) { @@ -186,15 +186,13 @@ void GainToNoAudibleEcho( const std::array& one_by_echo, std::array* gain) { float nearend_masking_margin = 0.f; - if (saturated_echo) { - nearend_masking_margin = config.gain_mask.m2; + if (linear_echo_estimate) { + nearend_masking_margin = + low_noise_render + ? config.gain_mask.m9 + : (saturated_echo ? config.gain_mask.m2 : config.gain_mask.m3); } else { - if (linear_echo_estimate) { - nearend_masking_margin = - low_noise_render ? config.gain_mask.m9 : config.gain_mask.m3; - } else { - nearend_masking_margin = config.gain_mask.m7; - } + nearend_masking_margin = config.gain_mask.m7; } RTC_DCHECK_LE(0.f, nearend_masking_margin); @@ -202,13 +200,8 @@ void GainToNoAudibleEcho( const float one_by_one_minus_nearend_masking_margin = 1.f / (1.0f - nearend_masking_margin); - float masker_margin; - if (saturated_echo || saturating_echo_path) { - masker_margin = 0.0001f; - } else { - masker_margin = - linear_echo_estimate ? config.gain_mask.m1 : config.gain_mask.m8; - } + const float masker_margin = + linear_echo_estimate ? config.gain_mask.m1 : config.gain_mask.m8; for (size_t k = 0; k < gain->size(); ++k) { const float unity_gain_masker = std::max(nearend[k], masker[k]); @@ -306,7 +299,7 @@ void SuppressionGain::LowerBandGain( const float min_echo_power = low_noise_render ? config_.echo_audibility.low_render_limit : config_.echo_audibility.normal_render_limit; - if (!saturating_echo_path) { + if (no_saturation_counter_ > 10) { for (size_t k = 0; k < nearend.size(); ++k) { const float denom = std::min(nearend[k], echo[k]); min_gain[k] = denom > 0.f ? min_echo_power / denom : 1.f; @@ -319,12 +312,10 @@ void SuppressionGain::LowerBandGain( // Compute the maximum gain by limiting the gain increase from the previous // gain. std::array max_gain; - const float first_increase = saturated_echo || saturating_echo_path - ? 0.00001f - : config_.gain_updates.floor_first_increase; for (size_t k = 0; k < gain->size(); ++k) { - max_gain[k] = std::min( - std::max(last_gain_[k] * gain_increase_[k], first_increase), 1.f); + max_gain[k] = std::min(std::max(last_gain_[k] * gain_increase_[k], + config_.gain_updates.floor_first_increase), + 1.f); } // Iteratively compute the gain required to attenuate the echo to a non @@ -333,7 +324,7 @@ void SuppressionGain::LowerBandGain( for (int k = 0; k < 2; ++k) { std::array masker; MaskingPower(config_, nearend, comfort_noise, last_masker_, *gain, &masker); - GainToNoAudibleEcho(config_, low_noise_render, no_saturation_counter_ > 10, + GainToNoAudibleEcho(config_, low_noise_render, saturated_echo, saturating_echo_path, linear_echo_estimate, nearend, echo, masker, min_gain, max_gain, one_by_echo, gain); AdjustForExternalFilters(gain); diff --git a/modules/audio_processing/include/audio_processing.h b/modules/audio_processing/include/audio_processing.h index a639851d27..c353be9893 100644 --- a/modules/audio_processing/include/audio_processing.h +++ b/modules/audio_processing/include/audio_processing.h @@ -1207,7 +1207,7 @@ struct EchoCanceller3Config { GainChanges low_noise = {3.f, 3.f, 1.5f, 1.5f, 1.5f, 1.5f}; GainChanges normal = {2.f, 2.f, 1.5f, 1.5f, 1.2f, 1.2f}; - GainChanges saturation = {1.5f, 1.5f, 1.2f, 1.2f, 1.1f, 1.1f}; + GainChanges saturation = {1.2f, 1.2f, 1.5f, 1.5f, 1.f, 1.f}; GainChanges nonlinear = {1.5f, 1.5f, 1.2f, 1.2f, 1.1f, 1.1f}; float floor_first_increase = 0.0001f;