Echo canceller 3 improvements for setups with headsets.

This CL improves the echo cancellation performance on setups where headsets are used (systems with such low echo path gain that no correlation between the render and capture signals can be found) in 4 ways: 1) The echo path gain for systems with headsets is assumed to be nonzero. 2) The stationary component of the render power is not included in nonlinear echo power estimate. 3) The behavior after echo path gain changes is made less cautious. 4) The detection of systems with headsets is made more rapid. BUG=chromium:712651, webrtc:6018 Review-Url: https://codereview.webrtc.org/2823903003 Cr-Commit-Position: refs/heads/master@{#17768}
2017-04-19 09:03:40 -07:00 · 2017-04-19 09:03:40 -07:00 · e52a203a56
commit e52a203a56
parent d5c77abbaa
4 changed files with 78 additions and 27 deletions
--- a/webrtc/modules/audio_processing/aec3/aec_state.cc
+++ b/webrtc/modules/audio_processing/aec3/aec_state.cc
@ -22,7 +22,7 @@
 namespace webrtc {
 namespace {

-constexpr size_t kEchoPathChangeConvergenceBlocks = 4 * kNumBlocksPerSecond;
+constexpr size_t kEchoPathChangeConvergenceBlocks = 2 * kNumBlocksPerSecond;
 constexpr size_t kSaturationLeakageBlocks = 20;

 // Computes delay of the adaptive filter.
@ -89,7 +89,6 @@ void AecState::HandleEchoPathChange(
    const EchoPathVariability& echo_path_variability) {
  if (echo_path_variability.AudioPathChanged()) {
    blocks_since_last_saturation_ = 0;
-    active_render_blocks_ = 0;
    usable_linear_estimate_ = false;
    echo_leakage_detected_ = false;
    capture_signal_saturation_ = false;
@ -98,6 +97,8 @@ void AecState::HandleEchoPathChange(

    if (echo_path_variability.delay_change) {
      force_zero_gain_counter_ = 0;
+      blocks_with_filter_adaptation_ = 0;
+      render_received_ = false;
      force_zero_gain_ = true;
      echo_path_change_counter_ = kEchoPathChangeCounterMax;
    }
@ -121,7 +122,11 @@ void AecState::Update(const std::vector<std::array<float, kFftLengthBy2Plus1>>&
  // Update counters.
  const float x_energy = std::inner_product(x.begin(), x.end(), x.begin(), 0.f);
  const bool active_render_block = x_energy > 10000.f * kFftLengthBy2;
-  active_render_blocks_ += active_render_block ? 1 : 0;
+  if (active_render_block) {
+    render_received_ = true;
+  }
+  blocks_with_filter_adaptation_ +=
+      (active_render_block && (!SaturatedCapture()) ? 1 : 0);
  --echo_path_change_counter_;

  // Force zero echo suppression gain after an echo path change to allow at
@ -145,6 +150,8 @@ void AecState::Update(const std::vector<std::array<float, kFftLengthBy2Plus1>>&
  }

  // Detect and flag echo saturation.
+  // TODO(peah): Add the delay in this computation to ensure that the render and
+  // capture signals are properly aligned.
  RTC_DCHECK_LT(0, x.size());
  const float max_sample = fabs(*std::max_element(
      x.begin(), x.end(), [](float a, float b) { return a * a < b * b; }));
@ -160,14 +167,17 @@ void AecState::Update(const std::vector<std::array<float, kFftLengthBy2Plus1>>&
  // Flag whether the linear filter estimate is usable.
  usable_linear_estimate_ =
      (!echo_saturation_) &&
-      active_render_blocks_ > kEchoPathChangeConvergenceBlocks &&
+      (!render_received_ ||
+       blocks_with_filter_adaptation_ > kEchoPathChangeConvergenceBlocks) &&
      filter_delay_ && echo_path_change_counter_ <= 0;

  // After an amount of active render samples for which an echo should have been
  // detected in the capture signal if the ERL was not infinite, flag that a
  // headset is used.
-  headset_detected_ = !external_delay_ && !filter_delay_ &&
-                      active_render_blocks_ >= kEchoPathChangeConvergenceBlocks;
+  headset_detected_ =
+      !external_delay_ && !filter_delay_ &&
+      (!render_received_ ||
+       blocks_with_filter_adaptation_ >= kEchoPathChangeConvergenceBlocks);
 }

 }  // namespace webrtc
--- a/webrtc/modules/audio_processing/aec3/aec_state.h
+++ b/webrtc/modules/audio_processing/aec3/aec_state.h
@ -41,7 +41,8 @@ class AecState {
  bool EchoLeakageDetected() const { return echo_leakage_detected_; }

  // Returns whether the render signal is currently active.
-  bool ActiveRender() const { return active_render_blocks_ > 200; }
+  // TODO(peah): Deprecate this in an upcoming CL.
+  bool ActiveRender() const { return blocks_with_filter_adaptation_ > 200; }

  // Returns the ERLE.
  const std::array<float, kFftLengthBy2Plus1>& Erle() const {
@ -99,7 +100,7 @@ class AecState {
  ErlEstimator erl_estimator_;
  ErleEstimator erle_estimator_;
  int echo_path_change_counter_;
-  size_t active_render_blocks_ = 0;
+  size_t blocks_with_filter_adaptation_ = 0;
  bool usable_linear_estimate_ = false;
  bool echo_leakage_detected_ = false;
  bool capture_signal_saturation_ = false;
@ -107,6 +108,7 @@ class AecState {
  bool headset_detected_ = false;
  float previous_max_sample_ = 0.f;
  bool force_zero_gain_ = false;
+  bool render_received_ = false;
  size_t force_zero_gain_counter_ = 0;
  rtc::Optional<size_t> filter_delay_;
  rtc::Optional<size_t> external_delay_;
--- a/webrtc/modules/audio_processing/aec3/residual_echo_estimator.cc
+++ b/webrtc/modules/audio_processing/aec3/residual_echo_estimator.cc
@ -40,6 +40,43 @@ void EchoGeneratingPower(const RenderBuffer& render_buffer,
  });
 }

+constexpr int kNoiseFloorCounterMax = 50;
+constexpr float kNoiseFloorMin = 10.f * 10.f * 128.f * 128.f;
+
+// Updates estimate for the power of the stationary noise component in the
+// render signal.
+void RenderNoisePower(
+    const RenderBuffer& render_buffer,
+    std::array<float, kFftLengthBy2Plus1>* X2_noise_floor,
+    std::array<int, kFftLengthBy2Plus1>* X2_noise_floor_counter) {
+  RTC_DCHECK(X2_noise_floor);
+  RTC_DCHECK(X2_noise_floor_counter);
+
+  const auto render_power = render_buffer.Spectrum(0);
+  RTC_DCHECK_EQ(X2_noise_floor->size(), render_power.size());
+  RTC_DCHECK_EQ(X2_noise_floor_counter->size(), render_power.size());
+
+  // Estimate the stationary noise power in a minimum statistics manner.
+  for (size_t k = 0; k < render_power.size(); ++k) {
+    // Decrease rapidly.
+    if (render_power[k] < (*X2_noise_floor)[k]) {
+      (*X2_noise_floor)[k] = render_power[k];
+      (*X2_noise_floor_counter)[k] = 0;
+    } else {
+      // Increase in a delayed, leaky manner.
+      if ((*X2_noise_floor_counter)[k] >= kNoiseFloorCounterMax) {
+        (*X2_noise_floor)[k] =
+            std::max((*X2_noise_floor)[k] * 1.1f, kNoiseFloorMin);
+      } else {
+        ++(*X2_noise_floor_counter)[k];
+      }
+    }
+  }
+}
+
+// Assume a minimum echo path gain of -33 dB for headsets.
+constexpr float kHeadsetEchoPathGain = 0.0005f;
+
 }  // namespace

 ResidualEchoEstimator::ResidualEchoEstimator() {
@ -57,28 +94,19 @@ void ResidualEchoEstimator::Estimate(
    std::array<float, kFftLengthBy2Plus1>* R2) {
  RTC_DCHECK(R2);

-  // Return zero residual echo power when a headset is detected.
-  if (aec_state.HeadsetDetected()) {
-    if (!headset_detected_cached_) {
-      Reset();
-      headset_detected_cached_ = true;
-    }
-    R2->fill(0.f);
-    return;
-  } else {
-    headset_detected_cached_ = false;
-  }
-
  const rtc::Optional<size_t> delay =
      aec_state.FilterDelay()
          ? aec_state.FilterDelay()
          : (aec_state.ExternalDelay() ? aec_state.ExternalDelay()
                                       : rtc::Optional<size_t>());

+  // Estimate the power of the stationary noise in the render signal.
+  RenderNoisePower(render_buffer, &X2_noise_floor_, &X2_noise_floor_counter_);
+
  // Estimate the residual echo power.
  const bool use_linear_echo_power =
      aec_state.UsableLinearEstimate() && using_subtractor_output;
-  if (use_linear_echo_power) {
+  if (use_linear_echo_power && !aec_state.HeadsetDetected()) {
    RTC_DCHECK(aec_state.FilterDelay());
    const int filter_delay = *aec_state.FilterDelay();
    LinearEstimate(S2_linear, aec_state.Erle(), filter_delay, R2);
@ -102,7 +130,15 @@ void ResidualEchoEstimator::Estimate(
                          kResidualEchoPowerRenderWindowSize - 1, &X2);
    }

-    NonLinearEstimate(X2, Y2, R2);
+    // Subtract the stationary noise power to avoid stationary noise causing
+    // excessive echo suppression.
+    std::transform(
+        X2.begin(), X2.end(), X2_noise_floor_.begin(), X2.begin(),
+        [](float a, float b) { return std::max(0.f, a - 10.f * b); });
+
+    NonLinearEstimate(
+        aec_state.HeadsetDetected() ? kHeadsetEchoPathGain : kFixedEchoPathGain,
+        X2, Y2, R2);
    AddEchoReverb(*R2, aec_state.SaturatedEcho(),
                  std::min(static_cast<size_t>(kAdaptiveFilterLength),
                           delay.value_or(kAdaptiveFilterLength)),
@ -119,6 +155,8 @@ void ResidualEchoEstimator::Estimate(
 }

 void ResidualEchoEstimator::Reset() {
+  X2_noise_floor_counter_.fill(kNoiseFloorCounterMax);
+  X2_noise_floor_.fill(kNoiseFloorMin);
  R2_reverb_.fill(0.f);
  R2_old_.fill(0.f);
  R2_hold_counter_.fill(0.f);
@ -141,14 +179,13 @@ void ResidualEchoEstimator::LinearEstimate(
 }

 void ResidualEchoEstimator::NonLinearEstimate(
+    float echo_path_gain,
    const std::array<float, kFftLengthBy2Plus1>& X2,
    const std::array<float, kFftLengthBy2Plus1>& Y2,
    std::array<float, kFftLengthBy2Plus1>* R2) {
  // Compute preliminary residual echo.
-  // TODO(peah): Try to make this adaptive. Currently the gain is hardcoded to
-  // 20 dB.
  std::transform(X2.begin(), X2.end(), R2->begin(),
-                 [](float a) { return a * kFixedEchoPathGain; });
+                 [echo_path_gain](float a) { return a * echo_path_gain; });

  for (size_t k = 0; k < R2->size(); ++k) {
    // Update hold counter.
--- a/webrtc/modules/audio_processing/aec3/residual_echo_estimator.h
+++ b/webrtc/modules/audio_processing/aec3/residual_echo_estimator.h
@ -48,7 +48,8 @@ class ResidualEchoEstimator {

  // Estimates the residual echo power based on the estimate of the echo path
  // gain.
-  void NonLinearEstimate(const std::array<float, kFftLengthBy2Plus1>& X2,
+  void NonLinearEstimate(float echo_path_gain,
+                         const std::array<float, kFftLengthBy2Plus1>& X2,
                         const std::array<float, kFftLengthBy2Plus1>& Y2,
                         std::array<float, kFftLengthBy2Plus1>* R2);

@ -66,7 +67,8 @@ class ResidualEchoEstimator {
  int S2_old_index_ = 0;
  std::array<std::array<float, kFftLengthBy2Plus1>, kAdaptiveFilterLength>
      S2_old_;
-  bool headset_detected_cached_ = false;
+  std::array<float, kFftLengthBy2Plus1> X2_noise_floor_;
+  std::array<int, kFftLengthBy2Plus1> X2_noise_floor_counter_;

  RTC_DISALLOW_COPY_AND_ASSIGN(ResidualEchoEstimator);
 };