From 09fa4b04dd3ca92be4bf2ba2b73f352d8fa07153 Mon Sep 17 00:00:00 2001
From: Ivo Creusen <ivoc@webrtc.org>
Date: Thu, 11 Jan 2018 16:08:54 +0100
Subject: [PATCH] Make the echo detector injectable.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds a generic interface for an echo detector, and makes it possible to inject one into the audio processing module.

Bug: webrtc:8732
Change-Id: I30d97aeb829307b2ae9c4dbeb9a3e15ab7ec0912
Reviewed-on: https://webrtc-review.googlesource.com/38900
Commit-Queue: Ivo Creusen <ivoc@webrtc.org>
Reviewed-by: Per Åhgren <peah@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#21588}
---
 .../audio_processing/audio_processing_impl.cc | 67 +++++++++++++------
 .../audio_processing/audio_processing_impl.h  |  1 +
 .../include/audio_processing.h                | 33 +++++++++
 .../residual_echo_detector.cc                 | 14 ++--
 .../audio_processing/residual_echo_detector.h | 21 ++----
 .../residual_echo_detector_unittest.cc        | 12 ++--
 6 files changed, 104 insertions(+), 44 deletions(-)
diff --git a/modules/audio_processing/audio_processing_impl.cc b/modules/audio_processing/audio_processing_impl.cc
index 4629ebfbc2..accd03b255 100644
--- a/modules/audio_processing/audio_processing_impl.cc
+++ b/modules/audio_processing/audio_processing_impl.cc
@@ -302,8 +302,10 @@ struct AudioProcessingImpl::ApmPublicSubmodules {
 struct AudioProcessingImpl::ApmPrivateSubmodules {
   ApmPrivateSubmodules(NonlinearBeamformer* beamformer,
                        std::unique_ptr<CustomProcessing> capture_post_processor,
-                       std::unique_ptr<CustomProcessing> render_pre_processor)
+                       std::unique_ptr<CustomProcessing> render_pre_processor,
+                       std::unique_ptr<EchoDetector> echo_detector)
       : beamformer(beamformer),
+        echo_detector(std::move(echo_detector)),
         capture_post_processor(std::move(capture_post_processor)),
         render_pre_processor(std::move(render_pre_processor)) {}
   // Accessed internally from capture or during initialization
@@ -312,7 +314,7 @@ struct AudioProcessingImpl::ApmPrivateSubmodules {
   std::unique_ptr<GainController2> gain_controller2;
   std::unique_ptr<LowCutFilter> low_cut_filter;
   std::unique_ptr<LevelController> level_controller;
-  std::unique_ptr<ResidualEchoDetector> residual_echo_detector;
+  std::unique_ptr<EchoDetector> echo_detector;
   std::unique_ptr<EchoControl> echo_controller;
   std::unique_ptr<CustomProcessing> capture_post_processor;
   std::unique_ptr<CustomProcessing> render_pre_processor;
@@ -345,16 +347,27 @@ AudioProcessingBuilder& AudioProcessingBuilder::SetNonlinearBeamformer(
   return *this;
 }
 
+AudioProcessingBuilder& AudioProcessingBuilder::SetEchoDetector(
+    std::unique_ptr<EchoDetector> echo_detector) {
+  echo_detector_ = std::move(echo_detector);
+  return *this;
+}
+
 AudioProcessing* AudioProcessingBuilder::Create() {
   webrtc::Config config;
   return Create(config);
 }
 
 AudioProcessing* AudioProcessingBuilder::Create(const webrtc::Config& config) {
-  return AudioProcessing::Create(config, std::move(capture_post_processing_),
-                                 std::move(render_pre_processing_),
-                                 std::move(echo_control_factory_),
-                                 nonlinear_beamformer_.release());
+  AudioProcessingImpl* apm = new rtc::RefCountedObject<AudioProcessingImpl>(
+      config, std::move(capture_post_processing_),
+      std::move(render_pre_processing_), std::move(echo_control_factory_),
+      std::move(echo_detector_), nonlinear_beamformer_.release());
+  if (apm->Initialize() != AudioProcessing::kNoError) {
+    delete apm;
+    apm = nullptr;
+  }
+  return apm;
 }
 
 AudioProcessing* AudioProcessing::Create() {
@@ -388,7 +401,7 @@ AudioProcessing* AudioProcessing::Create(
     NonlinearBeamformer* beamformer) {
   AudioProcessingImpl* apm = new rtc::RefCountedObject<AudioProcessingImpl>(
       config, std::move(capture_post_processor),
-      std::move(render_pre_processor), std::move(echo_control_factory),
+      std::move(render_pre_processor), std::move(echo_control_factory), nullptr,
       beamformer);
   if (apm->Initialize() != kNoError) {
     delete apm;
@@ -399,13 +412,15 @@ AudioProcessing* AudioProcessing::Create(
 }
 
 AudioProcessingImpl::AudioProcessingImpl(const webrtc::Config& config)
-    : AudioProcessingImpl(config, nullptr, nullptr, nullptr, nullptr) {}
+    : AudioProcessingImpl(config, nullptr, nullptr, nullptr, nullptr, nullptr) {
+}
 
 AudioProcessingImpl::AudioProcessingImpl(
     const webrtc::Config& config,
     std::unique_ptr<CustomProcessing> capture_post_processor,
     std::unique_ptr<CustomProcessing> render_pre_processor,
     std::unique_ptr<EchoControlFactory> echo_control_factory,
+    std::unique_ptr<EchoDetector> echo_detector,
     NonlinearBeamformer* beamformer)
     : high_pass_filter_impl_(new HighPassFilterImpl(this)),
       echo_control_factory_(std::move(echo_control_factory)),
@@ -414,7 +429,8 @@ AudioProcessingImpl::AudioProcessingImpl(
       private_submodules_(
           new ApmPrivateSubmodules(beamformer,
                                    std::move(capture_post_processor),
-                                   std::move(render_pre_processor))),
+                                   std::move(render_pre_processor),
+                                   std::move(echo_detector))),
       constants_(config.Get<ExperimentalAgc>().startup_min_volume,
                  config.Get<ExperimentalAgc>().clipped_level_min,
 #if defined(WEBRTC_ANDROID) || defined(WEBRTC_IOS)
@@ -454,8 +470,11 @@ AudioProcessingImpl::AudioProcessingImpl(
     public_submodules_->gain_control_for_experimental_agc.reset(
         new GainControlForExperimentalAgc(
             public_submodules_->gain_control.get(), &crit_capture_));
-    private_submodules_->residual_echo_detector.reset(
-        new ResidualEchoDetector());
+
+    // If no echo detector is injected, use the ResidualEchoDetector.
+    if (!private_submodules_->echo_detector) {
+      private_submodules_->echo_detector.reset(new ResidualEchoDetector());
+    }
 
     // TODO(peah): Move this creation to happen only when the level controller
     // is enabled.
@@ -1121,7 +1140,8 @@ void AudioProcessingImpl::EmptyQueuedRenderAudio() {
   }
 
   while (red_render_signal_queue_->Remove(&red_capture_queue_buffer_)) {
-    private_submodules_->residual_echo_detector->AnalyzeRenderAudio(
+    RTC_DCHECK(private_submodules_->echo_detector);
+    private_submodules_->echo_detector->AnalyzeRenderAudio(
         red_capture_queue_buffer_);
   }
 }
@@ -1337,7 +1357,8 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() {
   }
 
   if (config_.residual_echo_detector.enabled) {
-    private_submodules_->residual_echo_detector->AnalyzeCaptureAudio(
+    RTC_DCHECK(private_submodules_->echo_detector);
+    private_submodules_->echo_detector->AnalyzeCaptureAudio(
         rtc::ArrayView<const float>(capture_buffer->channels_f()[0],
                                     capture_buffer->num_frames()));
   }
@@ -1664,11 +1685,11 @@ AudioProcessing::AudioProcessingStatistics AudioProcessingImpl::GetStatistics()
   }
   {
     rtc::CritScope cs_capture(&crit_capture_);
-    stats.residual_echo_likelihood =
-        private_submodules_->residual_echo_detector->echo_likelihood();
+    RTC_DCHECK(private_submodules_->echo_detector);
+    auto ed_metrics = private_submodules_->echo_detector->GetMetrics();
+    stats.residual_echo_likelihood = ed_metrics.echo_likelihood;
     stats.residual_echo_likelihood_recent_max =
-        private_submodules_->residual_echo_detector
-            ->echo_likelihood_recent_max();
+        ed_metrics.echo_likelihood_recent_max;
   }
   public_submodules_->echo_cancellation->GetDelayMetrics(
       &stats.delay_median, &stats.delay_standard_deviation,
@@ -1705,11 +1726,11 @@ AudioProcessingStats AudioProcessingImpl::GetStatistics(
     }
     if (config_.residual_echo_detector.enabled) {
       rtc::CritScope cs_capture(&crit_capture_);
-      stats.residual_echo_likelihood = rtc::Optional<double>(
-          private_submodules_->residual_echo_detector->echo_likelihood());
+      RTC_DCHECK(private_submodules_->echo_detector);
+      auto ed_metrics = private_submodules_->echo_detector->GetMetrics();
+      stats.residual_echo_likelihood = ed_metrics.echo_likelihood;
       stats.residual_echo_likelihood_recent_max =
-          rtc::Optional<double>(private_submodules_->residual_echo_detector
-                                    ->echo_likelihood_recent_max());
+          ed_metrics.echo_likelihood_recent_max;
     }
     int delay_median, delay_std;
     float fraction_poor_delays;
@@ -1854,7 +1875,9 @@ void AudioProcessingImpl::InitializeLevelController() {
 }
 
 void AudioProcessingImpl::InitializeResidualEchoDetector() {
-  private_submodules_->residual_echo_detector->Initialize();
+  RTC_DCHECK(private_submodules_->echo_detector);
+  private_submodules_->echo_detector->Initialize(proc_sample_rate_hz(),
+                                                 num_proc_channels());
 }
 
 void AudioProcessingImpl::InitializePostProcessor() {
diff --git a/modules/audio_processing/audio_processing_impl.h b/modules/audio_processing/audio_processing_impl.h
index c05d23838a..8ece029723 100644
--- a/modules/audio_processing/audio_processing_impl.h
+++ b/modules/audio_processing/audio_processing_impl.h
@@ -45,6 +45,7 @@ class AudioProcessingImpl : public AudioProcessing {
                       std::unique_ptr<CustomProcessing> capture_post_processor,
                       std::unique_ptr<CustomProcessing> render_pre_processor,
                       std::unique_ptr<EchoControlFactory> echo_control_factory,
+                      std::unique_ptr<EchoDetector> echo_detector,
                       NonlinearBeamformer* beamformer);
   ~AudioProcessingImpl() override;
   int Initialize() override;
diff --git a/modules/audio_processing/include/audio_processing.h b/modules/audio_processing/include/audio_processing.h
index 60bf0c7ec6..8951b8cda8 100644
--- a/modules/audio_processing/include/audio_processing.h
+++ b/modules/audio_processing/include/audio_processing.h
@@ -49,6 +49,7 @@ class ProcessingConfig;
 class EchoCancellation;
 class EchoControlMobile;
 class EchoControlFactory;
+class EchoDetector;
 class GainControl;
 class HighPassFilter;
 class LevelEstimator;
@@ -665,6 +666,9 @@ class AudioProcessingBuilder {
   // The AudioProcessingBuilder takes ownership of the nonlinear beamformer.
   AudioProcessingBuilder& SetNonlinearBeamformer(
       std::unique_ptr<NonlinearBeamformer> nonlinear_beamformer);
+  // The AudioProcessingBuilder takes ownership of the echo_detector.
+  AudioProcessingBuilder& SetEchoDetector(
+      std::unique_ptr<EchoDetector> echo_detector);
   // This creates an APM instance using the previously set components. Calling
   // the Create function resets the AudioProcessingBuilder to its initial state.
   AudioProcessing* Create();
@@ -675,6 +679,7 @@ class AudioProcessingBuilder {
   std::unique_ptr<CustomProcessing> capture_post_processing_;
   std::unique_ptr<CustomProcessing> render_pre_processing_;
   std::unique_ptr<NonlinearBeamformer> nonlinear_beamformer_;
+  std::unique_ptr<EchoDetector> echo_detector_;
   RTC_DISALLOW_COPY_AND_ASSIGN(AudioProcessingBuilder);
 };
 
@@ -1147,6 +1152,34 @@ class CustomProcessing {
   virtual ~CustomProcessing() {}
 };
 
+// Interface for an echo detector submodule.
+class EchoDetector {
+ public:
+  // (Re-)Initializes the submodule.
+  virtual void Initialize(int sample_rate_hz, int num_channels) = 0;
+
+  // Analysis (not changing) of the render signal.
+  virtual void AnalyzeRenderAudio(rtc::ArrayView<const float> render_audio) = 0;
+
+  // Analysis (not changing) of the capture signal.
+  virtual void AnalyzeCaptureAudio(
+      rtc::ArrayView<const float> capture_audio) = 0;
+
+  // Pack an AudioBuffer into a vector<float>.
+  static void PackRenderAudioBuffer(AudioBuffer* audio,
+                                    std::vector<float>* packed_buffer);
+
+  struct Metrics {
+    double echo_likelihood;
+    double echo_likelihood_recent_max;
+  };
+
+  // Collect current metrics from the echo detector.
+  virtual Metrics GetMetrics() const = 0;
+
+  virtual ~EchoDetector() {}
+};
+
 // The voice activity detection (VAD) component analyzes the stream to
 // determine if voice is present. A facility is also provided to pass in an
 // external VAD decision.
diff --git a/modules/audio_processing/residual_echo_detector.cc b/modules/audio_processing/residual_echo_detector.cc
index b35c1558c7..ef325a032b 100644
--- a/modules/audio_processing/residual_echo_detector.cc
+++ b/modules/audio_processing/residual_echo_detector.cc
@@ -177,7 +177,8 @@ void ResidualEchoDetector::AnalyzeCaptureAudio(
                               : 0;
 }
 
-void ResidualEchoDetector::Initialize() {
+void ResidualEchoDetector::Initialize(int /*sample_rate_hz*/,
+                                      int /*num_channels*/) {
   render_buffer_.Clear();
   std::fill(render_power_.begin(), render_power_.end(), 0.f);
   std::fill(render_power_mean_.begin(), render_power_mean_.end(), 0.f);
@@ -193,12 +194,17 @@ void ResidualEchoDetector::Initialize() {
   reliability_ = 0.f;
 }
 
-void ResidualEchoDetector::PackRenderAudioBuffer(
-    AudioBuffer* audio,
-    std::vector<float>* packed_buffer) {
+void EchoDetector::PackRenderAudioBuffer(AudioBuffer* audio,
+                                         std::vector<float>* packed_buffer) {
   packed_buffer->clear();
   packed_buffer->insert(packed_buffer->end(), audio->channels_f()[0],
                         audio->channels_f()[0] + audio->num_frames());
 }
 
+EchoDetector::Metrics ResidualEchoDetector::GetMetrics() const {
+  EchoDetector::Metrics metrics;
+  metrics.echo_likelihood = echo_likelihood_;
+  metrics.echo_likelihood_recent_max = recent_likelihood_max_.max();
+  return metrics;
+}
 }  // namespace webrtc
diff --git a/modules/audio_processing/residual_echo_detector.h b/modules/audio_processing/residual_echo_detector.h
index de1b989110..e8ae552d6c 100644
--- a/modules/audio_processing/residual_echo_detector.h
+++ b/modules/audio_processing/residual_echo_detector.h
@@ -18,39 +18,32 @@
 #include "modules/audio_processing/echo_detector/mean_variance_estimator.h"
 #include "modules/audio_processing/echo_detector/moving_max.h"
 #include "modules/audio_processing/echo_detector/normalized_covariance_estimator.h"
+#include "modules/audio_processing/include/audio_processing.h"
 
 namespace webrtc {
 
 class ApmDataDumper;
 class AudioBuffer;
-class EchoDetector;
 
-class ResidualEchoDetector {
+class ResidualEchoDetector : public EchoDetector {
  public:
   ResidualEchoDetector();
-  ~ResidualEchoDetector();
+  ~ResidualEchoDetector() override;
 
   // This function should be called while holding the render lock.
-  void AnalyzeRenderAudio(rtc::ArrayView<const float> render_audio);
+  void AnalyzeRenderAudio(rtc::ArrayView<const float> render_audio) override;
 
   // This function should be called while holding the capture lock.
-  void AnalyzeCaptureAudio(rtc::ArrayView<const float> capture_audio);
+  void AnalyzeCaptureAudio(rtc::ArrayView<const float> capture_audio) override;
 
   // This function should be called while holding the capture lock.
-  void Initialize();
+  void Initialize(int sample_rate_hz, int num_channels) override;
 
   // This function is for testing purposes only.
   void SetReliabilityForTest(float value) { reliability_ = value; }
 
-  static void PackRenderAudioBuffer(AudioBuffer* audio,
-                                    std::vector<float>* packed_buffer);
-
   // This function should be called while holding the capture lock.
-  float echo_likelihood() const { return echo_likelihood_; }
-
-  float echo_likelihood_recent_max() const {
-    return recent_likelihood_max_.max();
-  }
+  EchoDetector::Metrics GetMetrics() const override;
 
  private:
   static int instance_count_;
diff --git a/modules/audio_processing/residual_echo_detector_unittest.cc b/modules/audio_processing/residual_echo_detector_unittest.cc
index baf83ba4aa..7bfa0d2eec 100644
--- a/modules/audio_processing/residual_echo_detector_unittest.cc
+++ b/modules/audio_processing/residual_echo_detector_unittest.cc
@@ -37,7 +37,8 @@ TEST(ResidualEchoDetectorTests, Echo) {
     }
   }
   // We expect to detect echo with near certain likelihood.
-  EXPECT_NEAR(1.f, echo_detector.echo_likelihood(), 0.01f);
+  auto ed_metrics = echo_detector.GetMetrics();
+  EXPECT_NEAR(1.f, ed_metrics.echo_likelihood, 0.01f);
 }
 
 TEST(ResidualEchoDetectorTests, NoEcho) {
@@ -57,7 +58,8 @@ TEST(ResidualEchoDetectorTests, NoEcho) {
     echo_detector.AnalyzeCaptureAudio(zeros);
   }
   // We expect to not detect any echo.
-  EXPECT_NEAR(0.f, echo_detector.echo_likelihood(), 0.01f);
+  auto ed_metrics = echo_detector.GetMetrics();
+  EXPECT_NEAR(0.f, ed_metrics.echo_likelihood, 0.01f);
 }
 
 TEST(ResidualEchoDetectorTests, EchoWithRenderClockDrift) {
@@ -92,7 +94,8 @@ TEST(ResidualEchoDetectorTests, EchoWithRenderClockDrift) {
   // A growing buffer can be caused by jitter or clock drift and it's not
   // possible to make this decision right away. For this reason we only expect
   // an echo likelihood of 75% in this test.
-  EXPECT_GT(echo_detector.echo_likelihood(), 0.75f);
+  auto ed_metrics = echo_detector.GetMetrics();
+  EXPECT_GT(ed_metrics.echo_likelihood, 0.75f);
 }
 
 TEST(ResidualEchoDetectorTests, EchoWithCaptureClockDrift) {
@@ -122,7 +125,8 @@ TEST(ResidualEchoDetectorTests, EchoWithCaptureClockDrift) {
     }
   }
   // We expect to detect echo with near certain likelihood.
-  EXPECT_NEAR(1.f, echo_detector.echo_likelihood(), 0.01f);
+  auto ed_metrics = echo_detector.GetMetrics();
+  EXPECT_NEAR(1.f, ed_metrics.echo_likelihood, 0.01f);
 }
 
 }  // namespace webrtc