From 45daf7b26f49793c30e395f7ba7be30aa51936bb Mon Sep 17 00:00:00 2001
From: aluebs <aluebs@webrtc.org>
Date: Fri, 16 Oct 2015 17:04:09 -0700
Subject: [PATCH] Implement new version of the NonlinearBeamformer

Sounds better according to a MUSHRA listening test.
The computational complexity is unaffected.
An empirically estimated gain was added to compensate for the attenuation introduced by the algorithm.
There are some TODOs, which I will address in follow up CLs.

It was tested in Hangouts without headphones and highest volume, to make sure it doesn't affect the AEC.

Review URL: https://codereview.webrtc.org/1378973003

Cr-Commit-Position: refs/heads/master@{#10308}
---
 .../beamformer/covariance_matrix_generator.cc |  16 +-
 .../covariance_matrix_generator_unittest.cc   |  42 +++---
 .../beamformer/nonlinear_beamformer.cc        | 139 +++++++++---------
 .../beamformer/nonlinear_beamformer.h         |  31 ++--
 4 files changed, 122 insertions(+), 106 deletions(-)
diff --git a/webrtc/modules/audio_processing/beamformer/covariance_matrix_generator.cc b/webrtc/modules/audio_processing/beamformer/covariance_matrix_generator.cc
index efc5b0f71a..d0728325fc 100644
--- a/webrtc/modules/audio_processing/beamformer/covariance_matrix_generator.cc
+++ b/webrtc/modules/audio_processing/beamformer/covariance_matrix_generator.cc
@@ -14,6 +14,7 @@
 
 #include <cmath>
 
+namespace webrtc {
 namespace {
 
 float BesselJ0(float x) {
@@ -24,9 +25,19 @@ float BesselJ0(float x) {
 #endif
 }
 
-}  // namespace
+// Calculates the Euclidean norm for a row vector.
+float Norm(const ComplexMatrix<float>& x) {
+  RTC_CHECK_EQ(1, x.num_rows());
+  const size_t length = x.num_columns();
+  const complex<float>* elems = x.elements()[0];
+  float result = 0.f;
+  for (size_t i = 0u; i < length; ++i) {
+    result += std::norm(elems[i]);
+  }
+  return std::sqrt(result);
+}
 
-namespace webrtc {
+}  // namespace
 
 void CovarianceMatrixGenerator::UniformCovarianceMatrix(
     float wave_number,
@@ -69,6 +80,7 @@ void CovarianceMatrixGenerator::AngledCovarianceMatrix(
                       geometry,
                       angle,
                       &interf_cov_vector);
+  interf_cov_vector.Scale(1.f / Norm(interf_cov_vector));
   interf_cov_vector_transposed.Transpose(interf_cov_vector);
   interf_cov_vector.PointwiseConjugate();
   mat->Multiply(interf_cov_vector_transposed, interf_cov_vector);
diff --git a/webrtc/modules/audio_processing/beamformer/covariance_matrix_generator_unittest.cc b/webrtc/modules/audio_processing/beamformer/covariance_matrix_generator_unittest.cc
index 4ea341d84c..23d672fb4e 100644
--- a/webrtc/modules/audio_processing/beamformer/covariance_matrix_generator_unittest.cc
+++ b/webrtc/modules/audio_processing/beamformer/covariance_matrix_generator_unittest.cc
@@ -165,14 +165,14 @@ TEST(CovarianceMatrixGeneratorTest, TestAngledCovarianceMatrix2Mics) {
 
   complex<float>* const* actual_els = actual_covariance_matrix.elements();
 
-  EXPECT_NEAR(actual_els[0][0].real(), 1.f, kTolerance);
-  EXPECT_NEAR(actual_els[0][1].real(), 0.9952f, kTolerance);
-  EXPECT_NEAR(actual_els[1][0].real(), 0.9952f, kTolerance);
-  EXPECT_NEAR(actual_els[1][1].real(), 1.f, kTolerance);
+  EXPECT_NEAR(actual_els[0][0].real(), 0.5f, kTolerance);
+  EXPECT_NEAR(actual_els[0][1].real(), 0.4976f, kTolerance);
+  EXPECT_NEAR(actual_els[1][0].real(), 0.4976f, kTolerance);
+  EXPECT_NEAR(actual_els[1][1].real(), 0.5f, kTolerance);
 
   EXPECT_NEAR(actual_els[0][0].imag(), 0.f, kTolerance);
-  EXPECT_NEAR(actual_els[0][1].imag(), 0.0978f, kTolerance);
-  EXPECT_NEAR(actual_els[1][0].imag(), -0.0978f, kTolerance);
+  EXPECT_NEAR(actual_els[0][1].imag(), 0.0489f, kTolerance);
+  EXPECT_NEAR(actual_els[1][0].imag(), -0.0489f, kTolerance);
   EXPECT_NEAR(actual_els[1][1].imag(), 0.f, kTolerance);
 }
 
@@ -203,24 +203,24 @@ TEST(CovarianceMatrixGeneratorTest, TestAngledCovarianceMatrix3Mics) {
 
   complex<float>* const* actual_els = actual_covariance_matrix.elements();
 
-  EXPECT_NEAR(actual_els[0][0].real(), 1.f, kTolerance);
-  EXPECT_NEAR(actual_els[0][1].real(), 0.8859f, kTolerance);
-  EXPECT_NEAR(actual_els[0][2].real(), 0.5696f, kTolerance);
-  EXPECT_NEAR(actual_els[1][0].real(), 0.8859f, kTolerance);
-  EXPECT_NEAR(actual_els[1][1].real(), 1.f, kTolerance);
-  EXPECT_NEAR(actual_els[1][2].real(), 0.8859f, kTolerance);
-  EXPECT_NEAR(actual_els[2][0].real(), 0.5696f, kTolerance);
-  EXPECT_NEAR(actual_els[2][1].real(), 0.8859f, kTolerance);
-  EXPECT_NEAR(actual_els[2][2].real(), 1.f, kTolerance);
+  EXPECT_NEAR(actual_els[0][0].real(), 0.3333f, kTolerance);
+  EXPECT_NEAR(actual_els[0][1].real(), 0.2953f, kTolerance);
+  EXPECT_NEAR(actual_els[0][2].real(), 0.1899f, kTolerance);
+  EXPECT_NEAR(actual_els[1][0].real(), 0.2953f, kTolerance);
+  EXPECT_NEAR(actual_els[1][1].real(), 0.3333f, kTolerance);
+  EXPECT_NEAR(actual_els[1][2].real(), 0.2953f, kTolerance);
+  EXPECT_NEAR(actual_els[2][0].real(), 0.1899f, kTolerance);
+  EXPECT_NEAR(actual_els[2][1].real(), 0.2953f, kTolerance);
+  EXPECT_NEAR(actual_els[2][2].real(), 0.3333f, kTolerance);
 
   EXPECT_NEAR(actual_els[0][0].imag(), 0.f, kTolerance);
-  EXPECT_NEAR(actual_els[0][1].imag(), 0.4639f, kTolerance);
-  EXPECT_NEAR(actual_els[0][2].imag(), 0.8219f, kTolerance);
-  EXPECT_NEAR(actual_els[1][0].imag(), -0.4639f, kTolerance);
+  EXPECT_NEAR(actual_els[0][1].imag(), 0.1546f, kTolerance);
+  EXPECT_NEAR(actual_els[0][2].imag(), 0.274f, kTolerance);
+  EXPECT_NEAR(actual_els[1][0].imag(), -0.1546f, kTolerance);
   EXPECT_NEAR(actual_els[1][1].imag(), 0.f, kTolerance);
-  EXPECT_NEAR(actual_els[1][2].imag(), 0.4639f, kTolerance);
-  EXPECT_NEAR(actual_els[2][0].imag(), -0.8219f, kTolerance);
-  EXPECT_NEAR(actual_els[2][1].imag(), -0.4639f, kTolerance);
+  EXPECT_NEAR(actual_els[1][2].imag(), 0.1546f, kTolerance);
+  EXPECT_NEAR(actual_els[2][0].imag(), -0.274f, kTolerance);
+  EXPECT_NEAR(actual_els[2][1].imag(), -0.1546f, kTolerance);
   EXPECT_NEAR(actual_els[2][2].imag(), 0.f, kTolerance);
 }
 
diff --git a/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc b/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc
index da7ad0da59..40e738c6d1 100644
--- a/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc
+++ b/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc
@@ -27,34 +27,23 @@ namespace {
 // Alpha for the Kaiser Bessel Derived window.
 const float kKbdAlpha = 1.5f;
 
-// The minimum value a post-processing mask can take.
-const float kMaskMinimum = 0.01f;
-
 const float kSpeedOfSoundMeterSeconds = 343;
 
 // For both target and interference angles, PI / 2 is perpendicular to the
 // microphone array, facing forwards. The positive direction goes
 // counterclockwise.
 // The angle at which we amplify sound.
+// TODO(aluebs): Make the target angle dynamically settable.
 const float kTargetAngleRadians = static_cast<float>(M_PI) / 2.f;
 
-// The angle at which we suppress sound. Suppression is symmetric around PI / 2
-// radians, so sound is suppressed at both +|kInterfAngleRadians| and
-// PI - |kInterfAngleRadians|. Since the beamformer is robust, this should
-// suppress sound coming from close angles as well.
-const float kInterfAngleRadians = static_cast<float>(M_PI) / 4.f;
-
 // When calculating the interference covariance matrix, this is the weight for
 // the weighted average between the uniform covariance matrix and the angled
 // covariance matrix.
 // Rpsi = Rpsi_angled * kBalance + Rpsi_uniform * (1 - kBalance)
-const float kBalance = 0.4f;
+const float kBalance = 0.95f;
 
 const float kHalfBeamWidthRadians = static_cast<float>(M_PI) * 20.f / 180.f;
 
-// TODO(claguna): need comment here.
-const float kBeamwidthConstant = 0.00002f;
-
 // Alpha coefficients for mask smoothing.
 const float kMaskTimeSmoothAlpha = 0.2f;
 const float kMaskFrequencySmoothAlpha = 0.6f;
@@ -64,17 +53,33 @@ const float kMaskFrequencySmoothAlpha = 0.6f;
 const int kLowMeanStartHz = 200;
 const int kLowMeanEndHz = 400;
 
+// TODO(aluebs): Make the high frequency correction range depend on the target
+// angle.
 const int kHighMeanStartHz = 3000;
 const int kHighMeanEndHz = 5000;
 
+// Range limiter for subtractive terms in the nominator and denominator of the
+// postfilter expression. It handles the scenario mismatch between the true and
+// model sources (target and interference).
+const float kCutOffConstant = 0.9999f;
+
 // Quantile of mask values which is used to estimate target presence.
 const float kMaskQuantile = 0.7f;
 // Mask threshold over which the data is considered signal and not interference.
-const float kMaskTargetThreshold = 0.3f;
+// It has to be updated every time the postfilter calculation is changed
+// significantly.
+// TODO(aluebs): Write a tool to tune the target threshold automatically based
+// on files annotated with target and interference ground truth.
+const float kMaskTargetThreshold = 0.01f;
 // Time in seconds after which the data is considered interference if the mask
 // does not pass |kMaskTargetThreshold|.
 const float kHoldTargetSeconds = 0.25f;
 
+// To compensate for the attenuation this algorithm introduces to the target
+// signal. It was estimated empirically from a low-noise low-reverberation
+// recording from broadside.
+const float kCompensationGain = 2.f;
+
 // Does conjugate(|norm_mat|) * |mat| * transpose(|norm_mat|). No extra space is
 // used; to accomplish this, we compute both multiplications in the same loop.
 // The returned norm is clamped to be non-negative.
@@ -218,7 +223,6 @@ void NonlinearBeamformer::Initialize(int chunk_size_ms, int sample_rate_hz) {
   hold_target_blocks_ = kHoldTargetSeconds * 2 * sample_rate_hz / kFftSize;
   interference_blocks_count_ = hold_target_blocks_;
 
-
   lapped_transform_.reset(new LappedTransform(num_input_channels_,
                                               1,
                                               chunk_length_,
@@ -231,24 +235,34 @@ void NonlinearBeamformer::Initialize(int chunk_size_ms, int sample_rate_hz) {
     final_mask_[i] = 1.f;
     float freq_hz = (static_cast<float>(i) / kFftSize) * sample_rate_hz_;
     wave_numbers_[i] = 2 * M_PI * freq_hz / kSpeedOfSoundMeterSeconds;
-    mask_thresholds_[i] = num_input_channels_ * num_input_channels_ *
-                          kBeamwidthConstant * wave_numbers_[i] *
-                          wave_numbers_[i];
   }
 
   // Initialize all nonadaptive values before looping through the frames.
+  InitInterfAngles();
   InitDelaySumMasks();
   InitTargetCovMats();
   InitInterfCovMats();
 
   for (size_t i = 0; i < kNumFreqBins; ++i) {
     rxiws_[i] = Norm(target_cov_mats_[i], delay_sum_masks_[i]);
-    rpsiws_[i] = Norm(interf_cov_mats_[i], delay_sum_masks_[i]);
-    reflected_rpsiws_[i] =
-        Norm(reflected_interf_cov_mats_[i], delay_sum_masks_[i]);
+    rpsiws_[i].clear();
+    for (size_t j = 0; j < interf_angles_radians_.size(); ++j) {
+      rpsiws_[i].push_back(Norm(*interf_cov_mats_[i][j], delay_sum_masks_[i]));
+    }
   }
 }
 
+void NonlinearBeamformer::InitInterfAngles() {
+  // TODO(aluebs): Make kAwayRadians dependent on the mic spacing.
+  const float kAwayRadians = 0.5;
+
+  interf_angles_radians_.clear();
+  // TODO(aluebs): When the target angle is settable, make sure the interferer
+  // scenarios aren't reflected over the target one for linear geometries.
+  interf_angles_radians_.push_back(kTargetAngleRadians - kAwayRadians);
+  interf_angles_radians_.push_back(kTargetAngleRadians + kAwayRadians);
+}
+
 void NonlinearBeamformer::InitDelaySumMasks() {
   for (size_t f_ix = 0; f_ix < kNumFreqBins; ++f_ix) {
     delay_sum_masks_[f_ix].Resize(1, num_input_channels_);
@@ -273,40 +287,39 @@ void NonlinearBeamformer::InitTargetCovMats() {
   for (size_t i = 0; i < kNumFreqBins; ++i) {
     target_cov_mats_[i].Resize(num_input_channels_, num_input_channels_);
     TransposedConjugatedProduct(delay_sum_masks_[i], &target_cov_mats_[i]);
-    complex_f normalization_factor = target_cov_mats_[i].Trace();
-    target_cov_mats_[i].Scale(1.f / normalization_factor);
   }
 }
 
 void NonlinearBeamformer::InitInterfCovMats() {
   for (size_t i = 0; i < kNumFreqBins; ++i) {
-    interf_cov_mats_[i].Resize(num_input_channels_, num_input_channels_);
     ComplexMatrixF uniform_cov_mat(num_input_channels_, num_input_channels_);
-    ComplexMatrixF angled_cov_mat(num_input_channels_, num_input_channels_);
-
     CovarianceMatrixGenerator::UniformCovarianceMatrix(wave_numbers_[i],
                                                        array_geometry_,
                                                        &uniform_cov_mat);
-
-    CovarianceMatrixGenerator::AngledCovarianceMatrix(kSpeedOfSoundMeterSeconds,
-                                                      kInterfAngleRadians,
-                                                      i,
-                                                      kFftSize,
-                                                      kNumFreqBins,
-                                                      sample_rate_hz_,
-                                                      array_geometry_,
-                                                      &angled_cov_mat);
-    // Normalize matrices before averaging them.
-    complex_f normalization_factor = uniform_cov_mat.Trace();
+    complex_f normalization_factor = uniform_cov_mat.elements()[0][0];
     uniform_cov_mat.Scale(1.f / normalization_factor);
-    normalization_factor = angled_cov_mat.Trace();
-    angled_cov_mat.Scale(1.f / normalization_factor);
-
-    // Average matrices.
     uniform_cov_mat.Scale(1 - kBalance);
-    angled_cov_mat.Scale(kBalance);
-    interf_cov_mats_[i].Add(uniform_cov_mat, angled_cov_mat);
-    reflected_interf_cov_mats_[i].PointwiseConjugate(interf_cov_mats_[i]);
+    interf_cov_mats_[i].clear();
+    for (size_t j = 0; j < interf_angles_radians_.size(); ++j) {
+      interf_cov_mats_[i].push_back(new ComplexMatrixF(num_input_channels_,
+                                                       num_input_channels_));
+      ComplexMatrixF angled_cov_mat(num_input_channels_, num_input_channels_);
+      CovarianceMatrixGenerator::AngledCovarianceMatrix(
+          kSpeedOfSoundMeterSeconds,
+          interf_angles_radians_[j],
+          i,
+          kFftSize,
+          kNumFreqBins,
+          sample_rate_hz_,
+          array_geometry_,
+          &angled_cov_mat);
+      // Normalize matrices before averaging them.
+      normalization_factor = angled_cov_mat.elements()[0][0];
+      angled_cov_mat.Scale(1.f / normalization_factor);
+      // Weighted average of matrices.
+      angled_cov_mat.Scale(kBalance);
+      interf_cov_mats_[i][j]->Add(uniform_cov_mat, angled_cov_mat);
+    }
   }
 }
 
@@ -376,17 +389,19 @@ void NonlinearBeamformer::ProcessAudioBlock(const complex_f* const* input,
     rmw *= rmw;
     float rmw_r = rmw.real();
 
-    new_mask_[i] = CalculatePostfilterMask(interf_cov_mats_[i],
-                                           rpsiws_[i],
+    new_mask_[i] = CalculatePostfilterMask(*interf_cov_mats_[i][0],
+                                           rpsiws_[i][0],
                                            ratio_rxiw_rxim,
-                                           rmw_r,
-                                           mask_thresholds_[i]);
-
-    new_mask_[i] *= CalculatePostfilterMask(reflected_interf_cov_mats_[i],
-                                            reflected_rpsiws_[i],
-                                            ratio_rxiw_rxim,
-                                            rmw_r,
-                                            mask_thresholds_[i]);
+                                           rmw_r);
+    for (size_t j = 1; j < interf_angles_radians_.size(); ++j) {
+      float tmp_mask = CalculatePostfilterMask(*interf_cov_mats_[i][j],
+                                               rpsiws_[i][j],
+                                               ratio_rxiw_rxim,
+                                               rmw_r);
+      if (tmp_mask < new_mask_[i]) {
+        new_mask_[i] = tmp_mask;
+      }
+    }
   }
 
   ApplyMaskTimeSmoothing();
@@ -401,24 +416,16 @@ float NonlinearBeamformer::CalculatePostfilterMask(
     const ComplexMatrixF& interf_cov_mat,
     float rpsiw,
     float ratio_rxiw_rxim,
-    float rmw_r,
-    float mask_threshold) {
+    float rmw_r) {
   float rpsim = Norm(interf_cov_mat, eig_m_);
 
-  // Find lambda.
   float ratio = 0.f;
   if (rpsim > 0.f) {
     ratio = rpsiw / rpsim;
   }
-  float numerator = rmw_r - ratio;
-  float denominator = ratio_rxiw_rxim - ratio;
 
-  float mask = 1.f;
-  if (denominator > mask_threshold) {
-    float lambda = numerator / denominator;
-    mask = std::max(lambda * ratio_rxiw_rxim / rmw_r, kMaskMinimum);
-  }
-  return mask;
+  return (1.f - std::min(kCutOffConstant, ratio / rmw_r)) /
+         (1.f - std::min(kCutOffConstant, ratio / ratio_rxiw_rxim));
 }
 
 void NonlinearBeamformer::ApplyMasks(const complex_f* const* input,
@@ -433,7 +440,7 @@ void NonlinearBeamformer::ApplyMasks(const complex_f* const* input,
       output_channel[f_ix] += input[c_ix][f_ix] * delay_sum_mask_els[c_ix];
     }
 
-    output_channel[f_ix] *= final_mask_[f_ix];
+    output_channel[f_ix] *= kCompensationGain * final_mask_[f_ix];
   }
 }
 
diff --git a/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.h b/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.h
index 46c68bf808..4ff0b48093 100644
--- a/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.h
+++ b/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.h
@@ -17,6 +17,7 @@
 #include "webrtc/common_audio/channel_buffer.h"
 #include "webrtc/modules/audio_processing/beamformer/beamformer.h"
 #include "webrtc/modules/audio_processing/beamformer/complex_matrix.h"
+#include "webrtc/system_wrappers/interface/scoped_vector.h"
 
 namespace webrtc {
 
@@ -26,14 +27,10 @@ namespace webrtc {
 //
 // The implemented nonlinear postfilter algorithm taken from "A Robust Nonlinear
 // Beamforming Postprocessor" by Bastiaan Kleijn.
-//
-// TODO(aluebs): Target angle assumed to be 0. Parameterize target angle.
 class NonlinearBeamformer
   : public Beamformer<float>,
     public LappedTransform::Callback {
  public:
-  // At the moment it only accepts uniform linear microphone arrays. Using the
-  // first microphone as a reference position [0, 0, 0] is a natural choice.
   explicit NonlinearBeamformer(const std::vector<Point>& array_geometry);
 
   // Sample rate corresponds to the lower band.
@@ -69,19 +66,17 @@ class NonlinearBeamformer
   typedef ComplexMatrix<float> ComplexMatrixF;
   typedef complex<float> complex_f;
 
+  void InitInterfAngles();
   void InitDelaySumMasks();
-  void InitTargetCovMats();  // TODO(aluebs): Make this depend on target angle.
+  void InitTargetCovMats();
   void InitInterfCovMats();
 
-  // An implementation of equation 18, which calculates postfilter masks that,
-  // when applied, minimize the mean-square error of our estimation of the
-  // desired signal. A sub-task is to calculate lambda, which is solved via
-  // equation 13.
+  // Calculates postfilter masks that minimize the mean squared error of our
+  // estimation of the desired signal.
   float CalculatePostfilterMask(const ComplexMatrixF& interf_cov_mat,
                                 float rpsiw,
                                 float ratio_rxiw_rxim,
-                                float rmxi_r,
-                                float mask_threshold);
+                                float rmxi_r);
 
   // Prevents the postfilter masks from degenerating too quickly (a cause of
   // musical noise).
@@ -134,6 +129,9 @@ class NonlinearBeamformer
   // Time and frequency smoothed mask.
   float final_mask_[kNumFreqBins];
 
+  // Angles of the interferer scenarios.
+  std::vector<float> interf_angles_radians_;
+
   // Array of length |kNumFreqBins|, Matrix of size |1| x |num_channels_|.
   ComplexMatrixF delay_sum_masks_[kNumFreqBins];
   ComplexMatrixF normalized_delay_sum_masks_[kNumFreqBins];
@@ -143,19 +141,18 @@ class NonlinearBeamformer
   ComplexMatrixF target_cov_mats_[kNumFreqBins];
 
   // Array of length |kNumFreqBins|, Matrix of size |num_input_channels_| x
-  // |num_input_channels_|.
-  ComplexMatrixF interf_cov_mats_[kNumFreqBins];
-  ComplexMatrixF reflected_interf_cov_mats_[kNumFreqBins];
+  // |num_input_channels_|. ScopedVector has a size equal to the number of
+  // interferer scenarios.
+  ScopedVector<ComplexMatrixF> interf_cov_mats_[kNumFreqBins];
 
   // Of length |kNumFreqBins|.
-  float mask_thresholds_[kNumFreqBins];
   float wave_numbers_[kNumFreqBins];
 
   // Preallocated for ProcessAudioBlock()
   // Of length |kNumFreqBins|.
   float rxiws_[kNumFreqBins];
-  float rpsiws_[kNumFreqBins];
-  float reflected_rpsiws_[kNumFreqBins];
+  // The vector has a size equal to the number of interferer scenarios.
+  std::vector<float> rpsiws_[kNumFreqBins];
 
   // The microphone normalization factor.
   ComplexMatrixF eig_m_;