From 45daf7b26f49793c30e395f7ba7be30aa51936bb Mon Sep 17 00:00:00 2001 From: aluebs Date: Fri, 16 Oct 2015 17:04:09 -0700 Subject: [PATCH] Implement new version of the NonlinearBeamformer Sounds better according to a MUSHRA listening test. The computational complexity is unaffected. An empirically estimated gain was added to compensate for the attenuation introduced by the algorithm. There are some TODOs, which I will address in follow up CLs. It was tested in Hangouts without headphones and highest volume, to make sure it doesn't affect the AEC. Review URL: https://codereview.webrtc.org/1378973003 Cr-Commit-Position: refs/heads/master@{#10308} --- .../beamformer/covariance_matrix_generator.cc | 16 +- .../covariance_matrix_generator_unittest.cc | 42 +++--- .../beamformer/nonlinear_beamformer.cc | 139 +++++++++--------- .../beamformer/nonlinear_beamformer.h | 31 ++-- 4 files changed, 122 insertions(+), 106 deletions(-) diff --git a/webrtc/modules/audio_processing/beamformer/covariance_matrix_generator.cc b/webrtc/modules/audio_processing/beamformer/covariance_matrix_generator.cc index efc5b0f71a..d0728325fc 100644 --- a/webrtc/modules/audio_processing/beamformer/covariance_matrix_generator.cc +++ b/webrtc/modules/audio_processing/beamformer/covariance_matrix_generator.cc @@ -14,6 +14,7 @@ #include +namespace webrtc { namespace { float BesselJ0(float x) { @@ -24,9 +25,19 @@ float BesselJ0(float x) { #endif } -} // namespace +// Calculates the Euclidean norm for a row vector. +float Norm(const ComplexMatrix& x) { + RTC_CHECK_EQ(1, x.num_rows()); + const size_t length = x.num_columns(); + const complex* elems = x.elements()[0]; + float result = 0.f; + for (size_t i = 0u; i < length; ++i) { + result += std::norm(elems[i]); + } + return std::sqrt(result); +} -namespace webrtc { +} // namespace void CovarianceMatrixGenerator::UniformCovarianceMatrix( float wave_number, @@ -69,6 +80,7 @@ void CovarianceMatrixGenerator::AngledCovarianceMatrix( geometry, angle, &interf_cov_vector); + interf_cov_vector.Scale(1.f / Norm(interf_cov_vector)); interf_cov_vector_transposed.Transpose(interf_cov_vector); interf_cov_vector.PointwiseConjugate(); mat->Multiply(interf_cov_vector_transposed, interf_cov_vector); diff --git a/webrtc/modules/audio_processing/beamformer/covariance_matrix_generator_unittest.cc b/webrtc/modules/audio_processing/beamformer/covariance_matrix_generator_unittest.cc index 4ea341d84c..23d672fb4e 100644 --- a/webrtc/modules/audio_processing/beamformer/covariance_matrix_generator_unittest.cc +++ b/webrtc/modules/audio_processing/beamformer/covariance_matrix_generator_unittest.cc @@ -165,14 +165,14 @@ TEST(CovarianceMatrixGeneratorTest, TestAngledCovarianceMatrix2Mics) { complex* const* actual_els = actual_covariance_matrix.elements(); - EXPECT_NEAR(actual_els[0][0].real(), 1.f, kTolerance); - EXPECT_NEAR(actual_els[0][1].real(), 0.9952f, kTolerance); - EXPECT_NEAR(actual_els[1][0].real(), 0.9952f, kTolerance); - EXPECT_NEAR(actual_els[1][1].real(), 1.f, kTolerance); + EXPECT_NEAR(actual_els[0][0].real(), 0.5f, kTolerance); + EXPECT_NEAR(actual_els[0][1].real(), 0.4976f, kTolerance); + EXPECT_NEAR(actual_els[1][0].real(), 0.4976f, kTolerance); + EXPECT_NEAR(actual_els[1][1].real(), 0.5f, kTolerance); EXPECT_NEAR(actual_els[0][0].imag(), 0.f, kTolerance); - EXPECT_NEAR(actual_els[0][1].imag(), 0.0978f, kTolerance); - EXPECT_NEAR(actual_els[1][0].imag(), -0.0978f, kTolerance); + EXPECT_NEAR(actual_els[0][1].imag(), 0.0489f, kTolerance); + EXPECT_NEAR(actual_els[1][0].imag(), -0.0489f, kTolerance); EXPECT_NEAR(actual_els[1][1].imag(), 0.f, kTolerance); } @@ -203,24 +203,24 @@ TEST(CovarianceMatrixGeneratorTest, TestAngledCovarianceMatrix3Mics) { complex* const* actual_els = actual_covariance_matrix.elements(); - EXPECT_NEAR(actual_els[0][0].real(), 1.f, kTolerance); - EXPECT_NEAR(actual_els[0][1].real(), 0.8859f, kTolerance); - EXPECT_NEAR(actual_els[0][2].real(), 0.5696f, kTolerance); - EXPECT_NEAR(actual_els[1][0].real(), 0.8859f, kTolerance); - EXPECT_NEAR(actual_els[1][1].real(), 1.f, kTolerance); - EXPECT_NEAR(actual_els[1][2].real(), 0.8859f, kTolerance); - EXPECT_NEAR(actual_els[2][0].real(), 0.5696f, kTolerance); - EXPECT_NEAR(actual_els[2][1].real(), 0.8859f, kTolerance); - EXPECT_NEAR(actual_els[2][2].real(), 1.f, kTolerance); + EXPECT_NEAR(actual_els[0][0].real(), 0.3333f, kTolerance); + EXPECT_NEAR(actual_els[0][1].real(), 0.2953f, kTolerance); + EXPECT_NEAR(actual_els[0][2].real(), 0.1899f, kTolerance); + EXPECT_NEAR(actual_els[1][0].real(), 0.2953f, kTolerance); + EXPECT_NEAR(actual_els[1][1].real(), 0.3333f, kTolerance); + EXPECT_NEAR(actual_els[1][2].real(), 0.2953f, kTolerance); + EXPECT_NEAR(actual_els[2][0].real(), 0.1899f, kTolerance); + EXPECT_NEAR(actual_els[2][1].real(), 0.2953f, kTolerance); + EXPECT_NEAR(actual_els[2][2].real(), 0.3333f, kTolerance); EXPECT_NEAR(actual_els[0][0].imag(), 0.f, kTolerance); - EXPECT_NEAR(actual_els[0][1].imag(), 0.4639f, kTolerance); - EXPECT_NEAR(actual_els[0][2].imag(), 0.8219f, kTolerance); - EXPECT_NEAR(actual_els[1][0].imag(), -0.4639f, kTolerance); + EXPECT_NEAR(actual_els[0][1].imag(), 0.1546f, kTolerance); + EXPECT_NEAR(actual_els[0][2].imag(), 0.274f, kTolerance); + EXPECT_NEAR(actual_els[1][0].imag(), -0.1546f, kTolerance); EXPECT_NEAR(actual_els[1][1].imag(), 0.f, kTolerance); - EXPECT_NEAR(actual_els[1][2].imag(), 0.4639f, kTolerance); - EXPECT_NEAR(actual_els[2][0].imag(), -0.8219f, kTolerance); - EXPECT_NEAR(actual_els[2][1].imag(), -0.4639f, kTolerance); + EXPECT_NEAR(actual_els[1][2].imag(), 0.1546f, kTolerance); + EXPECT_NEAR(actual_els[2][0].imag(), -0.274f, kTolerance); + EXPECT_NEAR(actual_els[2][1].imag(), -0.1546f, kTolerance); EXPECT_NEAR(actual_els[2][2].imag(), 0.f, kTolerance); } diff --git a/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc b/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc index da7ad0da59..40e738c6d1 100644 --- a/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc +++ b/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.cc @@ -27,34 +27,23 @@ namespace { // Alpha for the Kaiser Bessel Derived window. const float kKbdAlpha = 1.5f; -// The minimum value a post-processing mask can take. -const float kMaskMinimum = 0.01f; - const float kSpeedOfSoundMeterSeconds = 343; // For both target and interference angles, PI / 2 is perpendicular to the // microphone array, facing forwards. The positive direction goes // counterclockwise. // The angle at which we amplify sound. +// TODO(aluebs): Make the target angle dynamically settable. const float kTargetAngleRadians = static_cast(M_PI) / 2.f; -// The angle at which we suppress sound. Suppression is symmetric around PI / 2 -// radians, so sound is suppressed at both +|kInterfAngleRadians| and -// PI - |kInterfAngleRadians|. Since the beamformer is robust, this should -// suppress sound coming from close angles as well. -const float kInterfAngleRadians = static_cast(M_PI) / 4.f; - // When calculating the interference covariance matrix, this is the weight for // the weighted average between the uniform covariance matrix and the angled // covariance matrix. // Rpsi = Rpsi_angled * kBalance + Rpsi_uniform * (1 - kBalance) -const float kBalance = 0.4f; +const float kBalance = 0.95f; const float kHalfBeamWidthRadians = static_cast(M_PI) * 20.f / 180.f; -// TODO(claguna): need comment here. -const float kBeamwidthConstant = 0.00002f; - // Alpha coefficients for mask smoothing. const float kMaskTimeSmoothAlpha = 0.2f; const float kMaskFrequencySmoothAlpha = 0.6f; @@ -64,17 +53,33 @@ const float kMaskFrequencySmoothAlpha = 0.6f; const int kLowMeanStartHz = 200; const int kLowMeanEndHz = 400; +// TODO(aluebs): Make the high frequency correction range depend on the target +// angle. const int kHighMeanStartHz = 3000; const int kHighMeanEndHz = 5000; +// Range limiter for subtractive terms in the nominator and denominator of the +// postfilter expression. It handles the scenario mismatch between the true and +// model sources (target and interference). +const float kCutOffConstant = 0.9999f; + // Quantile of mask values which is used to estimate target presence. const float kMaskQuantile = 0.7f; // Mask threshold over which the data is considered signal and not interference. -const float kMaskTargetThreshold = 0.3f; +// It has to be updated every time the postfilter calculation is changed +// significantly. +// TODO(aluebs): Write a tool to tune the target threshold automatically based +// on files annotated with target and interference ground truth. +const float kMaskTargetThreshold = 0.01f; // Time in seconds after which the data is considered interference if the mask // does not pass |kMaskTargetThreshold|. const float kHoldTargetSeconds = 0.25f; +// To compensate for the attenuation this algorithm introduces to the target +// signal. It was estimated empirically from a low-noise low-reverberation +// recording from broadside. +const float kCompensationGain = 2.f; + // Does conjugate(|norm_mat|) * |mat| * transpose(|norm_mat|). No extra space is // used; to accomplish this, we compute both multiplications in the same loop. // The returned norm is clamped to be non-negative. @@ -218,7 +223,6 @@ void NonlinearBeamformer::Initialize(int chunk_size_ms, int sample_rate_hz) { hold_target_blocks_ = kHoldTargetSeconds * 2 * sample_rate_hz / kFftSize; interference_blocks_count_ = hold_target_blocks_; - lapped_transform_.reset(new LappedTransform(num_input_channels_, 1, chunk_length_, @@ -231,24 +235,34 @@ void NonlinearBeamformer::Initialize(int chunk_size_ms, int sample_rate_hz) { final_mask_[i] = 1.f; float freq_hz = (static_cast(i) / kFftSize) * sample_rate_hz_; wave_numbers_[i] = 2 * M_PI * freq_hz / kSpeedOfSoundMeterSeconds; - mask_thresholds_[i] = num_input_channels_ * num_input_channels_ * - kBeamwidthConstant * wave_numbers_[i] * - wave_numbers_[i]; } // Initialize all nonadaptive values before looping through the frames. + InitInterfAngles(); InitDelaySumMasks(); InitTargetCovMats(); InitInterfCovMats(); for (size_t i = 0; i < kNumFreqBins; ++i) { rxiws_[i] = Norm(target_cov_mats_[i], delay_sum_masks_[i]); - rpsiws_[i] = Norm(interf_cov_mats_[i], delay_sum_masks_[i]); - reflected_rpsiws_[i] = - Norm(reflected_interf_cov_mats_[i], delay_sum_masks_[i]); + rpsiws_[i].clear(); + for (size_t j = 0; j < interf_angles_radians_.size(); ++j) { + rpsiws_[i].push_back(Norm(*interf_cov_mats_[i][j], delay_sum_masks_[i])); + } } } +void NonlinearBeamformer::InitInterfAngles() { + // TODO(aluebs): Make kAwayRadians dependent on the mic spacing. + const float kAwayRadians = 0.5; + + interf_angles_radians_.clear(); + // TODO(aluebs): When the target angle is settable, make sure the interferer + // scenarios aren't reflected over the target one for linear geometries. + interf_angles_radians_.push_back(kTargetAngleRadians - kAwayRadians); + interf_angles_radians_.push_back(kTargetAngleRadians + kAwayRadians); +} + void NonlinearBeamformer::InitDelaySumMasks() { for (size_t f_ix = 0; f_ix < kNumFreqBins; ++f_ix) { delay_sum_masks_[f_ix].Resize(1, num_input_channels_); @@ -273,40 +287,39 @@ void NonlinearBeamformer::InitTargetCovMats() { for (size_t i = 0; i < kNumFreqBins; ++i) { target_cov_mats_[i].Resize(num_input_channels_, num_input_channels_); TransposedConjugatedProduct(delay_sum_masks_[i], &target_cov_mats_[i]); - complex_f normalization_factor = target_cov_mats_[i].Trace(); - target_cov_mats_[i].Scale(1.f / normalization_factor); } } void NonlinearBeamformer::InitInterfCovMats() { for (size_t i = 0; i < kNumFreqBins; ++i) { - interf_cov_mats_[i].Resize(num_input_channels_, num_input_channels_); ComplexMatrixF uniform_cov_mat(num_input_channels_, num_input_channels_); - ComplexMatrixF angled_cov_mat(num_input_channels_, num_input_channels_); - CovarianceMatrixGenerator::UniformCovarianceMatrix(wave_numbers_[i], array_geometry_, &uniform_cov_mat); - - CovarianceMatrixGenerator::AngledCovarianceMatrix(kSpeedOfSoundMeterSeconds, - kInterfAngleRadians, - i, - kFftSize, - kNumFreqBins, - sample_rate_hz_, - array_geometry_, - &angled_cov_mat); - // Normalize matrices before averaging them. - complex_f normalization_factor = uniform_cov_mat.Trace(); + complex_f normalization_factor = uniform_cov_mat.elements()[0][0]; uniform_cov_mat.Scale(1.f / normalization_factor); - normalization_factor = angled_cov_mat.Trace(); - angled_cov_mat.Scale(1.f / normalization_factor); - - // Average matrices. uniform_cov_mat.Scale(1 - kBalance); - angled_cov_mat.Scale(kBalance); - interf_cov_mats_[i].Add(uniform_cov_mat, angled_cov_mat); - reflected_interf_cov_mats_[i].PointwiseConjugate(interf_cov_mats_[i]); + interf_cov_mats_[i].clear(); + for (size_t j = 0; j < interf_angles_radians_.size(); ++j) { + interf_cov_mats_[i].push_back(new ComplexMatrixF(num_input_channels_, + num_input_channels_)); + ComplexMatrixF angled_cov_mat(num_input_channels_, num_input_channels_); + CovarianceMatrixGenerator::AngledCovarianceMatrix( + kSpeedOfSoundMeterSeconds, + interf_angles_radians_[j], + i, + kFftSize, + kNumFreqBins, + sample_rate_hz_, + array_geometry_, + &angled_cov_mat); + // Normalize matrices before averaging them. + normalization_factor = angled_cov_mat.elements()[0][0]; + angled_cov_mat.Scale(1.f / normalization_factor); + // Weighted average of matrices. + angled_cov_mat.Scale(kBalance); + interf_cov_mats_[i][j]->Add(uniform_cov_mat, angled_cov_mat); + } } } @@ -376,17 +389,19 @@ void NonlinearBeamformer::ProcessAudioBlock(const complex_f* const* input, rmw *= rmw; float rmw_r = rmw.real(); - new_mask_[i] = CalculatePostfilterMask(interf_cov_mats_[i], - rpsiws_[i], + new_mask_[i] = CalculatePostfilterMask(*interf_cov_mats_[i][0], + rpsiws_[i][0], ratio_rxiw_rxim, - rmw_r, - mask_thresholds_[i]); - - new_mask_[i] *= CalculatePostfilterMask(reflected_interf_cov_mats_[i], - reflected_rpsiws_[i], - ratio_rxiw_rxim, - rmw_r, - mask_thresholds_[i]); + rmw_r); + for (size_t j = 1; j < interf_angles_radians_.size(); ++j) { + float tmp_mask = CalculatePostfilterMask(*interf_cov_mats_[i][j], + rpsiws_[i][j], + ratio_rxiw_rxim, + rmw_r); + if (tmp_mask < new_mask_[i]) { + new_mask_[i] = tmp_mask; + } + } } ApplyMaskTimeSmoothing(); @@ -401,24 +416,16 @@ float NonlinearBeamformer::CalculatePostfilterMask( const ComplexMatrixF& interf_cov_mat, float rpsiw, float ratio_rxiw_rxim, - float rmw_r, - float mask_threshold) { + float rmw_r) { float rpsim = Norm(interf_cov_mat, eig_m_); - // Find lambda. float ratio = 0.f; if (rpsim > 0.f) { ratio = rpsiw / rpsim; } - float numerator = rmw_r - ratio; - float denominator = ratio_rxiw_rxim - ratio; - float mask = 1.f; - if (denominator > mask_threshold) { - float lambda = numerator / denominator; - mask = std::max(lambda * ratio_rxiw_rxim / rmw_r, kMaskMinimum); - } - return mask; + return (1.f - std::min(kCutOffConstant, ratio / rmw_r)) / + (1.f - std::min(kCutOffConstant, ratio / ratio_rxiw_rxim)); } void NonlinearBeamformer::ApplyMasks(const complex_f* const* input, @@ -433,7 +440,7 @@ void NonlinearBeamformer::ApplyMasks(const complex_f* const* input, output_channel[f_ix] += input[c_ix][f_ix] * delay_sum_mask_els[c_ix]; } - output_channel[f_ix] *= final_mask_[f_ix]; + output_channel[f_ix] *= kCompensationGain * final_mask_[f_ix]; } } diff --git a/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.h b/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.h index 46c68bf808..4ff0b48093 100644 --- a/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.h +++ b/webrtc/modules/audio_processing/beamformer/nonlinear_beamformer.h @@ -17,6 +17,7 @@ #include "webrtc/common_audio/channel_buffer.h" #include "webrtc/modules/audio_processing/beamformer/beamformer.h" #include "webrtc/modules/audio_processing/beamformer/complex_matrix.h" +#include "webrtc/system_wrappers/interface/scoped_vector.h" namespace webrtc { @@ -26,14 +27,10 @@ namespace webrtc { // // The implemented nonlinear postfilter algorithm taken from "A Robust Nonlinear // Beamforming Postprocessor" by Bastiaan Kleijn. -// -// TODO(aluebs): Target angle assumed to be 0. Parameterize target angle. class NonlinearBeamformer : public Beamformer, public LappedTransform::Callback { public: - // At the moment it only accepts uniform linear microphone arrays. Using the - // first microphone as a reference position [0, 0, 0] is a natural choice. explicit NonlinearBeamformer(const std::vector& array_geometry); // Sample rate corresponds to the lower band. @@ -69,19 +66,17 @@ class NonlinearBeamformer typedef ComplexMatrix ComplexMatrixF; typedef complex complex_f; + void InitInterfAngles(); void InitDelaySumMasks(); - void InitTargetCovMats(); // TODO(aluebs): Make this depend on target angle. + void InitTargetCovMats(); void InitInterfCovMats(); - // An implementation of equation 18, which calculates postfilter masks that, - // when applied, minimize the mean-square error of our estimation of the - // desired signal. A sub-task is to calculate lambda, which is solved via - // equation 13. + // Calculates postfilter masks that minimize the mean squared error of our + // estimation of the desired signal. float CalculatePostfilterMask(const ComplexMatrixF& interf_cov_mat, float rpsiw, float ratio_rxiw_rxim, - float rmxi_r, - float mask_threshold); + float rmxi_r); // Prevents the postfilter masks from degenerating too quickly (a cause of // musical noise). @@ -134,6 +129,9 @@ class NonlinearBeamformer // Time and frequency smoothed mask. float final_mask_[kNumFreqBins]; + // Angles of the interferer scenarios. + std::vector interf_angles_radians_; + // Array of length |kNumFreqBins|, Matrix of size |1| x |num_channels_|. ComplexMatrixF delay_sum_masks_[kNumFreqBins]; ComplexMatrixF normalized_delay_sum_masks_[kNumFreqBins]; @@ -143,19 +141,18 @@ class NonlinearBeamformer ComplexMatrixF target_cov_mats_[kNumFreqBins]; // Array of length |kNumFreqBins|, Matrix of size |num_input_channels_| x - // |num_input_channels_|. - ComplexMatrixF interf_cov_mats_[kNumFreqBins]; - ComplexMatrixF reflected_interf_cov_mats_[kNumFreqBins]; + // |num_input_channels_|. ScopedVector has a size equal to the number of + // interferer scenarios. + ScopedVector interf_cov_mats_[kNumFreqBins]; // Of length |kNumFreqBins|. - float mask_thresholds_[kNumFreqBins]; float wave_numbers_[kNumFreqBins]; // Preallocated for ProcessAudioBlock() // Of length |kNumFreqBins|. float rxiws_[kNumFreqBins]; - float rpsiws_[kNumFreqBins]; - float reflected_rpsiws_[kNumFreqBins]; + // The vector has a size equal to the number of interferer scenarios. + std::vector rpsiws_[kNumFreqBins]; // The microphone normalization factor. ComplexMatrixF eig_m_;