Implement new version of the NonlinearBeamformer

Sounds better according to a MUSHRA listening test.
The computational complexity is unaffected.
An empirically estimated gain was added to compensate for the attenuation introduced by the algorithm.
There are some TODOs, which I will address in follow up CLs.

It was tested in Hangouts without headphones and highest volume, to make sure it doesn't affect the AEC.

Review URL: https://codereview.webrtc.org/1378973003

Cr-Commit-Position: refs/heads/master@{#10308}
This commit is contained in:
aluebs 2015-10-16 17:04:09 -07:00 committed by Commit bot
parent c7a8b08a7c
commit 45daf7b26f
4 changed files with 122 additions and 106 deletions

View File

@ -14,6 +14,7 @@
#include <cmath>
namespace webrtc {
namespace {
float BesselJ0(float x) {
@ -24,9 +25,19 @@ float BesselJ0(float x) {
#endif
}
} // namespace
// Calculates the Euclidean norm for a row vector.
float Norm(const ComplexMatrix<float>& x) {
RTC_CHECK_EQ(1, x.num_rows());
const size_t length = x.num_columns();
const complex<float>* elems = x.elements()[0];
float result = 0.f;
for (size_t i = 0u; i < length; ++i) {
result += std::norm(elems[i]);
}
return std::sqrt(result);
}
namespace webrtc {
} // namespace
void CovarianceMatrixGenerator::UniformCovarianceMatrix(
float wave_number,
@ -69,6 +80,7 @@ void CovarianceMatrixGenerator::AngledCovarianceMatrix(
geometry,
angle,
&interf_cov_vector);
interf_cov_vector.Scale(1.f / Norm(interf_cov_vector));
interf_cov_vector_transposed.Transpose(interf_cov_vector);
interf_cov_vector.PointwiseConjugate();
mat->Multiply(interf_cov_vector_transposed, interf_cov_vector);

View File

@ -165,14 +165,14 @@ TEST(CovarianceMatrixGeneratorTest, TestAngledCovarianceMatrix2Mics) {
complex<float>* const* actual_els = actual_covariance_matrix.elements();
EXPECT_NEAR(actual_els[0][0].real(), 1.f, kTolerance);
EXPECT_NEAR(actual_els[0][1].real(), 0.9952f, kTolerance);
EXPECT_NEAR(actual_els[1][0].real(), 0.9952f, kTolerance);
EXPECT_NEAR(actual_els[1][1].real(), 1.f, kTolerance);
EXPECT_NEAR(actual_els[0][0].real(), 0.5f, kTolerance);
EXPECT_NEAR(actual_els[0][1].real(), 0.4976f, kTolerance);
EXPECT_NEAR(actual_els[1][0].real(), 0.4976f, kTolerance);
EXPECT_NEAR(actual_els[1][1].real(), 0.5f, kTolerance);
EXPECT_NEAR(actual_els[0][0].imag(), 0.f, kTolerance);
EXPECT_NEAR(actual_els[0][1].imag(), 0.0978f, kTolerance);
EXPECT_NEAR(actual_els[1][0].imag(), -0.0978f, kTolerance);
EXPECT_NEAR(actual_els[0][1].imag(), 0.0489f, kTolerance);
EXPECT_NEAR(actual_els[1][0].imag(), -0.0489f, kTolerance);
EXPECT_NEAR(actual_els[1][1].imag(), 0.f, kTolerance);
}
@ -203,24 +203,24 @@ TEST(CovarianceMatrixGeneratorTest, TestAngledCovarianceMatrix3Mics) {
complex<float>* const* actual_els = actual_covariance_matrix.elements();
EXPECT_NEAR(actual_els[0][0].real(), 1.f, kTolerance);
EXPECT_NEAR(actual_els[0][1].real(), 0.8859f, kTolerance);
EXPECT_NEAR(actual_els[0][2].real(), 0.5696f, kTolerance);
EXPECT_NEAR(actual_els[1][0].real(), 0.8859f, kTolerance);
EXPECT_NEAR(actual_els[1][1].real(), 1.f, kTolerance);
EXPECT_NEAR(actual_els[1][2].real(), 0.8859f, kTolerance);
EXPECT_NEAR(actual_els[2][0].real(), 0.5696f, kTolerance);
EXPECT_NEAR(actual_els[2][1].real(), 0.8859f, kTolerance);
EXPECT_NEAR(actual_els[2][2].real(), 1.f, kTolerance);
EXPECT_NEAR(actual_els[0][0].real(), 0.3333f, kTolerance);
EXPECT_NEAR(actual_els[0][1].real(), 0.2953f, kTolerance);
EXPECT_NEAR(actual_els[0][2].real(), 0.1899f, kTolerance);
EXPECT_NEAR(actual_els[1][0].real(), 0.2953f, kTolerance);
EXPECT_NEAR(actual_els[1][1].real(), 0.3333f, kTolerance);
EXPECT_NEAR(actual_els[1][2].real(), 0.2953f, kTolerance);
EXPECT_NEAR(actual_els[2][0].real(), 0.1899f, kTolerance);
EXPECT_NEAR(actual_els[2][1].real(), 0.2953f, kTolerance);
EXPECT_NEAR(actual_els[2][2].real(), 0.3333f, kTolerance);
EXPECT_NEAR(actual_els[0][0].imag(), 0.f, kTolerance);
EXPECT_NEAR(actual_els[0][1].imag(), 0.4639f, kTolerance);
EXPECT_NEAR(actual_els[0][2].imag(), 0.8219f, kTolerance);
EXPECT_NEAR(actual_els[1][0].imag(), -0.4639f, kTolerance);
EXPECT_NEAR(actual_els[0][1].imag(), 0.1546f, kTolerance);
EXPECT_NEAR(actual_els[0][2].imag(), 0.274f, kTolerance);
EXPECT_NEAR(actual_els[1][0].imag(), -0.1546f, kTolerance);
EXPECT_NEAR(actual_els[1][1].imag(), 0.f, kTolerance);
EXPECT_NEAR(actual_els[1][2].imag(), 0.4639f, kTolerance);
EXPECT_NEAR(actual_els[2][0].imag(), -0.8219f, kTolerance);
EXPECT_NEAR(actual_els[2][1].imag(), -0.4639f, kTolerance);
EXPECT_NEAR(actual_els[1][2].imag(), 0.1546f, kTolerance);
EXPECT_NEAR(actual_els[2][0].imag(), -0.274f, kTolerance);
EXPECT_NEAR(actual_els[2][1].imag(), -0.1546f, kTolerance);
EXPECT_NEAR(actual_els[2][2].imag(), 0.f, kTolerance);
}

View File

@ -27,34 +27,23 @@ namespace {
// Alpha for the Kaiser Bessel Derived window.
const float kKbdAlpha = 1.5f;
// The minimum value a post-processing mask can take.
const float kMaskMinimum = 0.01f;
const float kSpeedOfSoundMeterSeconds = 343;
// For both target and interference angles, PI / 2 is perpendicular to the
// microphone array, facing forwards. The positive direction goes
// counterclockwise.
// The angle at which we amplify sound.
// TODO(aluebs): Make the target angle dynamically settable.
const float kTargetAngleRadians = static_cast<float>(M_PI) / 2.f;
// The angle at which we suppress sound. Suppression is symmetric around PI / 2
// radians, so sound is suppressed at both +|kInterfAngleRadians| and
// PI - |kInterfAngleRadians|. Since the beamformer is robust, this should
// suppress sound coming from close angles as well.
const float kInterfAngleRadians = static_cast<float>(M_PI) / 4.f;
// When calculating the interference covariance matrix, this is the weight for
// the weighted average between the uniform covariance matrix and the angled
// covariance matrix.
// Rpsi = Rpsi_angled * kBalance + Rpsi_uniform * (1 - kBalance)
const float kBalance = 0.4f;
const float kBalance = 0.95f;
const float kHalfBeamWidthRadians = static_cast<float>(M_PI) * 20.f / 180.f;
// TODO(claguna): need comment here.
const float kBeamwidthConstant = 0.00002f;
// Alpha coefficients for mask smoothing.
const float kMaskTimeSmoothAlpha = 0.2f;
const float kMaskFrequencySmoothAlpha = 0.6f;
@ -64,17 +53,33 @@ const float kMaskFrequencySmoothAlpha = 0.6f;
const int kLowMeanStartHz = 200;
const int kLowMeanEndHz = 400;
// TODO(aluebs): Make the high frequency correction range depend on the target
// angle.
const int kHighMeanStartHz = 3000;
const int kHighMeanEndHz = 5000;
// Range limiter for subtractive terms in the nominator and denominator of the
// postfilter expression. It handles the scenario mismatch between the true and
// model sources (target and interference).
const float kCutOffConstant = 0.9999f;
// Quantile of mask values which is used to estimate target presence.
const float kMaskQuantile = 0.7f;
// Mask threshold over which the data is considered signal and not interference.
const float kMaskTargetThreshold = 0.3f;
// It has to be updated every time the postfilter calculation is changed
// significantly.
// TODO(aluebs): Write a tool to tune the target threshold automatically based
// on files annotated with target and interference ground truth.
const float kMaskTargetThreshold = 0.01f;
// Time in seconds after which the data is considered interference if the mask
// does not pass |kMaskTargetThreshold|.
const float kHoldTargetSeconds = 0.25f;
// To compensate for the attenuation this algorithm introduces to the target
// signal. It was estimated empirically from a low-noise low-reverberation
// recording from broadside.
const float kCompensationGain = 2.f;
// Does conjugate(|norm_mat|) * |mat| * transpose(|norm_mat|). No extra space is
// used; to accomplish this, we compute both multiplications in the same loop.
// The returned norm is clamped to be non-negative.
@ -218,7 +223,6 @@ void NonlinearBeamformer::Initialize(int chunk_size_ms, int sample_rate_hz) {
hold_target_blocks_ = kHoldTargetSeconds * 2 * sample_rate_hz / kFftSize;
interference_blocks_count_ = hold_target_blocks_;
lapped_transform_.reset(new LappedTransform(num_input_channels_,
1,
chunk_length_,
@ -231,24 +235,34 @@ void NonlinearBeamformer::Initialize(int chunk_size_ms, int sample_rate_hz) {
final_mask_[i] = 1.f;
float freq_hz = (static_cast<float>(i) / kFftSize) * sample_rate_hz_;
wave_numbers_[i] = 2 * M_PI * freq_hz / kSpeedOfSoundMeterSeconds;
mask_thresholds_[i] = num_input_channels_ * num_input_channels_ *
kBeamwidthConstant * wave_numbers_[i] *
wave_numbers_[i];
}
// Initialize all nonadaptive values before looping through the frames.
InitInterfAngles();
InitDelaySumMasks();
InitTargetCovMats();
InitInterfCovMats();
for (size_t i = 0; i < kNumFreqBins; ++i) {
rxiws_[i] = Norm(target_cov_mats_[i], delay_sum_masks_[i]);
rpsiws_[i] = Norm(interf_cov_mats_[i], delay_sum_masks_[i]);
reflected_rpsiws_[i] =
Norm(reflected_interf_cov_mats_[i], delay_sum_masks_[i]);
rpsiws_[i].clear();
for (size_t j = 0; j < interf_angles_radians_.size(); ++j) {
rpsiws_[i].push_back(Norm(*interf_cov_mats_[i][j], delay_sum_masks_[i]));
}
}
}
void NonlinearBeamformer::InitInterfAngles() {
// TODO(aluebs): Make kAwayRadians dependent on the mic spacing.
const float kAwayRadians = 0.5;
interf_angles_radians_.clear();
// TODO(aluebs): When the target angle is settable, make sure the interferer
// scenarios aren't reflected over the target one for linear geometries.
interf_angles_radians_.push_back(kTargetAngleRadians - kAwayRadians);
interf_angles_radians_.push_back(kTargetAngleRadians + kAwayRadians);
}
void NonlinearBeamformer::InitDelaySumMasks() {
for (size_t f_ix = 0; f_ix < kNumFreqBins; ++f_ix) {
delay_sum_masks_[f_ix].Resize(1, num_input_channels_);
@ -273,40 +287,39 @@ void NonlinearBeamformer::InitTargetCovMats() {
for (size_t i = 0; i < kNumFreqBins; ++i) {
target_cov_mats_[i].Resize(num_input_channels_, num_input_channels_);
TransposedConjugatedProduct(delay_sum_masks_[i], &target_cov_mats_[i]);
complex_f normalization_factor = target_cov_mats_[i].Trace();
target_cov_mats_[i].Scale(1.f / normalization_factor);
}
}
void NonlinearBeamformer::InitInterfCovMats() {
for (size_t i = 0; i < kNumFreqBins; ++i) {
interf_cov_mats_[i].Resize(num_input_channels_, num_input_channels_);
ComplexMatrixF uniform_cov_mat(num_input_channels_, num_input_channels_);
ComplexMatrixF angled_cov_mat(num_input_channels_, num_input_channels_);
CovarianceMatrixGenerator::UniformCovarianceMatrix(wave_numbers_[i],
array_geometry_,
&uniform_cov_mat);
CovarianceMatrixGenerator::AngledCovarianceMatrix(kSpeedOfSoundMeterSeconds,
kInterfAngleRadians,
i,
kFftSize,
kNumFreqBins,
sample_rate_hz_,
array_geometry_,
&angled_cov_mat);
// Normalize matrices before averaging them.
complex_f normalization_factor = uniform_cov_mat.Trace();
complex_f normalization_factor = uniform_cov_mat.elements()[0][0];
uniform_cov_mat.Scale(1.f / normalization_factor);
normalization_factor = angled_cov_mat.Trace();
angled_cov_mat.Scale(1.f / normalization_factor);
// Average matrices.
uniform_cov_mat.Scale(1 - kBalance);
angled_cov_mat.Scale(kBalance);
interf_cov_mats_[i].Add(uniform_cov_mat, angled_cov_mat);
reflected_interf_cov_mats_[i].PointwiseConjugate(interf_cov_mats_[i]);
interf_cov_mats_[i].clear();
for (size_t j = 0; j < interf_angles_radians_.size(); ++j) {
interf_cov_mats_[i].push_back(new ComplexMatrixF(num_input_channels_,
num_input_channels_));
ComplexMatrixF angled_cov_mat(num_input_channels_, num_input_channels_);
CovarianceMatrixGenerator::AngledCovarianceMatrix(
kSpeedOfSoundMeterSeconds,
interf_angles_radians_[j],
i,
kFftSize,
kNumFreqBins,
sample_rate_hz_,
array_geometry_,
&angled_cov_mat);
// Normalize matrices before averaging them.
normalization_factor = angled_cov_mat.elements()[0][0];
angled_cov_mat.Scale(1.f / normalization_factor);
// Weighted average of matrices.
angled_cov_mat.Scale(kBalance);
interf_cov_mats_[i][j]->Add(uniform_cov_mat, angled_cov_mat);
}
}
}
@ -376,17 +389,19 @@ void NonlinearBeamformer::ProcessAudioBlock(const complex_f* const* input,
rmw *= rmw;
float rmw_r = rmw.real();
new_mask_[i] = CalculatePostfilterMask(interf_cov_mats_[i],
rpsiws_[i],
new_mask_[i] = CalculatePostfilterMask(*interf_cov_mats_[i][0],
rpsiws_[i][0],
ratio_rxiw_rxim,
rmw_r,
mask_thresholds_[i]);
new_mask_[i] *= CalculatePostfilterMask(reflected_interf_cov_mats_[i],
reflected_rpsiws_[i],
ratio_rxiw_rxim,
rmw_r,
mask_thresholds_[i]);
rmw_r);
for (size_t j = 1; j < interf_angles_radians_.size(); ++j) {
float tmp_mask = CalculatePostfilterMask(*interf_cov_mats_[i][j],
rpsiws_[i][j],
ratio_rxiw_rxim,
rmw_r);
if (tmp_mask < new_mask_[i]) {
new_mask_[i] = tmp_mask;
}
}
}
ApplyMaskTimeSmoothing();
@ -401,24 +416,16 @@ float NonlinearBeamformer::CalculatePostfilterMask(
const ComplexMatrixF& interf_cov_mat,
float rpsiw,
float ratio_rxiw_rxim,
float rmw_r,
float mask_threshold) {
float rmw_r) {
float rpsim = Norm(interf_cov_mat, eig_m_);
// Find lambda.
float ratio = 0.f;
if (rpsim > 0.f) {
ratio = rpsiw / rpsim;
}
float numerator = rmw_r - ratio;
float denominator = ratio_rxiw_rxim - ratio;
float mask = 1.f;
if (denominator > mask_threshold) {
float lambda = numerator / denominator;
mask = std::max(lambda * ratio_rxiw_rxim / rmw_r, kMaskMinimum);
}
return mask;
return (1.f - std::min(kCutOffConstant, ratio / rmw_r)) /
(1.f - std::min(kCutOffConstant, ratio / ratio_rxiw_rxim));
}
void NonlinearBeamformer::ApplyMasks(const complex_f* const* input,
@ -433,7 +440,7 @@ void NonlinearBeamformer::ApplyMasks(const complex_f* const* input,
output_channel[f_ix] += input[c_ix][f_ix] * delay_sum_mask_els[c_ix];
}
output_channel[f_ix] *= final_mask_[f_ix];
output_channel[f_ix] *= kCompensationGain * final_mask_[f_ix];
}
}

View File

@ -17,6 +17,7 @@
#include "webrtc/common_audio/channel_buffer.h"
#include "webrtc/modules/audio_processing/beamformer/beamformer.h"
#include "webrtc/modules/audio_processing/beamformer/complex_matrix.h"
#include "webrtc/system_wrappers/interface/scoped_vector.h"
namespace webrtc {
@ -26,14 +27,10 @@ namespace webrtc {
//
// The implemented nonlinear postfilter algorithm taken from "A Robust Nonlinear
// Beamforming Postprocessor" by Bastiaan Kleijn.
//
// TODO(aluebs): Target angle assumed to be 0. Parameterize target angle.
class NonlinearBeamformer
: public Beamformer<float>,
public LappedTransform::Callback {
public:
// At the moment it only accepts uniform linear microphone arrays. Using the
// first microphone as a reference position [0, 0, 0] is a natural choice.
explicit NonlinearBeamformer(const std::vector<Point>& array_geometry);
// Sample rate corresponds to the lower band.
@ -69,19 +66,17 @@ class NonlinearBeamformer
typedef ComplexMatrix<float> ComplexMatrixF;
typedef complex<float> complex_f;
void InitInterfAngles();
void InitDelaySumMasks();
void InitTargetCovMats(); // TODO(aluebs): Make this depend on target angle.
void InitTargetCovMats();
void InitInterfCovMats();
// An implementation of equation 18, which calculates postfilter masks that,
// when applied, minimize the mean-square error of our estimation of the
// desired signal. A sub-task is to calculate lambda, which is solved via
// equation 13.
// Calculates postfilter masks that minimize the mean squared error of our
// estimation of the desired signal.
float CalculatePostfilterMask(const ComplexMatrixF& interf_cov_mat,
float rpsiw,
float ratio_rxiw_rxim,
float rmxi_r,
float mask_threshold);
float rmxi_r);
// Prevents the postfilter masks from degenerating too quickly (a cause of
// musical noise).
@ -134,6 +129,9 @@ class NonlinearBeamformer
// Time and frequency smoothed mask.
float final_mask_[kNumFreqBins];
// Angles of the interferer scenarios.
std::vector<float> interf_angles_radians_;
// Array of length |kNumFreqBins|, Matrix of size |1| x |num_channels_|.
ComplexMatrixF delay_sum_masks_[kNumFreqBins];
ComplexMatrixF normalized_delay_sum_masks_[kNumFreqBins];
@ -143,19 +141,18 @@ class NonlinearBeamformer
ComplexMatrixF target_cov_mats_[kNumFreqBins];
// Array of length |kNumFreqBins|, Matrix of size |num_input_channels_| x
// |num_input_channels_|.
ComplexMatrixF interf_cov_mats_[kNumFreqBins];
ComplexMatrixF reflected_interf_cov_mats_[kNumFreqBins];
// |num_input_channels_|. ScopedVector has a size equal to the number of
// interferer scenarios.
ScopedVector<ComplexMatrixF> interf_cov_mats_[kNumFreqBins];
// Of length |kNumFreqBins|.
float mask_thresholds_[kNumFreqBins];
float wave_numbers_[kNumFreqBins];
// Preallocated for ProcessAudioBlock()
// Of length |kNumFreqBins|.
float rxiws_[kNumFreqBins];
float rpsiws_[kNumFreqBins];
float reflected_rpsiws_[kNumFreqBins];
// The vector has a size equal to the number of interferer scenarios.
std::vector<float> rpsiws_[kNumFreqBins];
// The microphone normalization factor.
ComplexMatrixF eig_m_;