From 4a53766c841c03647211e02519a1dbaf94239a75 Mon Sep 17 00:00:00 2001 From: Alessio Bazzica Date: Wed, 10 Apr 2019 09:36:21 +0200 Subject: [PATCH] RNN VAD: Opus band spectral analysis refactoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This CL refactors the computation of band energy and spectral cross-correlation coefficients by moving and optimizing the code from ComputeBandCoefficients, ComputeBandEnergies and ComputeSpectralCrossCorrelation into a single class (named BandFeaturesExtractor). This change will also help replacing FFT library in the RNN VAD. Bug: webrtc:10480 Change-Id: I6cefa23e8f3bc8de6eb09d3ea434699d5e19124e Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/129726 Commit-Queue: Alessio Bazzica Reviewed-by: Per Ã…hgren Cr-Commit-Position: refs/heads/master@{#27535} --- .../audio_processing/agc2/rnn_vad/BUILD.gn | 1 - .../audio_processing/agc2/rnn_vad/common.h | 12 +- .../agc2/rnn_vad/features_extraction.cc | 12 +- .../audio_processing/agc2/rnn_vad/fft_util.cc | 10 +- .../audio_processing/agc2/rnn_vad/fft_util.h | 21 +- .../agc2/rnn_vad/fft_util_unittest.cc | 6 +- .../agc2/rnn_vad/spectral_features.cc | 190 ++++++++--------- .../agc2/rnn_vad/spectral_features.h | 54 ++--- .../rnn_vad/spectral_features_internal.cc | 196 +++++++++++------- .../agc2/rnn_vad/spectral_features_internal.h | 86 +++++--- .../spectral_features_internal_unittest.cc | 119 ++++++----- .../rnn_vad/spectral_features_unittest.cc | 82 +++++--- .../agc2/rnn_vad/test_utils.cc | 8 - .../agc2/rnn_vad/test_utils.h | 3 - 14 files changed, 439 insertions(+), 361 deletions(-) diff --git a/modules/audio_processing/agc2/rnn_vad/BUILD.gn b/modules/audio_processing/agc2/rnn_vad/BUILD.gn index 237c80972d..cd9a7a24a2 100644 --- a/modules/audio_processing/agc2/rnn_vad/BUILD.gn +++ b/modules/audio_processing/agc2/rnn_vad/BUILD.gn @@ -38,7 +38,6 @@ rtc_source_set("rnn_vad") { deps = [ "..:biquad_filter", "../../../../api:array_view", - "../../../../api:function_view", "../../../../rtc_base:checks", "../../../../rtc_base:rtc_base_approved", "../../utility:pffft_wrapper", diff --git a/modules/audio_processing/agc2/rnn_vad/common.h b/modules/audio_processing/agc2/rnn_vad/common.h index 2f16cd41e9..4fef3ab56c 100644 --- a/modules/audio_processing/agc2/rnn_vad/common.h +++ b/modules/audio_processing/agc2/rnn_vad/common.h @@ -52,17 +52,13 @@ constexpr size_t kNumInvertedLags12kHz = kMaxPitch12kHz - kInitialMinPitch12kHz; constexpr size_t kMinPitch48kHz = kMinPitch24kHz * 2; constexpr size_t kMaxPitch48kHz = kMaxPitch24kHz * 2; -// Sub-band frequency boundaries. +// Spectral features. +constexpr size_t kFftSizeBy2Plus1 = kFrameSize20ms24kHz / 2 + 1; constexpr size_t kNumBands = 22; -constexpr int kBandFrequencyBoundaries[kNumBands] = { - 0, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 2000, 2400, - 2800, 3200, 4000, 4800, 5600, 6800, 8000, 9600, 12000, 15600, 20000}; - -// Feature extraction parameters. constexpr size_t kNumLowerBands = 6; static_assert((0 < kNumLowerBands) && (kNumLowerBands < kNumBands), ""); -constexpr size_t kSpectralCoeffsHistorySize = 8; -static_assert(kSpectralCoeffsHistorySize > 2, +constexpr size_t kCepstralCoeffsHistorySize = 8; +static_assert(kCepstralCoeffsHistorySize > 2, "The history size must at least be 3 to compute first and second " "derivatives."); diff --git a/modules/audio_processing/agc2/rnn_vad/features_extraction.cc b/modules/audio_processing/agc2/rnn_vad/features_extraction.cc index 8f472a55ed..e9351797f5 100644 --- a/modules/audio_processing/agc2/rnn_vad/features_extraction.cc +++ b/modules/audio_processing/agc2/rnn_vad/features_extraction.cc @@ -78,12 +78,12 @@ bool FeaturesExtractor::CheckSilenceComputeFeatures( // and write the feature vector. return spectral_features_extractor_.CheckSilenceComputeFeatures( reference_frame_view_, {lagged_frame.data(), kFrameSize20ms24kHz}, - {{feature_vector.data() + kNumLowerBands, kNumBands - kNumLowerBands}, - {feature_vector.data(), kNumLowerBands}, - {feature_vector.data() + kNumBands, kNumLowerBands}, - {feature_vector.data() + kNumBands + kNumLowerBands, kNumLowerBands}, - {feature_vector.data() + kNumBands + 2 * kNumLowerBands, kNumLowerBands}, - &feature_vector[kFeatureVectorSize - 1]}); + {feature_vector.data() + kNumLowerBands, kNumBands - kNumLowerBands}, + {feature_vector.data(), kNumLowerBands}, + {feature_vector.data() + kNumBands, kNumLowerBands}, + {feature_vector.data() + kNumBands + kNumLowerBands, kNumLowerBands}, + {feature_vector.data() + kNumBands + 2 * kNumLowerBands, kNumLowerBands}, + &feature_vector[kFeatureVectorSize - 1]); } } // namespace rnn_vad diff --git a/modules/audio_processing/agc2/rnn_vad/fft_util.cc b/modules/audio_processing/agc2/rnn_vad/fft_util.cc index 4825e2befe..4cc3ed9545 100644 --- a/modules/audio_processing/agc2/rnn_vad/fft_util.cc +++ b/modules/audio_processing/agc2/rnn_vad/fft_util.cc @@ -35,16 +35,16 @@ std::array ComputeHalfVorbisWindow() { } // namespace -BandAnalysisFft::BandAnalysisFft() +FftUtil::FftUtil() : half_window_(ComputeHalfVorbisWindow()), fft_(static_cast(input_buf_.size())) {} -BandAnalysisFft::~BandAnalysisFft() = default; +FftUtil::~FftUtil() = default; -void BandAnalysisFft::ForwardFft(rtc::ArrayView samples, - rtc::ArrayView> dst) { +void FftUtil::WindowedFft(rtc::ArrayView samples, + rtc::ArrayView> dst) { RTC_DCHECK_EQ(samples.size(), kFrameSize20ms24kHz); - RTC_DCHECK_EQ(dst.size(), kFrameSize20ms24kHz / 2 + 1); + RTC_DCHECK_EQ(dst.size(), kFftSizeBy2Plus1); // Apply windowing. RTC_DCHECK_EQ(input_buf_.size(), 2 * half_window_.size()); for (size_t i = 0; i < input_buf_.size() / 2; ++i) { diff --git a/modules/audio_processing/agc2/rnn_vad/fft_util.h b/modules/audio_processing/agc2/rnn_vad/fft_util.h index c744ff6c64..e38b0fffae 100644 --- a/modules/audio_processing/agc2/rnn_vad/fft_util.h +++ b/modules/audio_processing/agc2/rnn_vad/fft_util.h @@ -21,32 +21,31 @@ namespace webrtc { namespace rnn_vad { -// TODO(alessiob): Switch to PFFFT using its own wrapper. -// TODO(alessiob): Delete this class when switching to PFFFT. +// TODO(alessiob): Switch to PFFFT and remove this class. // FFT implementation wrapper for the band-wise analysis step in which 20 ms // frames at 24 kHz are analyzed in the frequency domain. The goal of this class // are (i) making easy to switch to another FFT implementation, (ii) own the // input buffer for the FFT and (iii) apply a windowing function before // computing the FFT. -class BandAnalysisFft { +class FftUtil { public: - BandAnalysisFft(); - BandAnalysisFft(const BandAnalysisFft&) = delete; - BandAnalysisFft& operator=(const BandAnalysisFft&) = delete; - ~BandAnalysisFft(); + FftUtil(); + FftUtil(const FftUtil&) = delete; + FftUtil& operator=(const FftUtil&) = delete; + ~FftUtil(); // Applies a windowing function to |samples|, computes the real forward FFT // and writes the result in |dst|. // The size of |samples| must be 480 (20 ms at 24 kHz). // The size of |dst| must be 241 since the complex conjugate is not written. - void ForwardFft(rtc::ArrayView samples, - rtc::ArrayView> dst); + void WindowedFft(rtc::ArrayView samples, + rtc::ArrayView> dst); private: static_assert((kFrameSize20ms24kHz & 1) == 0, "kFrameSize20ms24kHz must be even."); const std::array half_window_; - std::array, kFrameSize20ms24kHz> input_buf_{}; - std::array, kFrameSize20ms24kHz> output_buf_{}; + std::array, kFrameSize20ms24kHz> input_buf_; + std::array, kFrameSize20ms24kHz> output_buf_; rnnoise::KissFft fft_; }; diff --git a/modules/audio_processing/agc2/rnn_vad/fft_util_unittest.cc b/modules/audio_processing/agc2/rnn_vad/fft_util_unittest.cc index 28f56bd069..a7efa1eb41 100644 --- a/modules/audio_processing/agc2/rnn_vad/fft_util_unittest.cc +++ b/modules/audio_processing/agc2/rnn_vad/fft_util_unittest.cc @@ -39,16 +39,16 @@ std::vector CreateSine(float amplitude, } // namespace -TEST(RnnVadTest, BandAnalysisFftTest) { +TEST(RnnVadTest, FftUtilTest) { for (float frequency_hz : {200.f, 450.f, 1500.f}) { SCOPED_TRACE(frequency_hz); auto x = CreateSine( /*amplitude=*/1000.f, frequency_hz, /*duration_s=*/0.02f, /*sample_rate_hz=*/kSampleRate24kHz); - BandAnalysisFft analyzer; + FftUtil analyzer; std::vector> x_fft(x.size() / 2 + 1); - analyzer.ForwardFft(x, x_fft); + analyzer.WindowedFft(x, x_fft); int peak_fft_bin_index = std::distance( x_fft.begin(), std::max_element(x_fft.begin(), x_fft.end(), diff --git a/modules/audio_processing/agc2/rnn_vad/spectral_features.cc b/modules/audio_processing/agc2/rnn_vad/spectral_features.cc index 84db2dfecd..82355798a0 100644 --- a/modules/audio_processing/agc2/rnn_vad/spectral_features.cc +++ b/modules/audio_processing/agc2/rnn_vad/spectral_features.cc @@ -15,7 +15,6 @@ #include #include -#include "modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h" #include "rtc_base/checks.h" namespace webrtc { @@ -24,21 +23,21 @@ namespace { constexpr float kSilenceThreshold = 0.04f; -// Computes the new spectral difference stats and pushes them into the passed +// Computes the new cepstral difference stats and pushes them into the passed // symmetric matrix buffer. -void UpdateSpectralDifferenceStats( - rtc::ArrayView new_spectral_coeffs, - const RingBuffer& ring_buf, - SymmetricMatrixBuffer* sym_matrix_buf) { +void UpdateCepstralDifferenceStats( + rtc::ArrayView new_cepstral_coeffs, + const RingBuffer& ring_buf, + SymmetricMatrixBuffer* sym_matrix_buf) { RTC_DCHECK(sym_matrix_buf); - // Compute the new spectral distance stats. - std::array distances; - for (size_t i = 0; i < kSpectralCoeffsHistorySize - 1; ++i) { + // Compute the new cepstral distance stats. + std::array distances; + for (size_t i = 0; i < kCepstralCoeffsHistorySize - 1; ++i) { const size_t delay = i + 1; - auto old_spectral_coeffs = ring_buf.GetArrayView(delay); + auto old_cepstral_coeffs = ring_buf.GetArrayView(delay); distances[i] = 0.f; for (size_t k = 0; k < kNumBands; ++k) { - const float c = new_spectral_coeffs[k] - old_spectral_coeffs[k]; + const float c = new_cepstral_coeffs[k] - old_cepstral_coeffs[k]; distances[i] += c * c; } } @@ -48,96 +47,77 @@ void UpdateSpectralDifferenceStats( } // namespace -SpectralFeaturesView::SpectralFeaturesView( - rtc::ArrayView coeffs, - rtc::ArrayView average, - rtc::ArrayView first_derivative, - rtc::ArrayView second_derivative, - rtc::ArrayView cross_correlations, - float* variability) - : coeffs(coeffs), - average(average), - first_derivative(first_derivative), - second_derivative(second_derivative), - cross_correlations(cross_correlations), - variability(variability) {} - -SpectralFeaturesView::SpectralFeaturesView(const SpectralFeaturesView&) = - default; -SpectralFeaturesView::~SpectralFeaturesView() = default; - SpectralFeaturesExtractor::SpectralFeaturesExtractor() : fft_(), - reference_frame_fft_(kFrameSize20ms24kHz / 2 + 1), - lagged_frame_fft_(kFrameSize20ms24kHz / 2 + 1), - band_boundaries_( - ComputeBandBoundaryIndexes(kSampleRate24kHz, kFrameSize20ms24kHz)), + reference_frame_fft_(kFftSizeBy2Plus1), + lagged_frame_fft_(kFftSizeBy2Plus1), dct_table_(ComputeDctTable()) {} SpectralFeaturesExtractor::~SpectralFeaturesExtractor() = default; void SpectralFeaturesExtractor::Reset() { - spectral_coeffs_ring_buf_.Reset(); - spectral_diffs_buf_.Reset(); + cepstral_coeffs_ring_buf_.Reset(); + cepstral_diffs_buf_.Reset(); } bool SpectralFeaturesExtractor::CheckSilenceComputeFeatures( rtc::ArrayView reference_frame, rtc::ArrayView lagged_frame, - SpectralFeaturesView spectral_features) { - // Analyze reference frame. - fft_.ForwardFft(reference_frame, reference_frame_fft_); - ComputeBandEnergies(reference_frame_fft_, band_boundaries_, - reference_frame_energy_coeffs_); + rtc::ArrayView higher_bands_cepstrum, + rtc::ArrayView average, + rtc::ArrayView first_derivative, + rtc::ArrayView second_derivative, + rtc::ArrayView bands_cross_corr, + float* variability) { + // Compute the Opus band energies for the reference frame. + fft_.WindowedFft(reference_frame, reference_frame_fft_); + spectral_correlator_.ComputeAutoCorrelation( + {reference_frame_fft_.data(), kFftSizeBy2Plus1}, + reference_frame_bands_energy_); // Check if the reference frame has silence. const float tot_energy = - std::accumulate(reference_frame_energy_coeffs_.begin(), - reference_frame_energy_coeffs_.end(), 0.f); - if (tot_energy < kSilenceThreshold) + std::accumulate(reference_frame_bands_energy_.begin(), + reference_frame_bands_energy_.end(), 0.f); + if (tot_energy < kSilenceThreshold) { return true; - // Analyze lagged frame. - fft_.ForwardFft(lagged_frame, lagged_frame_fft_); - ComputeBandEnergies(lagged_frame_fft_, band_boundaries_, - lagged_frame_energy_coeffs_); + } + // Compute the Opus band energies for the lagged frame. + fft_.WindowedFft(lagged_frame, lagged_frame_fft_); + spectral_correlator_.ComputeAutoCorrelation( + {lagged_frame_fft_.data(), kFftSizeBy2Plus1}, lagged_frame_bands_energy_); // Log of the band energies for the reference frame. - std::array log_band_energy_coeffs; - ComputeLogBandEnergiesCoefficients(reference_frame_energy_coeffs_, - log_band_energy_coeffs); - // Decorrelate band-wise log energy coefficients via DCT. - std::array log_band_energy_coeffs_decorrelated; - ComputeDct(log_band_energy_coeffs, dct_table_, - log_band_energy_coeffs_decorrelated); - // Normalize (based on training set stats). - log_band_energy_coeffs_decorrelated[0] -= 12; - log_band_energy_coeffs_decorrelated[1] -= 4; - // Update the ring buffer and the spectral difference stats. - spectral_coeffs_ring_buf_.Push(log_band_energy_coeffs_decorrelated); - UpdateSpectralDifferenceStats(log_band_energy_coeffs_decorrelated, - spectral_coeffs_ring_buf_, - &spectral_diffs_buf_); - // Write the higher bands spectral coefficients. - auto coeffs_src = spectral_coeffs_ring_buf_.GetArrayView(0); - RTC_DCHECK_EQ(coeffs_src.size() - kNumLowerBands, - spectral_features.coeffs.size()); - std::copy(coeffs_src.begin() + kNumLowerBands, coeffs_src.end(), - spectral_features.coeffs.begin()); + std::array log_bands_energy; + ComputeSmoothedLogMagnitudeSpectrum(reference_frame_bands_energy_, + log_bands_energy); + // Reference frame cepstrum. + std::array cepstrum; + ComputeDct(log_bands_energy, dct_table_, cepstrum); + // Ad-hoc correction terms for the first two cepstral coefficients. + cepstrum[0] -= 12.f; + cepstrum[1] -= 4.f; + // Update the ring buffer and the cepstral difference stats. + cepstral_coeffs_ring_buf_.Push(cepstrum); + UpdateCepstralDifferenceStats(cepstrum, cepstral_coeffs_ring_buf_, + &cepstral_diffs_buf_); + // Write the higher bands cepstral coefficients. + RTC_DCHECK_EQ(cepstrum.size() - kNumLowerBands, higher_bands_cepstrum.size()); + std::copy(cepstrum.begin() + kNumLowerBands, cepstrum.end(), + higher_bands_cepstrum.begin()); // Compute and write remaining features. - ComputeAvgAndDerivatives(spectral_features.average, - spectral_features.first_derivative, - spectral_features.second_derivative); - ComputeCrossCorrelation(spectral_features.cross_correlations); - RTC_DCHECK(spectral_features.variability); - *(spectral_features.variability) = ComputeVariability(); + ComputeAvgAndDerivatives(average, first_derivative, second_derivative); + ComputeNormalizedCepstralCorrelation(bands_cross_corr); + RTC_DCHECK(variability); + *variability = ComputeVariability(); return false; } void SpectralFeaturesExtractor::ComputeAvgAndDerivatives( rtc::ArrayView average, rtc::ArrayView first_derivative, - rtc::ArrayView second_derivative) { - auto curr = spectral_coeffs_ring_buf_.GetArrayView(0); - auto prev1 = spectral_coeffs_ring_buf_.GetArrayView(1); - auto prev2 = spectral_coeffs_ring_buf_.GetArrayView(2); + rtc::ArrayView second_derivative) const { + auto curr = cepstral_coeffs_ring_buf_.GetArrayView(0); + auto prev1 = cepstral_coeffs_ring_buf_.GetArrayView(1); + auto prev2 = cepstral_coeffs_ring_buf_.GetArrayView(2); RTC_DCHECK_EQ(average.size(), first_derivative.size()); RTC_DCHECK_EQ(first_derivative.size(), second_derivative.size()); RTC_DCHECK_LE(average.size(), curr.size()); @@ -151,47 +131,41 @@ void SpectralFeaturesExtractor::ComputeAvgAndDerivatives( } } -void SpectralFeaturesExtractor::ComputeCrossCorrelation( - rtc::ArrayView cross_correlations) { - const auto& x = reference_frame_fft_; - const auto& y = lagged_frame_fft_; - auto cross_corr = [x, y](const size_t freq_bin_index) -> float { - return (x[freq_bin_index].real() * y[freq_bin_index].real() + - x[freq_bin_index].imag() * y[freq_bin_index].imag()); - }; - std::array cross_corr_coeffs; - constexpr size_t kNumFftPoints = kFrameSize20ms24kHz / 2 + 1; - ComputeBandCoefficients(cross_corr, band_boundaries_, kNumFftPoints - 1, - cross_corr_coeffs); +void SpectralFeaturesExtractor::ComputeNormalizedCepstralCorrelation( + rtc::ArrayView bands_cross_corr) { + spectral_correlator_.ComputeCrossCorrelation( + {reference_frame_fft_.data(), kFftSizeBy2Plus1}, + {lagged_frame_fft_.data(), kFftSizeBy2Plus1}, bands_cross_corr_); // Normalize. - for (size_t i = 0; i < cross_corr_coeffs.size(); ++i) { - cross_corr_coeffs[i] = - cross_corr_coeffs[i] / - std::sqrt(0.001f + reference_frame_energy_coeffs_[i] * - lagged_frame_energy_coeffs_[i]); + for (size_t i = 0; i < bands_cross_corr_.size(); ++i) { + bands_cross_corr_[i] = + bands_cross_corr_[i] / + std::sqrt(0.001f + reference_frame_bands_energy_[i] * + lagged_frame_bands_energy_[i]); } - // Decorrelate. - ComputeDct(cross_corr_coeffs, dct_table_, cross_correlations); - // Normalize (based on training set stats). - cross_correlations[0] -= 1.3f; - cross_correlations[1] -= 0.9f; + // Cepstrum. + ComputeDct(bands_cross_corr_, dct_table_, bands_cross_corr); + // Ad-hoc correction terms for the first two cepstral coefficients. + bands_cross_corr[0] -= 1.3f; + bands_cross_corr[1] -= 0.9f; } -float SpectralFeaturesExtractor::ComputeVariability() { - // Compute spectral variability score. - float spec_variability = 0.f; - for (size_t delay1 = 0; delay1 < kSpectralCoeffsHistorySize; ++delay1) { +float SpectralFeaturesExtractor::ComputeVariability() const { + // Compute cepstral variability score. + float variability = 0.f; + for (size_t delay1 = 0; delay1 < kCepstralCoeffsHistorySize; ++delay1) { float min_dist = std::numeric_limits::max(); - for (size_t delay2 = 0; delay2 < kSpectralCoeffsHistorySize; ++delay2) { + for (size_t delay2 = 0; delay2 < kCepstralCoeffsHistorySize; ++delay2) { if (delay1 == delay2) // The distance would be 0. continue; min_dist = - std::min(min_dist, spectral_diffs_buf_.GetValue(delay1, delay2)); + std::min(min_dist, cepstral_diffs_buf_.GetValue(delay1, delay2)); } - spec_variability += min_dist; + variability += min_dist; } // Normalize (based on training set stats). - return spec_variability / kSpectralCoeffsHistorySize - 2.1f; + // TODO(bugs.webrtc.org/10480): Isolate normalization from feature extraction. + return variability / kCepstralCoeffsHistorySize - 2.1f; } } // namespace rnn_vad diff --git a/modules/audio_processing/agc2/rnn_vad/spectral_features.h b/modules/audio_processing/agc2/rnn_vad/spectral_features.h index 5c33dcdd24..047af247ce 100644 --- a/modules/audio_processing/agc2/rnn_vad/spectral_features.h +++ b/modules/audio_processing/agc2/rnn_vad/spectral_features.h @@ -20,34 +20,12 @@ #include "modules/audio_processing/agc2/rnn_vad/common.h" #include "modules/audio_processing/agc2/rnn_vad/fft_util.h" #include "modules/audio_processing/agc2/rnn_vad/ring_buffer.h" +#include "modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h" #include "modules/audio_processing/agc2/rnn_vad/symmetric_matrix_buffer.h" namespace webrtc { namespace rnn_vad { -// View on spectral features. -class SpectralFeaturesView { - public: - SpectralFeaturesView(rtc::ArrayView coeffs, - rtc::ArrayView average, - rtc::ArrayView first_derivative, - rtc::ArrayView second_derivative, - rtc::ArrayView cross_correlations, - float* variability); - SpectralFeaturesView(const SpectralFeaturesView&); - ~SpectralFeaturesView(); - // Higher bands spectral coefficients. - const rtc::ArrayView coeffs; - // Average and first and second derivative over time for the lower bands. - const rtc::ArrayView average; - const rtc::ArrayView first_derivative; - const rtc::ArrayView second_derivative; - // Spectral cross-correlation for the lower bands. - const rtc::ArrayView cross_correlations; - // Spectral variability score. - float* const variability; -}; - // Class to compute spectral features. class SpectralFeaturesExtractor { public: @@ -64,27 +42,33 @@ class SpectralFeaturesExtractor { bool CheckSilenceComputeFeatures( rtc::ArrayView reference_frame, rtc::ArrayView lagged_frame, - SpectralFeaturesView spectral_features); + rtc::ArrayView higher_bands_cepstrum, + rtc::ArrayView average, + rtc::ArrayView first_derivative, + rtc::ArrayView second_derivative, + rtc::ArrayView bands_cross_corr, + float* variability); private: void ComputeAvgAndDerivatives( rtc::ArrayView average, rtc::ArrayView first_derivative, - rtc::ArrayView second_derivative); - void ComputeCrossCorrelation( - rtc::ArrayView cross_correlations); - float ComputeVariability(); + rtc::ArrayView second_derivative) const; + void ComputeNormalizedCepstralCorrelation( + rtc::ArrayView bands_cross_corr); + float ComputeVariability() const; - BandAnalysisFft fft_; + FftUtil fft_; std::vector> reference_frame_fft_; std::vector> lagged_frame_fft_; - std::array reference_frame_energy_coeffs_{}; - std::array lagged_frame_energy_coeffs_{}; - const std::array band_boundaries_; + SpectralCorrelator spectral_correlator_; + std::array reference_frame_bands_energy_; + std::array lagged_frame_bands_energy_; + std::array bands_cross_corr_; const std::array dct_table_; - RingBuffer - spectral_coeffs_ring_buf_; - SymmetricMatrixBuffer spectral_diffs_buf_; + RingBuffer + cepstral_coeffs_ring_buf_; + SymmetricMatrixBuffer cepstral_diffs_buf_; }; } // namespace rnn_vad diff --git a/modules/audio_processing/agc2/rnn_vad/spectral_features_internal.cc b/modules/audio_processing/agc2/rnn_vad/spectral_features_internal.cc index 74211fe814..8135e3c333 100644 --- a/modules/audio_processing/agc2/rnn_vad/spectral_features_internal.cc +++ b/modules/audio_processing/agc2/rnn_vad/spectral_features_internal.cc @@ -20,85 +20,126 @@ namespace webrtc { namespace rnn_vad { namespace { -// DCT scaling factor. -static_assert( - kNumBands == 22, - "kNumBands changed! Please update the value of kDctScalingFactor"); -constexpr float kDctScalingFactor = 0.301511345f; // sqrt(2 / kNumBands) +// Weights for each FFT coefficient for each Opus band (Nyquist frequency +// excluded). The size of each band is specified in +// |kOpusScaleNumBins24kHz20ms|. +constexpr std::array kOpusBandWeights24kHz20ms = + {{ + 0.f, 0.25f, 0.5f, 0.75f, // Band 0 + 0.f, 0.25f, 0.5f, 0.75f, // Band 1 + 0.f, 0.25f, 0.5f, 0.75f, // Band 2 + 0.f, 0.25f, 0.5f, 0.75f, // Band 3 + 0.f, 0.25f, 0.5f, 0.75f, // Band 4 + 0.f, 0.25f, 0.5f, 0.75f, // Band 5 + 0.f, 0.25f, 0.5f, 0.75f, // Band 6 + 0.f, 0.25f, 0.5f, 0.75f, // Band 7 + 0.f, 0.125f, 0.25f, 0.375f, 0.5f, + 0.625f, 0.75f, 0.875f, // Band 8 + 0.f, 0.125f, 0.25f, 0.375f, 0.5f, + 0.625f, 0.75f, 0.875f, // Band 9 + 0.f, 0.125f, 0.25f, 0.375f, 0.5f, + 0.625f, 0.75f, 0.875f, // Band 10 + 0.f, 0.125f, 0.25f, 0.375f, 0.5f, + 0.625f, 0.75f, 0.875f, // Band 11 + 0.f, 0.0625f, 0.125f, 0.1875f, 0.25f, + 0.3125f, 0.375f, 0.4375f, 0.5f, 0.5625f, + 0.625f, 0.6875f, 0.75f, 0.8125f, 0.875f, + 0.9375f, // Band 12 + 0.f, 0.0625f, 0.125f, 0.1875f, 0.25f, + 0.3125f, 0.375f, 0.4375f, 0.5f, 0.5625f, + 0.625f, 0.6875f, 0.75f, 0.8125f, 0.875f, + 0.9375f, // Band 13 + 0.f, 0.0625f, 0.125f, 0.1875f, 0.25f, + 0.3125f, 0.375f, 0.4375f, 0.5f, 0.5625f, + 0.625f, 0.6875f, 0.75f, 0.8125f, 0.875f, + 0.9375f, // Band 14 + 0.f, 0.0416667f, 0.0833333f, 0.125f, 0.166667f, + 0.208333f, 0.25f, 0.291667f, 0.333333f, 0.375f, + 0.416667f, 0.458333f, 0.5f, 0.541667f, 0.583333f, + 0.625f, 0.666667f, 0.708333f, 0.75f, 0.791667f, + 0.833333f, 0.875f, 0.916667f, 0.958333f, // Band 15 + 0.f, 0.0416667f, 0.0833333f, 0.125f, 0.166667f, + 0.208333f, 0.25f, 0.291667f, 0.333333f, 0.375f, + 0.416667f, 0.458333f, 0.5f, 0.541667f, 0.583333f, + 0.625f, 0.666667f, 0.708333f, 0.75f, 0.791667f, + 0.833333f, 0.875f, 0.916667f, 0.958333f, // Band 16 + 0.f, 0.03125f, 0.0625f, 0.09375f, 0.125f, + 0.15625f, 0.1875f, 0.21875f, 0.25f, 0.28125f, + 0.3125f, 0.34375f, 0.375f, 0.40625f, 0.4375f, + 0.46875f, 0.5f, 0.53125f, 0.5625f, 0.59375f, + 0.625f, 0.65625f, 0.6875f, 0.71875f, 0.75f, + 0.78125f, 0.8125f, 0.84375f, 0.875f, 0.90625f, + 0.9375f, 0.96875f, // Band 17 + 0.f, 0.0208333f, 0.0416667f, 0.0625f, 0.0833333f, + 0.104167f, 0.125f, 0.145833f, 0.166667f, 0.1875f, + 0.208333f, 0.229167f, 0.25f, 0.270833f, 0.291667f, + 0.3125f, 0.333333f, 0.354167f, 0.375f, 0.395833f, + 0.416667f, 0.4375f, 0.458333f, 0.479167f, 0.5f, + 0.520833f, 0.541667f, 0.5625f, 0.583333f, 0.604167f, + 0.625f, 0.645833f, 0.666667f, 0.6875f, 0.708333f, + 0.729167f, 0.75f, 0.770833f, 0.791667f, 0.8125f, + 0.833333f, 0.854167f, 0.875f, 0.895833f, 0.916667f, + 0.9375f, 0.958333f, 0.979167f // Band 18 + }}; } // namespace -std::array ComputeBandBoundaryIndexes( - size_t sample_rate_hz, - size_t frame_size_samples) { - std::array indexes; - for (size_t i = 0; i < kNumBands; ++i) { - indexes[i] = - kBandFrequencyBoundaries[i] * frame_size_samples / sample_rate_hz; - } - return indexes; +SpectralCorrelator::SpectralCorrelator() + : weights_(kOpusBandWeights24kHz20ms.begin(), + kOpusBandWeights24kHz20ms.end()) {} + +SpectralCorrelator::~SpectralCorrelator() = default; + +void SpectralCorrelator::ComputeAutoCorrelation( + rtc::ArrayView, kFftSizeBy2Plus1> x, + rtc::ArrayView auto_corr) const { + ComputeCrossCorrelation(x, x, auto_corr); } -void ComputeBandCoefficients( - rtc::FunctionView functor, - rtc::ArrayView band_boundaries, - size_t max_freq_bin_index, - rtc::ArrayView coefficients) { - std::fill(coefficients.begin(), coefficients.end(), 0.f); - for (size_t i = 0; i < coefficients.size() - 1; ++i) { - RTC_DCHECK_EQ(0.f, coefficients[i + 1]); - RTC_DCHECK_GT(band_boundaries[i + 1], band_boundaries[i]); - const size_t first_freq_bin = band_boundaries[i]; - const size_t last_freq_bin = - std::min(max_freq_bin_index, first_freq_bin + band_boundaries[i + 1] - - band_boundaries[i] - 1); - // Depending on the sample rate, the highest bands can have no FFT - // coefficients. Stop the iteration when coming across the first empty band. - if (first_freq_bin >= last_freq_bin) - break; - const size_t band_size = last_freq_bin - first_freq_bin + 1; - // Compute the band coefficient using a triangular band with peak response - // at the band boundary. - for (size_t j = first_freq_bin; j <= last_freq_bin; ++j) { - const float w = static_cast(j - first_freq_bin) / band_size; - const float coefficient = functor(j); - coefficients[i] += (1.f - w) * coefficient; - coefficients[i + 1] += w * coefficient; +void SpectralCorrelator::ComputeCrossCorrelation( + rtc::ArrayView, kFftSizeBy2Plus1> x, + rtc::ArrayView, kFftSizeBy2Plus1> y, + rtc::ArrayView cross_corr) const { + constexpr auto kOpusScaleNumBins24kHz20ms = GetOpusScaleNumBins24kHz20ms(); + size_t k = 0; // Next Fourier coefficient index. + cross_corr[0] = 0.f; + for (size_t i = 0; i < kOpusBands24kHz - 1; ++i) { + cross_corr[i + 1] = 0.f; + for (int j = 0; j < kOpusScaleNumBins24kHz20ms[i]; ++j) { // Band size. + const float v = x[k].real() * y[k].real() + x[k].imag() * y[k].imag(); + const float tmp = weights_[k] * v; + cross_corr[i] += v - tmp; + cross_corr[i + 1] += tmp; + k++; } } - // The first and the last bands in the loop above only got half contribution. - coefficients[0] *= 2.f; - coefficients[coefficients.size() - 1] *= 2.f; - // TODO(bugs.webrtc.org/9076): Replace the line above with - // "coefficients[i] *= 2.f" (*) since we now assume that the last band is - // always |kNumBands| - 1. - // (*): "size_t i" must be declared before the main loop. + cross_corr[0] *= 2.f; // The first band only gets half contribution. + // The Nyquist coefficient is never used. + RTC_DCHECK_EQ(k, kFftSizeBy2Plus1 - 1); } -void ComputeBandEnergies( - rtc::ArrayView> fft_coeffs, - rtc::ArrayView band_boundaries, - rtc::ArrayView band_energies) { - RTC_DCHECK_EQ(band_boundaries.size(), band_energies.size()); - auto functor = [fft_coeffs](const size_t freq_bin_index) -> float { - return std::norm(fft_coeffs[freq_bin_index]); +void ComputeSmoothedLogMagnitudeSpectrum( + rtc::ArrayView bands_energy, + rtc::ArrayView log_bands_energy) { + RTC_DCHECK_LE(bands_energy.size(), kNumBands); + constexpr float kOneByHundred = 1e-2f; + constexpr float kLogOneByHundred = -2.f; + // Init. + float log_max = kLogOneByHundred; + float follow = kLogOneByHundred; + const auto smooth = [&log_max, &follow](float x) { + x = std::max(log_max - 7.f, std::max(follow - 1.5f, x)); + log_max = std::max(log_max, x); + follow = std::max(follow - 1.5f, x); + return x; }; - ComputeBandCoefficients(functor, band_boundaries, fft_coeffs.size() - 1, - band_energies); -} - -void ComputeLogBandEnergiesCoefficients( - rtc::ArrayView band_energy_coeffs, - rtc::ArrayView log_band_energy_coeffs) { - float log_max = -2.f; - float follow = -2.f; - for (size_t i = 0; i < band_energy_coeffs.size(); ++i) { - log_band_energy_coeffs[i] = std::log10(1e-2f + band_energy_coeffs[i]); - // Smoothing across frequency bands. - log_band_energy_coeffs[i] = std::max( - log_max - 7.f, std::max(follow - 1.5f, log_band_energy_coeffs[i])); - log_max = std::max(log_max, log_band_energy_coeffs[i]); - follow = std::max(follow - 1.5f, log_band_energy_coeffs[i]); + // Smoothing over the bands for which the band energy is defined. + for (size_t i = 0; i < bands_energy.size(); ++i) { + log_bands_energy[i] = smooth(std::log10(kOneByHundred + bands_energy[i])); + } + // Smoothing over the remaining bands (zero energy). + for (size_t i = bands_energy.size(); i < kNumBands; ++i) { + log_bands_energy[i] = smooth(kLogOneByHundred); } } @@ -113,17 +154,28 @@ std::array ComputeDctTable() { return dct_table; } -void ComputeDct(rtc::ArrayView in, +void ComputeDct(rtc::ArrayView in, rtc::ArrayView dct_table, rtc::ArrayView out) { + // DCT scaling factor - i.e., sqrt(2 / kNumBands). + constexpr float kDctScalingFactor = 0.301511345f; + constexpr float kDctScalingFactorError = + kDctScalingFactor * kDctScalingFactor - + 2.f / static_cast(kNumBands); + static_assert( + (kDctScalingFactorError >= 0.f && kDctScalingFactorError < 1e-1f) || + (kDctScalingFactorError < 0.f && kDctScalingFactorError > -1e-1f), + "kNumBands changed and kDctScalingFactor has not been updated."); RTC_DCHECK_NE(in.data(), out.data()) << "In-place DCT is not supported."; + RTC_DCHECK_LE(in.size(), kNumBands); RTC_DCHECK_LE(1, out.size()); RTC_DCHECK_LE(out.size(), in.size()); - std::fill(out.begin(), out.end(), 0.f); for (size_t i = 0; i < out.size(); ++i) { + out[i] = 0.f; for (size_t j = 0; j < in.size(); ++j) { - out[i] += in[j] * dct_table[j * in.size() + i]; + out[i] += in[j] * dct_table[j * kNumBands + i]; } + // TODO(bugs.webrtc.org/10480): Scaling factor in the DCT table. out[i] *= kDctScalingFactor; } } diff --git a/modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h b/modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h index 14ff56031f..0ec9652370 100644 --- a/modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h +++ b/modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h @@ -14,49 +14,75 @@ #include #include #include +#include #include "api/array_view.h" -#include "api/function_view.h" #include "modules/audio_processing/agc2/rnn_vad/common.h" namespace webrtc { namespace rnn_vad { -// Computes FFT boundary indexes corresponding to sub-bands. -std::array ComputeBandBoundaryIndexes( - size_t sample_rate_hz, - size_t frame_size_samples); +// At a sample rate of 24 kHz, the last 3 Opus bands are beyond the Nyquist +// frequency. However, band #19 gets the contributions from band #18 because +// of the symmetric triangular filter with peak response at 12 kHz. +constexpr size_t kOpusBands24kHz = 20; +static_assert(kOpusBands24kHz < kNumBands, + "The number of bands at 24 kHz must be less than those defined " + "in the Opus scale at 48 kHz."); -// Iterates through frequency bands and computes coefficients via |functor| for -// triangular bands with peak response at each band boundary. |functor| returns -// a floating point value for the FFT coefficient having index equal to the -// argument passed to |functor|; that argument is in the range {0, ... -// |max_freq_bin_index| - 1}. -void ComputeBandCoefficients( - rtc::FunctionView functor, - rtc::ArrayView band_boundaries, - const size_t max_freq_bin_index, - rtc::ArrayView coefficients); +// Number of FFT frequency bins covered by each band in the Opus scale at a +// sample rate of 24 kHz for 20 ms frames. +// Declared here for unit testing. +constexpr std::array GetOpusScaleNumBins24kHz20ms() { + return {4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 24, 24, 32, 48}; +} -// Given an array of FFT coefficients and a vector of band boundary indexes, -// computes band energy coefficients. -void ComputeBandEnergies( - rtc::ArrayView> fft_coeffs, - rtc::ArrayView band_boundaries, - rtc::ArrayView band_energies); +// TODO(bugs.webrtc.org/10480): Move to a separate file. +// Class to compute band-wise spectral features in the Opus perceptual scale +// for 20 ms frames sampled at 24 kHz. The analysis methods apply triangular +// filters with peak response at the each band boundary. +class SpectralCorrelator { + public: + // Ctor. + SpectralCorrelator(); + SpectralCorrelator(const SpectralCorrelator&) = delete; + SpectralCorrelator& operator=(const SpectralCorrelator&) = delete; + ~SpectralCorrelator(); -// Computes log band energy coefficients. -void ComputeLogBandEnergiesCoefficients( - rtc::ArrayView band_energy_coeffs, - rtc::ArrayView log_band_energy_coeffs); + // Computes the band-wise spectral auto-correlations. + void ComputeAutoCorrelation( + rtc::ArrayView, kFftSizeBy2Plus1> x, + rtc::ArrayView auto_corr) const; -// Creates a DCT table for arrays having size equal to |kNumBands|. + // Computes the band-wise spectral cross-correlations. + void ComputeCrossCorrelation( + rtc::ArrayView, kFftSizeBy2Plus1> x, + rtc::ArrayView, kFftSizeBy2Plus1> y, + rtc::ArrayView cross_corr) const; + + private: + const std::vector weights_; // Weights for each Fourier coefficient. +}; + +// TODO(bugs.webrtc.org/10480): Move to anonymous namespace in +// spectral_features.cc. Given a vector of Opus-bands energy coefficients, +// computes the log magnitude spectrum applying smoothing both over time and +// over frequency. Declared here for unit testing. +void ComputeSmoothedLogMagnitudeSpectrum( + rtc::ArrayView bands_energy, + rtc::ArrayView log_bands_energy); + +// TODO(bugs.webrtc.org/10480): Move to anonymous namespace in +// spectral_features.cc. Creates a DCT table for arrays having size equal to +// |kNumBands|. Declared here for unit testing. std::array ComputeDctTable(); -// Computes DCT for |in| given a pre-computed DCT table. In-place computation is -// not allowed and |out| can be smaller than |in| in order to only compute the -// first DCT coefficients. -void ComputeDct(rtc::ArrayView in, +// TODO(bugs.webrtc.org/10480): Move to anonymous namespace in +// spectral_features.cc. Computes DCT for |in| given a pre-computed DCT table. +// In-place computation is not allowed and |out| can be smaller than |in| in +// order to only compute the first DCT coefficients. Declared here for unit +// testing. +void ComputeDct(rtc::ArrayView in, rtc::ArrayView dct_table, rtc::ArrayView out); diff --git a/modules/audio_processing/agc2/rnn_vad/spectral_features_internal_unittest.cc b/modules/audio_processing/agc2/rnn_vad/spectral_features_internal_unittest.cc index 5e769bf674..4ff711883f 100644 --- a/modules/audio_processing/agc2/rnn_vad/spectral_features_internal_unittest.cc +++ b/modules/audio_processing/agc2/rnn_vad/spectral_features_internal_unittest.cc @@ -10,6 +10,13 @@ #include "modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h" +#include +#include +#include +#include +#include + +#include "api/array_view.h" #include "modules/audio_processing/agc2/rnn_vad/test_utils.h" // TODO(bugs.webrtc.org/8948): Add when the issue is fixed. // #include "test/fpe_observer.h" @@ -20,58 +27,76 @@ namespace rnn_vad { namespace test { namespace { -constexpr size_t kSampleRate48kHz = 48000; -constexpr size_t kFrameSize20ms48kHz = 2 * kSampleRate48kHz / 100; -constexpr size_t kFftNumCoeffs20ms48kHz = kFrameSize20ms48kHz / 2 + 1; +// Generates the values for the array named |kOpusBandWeights24kHz20ms| in the +// anonymous namespace of the .cc file, which is the array of FFT coefficient +// weights for the Opus scale triangular filters. +std::vector ComputeTriangularFiltersWeights() { + constexpr auto kOpusScaleNumBins24kHz20ms = GetOpusScaleNumBins24kHz20ms(); + const auto& v = kOpusScaleNumBins24kHz20ms; // Alias. + const size_t num_weights = std::accumulate( + kOpusScaleNumBins24kHz20ms.begin(), kOpusScaleNumBins24kHz20ms.end(), 0); + std::vector weights(num_weights); + size_t next_fft_coeff_index = 0; + for (size_t band = 0; band < v.size(); ++band) { + const size_t band_size = v[band]; + for (size_t j = 0; j < band_size; ++j) { + weights[next_fft_coeff_index + j] = static_cast(j) / band_size; + } + next_fft_coeff_index += band_size; + } + return weights; +} } // namespace -// TODO(bugs.webrtc.org/9076): Remove this test before closing the issue. -// Check that when using precomputed FFT coefficients for frames at 48 kHz, the -// output of ComputeBandEnergies() is bit exact. -TEST(RnnVadTest, ComputeBandEnergies48kHzBitExactness) { - // Initialize input data reader and buffers. - auto fft_coeffs_reader = CreateFftCoeffsReader(); - const size_t num_frames = fft_coeffs_reader.second; - ASSERT_EQ( - kFftNumCoeffs20ms48kHz, - rtc::CheckedDivExact(fft_coeffs_reader.first->data_length(), num_frames) / - 2); - std::array fft_coeffs_real; - std::array fft_coeffs_imag; - std::array, kFftNumCoeffs20ms48kHz> fft_coeffs; - // Init expected output reader and buffer. - auto band_energies_reader = CreateBandEnergyCoeffsReader(); - ASSERT_EQ(num_frames, band_energies_reader.second); - std::array expected_band_energies; - // Init band energies coefficients computation. - const auto band_boundary_indexes = - ComputeBandBoundaryIndexes(kSampleRate48kHz, kFrameSize20ms48kHz); - std::array computed_band_energies; - - // Check output for every frame. - { - // TODO(bugs.webrtc.org/8948): Add when the issue is fixed. - // FloatingPointExceptionObserver fpe_observer; - for (size_t i = 0; i < num_frames; ++i) { - SCOPED_TRACE(i); - // Read input. - fft_coeffs_reader.first->ReadChunk(fft_coeffs_real); - fft_coeffs_reader.first->ReadChunk(fft_coeffs_imag); - for (size_t i = 0; i < kFftNumCoeffs20ms48kHz; ++i) { - fft_coeffs[i].real(fft_coeffs_real[i]); - fft_coeffs[i].imag(fft_coeffs_imag[i]); - } - band_energies_reader.first->ReadChunk(expected_band_energies); - // Compute band energy coefficients and check output. - ComputeBandEnergies(fft_coeffs, band_boundary_indexes, - computed_band_energies); - ExpectEqualFloatArray(expected_band_energies, computed_band_energies); - } +// Checks that the values returned by GetOpusScaleNumBins24kHz20ms() match the +// Opus scale frequency boundaries. +TEST(RnnVadTest, TestOpusScaleBoundaries) { + constexpr int kBandFrequencyBoundariesHz[kNumBands - 1] = { + 200, 400, 600, 800, 1000, 1200, 1400, 1600, 2000, 2400, 2800, + 3200, 4000, 4800, 5600, 6800, 8000, 9600, 12000, 15600, 20000}; + constexpr auto kOpusScaleNumBins24kHz20ms = GetOpusScaleNumBins24kHz20ms(); + int prev = 0; + for (size_t i = 0; i < kOpusScaleNumBins24kHz20ms.size(); ++i) { + int boundary = + kBandFrequencyBoundariesHz[i] * kFrameSize20ms24kHz / kSampleRate24kHz; + EXPECT_EQ(kOpusScaleNumBins24kHz20ms[i], boundary - prev); + prev = boundary; } } -TEST(RnnVadTest, ComputeLogBandEnergiesCoefficientsBitExactness) { +// Checks that the computed triangular filters weights for the Opus scale are +// monotonic withing each Opus band. This test should only be enabled when +// ComputeTriangularFiltersWeights() is changed and |kOpusBandWeights24kHz20ms| +// is updated accordingly. +TEST(RnnVadTest, DISABLED_TestOpusScaleWeights) { + auto weights = ComputeTriangularFiltersWeights(); + size_t i = 0; + for (size_t band_size : GetOpusScaleNumBins24kHz20ms()) { + SCOPED_TRACE(band_size); + rtc::ArrayView band_weights(weights.data() + i, band_size); + float prev = -1.f; + for (float weight : band_weights) { + EXPECT_LT(prev, weight); + prev = weight; + } + i += band_size; + } +} + +TEST(RnnVadTest, SpectralCorrelatorValidOutput) { + SpectralCorrelator e; + std::array, kFftSizeBy2Plus1> in; + std::array out; + in.fill({1.f, 1.f}); + e.ComputeAutoCorrelation(in, out); + for (size_t i = 0; i < kOpusBands24kHz; ++i) { + SCOPED_TRACE(i); + EXPECT_GT(out[i], 0.f); + } +} + +TEST(RnnVadTest, ComputeSmoothedLogMagnitudeSpectrumWithinTolerance) { constexpr std::array input = { {86.060539245605f, 275.668334960938f, 43.406528472900f, 6.541896820068f, 17.964015960693f, 8.090919494629f, 1.261920094490f, 1.212702631950f, @@ -90,7 +115,7 @@ TEST(RnnVadTest, ComputeLogBandEnergiesCoefficientsBitExactness) { { // TODO(bugs.webrtc.org/8948): Add when the issue is fixed. // FloatingPointExceptionObserver fpe_observer; - ComputeLogBandEnergiesCoefficients(input, computed_output); + ComputeSmoothedLogMagnitudeSpectrum(input, computed_output); ExpectNearAbsolute(expected_output, computed_output, 1e-5f); } } diff --git a/modules/audio_processing/agc2/rnn_vad/spectral_features_unittest.cc b/modules/audio_processing/agc2/rnn_vad/spectral_features_unittest.cc index 557e41e97c..39b9f93eb1 100644 --- a/modules/audio_processing/agc2/rnn_vad/spectral_features_unittest.cc +++ b/modules/audio_processing/agc2/rnn_vad/spectral_features_unittest.cc @@ -32,15 +32,35 @@ void WriteTestData(rtc::ArrayView samples) { } } -SpectralFeaturesView GetSpectralFeaturesView( +rtc::ArrayView GetHigherBandsSpectrum( std::array* feature_vector) { - return { - {feature_vector->data() + kNumLowerBands, kNumBands - kNumLowerBands}, - {feature_vector->data(), kNumLowerBands}, - {feature_vector->data() + kNumBands, kNumLowerBands}, - {feature_vector->data() + kNumBands + kNumLowerBands, kNumLowerBands}, - {feature_vector->data() + kNumBands + 2 * kNumLowerBands, kNumLowerBands}, - &(*feature_vector)[kNumBands + 3 * kNumLowerBands]}; + return {feature_vector->data() + kNumLowerBands, kNumBands - kNumLowerBands}; +} + +rtc::ArrayView GetAverage( + std::array* feature_vector) { + return {feature_vector->data(), kNumLowerBands}; +} + +rtc::ArrayView GetFirstDerivative( + std::array* feature_vector) { + return {feature_vector->data() + kNumBands, kNumLowerBands}; +} + +rtc::ArrayView GetSecondDerivative( + std::array* feature_vector) { + return {feature_vector->data() + kNumBands + kNumLowerBands, kNumLowerBands}; +} + +rtc::ArrayView GetCepstralCrossCorrelation( + std::array* feature_vector) { + return {feature_vector->data() + kNumBands + 2 * kNumLowerBands, + kNumLowerBands}; +} + +float* GetCepstralVariability( + std::array* feature_vector) { + return feature_vector->data() + kNumBands + 3 * kNumLowerBands; } constexpr float kInitialFeatureVal = -9999.f; @@ -54,7 +74,6 @@ TEST(RnnVadTest, SpectralFeaturesWithAndWithoutSilence) { rtc::ArrayView samples_view(samples); bool is_silence; std::array feature_vector; - auto feature_vector_view = GetSpectralFeaturesView(&feature_vector); // Write an initial value in the feature vector to detect changes. std::fill(feature_vector.begin(), feature_vector.end(), kInitialFeatureVal); @@ -64,8 +83,12 @@ TEST(RnnVadTest, SpectralFeaturesWithAndWithoutSilence) { // With silence. std::fill(samples.begin(), samples.end(), 0.f); - is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view, - feature_vector_view); + is_silence = sfe.CheckSilenceComputeFeatures( + samples_view, samples_view, GetHigherBandsSpectrum(&feature_vector), + GetAverage(&feature_vector), GetFirstDerivative(&feature_vector), + GetSecondDerivative(&feature_vector), + GetCepstralCrossCorrelation(&feature_vector), + GetCepstralVariability(&feature_vector)); // Silence is expected, the output won't be overwritten. EXPECT_TRUE(is_silence); EXPECT_TRUE(std::all_of(feature_vector.begin(), feature_vector.end(), @@ -73,18 +96,22 @@ TEST(RnnVadTest, SpectralFeaturesWithAndWithoutSilence) { // With no silence. WriteTestData(samples); - is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view, - feature_vector_view); + is_silence = sfe.CheckSilenceComputeFeatures( + samples_view, samples_view, GetHigherBandsSpectrum(&feature_vector), + GetAverage(&feature_vector), GetFirstDerivative(&feature_vector), + GetSecondDerivative(&feature_vector), + GetCepstralCrossCorrelation(&feature_vector), + GetCepstralVariability(&feature_vector)); // Silence is not expected, the output will be overwritten. EXPECT_FALSE(is_silence); EXPECT_FALSE(std::all_of(feature_vector.begin(), feature_vector.end(), [](float x) { return x == kInitialFeatureVal; })); } -// When the input signal does not change, the spectral coefficients average does -// not change and the derivatives are zero. Similarly, the spectral variability +// When the input signal does not change, the cepstral coefficients average does +// not change and the derivatives are zero. Similarly, the cepstral variability // score does not change either. -TEST(RnnVadTest, SpectralFeaturesConstantAverageZeroDerivative) { +TEST(RnnVadTest, CepstralFeaturesConstantAverageZeroDerivative) { // Initialize. SpectralFeaturesExtractor sfe; std::array samples; @@ -94,17 +121,24 @@ TEST(RnnVadTest, SpectralFeaturesConstantAverageZeroDerivative) { // Fill the spectral features with test data. std::array feature_vector; - auto feature_vector_view = GetSpectralFeaturesView(&feature_vector); - for (size_t i = 0; i < kSpectralCoeffsHistorySize; ++i) { - is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view, - feature_vector_view); + for (size_t i = 0; i < kCepstralCoeffsHistorySize; ++i) { + is_silence = sfe.CheckSilenceComputeFeatures( + samples_view, samples_view, GetHigherBandsSpectrum(&feature_vector), + GetAverage(&feature_vector), GetFirstDerivative(&feature_vector), + GetSecondDerivative(&feature_vector), + GetCepstralCrossCorrelation(&feature_vector), + GetCepstralVariability(&feature_vector)); } // Feed the test data one last time but using a different output vector. std::array feature_vector_last; - auto feature_vector_last_view = GetSpectralFeaturesView(&feature_vector_last); - is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view, - feature_vector_last_view); + is_silence = sfe.CheckSilenceComputeFeatures( + samples_view, samples_view, GetHigherBandsSpectrum(&feature_vector_last), + GetAverage(&feature_vector_last), + GetFirstDerivative(&feature_vector_last), + GetSecondDerivative(&feature_vector_last), + GetCepstralCrossCorrelation(&feature_vector_last), + GetCepstralVariability(&feature_vector_last)); // Average is unchanged. ExpectEqualFloatArray({feature_vector.data(), kNumLowerBands}, @@ -116,7 +150,7 @@ TEST(RnnVadTest, SpectralFeaturesConstantAverageZeroDerivative) { ExpectEqualFloatArray( {feature_vector_last.data() + kNumBands + kNumLowerBands, kNumLowerBands}, zeros); - // Spectral variability is unchanged. + // Variability is unchanged. EXPECT_FLOAT_EQ(feature_vector[kNumBands + 3 * kNumLowerBands], feature_vector_last[kNumBands + 3 * kNumLowerBands]); } diff --git a/modules/audio_processing/agc2/rnn_vad/test_utils.cc b/modules/audio_processing/agc2/rnn_vad/test_utils.cc index 4dae8cdb3a..14b84a461c 100644 --- a/modules/audio_processing/agc2/rnn_vad/test_utils.cc +++ b/modules/audio_processing/agc2/rnn_vad/test_utils.cc @@ -87,14 +87,6 @@ ReaderPairType CreateFftCoeffsReader() { return {std::move(ptr), rtc::CheckedDivExact(ptr->data_length(), row_size)}; } -ReaderPairType CreateBandEnergyCoeffsReader() { - constexpr size_t num_bands = 22; - auto ptr = absl::make_unique>( - test::ResourcePath("audio_processing/agc2/rnn_vad/band_energies", "dat"), - num_bands); - return {std::move(ptr), rtc::CheckedDivExact(ptr->data_length(), num_bands)}; -} - ReaderPairType CreateSilenceFlagsFeatureMatrixReader() { constexpr size_t feature_vector_size = 42; auto ptr = absl::make_unique>( diff --git a/modules/audio_processing/agc2/rnn_vad/test_utils.h b/modules/audio_processing/agc2/rnn_vad/test_utils.h index f9d7376d43..c11af7f8a3 100644 --- a/modules/audio_processing/agc2/rnn_vad/test_utils.h +++ b/modules/audio_processing/agc2/rnn_vad/test_utils.h @@ -110,9 +110,6 @@ CreateLpResidualAndPitchPeriodGainReader(); // Creates a reader for the FFT coefficients. std::pair>, const size_t> CreateFftCoeffsReader(); -// Instance a reader for the band energy coefficients. -std::pair>, const size_t> -CreateBandEnergyCoeffsReader(); // Creates a reader for the silence flags and the feature matrix. std::pair>, const size_t> CreateSilenceFlagsFeatureMatrixReader();