From 0a007597803efe16b1a427256f2039eea3f53e4c Mon Sep 17 00:00:00 2001 From: aluebs Date: Fri, 26 Feb 2016 17:17:38 -0800 Subject: [PATCH] Fix the stereo support in IntelligibilityEnhancer Review URL: https://codereview.webrtc.org/1729753003 Cr-Commit-Position: refs/heads/master@{#11795} --- .../intelligibility_enhancer.cc | 95 ++++++++----------- .../intelligibility_enhancer.h | 46 +++------ .../intelligibility_enhancer_unittest.cc | 10 +- .../intelligibility/intelligibility_utils.cc | 9 +- .../intelligibility/intelligibility_utils.h | 7 +- .../intelligibility_utils_unittest.cc | 6 +- .../test/intelligibility_proc.cc | 80 ++++++---------- 7 files changed, 92 insertions(+), 161 deletions(-) diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc index 38a7ea32cf..d8f95edcf6 100644 --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc @@ -54,29 +54,12 @@ void MapToErbBands(const float* pow, float* result) { for (size_t i = 0; i < filter_bank.size(); ++i) { RTC_DCHECK_GT(filter_bank[i].size(), 0u); - result[i] = DotProduct(&filter_bank[i][0], pow, filter_bank[i].size()); + result[i] = DotProduct(filter_bank[i].data(), pow, filter_bank[i].size()); } } } // namespace -IntelligibilityEnhancer::TransformCallback::TransformCallback( - IntelligibilityEnhancer* parent) - : parent_(parent) { -} - -void IntelligibilityEnhancer::TransformCallback::ProcessAudioBlock( - const std::complex* const* in_block, - size_t in_channels, - size_t frames, - size_t /* out_channels */, - std::complex* const* out_block) { - RTC_DCHECK_EQ(parent_->freqs_, frames); - for (size_t i = 0; i < in_channels; ++i) { - parent_->ProcessClearBlock(in_block[i], out_block[i]); - } -} - IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz, size_t num_render_channels) : freqs_(RealFourier::ComplexLength( @@ -88,24 +71,17 @@ IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz, clear_power_estimator_(freqs_, kDecayRate), noise_power_estimator_( new intelligibility::PowerEstimator(freqs_, kDecayRate)), - filtered_clear_pow_(new float[bank_size_]), - filtered_noise_pow_(new float[bank_size_]), - center_freqs_(new float[bank_size_]), + filtered_clear_pow_(bank_size_, 0.f), + filtered_noise_pow_(bank_size_, 0.f), + center_freqs_(bank_size_), render_filter_bank_(CreateErbBank(freqs_)), - gains_eq_(new float[bank_size_]), + gains_eq_(bank_size_), gain_applier_(freqs_, kMaxRelativeGainChange), - temp_render_out_buffer_(chunk_length_, num_render_channels_), - render_callback_(this), audio_s16_(chunk_length_), chunks_since_voice_(kSpeechOffsetDelay), is_speech_(false) { RTC_DCHECK_LE(kRho, 1.f); - memset(filtered_clear_pow_.get(), 0, - bank_size_ * sizeof(filtered_clear_pow_[0])); - memset(filtered_noise_pow_.get(), 0, - bank_size_ * sizeof(filtered_noise_pow_[0])); - const size_t erb_index = static_cast( ceilf(11.17f * logf((kClipFreqKhz + 0.312f) / (kClipFreqKhz + 14.6575f)) + 43.f)); @@ -113,10 +89,11 @@ IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz, size_t window_size = static_cast(1 << RealFourier::FftOrder(freqs_)); std::vector kbd_window(window_size); - WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, &kbd_window[0]); + WindowGenerator::KaiserBesselDerived(kKbdAlpha, window_size, + kbd_window.data()); render_mangler_.reset(new LappedTransform( - num_render_channels_, num_render_channels_, chunk_length_, &kbd_window[0], - window_size, window_size / 2, &render_callback_)); + num_render_channels_, num_render_channels_, chunk_length_, + kbd_window.data(), window_size, window_size / 2, this)); } void IntelligibilityEnhancer::SetCaptureNoiseEstimate( @@ -127,7 +104,7 @@ void IntelligibilityEnhancer::SetCaptureNoiseEstimate( noise_power_estimator_.reset( new intelligibility::PowerEstimator(noise.size(), kDecayRate)); } - noise_power_estimator_->Step(&noise[0]); + noise_power_estimator_->Step(noise.data()); } void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio, @@ -136,38 +113,40 @@ void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio, RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz); RTC_CHECK_EQ(num_render_channels_, num_channels); is_speech_ = IsSpeech(audio[0]); - render_mangler_->ProcessChunk(audio, temp_render_out_buffer_.channels()); - for (size_t i = 0; i < num_render_channels_; ++i) { - memcpy(audio[i], temp_render_out_buffer_.channels()[i], - chunk_length_ * sizeof(**audio)); - } + render_mangler_->ProcessChunk(audio, audio); } -void IntelligibilityEnhancer::ProcessClearBlock( - const std::complex* in_block, - std::complex* out_block) { +void IntelligibilityEnhancer::ProcessAudioBlock( + const std::complex* const* in_block, + size_t in_channels, + size_t frames, + size_t /* out_channels */, + std::complex* const* out_block) { + RTC_DCHECK_EQ(freqs_, frames); if (is_speech_) { - clear_power_estimator_.Step(in_block); + clear_power_estimator_.Step(in_block[0]); } const std::vector& clear_power = clear_power_estimator_.power(); const std::vector& noise_power = noise_power_estimator_->power(); - MapToErbBands(&clear_power[0], render_filter_bank_, - filtered_clear_pow_.get()); - MapToErbBands(&noise_power[0], capture_filter_bank_, - filtered_noise_pow_.get()); - SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.get()); + MapToErbBands(clear_power.data(), render_filter_bank_, + filtered_clear_pow_.data()); + MapToErbBands(noise_power.data(), capture_filter_bank_, + filtered_noise_pow_.data()); + SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data()); const float power_target = - std::accumulate(&clear_power[0], &clear_power[0] + freqs_, 0.f); + std::accumulate(clear_power.data(), clear_power.data() + freqs_, 0.f); const float power_top = - DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); - SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.get()); + DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); + SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data()); const float power_bot = - DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); + DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); if (power_target >= power_bot && power_target <= power_top) { SolveForLambda(power_target); UpdateErbGains(); } // Else experiencing power underflow, so do nothing. - gain_applier_.Apply(in_block, out_block); + for (size_t i = 0; i < in_channels; ++i) { + gain_applier_.Apply(in_block[i], out_block[i]); + } } void IntelligibilityEnhancer::SolveForLambda(float power_target) { @@ -182,9 +161,9 @@ void IntelligibilityEnhancer::SolveForLambda(float power_target) { int iters = 0; while (std::fabs(power_ratio - 1.f) > kConvergeThresh && iters <= kMaxIters) { const float lambda = (lambda_bot + lambda_top) / 2.f; - SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.get()); + SolveForGainsGivenLambda(lambda, start_freq_, gains_eq_.data()); const float power = - DotProduct(gains_eq_.get(), filtered_clear_pow_.get(), bank_size_); + DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); if (power < power_target) { lambda_bot = lambda; } else { @@ -286,8 +265,8 @@ void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda, float* sols) { const float kMinPower = 1e-5f; - const float* pow_x0 = filtered_clear_pow_.get(); - const float* pow_n0 = filtered_noise_pow_.get(); + const float* pow_x0 = filtered_clear_pow_.data(); + const float* pow_n0 = filtered_noise_pow_.data(); for (size_t n = 0; n < start_freq; ++n) { sols[n] = 1.f; @@ -316,8 +295,8 @@ void IntelligibilityEnhancer::SolveForGainsGivenLambda(float lambda, } bool IntelligibilityEnhancer::IsSpeech(const float* audio) { - FloatToS16(audio, chunk_length_, &audio_s16_[0]); - vad_.ProcessChunk(&audio_s16_[0], chunk_length_, sample_rate_hz_); + FloatToS16(audio, chunk_length_, audio_s16_.data()); + vad_.ProcessChunk(audio_s16_.data(), chunk_length_, sample_rate_hz_); if (vad_.last_voice_probability() > kVoiceProbabilityThreshold) { chunks_since_voice_ = 0; } else if (chunks_since_voice_ < kSpeechOffsetDelay) { diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h index 22a3eab697..3b46d16afe 100644 --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h @@ -27,7 +27,7 @@ namespace webrtc { // frequency bin to enhance speech against the noise background. // Details of the model and algorithm can be found in the original paper: // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788 -class IntelligibilityEnhancer { +class IntelligibilityEnhancer : public LappedTransform::Callback { public: IntelligibilityEnhancer(int sample_rate_hz, size_t num_render_channels); @@ -40,32 +40,19 @@ class IntelligibilityEnhancer { size_t num_channels); bool active() const; + protected: + // All in frequency domain, receives input |in_block|, applies + // intelligibility enhancement, and writes result to |out_block|. + void ProcessAudioBlock(const std::complex* const* in_block, + size_t in_channels, + size_t frames, + size_t out_channels, + std::complex* const* out_block) override; + private: - // Provides access point to the frequency domain. - class TransformCallback : public LappedTransform::Callback { - public: - TransformCallback(IntelligibilityEnhancer* parent); - - // All in frequency domain, receives input |in_block|, applies - // intelligibility enhancement, and writes result to |out_block|. - void ProcessAudioBlock(const std::complex* const* in_block, - size_t in_channels, - size_t frames, - size_t out_channels, - std::complex* const* out_block) override; - - private: - IntelligibilityEnhancer* parent_; - }; - friend class TransformCallback; FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation); FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains); - // Updates power computation and analysis with |in_block_|, - // and writes modified speech to |out_block|. - void ProcessClearBlock(const std::complex* in_block, - std::complex* out_block); - // Bisection search for optimal |lambda|. void SolveForLambda(float power_target); @@ -94,21 +81,16 @@ class IntelligibilityEnhancer { intelligibility::PowerEstimator> clear_power_estimator_; std::unique_ptr> noise_power_estimator_; - std::unique_ptr filtered_clear_pow_; - std::unique_ptr filtered_noise_pow_; - std::unique_ptr center_freqs_; + std::vector filtered_clear_pow_; + std::vector filtered_noise_pow_; + std::vector center_freqs_; std::vector> capture_filter_bank_; std::vector> render_filter_bank_; size_t start_freq_; - std::unique_ptr gains_eq_; // Pre-filter modified gains. + std::vector gains_eq_; // Pre-filter modified gains. intelligibility::GainApplier gain_applier_; - // Destination buffers used to reassemble blocked chunks before overwriting - // the original input array with modifications. - ChannelBuffer temp_render_out_buffer_; - - TransformCallback render_callback_; std::unique_ptr render_mangler_; VoiceActivityDetector vad_; diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc index ebfb67a90d..dd5b681798 100644 --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc @@ -213,8 +213,8 @@ class IntelligibilityEnhancerTest : public ::testing::Test { bool CheckUpdate() { enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels)); - float* clear_cursor = &clear_data_[0]; - float* noise_cursor = &noise_data_[0]; + float* clear_cursor = clear_data_.data(); + float* noise_cursor = noise_data_.data(); for (int i = 0; i < kSamples; i += kFragmentSize) { enh_->ProcessRenderAudio(&clear_cursor, kSampleRate, kNumChannels); clear_cursor += kFragmentSize; @@ -273,7 +273,7 @@ TEST_F(IntelligibilityEnhancerTest, TestSolveForGains) { enh_->filtered_clear_pow_[i] = 0.f; enh_->filtered_noise_pow_[i] = 0.f; } - enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, &sols[0]); + enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, sols.data()); for (size_t i = 0; i < enh_->bank_size_; i++) { EXPECT_NEAR(kTestZeroVar, sols[i], kMaxTestError); } @@ -281,12 +281,12 @@ TEST_F(IntelligibilityEnhancerTest, TestSolveForGains) { enh_->filtered_clear_pow_[i] = static_cast(i + 1); enh_->filtered_noise_pow_[i] = static_cast(enh_->bank_size_ - i); } - enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, &sols[0]); + enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, sols.data()); for (size_t i = 0; i < enh_->bank_size_; i++) { EXPECT_NEAR(kTestNonZeroVarLambdaTop[i], sols[i], kMaxTestError); } lambda = -1.f; - enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, &sols[0]); + enh_->SolveForGainsGivenLambda(lambda, enh_->start_freq_, sols.data()); for (size_t i = 0; i < enh_->bank_size_; i++) { EXPECT_NEAR(kTestNonZeroVarLambdaTop[i], sols[i], kMaxTestError); } diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc index 6d37199a2c..3a9433b476 100644 --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.cc @@ -54,13 +54,8 @@ template class PowerEstimator>; GainApplier::GainApplier(size_t freqs, float relative_change_limit) : num_freqs_(freqs), relative_change_limit_(relative_change_limit), - target_(new float[freqs]()), - current_(new float[freqs]()) { - for (size_t i = 0; i < freqs; ++i) { - target_[i] = 1.f; - current_[i] = 1.f; - } -} + target_(freqs, 1.f), + current_(freqs, 1.f) {} void GainApplier::Apply(const std::complex* in_block, std::complex* out_block) { diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h index 3805a0cd15..11b9e49bd4 100644 --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h @@ -12,7 +12,6 @@ #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_UTILS_H_ #include -#include #include namespace webrtc { @@ -55,13 +54,13 @@ class GainApplier { std::complex* out_block); // Return the current target gain set. Modify this array to set the targets. - float* target() const { return target_.get(); } + float* target() { return target_.data(); } private: const size_t num_freqs_; const float relative_change_limit_; - std::unique_ptr target_; - std::unique_ptr current_; + std::vector target_; + std::vector current_; }; } // namespace intelligibility diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc index 28957bb80d..08e8368c08 100644 --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_utils_unittest.cc @@ -43,9 +43,9 @@ TEST(IntelligibilityUtilsTest, TestPowerEstimator) { EXPECT_EQ(0, power_estimator.power()[0]); // Makes sure Step is doing something. - power_estimator.Step(&test_data[0][0]); + power_estimator.Step(test_data[0].data()); for (size_t i = 1; i < kSamples; ++i) { - power_estimator.Step(&test_data[i][0]); + power_estimator.Step(test_data[i].data()); for (size_t j = 0; j < kFreqs; ++j) { EXPECT_GE(power_estimator.power()[j], 0.f); EXPECT_LE(power_estimator.power()[j], 1.f); @@ -64,7 +64,7 @@ TEST(IntelligibilityUtilsTest, TestGainApplier) { std::vector>> out_data( GenerateTestData(kFreqs, kSamples)); for (size_t i = 0; i < kSamples; ++i) { - gain_applier.Apply(&in_data[i][0], &out_data[i][0]); + gain_applier.Apply(in_data[i].data(), out_data[i].data()); for (size_t j = 0; j < kFreqs; ++j) { EXPECT_GT(out_data[i][j].real(), 0.f); EXPECT_LT(out_data[i][j].real(), 1.f); diff --git a/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc b/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc index ab8524bb00..e196e29436 100644 --- a/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc +++ b/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc @@ -8,17 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ -// -// Command line tool for speech intelligibility enhancement. Provides for -// running and testing intelligibility_enhancer as an independent process. -// Use --help for options. -// - -#include - #include "gflags/gflags.h" #include "testing/gtest/include/gtest/gtest.h" #include "webrtc/base/criticalsection.h" +#include "webrtc/common_audio/channel_buffer.h" #include "webrtc/common_audio/include/audio_util.h" #include "webrtc/common_audio/wav_file.h" #include "webrtc/modules/audio_processing/audio_buffer.h" @@ -40,62 +33,45 @@ void void_main(int argc, char* argv[]) { "\n\nInput files must be little-endian 16-bit signed raw PCM.\n"); google::ParseCommandLineFlags(&argc, &argv, true); - // Load settings and wav input. - struct stat in_stat, noise_stat; - ASSERT_EQ(stat(FLAGS_clear_file.c_str(), &in_stat), 0) - << "Empty speech file."; - ASSERT_EQ(stat(FLAGS_noise_file.c_str(), &noise_stat), 0) - << "Empty noise file."; - - const size_t samples = std::min(in_stat.st_size, noise_stat.st_size) / 2; - WavReader in_file(FLAGS_clear_file); - std::vector in_fpcm(samples); - in_file.ReadSamples(samples, &in_fpcm[0]); - FloatS16ToFloat(&in_fpcm[0], samples, &in_fpcm[0]); - WavReader noise_file(FLAGS_noise_file); - std::vector noise_fpcm(samples); - noise_file.ReadSamples(samples, &noise_fpcm[0]); - FloatS16ToFloat(&noise_fpcm[0], samples, &noise_fpcm[0]); - - // Run intelligibility enhancement. + WavWriter out_file(FLAGS_out_file, in_file.sample_rate(), + in_file.num_channels()); IntelligibilityEnhancer enh(in_file.sample_rate(), in_file.num_channels()); rtc::CriticalSection crit; NoiseSuppressionImpl ns(&crit); ns.Initialize(noise_file.num_channels(), noise_file.sample_rate()); ns.Enable(true); - - // Mirror real time APM chunk size. Duplicates chunk_length_ in - // IntelligibilityEnhancer. - size_t fragment_size = in_file.sample_rate() / 100; - AudioBuffer capture_audio(fragment_size, noise_file.num_channels(), - fragment_size, noise_file.num_channels(), - fragment_size); - StreamConfig stream_config(in_file.sample_rate(), noise_file.num_channels()); - - // Slice the input into smaller chunks, as the APM would do, and feed them - // through the enhancer. - float* clear_cursor = &in_fpcm[0]; - float* noise_cursor = &noise_fpcm[0]; - - for (size_t i = 0; i < samples; i += fragment_size) { - capture_audio.CopyFrom(&noise_cursor, stream_config); + const size_t in_samples = noise_file.sample_rate() / 100; + const size_t noise_samples = noise_file.sample_rate() / 100; + std::vector in(in_samples * in_file.num_channels()); + std::vector noise(noise_samples * noise_file.num_channels()); + ChannelBuffer in_buf(in_samples, in_file.num_channels()); + ChannelBuffer noise_buf(noise_samples, noise_file.num_channels()); + AudioBuffer capture_audio(noise_samples, noise_file.num_channels(), + noise_samples, noise_file.num_channels(), + noise_samples); + StreamConfig stream_config(noise_file.sample_rate(), + noise_file.num_channels()); + while (in_file.ReadSamples(in.size(), in.data()) == in.size() && + noise_file.ReadSamples(noise.size(), noise.data()) == noise.size()) { + FloatS16ToFloat(in.data(), in.size(), in.data()); + FloatS16ToFloat(noise.data(), noise.size(), noise.data()); + Deinterleave(in.data(), in_buf.num_frames(), in_buf.num_channels(), + in_buf.channels()); + Deinterleave(noise.data(), noise_buf.num_frames(), noise_buf.num_channels(), + noise_buf.channels()); + capture_audio.CopyFrom(noise_buf.channels(), stream_config); ns.AnalyzeCaptureAudio(&capture_audio); ns.ProcessCaptureAudio(&capture_audio); enh.SetCaptureNoiseEstimate(ns.NoiseEstimate()); - enh.ProcessRenderAudio(&clear_cursor, in_file.sample_rate(), + enh.ProcessRenderAudio(in_buf.channels(), in_file.sample_rate(), in_file.num_channels()); - clear_cursor += fragment_size; - noise_cursor += fragment_size; + Interleave(in_buf.channels(), in_buf.num_frames(), in_buf.num_channels(), + in.data()); + FloatToFloatS16(in.data(), in.size(), in.data()); + out_file.WriteSamples(in.data(), in.size()); } - - FloatToFloatS16(&in_fpcm[0], samples, &in_fpcm[0]); - - WavWriter out_file(FLAGS_out_file, - in_file.sample_rate(), - in_file.num_channels()); - out_file.WriteSamples(&in_fpcm[0], samples); } } // namespace