diff --git a/webrtc/modules/audio_processing/audio_processing_impl.cc b/webrtc/modules/audio_processing/audio_processing_impl.cc index e849822720..7715134feb 100644 --- a/webrtc/modules/audio_processing/audio_processing_impl.cc +++ b/webrtc/modules/audio_processing/audio_processing_impl.cc @@ -920,10 +920,6 @@ int AudioProcessingImpl::ProcessReverseStreamLocked() { } if (constants_.intelligibility_enabled) { - // Currently run in single-threaded mode when the intelligibility - // enhancer is activated. - // TODO(peah): Fix to be properly multi-threaded. - rtc::CritScope cs(&crit_capture_); public_submodules_->intelligibility_enhancer->ProcessRenderAudio( ra->split_channels_f(kBand0To8kHz), capture_nonlocked_.split_rate, ra->num_channels()); @@ -1235,7 +1231,8 @@ void AudioProcessingImpl::InitializeIntelligibility() { if (constants_.intelligibility_enabled) { public_submodules_->intelligibility_enhancer.reset( new IntelligibilityEnhancer(capture_nonlocked_.split_rate, - render_.render_audio->num_channels())); + render_.render_audio->num_channels(), + NoiseSuppressionImpl::num_noise_bins())); } } diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc index dc2dcdd261..04d3654521 100644 --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc @@ -61,25 +61,31 @@ void MapToErbBands(const float* pow, } // namespace IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz, - size_t num_render_channels) + size_t num_render_channels, + size_t num_noise_bins) : freqs_(RealFourier::ComplexLength( RealFourier::FftOrder(sample_rate_hz * kWindowSizeMs / 1000))), + num_noise_bins_(num_noise_bins), chunk_length_(static_cast(sample_rate_hz * kChunkSizeMs / 1000)), bank_size_(GetBankSize(sample_rate_hz, kErbResolution)), sample_rate_hz_(sample_rate_hz), num_render_channels_(num_render_channels), clear_power_estimator_(freqs_, kDecayRate), - noise_power_estimator_( - new intelligibility::PowerEstimator(freqs_, kDecayRate)), + noise_power_estimator_(num_noise_bins, kDecayRate), filtered_clear_pow_(bank_size_, 0.f), - filtered_noise_pow_(bank_size_, 0.f), + filtered_noise_pow_(num_noise_bins, 0.f), center_freqs_(bank_size_), + capture_filter_bank_(CreateErbBank(num_noise_bins)), render_filter_bank_(CreateErbBank(freqs_)), gains_eq_(bank_size_), gain_applier_(freqs_, kMaxRelativeGainChange), audio_s16_(chunk_length_), chunks_since_voice_(kSpeechOffsetDelay), - is_speech_(false) { + is_speech_(false), + noise_estimation_buffer_(num_noise_bins), + noise_estimation_queue_(kMaxNumNoiseEstimatesToBuffer, + std::vector(num_noise_bins), + RenderQueueItemVerifier(num_noise_bins)) { RTC_DCHECK_LE(kRho, 1.f); const size_t erb_index = static_cast( @@ -98,13 +104,11 @@ IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz, void IntelligibilityEnhancer::SetCaptureNoiseEstimate( std::vector noise) { - if (capture_filter_bank_.size() != bank_size_ || - capture_filter_bank_[0].size() != noise.size()) { - capture_filter_bank_ = CreateErbBank(noise.size()); - noise_power_estimator_.reset( - new intelligibility::PowerEstimator(noise.size(), kDecayRate)); - } - noise_power_estimator_->Step(noise.data()); + RTC_DCHECK_EQ(noise.size(), num_noise_bins_); + // Disregarding return value since buffer overflow is acceptable, because it + // is not critical to get each noise estimate. + if (noise_estimation_queue_.Insert(&noise)) { + }; } void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio, @@ -112,6 +116,9 @@ void IntelligibilityEnhancer::ProcessRenderAudio(float* const* audio, size_t num_channels) { RTC_CHECK_EQ(sample_rate_hz_, sample_rate_hz); RTC_CHECK_EQ(num_render_channels_, num_channels); + while (noise_estimation_queue_.Remove(&noise_estimation_buffer_)) { + noise_power_estimator_.Step(noise_estimation_buffer_.data()); + } is_speech_ = IsSpeech(audio[0]); render_mangler_->ProcessChunk(audio, audio); } @@ -127,7 +134,7 @@ void IntelligibilityEnhancer::ProcessAudioBlock( clear_power_estimator_.Step(in_block[0]); } const std::vector& clear_power = clear_power_estimator_.power(); - const std::vector& noise_power = noise_power_estimator_->power(); + const std::vector& noise_power = noise_power_estimator_.power(); MapToErbBands(clear_power.data(), render_filter_bank_, filtered_clear_pow_.data()); MapToErbBands(noise_power.data(), capture_filter_bank_, diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h index 3b46d16afe..c857661473 100644 --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h @@ -17,7 +17,9 @@ #include "webrtc/common_audio/lapped_transform.h" #include "webrtc/common_audio/channel_buffer.h" +#include "webrtc/common_audio/swap_queue.h" #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h" +#include "webrtc/modules/audio_processing/processing_component.h" #include "webrtc/modules/audio_processing/vad/voice_activity_detector.h" namespace webrtc { @@ -29,7 +31,9 @@ namespace webrtc { // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6882788 class IntelligibilityEnhancer : public LappedTransform::Callback { public: - IntelligibilityEnhancer(int sample_rate_hz, size_t num_render_channels); + IntelligibilityEnhancer(int sample_rate_hz, + size_t num_render_channels, + size_t num_noise_bins); // Sets the capture noise magnitude spectrum estimate. void SetCaptureNoiseEstimate(std::vector noise); @@ -72,15 +76,17 @@ class IntelligibilityEnhancer : public LappedTransform::Callback { // Returns true if the audio is speech. bool IsSpeech(const float* audio); + static const size_t kMaxNumNoiseEstimatesToBuffer = 5; + const size_t freqs_; // Num frequencies in frequency domain. + const size_t num_noise_bins_; const size_t chunk_length_; // Chunk size in samples. const size_t bank_size_; // Num ERB filters. const int sample_rate_hz_; const size_t num_render_channels_; intelligibility::PowerEstimator> clear_power_estimator_; - std::unique_ptr> - noise_power_estimator_; + intelligibility::PowerEstimator noise_power_estimator_; std::vector filtered_clear_pow_; std::vector filtered_noise_pow_; std::vector center_freqs_; @@ -97,6 +103,10 @@ class IntelligibilityEnhancer : public LappedTransform::Callback { std::vector audio_s16_; size_t chunks_since_voice_; bool is_speech_; + + std::vector noise_estimation_buffer_; + SwapQueue, RenderQueueItemVerifier> + noise_estimation_queue_; }; } // namespace webrtc diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc index dd5b681798..f5ea7340b8 100644 --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer_unittest.cc @@ -201,6 +201,7 @@ const int kSamples = 1000; const int kSampleRate = 4000; const int kNumChannels = 1; const int kFragmentSize = kSampleRate / 100; +const size_t kNumNoiseBins = 129; } // namespace @@ -208,11 +209,13 @@ class IntelligibilityEnhancerTest : public ::testing::Test { protected: IntelligibilityEnhancerTest() : clear_data_(kSamples), noise_data_(kSamples), orig_data_(kSamples) { - enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels)); + enh_.reset( + new IntelligibilityEnhancer(kSampleRate, kNumChannels, kNumNoiseBins)); } bool CheckUpdate() { - enh_.reset(new IntelligibilityEnhancer(kSampleRate, kNumChannels)); + enh_.reset( + new IntelligibilityEnhancer(kSampleRate, kNumChannels, kNumNoiseBins)); float* clear_cursor = clear_data_.data(); float* noise_cursor = noise_data_.data(); for (int i = 0; i < kSamples; i += kFragmentSize) { diff --git a/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc b/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc index e196e29436..b459c39b69 100644 --- a/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc +++ b/webrtc/modules/audio_processing/intelligibility/test/intelligibility_proc.cc @@ -37,9 +37,10 @@ void void_main(int argc, char* argv[]) { WavReader noise_file(FLAGS_noise_file); WavWriter out_file(FLAGS_out_file, in_file.sample_rate(), in_file.num_channels()); - IntelligibilityEnhancer enh(in_file.sample_rate(), in_file.num_channels()); rtc::CriticalSection crit; NoiseSuppressionImpl ns(&crit); + IntelligibilityEnhancer enh(in_file.sample_rate(), in_file.num_channels(), + NoiseSuppressionImpl::num_noise_bins()); ns.Initialize(noise_file.num_channels(), noise_file.sample_rate()); ns.Enable(true); const size_t in_samples = noise_file.sample_rate() / 100; diff --git a/webrtc/modules/audio_processing/noise_suppression_impl.cc b/webrtc/modules/audio_processing/noise_suppression_impl.cc index 7f19005924..a9d9f4a93b 100644 --- a/webrtc/modules/audio_processing/noise_suppression_impl.cc +++ b/webrtc/modules/audio_processing/noise_suppression_impl.cc @@ -200,4 +200,12 @@ std::vector NoiseSuppressionImpl::NoiseEstimate() { return noise_estimate; } +size_t NoiseSuppressionImpl::num_noise_bins() { +#if defined(WEBRTC_NS_FLOAT) + return WebRtcNs_num_freq(); +#elif defined(WEBRTC_NS_FIXED) + return WebRtcNsx_num_freq(); +#endif +} + } // namespace webrtc diff --git a/webrtc/modules/audio_processing/noise_suppression_impl.h b/webrtc/modules/audio_processing/noise_suppression_impl.h index ef30bb1167..111e8e17b6 100644 --- a/webrtc/modules/audio_processing/noise_suppression_impl.h +++ b/webrtc/modules/audio_processing/noise_suppression_impl.h @@ -39,6 +39,7 @@ class NoiseSuppressionImpl : public NoiseSuppression { Level level() const override; float speech_probability() const override; std::vector NoiseEstimate() override; + static size_t num_noise_bins(); private: class Suppressor;