diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc index de36b7a8bc..33de5c1f95 100644 --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.cc @@ -38,6 +38,8 @@ const float kDecayRate = 0.994f; // Power estimation decay rate. const float kMaxRelativeGainChange = 0.006f; const float kRho = 0.0004f; // Default production and interpretation SNR. const float kPowerNormalizationFactor = 1.f / (1 << 30); +const float kMaxActiveSNR = 128.f; // 21dB +const float kMinInactiveSNR = 32.f; // 15dB // Returns dot product of vectors |a| and |b| with size |length|. float DotProduct(const float* a, const float* b, size_t length) { @@ -84,6 +86,8 @@ IntelligibilityEnhancer::IntelligibilityEnhancer(int sample_rate_hz, audio_s16_(chunk_length_), chunks_since_voice_(kSpeechOffsetDelay), is_speech_(false), + snr_(kMaxActiveSNR), + is_active_(false), noise_estimation_buffer_(num_noise_bins), noise_estimation_queue_(kMaxNumNoiseEstimatesToBuffer, std::vector(num_noise_bins), @@ -135,29 +139,55 @@ void IntelligibilityEnhancer::ProcessAudioBlock( if (is_speech_) { clear_power_estimator_.Step(in_block[0]); } - const std::vector& clear_power = clear_power_estimator_.power(); - const std::vector& noise_power = noise_power_estimator_.power(); - MapToErbBands(clear_power.data(), render_filter_bank_, - filtered_clear_pow_.data()); - MapToErbBands(noise_power.data(), capture_filter_bank_, - filtered_noise_pow_.data()); - SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data()); - const float power_target = std::accumulate( - filtered_clear_pow_.data(), filtered_clear_pow_.data() + bank_size_, 0.f); - const float power_top = - DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); - SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data()); - const float power_bot = - DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); - if (power_target >= power_bot && power_target <= power_top) { - SolveForLambda(power_target); - UpdateErbGains(); - } // Else experiencing power underflow, so do nothing. + SnrBasedEffectActivation(); + if (is_active_) { + MapToErbBands(clear_power_estimator_.power().data(), render_filter_bank_, + filtered_clear_pow_.data()); + MapToErbBands(noise_power_estimator_.power().data(), capture_filter_bank_, + filtered_noise_pow_.data()); + SolveForGainsGivenLambda(kLambdaTop, start_freq_, gains_eq_.data()); + const float power_target = std::accumulate( + filtered_clear_pow_.data(), + filtered_clear_pow_.data() + bank_size_, + 0.f); + const float power_top = + DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); + SolveForGainsGivenLambda(kLambdaBot, start_freq_, gains_eq_.data()); + const float power_bot = + DotProduct(gains_eq_.data(), filtered_clear_pow_.data(), bank_size_); + if (power_target >= power_bot && power_target <= power_top) { + SolveForLambda(power_target); + UpdateErbGains(); + } // Else experiencing power underflow, so do nothing. + } for (size_t i = 0; i < in_channels; ++i) { gain_applier_.Apply(in_block[i], out_block[i]); } } +void IntelligibilityEnhancer::SnrBasedEffectActivation() { + const float* clear_psd = clear_power_estimator_.power().data(); + const float* noise_psd = noise_power_estimator_.power().data(); + const float clear_power = + std::accumulate(clear_psd, clear_psd + freqs_, 0.f); + const float noise_power = + std::accumulate(noise_psd, noise_psd + freqs_, 0.f); + snr_ = kDecayRate * snr_ + (1.f - kDecayRate) * clear_power / + (noise_power + std::numeric_limits::epsilon()); + if (is_active_) { + if (snr_ > kMaxActiveSNR) { + is_active_ = false; + // Set the target gains to unity. + float* gains = gain_applier_.target(); + for (size_t i = 0; i < freqs_; ++i) { + gains[i] = 1.f; + } + } + } else { + is_active_ = snr_ < kMinInactiveSNR; + } +} + void IntelligibilityEnhancer::SolveForLambda(float power_target) { const float kConvergeThresh = 0.001f; // TODO(ekmeyerson): Find best values const int kMaxIters = 100; // for these, based on experiments. diff --git a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h index 1413212934..111b765f97 100644 --- a/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h +++ b/webrtc/modules/audio_processing/intelligibility/intelligibility_enhancer.h @@ -57,6 +57,10 @@ class IntelligibilityEnhancer : public LappedTransform::Callback { FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation); FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains); + // Updates the SNR estimation and enables or disables this component using a + // hysteresis. + void SnrBasedEffectActivation(); + // Bisection search for optimal |lambda|. void SolveForLambda(float power_target); @@ -103,6 +107,8 @@ class IntelligibilityEnhancer : public LappedTransform::Callback { std::vector audio_s16_; size_t chunks_since_voice_; bool is_speech_; + float snr_; + bool is_active_; std::vector noise_estimation_buffer_; SwapQueue, RenderQueueItemVerifier>