From 980c4601e188a80bb435edf0c3e345899cd18c59 Mon Sep 17 00:00:00 2001 From: Alessio Bazzica Date: Wed, 14 Apr 2021 19:09:17 +0200 Subject: [PATCH] AGC2: retuning and large refactoring - Bug fix: the desired initial gain quickly dropped to 0 dB hence starting a call with a too low level - New tuning to make AGC2 more robust to VAD mistakes - Smarter max gain increase speed: to deal with an increased threshold of adjacent speech frames, the gain applier temporarily allows a faster gain increase to deal with a longer time spent waiting for enough speech frames in a row to be observed - Saturation protector isolated from `AdaptiveModeLevelEstimator` to simplify the unit tests for the latter (non bit-exact change) - AGC2 adaptive digital config: unnecessary params deprecated - Code readability improvements - Data dumps clean-up and better naming Bug: webrtc:7494 Change-Id: I4e36059bdf2566cc2a7e1a7e95b7430ba9ae9844 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/215140 Commit-Queue: Alessio Bazzica Reviewed-by: Jesus de Vicente Pena Cr-Commit-Position: refs/heads/master@{#33736} --- modules/audio_processing/agc2/BUILD.gn | 3 + modules/audio_processing/agc2/adaptive_agc.cc | 91 +++--- modules/audio_processing/agc2/adaptive_agc.h | 8 +- .../agc2/adaptive_digital_gain_applier.cc | 93 +++--- .../agc2/adaptive_digital_gain_applier.h | 29 +- .../adaptive_digital_gain_applier_unittest.cc | 199 ++++++------- .../agc2/adaptive_mode_level_estimator.cc | 156 ++++------ .../agc2/adaptive_mode_level_estimator.h | 18 +- .../adaptive_mode_level_estimator_unittest.cc | 254 +++++++--------- modules/audio_processing/agc2/agc2_common.h | 53 ++-- .../agc2/agc2_testing_common_unittest.cc | 2 +- .../agc2/fixed_digital_level_estimator.cc | 8 + .../fixed_digital_level_estimator_unittest.cc | 13 +- .../agc2/interpolated_gain_curve.h | 2 +- .../agc2/interpolated_gain_curve_unittest.cc | 22 +- .../agc2/noise_level_estimator.cc | 4 +- .../agc2/saturation_protector.cc | 205 ++++++++----- .../agc2/saturation_protector.h | 77 ++--- .../agc2/saturation_protector_buffer.cc | 77 +++++ .../agc2/saturation_protector_buffer.h | 59 ++++ .../saturation_protector_buffer_unittest.cc | 73 +++++ .../agc2/saturation_protector_unittest.cc | 273 +++++++++--------- .../audio_processing/agc2/vad_with_level.cc | 32 +- .../audio_processing/agc2/vad_with_level.h | 9 +- .../agc2/vad_with_level_unittest.cc | 61 +--- modules/audio_processing/gain_controller2.cc | 2 +- .../gain_controller2_unittest.cc | 59 ++-- .../include/audio_processing.cc | 29 +- .../include/audio_processing.h | 20 +- 29 files changed, 990 insertions(+), 941 deletions(-) create mode 100644 modules/audio_processing/agc2/saturation_protector_buffer.cc create mode 100644 modules/audio_processing/agc2/saturation_protector_buffer.h create mode 100644 modules/audio_processing/agc2/saturation_protector_buffer_unittest.cc diff --git a/modules/audio_processing/agc2/BUILD.gn b/modules/audio_processing/agc2/BUILD.gn index 910b58c9c2..4c6cfab576 100644 --- a/modules/audio_processing/agc2/BUILD.gn +++ b/modules/audio_processing/agc2/BUILD.gn @@ -25,6 +25,8 @@ rtc_library("adaptive_digital") { "adaptive_mode_level_estimator.h", "saturation_protector.cc", "saturation_protector.h", + "saturation_protector_buffer.cc", + "saturation_protector_buffer.h", ] configs += [ "..:apm_debug_dump" ] @@ -177,6 +179,7 @@ rtc_library("adaptive_digital_unittests") { "adaptive_digital_gain_applier_unittest.cc", "adaptive_mode_level_estimator_unittest.cc", "gain_applier_unittest.cc", + "saturation_protector_buffer_unittest.cc", "saturation_protector_unittest.cc", ] deps = [ diff --git a/modules/audio_processing/agc2/adaptive_agc.cc b/modules/audio_processing/agc2/adaptive_agc.cc index 37f11d2f25..8bf192e77f 100644 --- a/modules/audio_processing/agc2/adaptive_agc.cc +++ b/modules/audio_processing/agc2/adaptive_agc.cc @@ -25,15 +25,6 @@ using AdaptiveDigitalConfig = using NoiseEstimatorType = AudioProcessing::Config::GainController2::NoiseEstimator; -void DumpDebugData(const AdaptiveDigitalGainApplier::FrameInfo& info, - ApmDataDumper& dumper) { - dumper.DumpRaw("agc2_vad_probability", info.vad_result.speech_probability); - dumper.DumpRaw("agc2_vad_rms_dbfs", info.vad_result.rms_dbfs); - dumper.DumpRaw("agc2_vad_peak_dbfs", info.vad_result.peak_dbfs); - dumper.DumpRaw("agc2_noise_estimate_dbfs", info.input_noise_level_dbfs); - dumper.DumpRaw("agc2_last_limiter_audio_level", info.limiter_envelope_dbfs); -} - constexpr int kGainApplierAdjacentSpeechFramesThreshold = 1; constexpr float kMaxGainChangePerSecondDb = 3.0f; constexpr float kMaxOutputNoiseLevelDbfs = -50.0f; @@ -72,36 +63,42 @@ constexpr NoiseEstimatorType kDefaultNoiseLevelEstimatorType = AdaptiveAgc::AdaptiveAgc(ApmDataDumper* apm_data_dumper) : speech_level_estimator_(apm_data_dumper), - gain_applier_(apm_data_dumper, - kGainApplierAdjacentSpeechFramesThreshold, - kMaxGainChangePerSecondDb, - kMaxOutputNoiseLevelDbfs), + gain_controller_(apm_data_dumper, + kGainApplierAdjacentSpeechFramesThreshold, + kMaxGainChangePerSecondDb, + kMaxOutputNoiseLevelDbfs), apm_data_dumper_(apm_data_dumper), noise_level_estimator_( CreateNoiseLevelEstimator(kDefaultNoiseLevelEstimatorType, + apm_data_dumper)), + saturation_protector_( + CreateSaturationProtector(kSaturationProtectorInitialHeadroomDb, + kSaturationProtectorExtraHeadroomDb, + kGainApplierAdjacentSpeechFramesThreshold, apm_data_dumper)) { RTC_DCHECK(apm_data_dumper); } AdaptiveAgc::AdaptiveAgc(ApmDataDumper* apm_data_dumper, const AdaptiveDigitalConfig& config) - : speech_level_estimator_( - apm_data_dumper, - config.level_estimator, - config.level_estimator_adjacent_speech_frames_threshold, - config.initial_saturation_margin_db, - config.extra_saturation_margin_db), - vad_(config.vad_reset_period_ms, - config.vad_probability_attack, - GetAllowedCpuFeatures(config)), - gain_applier_(apm_data_dumper, - config.gain_applier_adjacent_speech_frames_threshold, - config.max_gain_change_db_per_second, - config.max_output_noise_level_dbfs), + : speech_level_estimator_(apm_data_dumper, + config.adjacent_speech_frames_threshold), + vad_(config.vad_reset_period_ms, GetAllowedCpuFeatures(config)), + gain_controller_(apm_data_dumper, + config.adjacent_speech_frames_threshold, + config.max_gain_change_db_per_second, + config.max_output_noise_level_dbfs), apm_data_dumper_(apm_data_dumper), noise_level_estimator_( - CreateNoiseLevelEstimator(config.noise_estimator, apm_data_dumper)) { + CreateNoiseLevelEstimator(config.noise_estimator, apm_data_dumper)), + saturation_protector_( + CreateSaturationProtector(kSaturationProtectorInitialHeadroomDb, + kSaturationProtectorExtraHeadroomDb, + config.adjacent_speech_frames_threshold, + apm_data_dumper)) { RTC_DCHECK(apm_data_dumper); + RTC_DCHECK(noise_level_estimator_); + RTC_DCHECK(saturation_protector_); if (!config.use_saturation_protector) { RTC_LOG(LS_WARNING) << "The saturation protector cannot be disabled."; } @@ -111,19 +108,39 @@ AdaptiveAgc::~AdaptiveAgc() = default; void AdaptiveAgc::Process(AudioFrameView frame, float limiter_envelope) { AdaptiveDigitalGainApplier::FrameInfo info; - info.vad_result = vad_.AnalyzeFrame(frame); - speech_level_estimator_.Update(info.vad_result); - info.input_level_dbfs = speech_level_estimator_.level_dbfs(); - info.input_noise_level_dbfs = noise_level_estimator_->Analyze(frame); - info.limiter_envelope_dbfs = - limiter_envelope > 0 ? FloatS16ToDbfs(limiter_envelope) : -90.0f; - info.estimate_is_confident = speech_level_estimator_.IsConfident(); - DumpDebugData(info, *apm_data_dumper_); - gain_applier_.Process(info, frame); + + VadLevelAnalyzer::Result vad_result = vad_.AnalyzeFrame(frame); + info.speech_probability = vad_result.speech_probability; + apm_data_dumper_->DumpRaw("agc2_speech_probability", + vad_result.speech_probability); + apm_data_dumper_->DumpRaw("agc2_input_rms_dbfs", vad_result.rms_dbfs); + apm_data_dumper_->DumpRaw("agc2_input_peak_dbfs", vad_result.peak_dbfs); + + speech_level_estimator_.Update(vad_result); + info.speech_level_dbfs = speech_level_estimator_.level_dbfs(); + info.speech_level_reliable = speech_level_estimator_.IsConfident(); + apm_data_dumper_->DumpRaw("agc2_speech_level_dbfs", info.speech_level_dbfs); + apm_data_dumper_->DumpRaw("agc2_speech_level_reliable", + info.speech_level_reliable); + + info.noise_rms_dbfs = noise_level_estimator_->Analyze(frame); + apm_data_dumper_->DumpRaw("agc2_noise_rms_dbfs", info.noise_rms_dbfs); + + saturation_protector_->Analyze(info.speech_probability, vad_result.peak_dbfs, + info.speech_level_dbfs); + info.headroom_db = saturation_protector_->HeadroomDb(); + apm_data_dumper_->DumpRaw("agc2_headroom_db", info.headroom_db); + + info.limiter_envelope_dbfs = FloatS16ToDbfs(limiter_envelope); + apm_data_dumper_->DumpRaw("agc2_limiter_envelope_dbfs", + info.limiter_envelope_dbfs); + + gain_controller_.Process(info, frame); } -void AdaptiveAgc::Reset() { +void AdaptiveAgc::HandleInputGainChange() { speech_level_estimator_.Reset(); + saturation_protector_->Reset(); } } // namespace webrtc diff --git a/modules/audio_processing/agc2/adaptive_agc.h b/modules/audio_processing/agc2/adaptive_agc.h index 525cab7561..fe814446ff 100644 --- a/modules/audio_processing/agc2/adaptive_agc.h +++ b/modules/audio_processing/agc2/adaptive_agc.h @@ -16,6 +16,7 @@ #include "modules/audio_processing/agc2/adaptive_digital_gain_applier.h" #include "modules/audio_processing/agc2/adaptive_mode_level_estimator.h" #include "modules/audio_processing/agc2/noise_level_estimator.h" +#include "modules/audio_processing/agc2/saturation_protector.h" #include "modules/audio_processing/agc2/vad_with_level.h" #include "modules/audio_processing/include/audio_frame_view.h" #include "modules/audio_processing/include/audio_processing.h" @@ -38,14 +39,17 @@ class AdaptiveAgc { // account the envelope measured by the limiter. // TODO(crbug.com/webrtc/7494): Make the class depend on the limiter. void Process(AudioFrameView frame, float limiter_envelope); - void Reset(); + + // Handles a gain change applied to the input signal (e.g., analog gain). + void HandleInputGainChange(); private: AdaptiveModeLevelEstimator speech_level_estimator_; VadLevelAnalyzer vad_; - AdaptiveDigitalGainApplier gain_applier_; + AdaptiveDigitalGainApplier gain_controller_; ApmDataDumper* const apm_data_dumper_; std::unique_ptr noise_level_estimator_; + std::unique_ptr saturation_protector_; }; } // namespace webrtc diff --git a/modules/audio_processing/agc2/adaptive_digital_gain_applier.cc b/modules/audio_processing/agc2/adaptive_digital_gain_applier.cc index 8a564647d2..8a8a7fdc9b 100644 --- a/modules/audio_processing/agc2/adaptive_digital_gain_applier.cc +++ b/modules/audio_processing/agc2/adaptive_digital_gain_applier.cc @@ -23,6 +23,9 @@ namespace webrtc { namespace { +constexpr int kHeadroomHistogramMin = 0; +constexpr int kHeadroomHistogramMax = 50; + // This function maps input level to desired applied gain. We want to // boost the signal so that peaks are at -kHeadroomDbfs. We can't // apply more than kMaxGainDb gain. @@ -31,17 +34,13 @@ float ComputeGainDb(float input_level_dbfs) { if (input_level_dbfs < -(kHeadroomDbfs + kMaxGainDb)) { return kMaxGainDb; } - // We expect to end up here most of the time: the level is below // -headroom, but we can boost it to -headroom. if (input_level_dbfs < -kHeadroomDbfs) { return -kHeadroomDbfs - input_level_dbfs; } - - // Otherwise, the level is too high and we can't boost. The - // LevelEstimator is responsible for not reporting bogus gain - // values. - RTC_DCHECK_LE(input_level_dbfs, 0.f); + // Otherwise, the level is too high and we can't boost. + RTC_DCHECK_GE(input_level_dbfs, -kHeadroomDbfs); return 0.f; } @@ -52,10 +51,11 @@ float LimitGainByNoise(float target_gain, float input_noise_level_dbfs, float max_output_noise_level_dbfs, ApmDataDumper& apm_data_dumper) { - const float noise_headroom_db = + const float max_allowed_gain_db = max_output_noise_level_dbfs - input_noise_level_dbfs; - apm_data_dumper.DumpRaw("agc2_noise_headroom_db", noise_headroom_db); - return std::min(target_gain, std::max(noise_headroom_db, 0.f)); + apm_data_dumper.DumpRaw("agc2_adaptive_gain_applier_max_allowed_gain_db", + max_allowed_gain_db); + return std::min(target_gain, std::max(max_allowed_gain_db, 0.f)); } float LimitGainByLowConfidence(float target_gain, @@ -68,8 +68,8 @@ float LimitGainByLowConfidence(float target_gain, } const float limiter_level_before_gain = limiter_audio_level_dbfs - last_gain; - // Compute a new gain so that limiter_level_before_gain + new_gain <= - // kLimiterThreshold. + // Compute a new gain so that `limiter_level_before_gain` + `new_target_gain` + // is not great than `kLimiterThresholdForAgcGainDbfs`. const float new_target_gain = std::max( kLimiterThresholdForAgcGainDbfs - limiter_level_before_gain, 0.f); return std::min(new_target_gain, target_gain); @@ -80,13 +80,16 @@ float LimitGainByLowConfidence(float target_gain, float ComputeGainChangeThisFrameDb(float target_gain_db, float last_gain_db, bool gain_increase_allowed, - float max_gain_change_db) { + float max_gain_decrease_db, + float max_gain_increase_db) { + RTC_DCHECK_GT(max_gain_decrease_db, 0); + RTC_DCHECK_GT(max_gain_increase_db, 0); float target_gain_difference_db = target_gain_db - last_gain_db; if (!gain_increase_allowed) { target_gain_difference_db = std::min(target_gain_difference_db, 0.f); } - return rtc::SafeClamp(target_gain_difference_db, -max_gain_change_db, - max_gain_change_db); + return rtc::SafeClamp(target_gain_difference_db, -max_gain_decrease_db, + max_gain_increase_db); } } // namespace @@ -115,7 +118,7 @@ AdaptiveDigitalGainApplier::AdaptiveDigitalGainApplier( void AdaptiveDigitalGainApplier::Process(const FrameInfo& info, AudioFrameView frame) { - RTC_DCHECK_GE(info.input_level_dbfs, -150.f); + RTC_DCHECK_GE(info.speech_level_dbfs, -150.f); RTC_DCHECK_GE(frame.num_channels(), 1); RTC_DCHECK( frame.samples_per_channel() == 80 || frame.samples_per_channel() == 160 || @@ -123,30 +126,46 @@ void AdaptiveDigitalGainApplier::Process(const FrameInfo& info, << "`frame` does not look like a 10 ms frame for an APM supported sample " "rate"; + // Compute the input level used to select the desired gain. + RTC_DCHECK_GT(info.headroom_db, 0.0f); + const float input_level_dbfs = info.speech_level_dbfs + info.headroom_db; + const float target_gain_db = LimitGainByLowConfidence( - LimitGainByNoise(ComputeGainDb(std::min(info.input_level_dbfs, 0.f)), - info.input_noise_level_dbfs, + LimitGainByNoise(ComputeGainDb(input_level_dbfs), info.noise_rms_dbfs, max_output_noise_level_dbfs_, *apm_data_dumper_), - last_gain_db_, info.limiter_envelope_dbfs, info.estimate_is_confident); + last_gain_db_, info.limiter_envelope_dbfs, info.speech_level_reliable); // Forbid increasing the gain until enough adjacent speech frames are // observed. - if (info.vad_result.speech_probability < kVadConfidenceThreshold) { + bool first_confident_speech_frame = false; + if (info.speech_probability < kVadConfidenceThreshold) { frames_to_gain_increase_allowed_ = adjacent_speech_frames_threshold_; } else if (frames_to_gain_increase_allowed_ > 0) { frames_to_gain_increase_allowed_--; + first_confident_speech_frame = frames_to_gain_increase_allowed_ == 0; + } + apm_data_dumper_->DumpRaw( + "agc2_adaptive_gain_applier_frames_to_gain_increase_allowed", + frames_to_gain_increase_allowed_); + + const bool gain_increase_allowed = frames_to_gain_increase_allowed_ == 0; + + float max_gain_increase_db = max_gain_change_db_per_10ms_; + if (first_confident_speech_frame) { + // No gain increase happened while waiting for a long enough speech + // sequence. Therefore, temporarily allow a faster gain increase. + RTC_DCHECK(gain_increase_allowed); + max_gain_increase_db *= adjacent_speech_frames_threshold_; } - apm_data_dumper_->DumpRaw("agc2_frames_to_gain_increase_allowed", - frames_to_gain_increase_allowed_); const float gain_change_this_frame_db = ComputeGainChangeThisFrameDb( - target_gain_db, last_gain_db_, - /*gain_increase_allowed=*/frames_to_gain_increase_allowed_ == 0, - max_gain_change_db_per_10ms_); + target_gain_db, last_gain_db_, gain_increase_allowed, + /*max_gain_decrease_db=*/max_gain_change_db_per_10ms_, + max_gain_increase_db); - apm_data_dumper_->DumpRaw("agc2_want_to_change_by_db", + apm_data_dumper_->DumpRaw("agc2_adaptive_gain_applier_want_to_change_by_db", target_gain_db - last_gain_db_); - apm_data_dumper_->DumpRaw("agc2_will_change_by_db", + apm_data_dumper_->DumpRaw("agc2_adaptive_gain_applier_will_change_by_db", gain_change_this_frame_db); // Optimization: avoid calling math functions if gain does not @@ -159,23 +178,29 @@ void AdaptiveDigitalGainApplier::Process(const FrameInfo& info, // Remember that the gain has changed for the next iteration. last_gain_db_ = last_gain_db_ + gain_change_this_frame_db; - apm_data_dumper_->DumpRaw("agc2_applied_gain_db", last_gain_db_); + apm_data_dumper_->DumpRaw("agc2_adaptive_gain_applier_applied_gain_db", + last_gain_db_); // Log every 10 seconds. calls_since_last_gain_log_++; if (calls_since_last_gain_log_ == 1000) { calls_since_last_gain_log_ = 0; + RTC_HISTOGRAM_COUNTS_LINEAR("WebRTC.Audio.Agc2.EstimatedSpeechLevel", + -info.speech_level_dbfs, 0, 100, 101); + RTC_HISTOGRAM_COUNTS_LINEAR("WebRTC.Audio.Agc2.EstimatedNoiseLevel", + -info.noise_rms_dbfs, 0, 100, 101); + RTC_HISTOGRAM_COUNTS_LINEAR( + "WebRTC.Audio.Agc2.Headroom", info.headroom_db, kHeadroomHistogramMin, + kHeadroomHistogramMax, + kHeadroomHistogramMax - kHeadroomHistogramMin + 1); RTC_HISTOGRAM_COUNTS_LINEAR("WebRTC.Audio.Agc2.DigitalGainApplied", last_gain_db_, 0, kMaxGainDb, kMaxGainDb + 1); - RTC_HISTOGRAM_COUNTS_LINEAR( - "WebRTC.Audio.Agc2.EstimatedSpeechPlusNoiseLevel", - -info.input_level_dbfs, 0, 100, 101); - RTC_HISTOGRAM_COUNTS_LINEAR("WebRTC.Audio.Agc2.EstimatedNoiseLevel", - -info.input_noise_level_dbfs, 0, 100, 101); RTC_LOG(LS_INFO) << "AGC2 adaptive digital" - << " | speech_plus_noise_dbfs: " << info.input_level_dbfs - << " | noise_dbfs: " << info.input_noise_level_dbfs + << " | speech_dbfs: " << info.speech_level_dbfs + << " | noise_dbfs: " << info.noise_rms_dbfs + << " | headroom_db: " << info.headroom_db << " | gain_db: " << last_gain_db_; } } + } // namespace webrtc diff --git a/modules/audio_processing/agc2/adaptive_digital_gain_applier.h b/modules/audio_processing/agc2/adaptive_digital_gain_applier.h index a65379f5be..74220fa861 100644 --- a/modules/audio_processing/agc2/adaptive_digital_gain_applier.h +++ b/modules/audio_processing/agc2/adaptive_digital_gain_applier.h @@ -12,33 +12,32 @@ #define MODULES_AUDIO_PROCESSING_AGC2_ADAPTIVE_DIGITAL_GAIN_APPLIER_H_ #include "modules/audio_processing/agc2/gain_applier.h" -#include "modules/audio_processing/agc2/vad_with_level.h" #include "modules/audio_processing/include/audio_frame_view.h" namespace webrtc { class ApmDataDumper; -// Part of the adaptive digital controller that applies a digital adaptive gain. -// The gain is updated towards a target. The logic decides when gain updates are -// allowed, it controls the adaptation speed and caps the target based on the -// estimated noise level and the speech level estimate confidence. +// TODO(bugs.webrtc.org): Split into `GainAdaptor` and `GainApplier`. +// Selects the target digital gain, decides when and how quickly to adapt to the +// target and applies the current gain to 10 ms frames. class AdaptiveDigitalGainApplier { public: // Information about a frame to process. struct FrameInfo { - float input_level_dbfs; // Estimated speech plus noise level. - float input_noise_level_dbfs; // Estimated noise level. - VadLevelAnalyzer::Result vad_result; - float limiter_envelope_dbfs; // Envelope level from the limiter. - bool estimate_is_confident; + float speech_probability; // Probability of speech in the [0, 1] range. + float speech_level_dbfs; // Estimated speech level (dBFS). + bool speech_level_reliable; // True with reliable speech level estimation. + float noise_rms_dbfs; // Estimated noise RMS level (dBFS). + float headroom_db; // Headroom (dB). + float limiter_envelope_dbfs; // Envelope level from the limiter (dBFS). }; - // Ctor. - // `adjacent_speech_frames_threshold` indicates how many speech frames are - // required before a gain increase is allowed. `max_gain_change_db_per_second` - // limits the adaptation speed (uniformly operated across frames). - // `max_output_noise_level_dbfs` limits the output noise level. + // Ctor. `adjacent_speech_frames_threshold` indicates how many adjacent speech + // frames must be observed in order to consider the sequence as speech. + // `max_gain_change_db_per_second` limits the adaptation speed (uniformly + // operated across frames). `max_output_noise_level_dbfs` limits the output + // noise level. AdaptiveDigitalGainApplier(ApmDataDumper* apm_data_dumper, int adjacent_speech_frames_threshold, float max_gain_change_db_per_second, diff --git a/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc b/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc index e2df700422..ee9cb02ed6 100644 --- a/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc +++ b/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc @@ -11,6 +11,7 @@ #include "modules/audio_processing/agc2/adaptive_digital_gain_applier.h" #include +#include #include "common_audio/include/audio_util.h" #include "modules/audio_processing/agc2/agc2_common.h" @@ -26,104 +27,75 @@ constexpr int kStereo = 2; constexpr int kFrameLen10ms8kHz = 80; constexpr int kFrameLen10ms48kHz = 480; +constexpr float kMaxSpeechProbability = 1.0f; + // Constants used in place of estimated noise levels. -constexpr float kNoNoiseDbfs = -90.f; +constexpr float kNoNoiseDbfs = kMinLevelDbfs; constexpr float kWithNoiseDbfs = -20.f; -static_assert(std::is_trivially_destructible::value, - ""); -constexpr VadLevelAnalyzer::Result kVadSpeech{1.f, -20.f, 0.f}; -constexpr float kMaxGainChangePerSecondDb = 3.f; +constexpr float kMaxGainChangePerSecondDb = 3.0f; constexpr float kMaxGainChangePerFrameDb = - kMaxGainChangePerSecondDb * kFrameDurationMs / 1000.f; -constexpr float kMaxOutputNoiseLevelDbfs = -50.f; + kMaxGainChangePerSecondDb * kFrameDurationMs / 1000.0f; +constexpr float kMaxOutputNoiseLevelDbfs = -50.0f; -// Helper to instance `AdaptiveDigitalGainApplier`. +// Helper to create initialized `AdaptiveDigitalGainApplier` objects. struct GainApplierHelper { GainApplierHelper() : GainApplierHelper(/*adjacent_speech_frames_threshold=*/1) {} explicit GainApplierHelper(int adjacent_speech_frames_threshold) : apm_data_dumper(0), - gain_applier(&apm_data_dumper, - adjacent_speech_frames_threshold, - kMaxGainChangePerSecondDb, - kMaxOutputNoiseLevelDbfs) {} + gain_applier(std::make_unique( + &apm_data_dumper, + adjacent_speech_frames_threshold, + kMaxGainChangePerSecondDb, + kMaxOutputNoiseLevelDbfs)) {} ApmDataDumper apm_data_dumper; - AdaptiveDigitalGainApplier gain_applier; + std::unique_ptr gain_applier; }; -// Runs gain applier and returns the applied gain in linear scale. -float RunOnConstantLevel(int num_iterations, - VadLevelAnalyzer::Result vad_level, - float input_level_dbfs, - AdaptiveDigitalGainApplier* gain_applier) { - float gain_linear = 0.f; - - for (int i = 0; i < num_iterations; ++i) { - VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.f); - AdaptiveDigitalGainApplier::FrameInfo info; - info.input_level_dbfs = input_level_dbfs; - info.input_noise_level_dbfs = kNoNoiseDbfs; - info.vad_result = vad_level; - info.limiter_envelope_dbfs = -2.f; - info.estimate_is_confident = true; - gain_applier->Process(info, fake_audio.float_frame_view()); - gain_linear = fake_audio.float_frame_view().channel(0)[0]; - } - return gain_linear; -} - // Voice on, no noise, low limiter, confident level. +static_assert(std::is_trivially_destructible< + AdaptiveDigitalGainApplier::FrameInfo>::value, + ""); constexpr AdaptiveDigitalGainApplier::FrameInfo kFrameInfo{ - /*input_level_dbfs=*/-1.f, - /*input_noise_level_dbfs=*/kNoNoiseDbfs, - /*vad_result=*/kVadSpeech, - /*limiter_envelope_dbfs=*/-2.f, - /*estimate_is_confident=*/true}; + /*speech_probability=*/kMaxSpeechProbability, + /*speech_level_dbfs=*/kInitialSpeechLevelEstimateDbfs, + /*speech_level_reliable=*/true, + /*noise_rms_dbfs=*/kNoNoiseDbfs, + /*headroom_db=*/kSaturationProtectorInitialHeadroomDb, + /*limiter_envelope_dbfs=*/-2.0f}; -TEST(AutomaticGainController2AdaptiveGainApplier, GainApplierShouldNotCrash) { +TEST(GainController2AdaptiveGainApplier, GainApplierShouldNotCrash) { GainApplierHelper helper; // Make one call with reasonable audio level values and settings. - VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.f); + VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.0f); AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; - info.input_level_dbfs = -5.0; - helper.gain_applier.Process(kFrameInfo, fake_audio.float_frame_view()); + info.speech_level_dbfs = -5.0f; + helper.gain_applier->Process(kFrameInfo, fake_audio.float_frame_view()); } -// Check that the output is -kHeadroom dBFS. -TEST(AutomaticGainController2AdaptiveGainApplier, TargetLevelIsReached) { - GainApplierHelper helper; - - constexpr float initial_level_dbfs = -5.f; - - const float applied_gain = RunOnConstantLevel( - 200, kVadSpeech, initial_level_dbfs, &helper.gain_applier); - - EXPECT_NEAR(applied_gain, DbToRatio(-kHeadroomDbfs - initial_level_dbfs), - 0.1f); -} - -// Check that the output is -kHeadroom dBFS -TEST(AutomaticGainController2AdaptiveGainApplier, GainApproachesMaxGain) { - GainApplierHelper helper; - - constexpr float initial_level_dbfs = -kHeadroomDbfs - kMaxGainDb - 10.f; - // A few extra frames for safety. +// Checks that the maximum allowed gain is applied. +TEST(GainController2AdaptiveGainApplier, MaxGainApplied) { constexpr int kNumFramesToAdapt = static_cast(kMaxGainDb / kMaxGainChangePerFrameDb) + 10; - const float applied_gain = RunOnConstantLevel( - kNumFramesToAdapt, kVadSpeech, initial_level_dbfs, &helper.gain_applier); - EXPECT_NEAR(applied_gain, DbToRatio(kMaxGainDb), 0.1f); - - const float applied_gain_db = 20.f * std::log10(applied_gain); + GainApplierHelper helper; + AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; + info.speech_level_dbfs = -60.0f; + float applied_gain; + for (int i = 0; i < kNumFramesToAdapt; ++i) { + VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.0f); + helper.gain_applier->Process(info, fake_audio.float_frame_view()); + applied_gain = fake_audio.float_frame_view().channel(0)[0]; + } + const float applied_gain_db = 20.0f * std::log10f(applied_gain); EXPECT_NEAR(applied_gain_db, kMaxGainDb, 0.1f); } -TEST(AutomaticGainController2AdaptiveGainApplier, GainDoesNotChangeFast) { +TEST(GainController2AdaptiveGainApplier, GainDoesNotChangeFast) { GainApplierHelper helper; - constexpr float initial_level_dbfs = -25.f; + constexpr float initial_level_dbfs = -25.0f; // A few extra frames for safety. constexpr int kNumFramesToAdapt = static_cast(initial_level_dbfs / kMaxGainChangePerFrameDb) + 10; @@ -133,10 +105,10 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainDoesNotChangeFast) { float last_gain_linear = 1.f; for (int i = 0; i < kNumFramesToAdapt; ++i) { SCOPED_TRACE(i); - VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.f); + VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.0f); AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; - info.input_level_dbfs = initial_level_dbfs; - helper.gain_applier.Process(info, fake_audio.float_frame_view()); + info.speech_level_dbfs = initial_level_dbfs; + helper.gain_applier->Process(info, fake_audio.float_frame_view()); float current_gain_linear = fake_audio.float_frame_view().channel(0)[0]; EXPECT_LE(std::abs(current_gain_linear - last_gain_linear), kMaxChangePerFrameLinear); @@ -146,10 +118,10 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainDoesNotChangeFast) { // Check that the same is true when gain decreases as well. for (int i = 0; i < kNumFramesToAdapt; ++i) { SCOPED_TRACE(i); - VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.f); + VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.0f); AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; - info.input_level_dbfs = 0.f; - helper.gain_applier.Process(info, fake_audio.float_frame_view()); + info.speech_level_dbfs = 0.f; + helper.gain_applier->Process(info, fake_audio.float_frame_view()); float current_gain_linear = fake_audio.float_frame_view().channel(0)[0]; EXPECT_LE(std::abs(current_gain_linear - last_gain_linear), kMaxChangePerFrameLinear); @@ -157,17 +129,17 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainDoesNotChangeFast) { } } -TEST(AutomaticGainController2AdaptiveGainApplier, GainIsRampedInAFrame) { +TEST(GainController2AdaptiveGainApplier, GainIsRampedInAFrame) { GainApplierHelper helper; - constexpr float initial_level_dbfs = -25.f; + constexpr float initial_level_dbfs = -25.0f; - VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f); + VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.0f); AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; - info.input_level_dbfs = initial_level_dbfs; - helper.gain_applier.Process(info, fake_audio.float_frame_view()); - float maximal_difference = 0.f; - float current_value = 1.f * DbToRatio(kInitialAdaptiveDigitalGainDb); + info.speech_level_dbfs = initial_level_dbfs; + helper.gain_applier->Process(info, fake_audio.float_frame_view()); + float maximal_difference = 0.0f; + float current_value = 1.0f * DbToRatio(kInitialAdaptiveDigitalGainDb); for (const auto& x : fake_audio.float_frame_view().channel(0)) { const float difference = std::abs(x - current_value); maximal_difference = std::max(maximal_difference, difference); @@ -181,10 +153,10 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainIsRampedInAFrame) { EXPECT_LE(maximal_difference, kMaxChangePerSample); } -TEST(AutomaticGainController2AdaptiveGainApplier, NoiseLimitsGain) { +TEST(GainController2AdaptiveGainApplier, NoiseLimitsGain) { GainApplierHelper helper; - constexpr float initial_level_dbfs = -25.f; + constexpr float initial_level_dbfs = -25.0f; constexpr int num_initial_frames = kInitialAdaptiveDigitalGainDb / kMaxGainChangePerFrameDb; constexpr int num_frames = 50; @@ -193,11 +165,11 @@ TEST(AutomaticGainController2AdaptiveGainApplier, NoiseLimitsGain) { << "kWithNoiseDbfs is too low"; for (int i = 0; i < num_initial_frames + num_frames; ++i) { - VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f); + VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.0f); AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; - info.input_level_dbfs = initial_level_dbfs; - info.input_noise_level_dbfs = kWithNoiseDbfs; - helper.gain_applier.Process(info, fake_audio.float_frame_view()); + info.speech_level_dbfs = initial_level_dbfs; + info.noise_rms_dbfs = kWithNoiseDbfs; + helper.gain_applier->Process(info, fake_audio.float_frame_view()); // Wait so that the adaptive gain applier has time to lower the gain. if (i > num_initial_frames) { @@ -205,25 +177,25 @@ TEST(AutomaticGainController2AdaptiveGainApplier, NoiseLimitsGain) { *std::max_element(fake_audio.float_frame_view().channel(0).begin(), fake_audio.float_frame_view().channel(0).end()); - EXPECT_NEAR(maximal_ratio, 1.f, 0.001f); + EXPECT_NEAR(maximal_ratio, 1.0f, 0.001f); } } } -TEST(AutomaticGainController2GainApplier, CanHandlePositiveSpeechLevels) { +TEST(GainController2GainApplier, CanHandlePositiveSpeechLevels) { GainApplierHelper helper; // Make one call with positive audio level values and settings. - VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.f); + VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.0f); AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; - info.input_level_dbfs = 5.f; - helper.gain_applier.Process(info, fake_audio.float_frame_view()); + info.speech_level_dbfs = 5.0f; + helper.gain_applier->Process(info, fake_audio.float_frame_view()); } -TEST(AutomaticGainController2GainApplier, AudioLevelLimitsGain) { +TEST(GainController2GainApplier, AudioLevelLimitsGain) { GainApplierHelper helper; - constexpr float initial_level_dbfs = -25.f; + constexpr float initial_level_dbfs = -25.0f; constexpr int num_initial_frames = kInitialAdaptiveDigitalGainDb / kMaxGainChangePerFrameDb; constexpr int num_frames = 50; @@ -232,12 +204,12 @@ TEST(AutomaticGainController2GainApplier, AudioLevelLimitsGain) { << "kWithNoiseDbfs is too low"; for (int i = 0; i < num_initial_frames + num_frames; ++i) { - VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f); + VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.0f); AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; - info.input_level_dbfs = initial_level_dbfs; - info.limiter_envelope_dbfs = 1.f; - info.estimate_is_confident = false; - helper.gain_applier.Process(info, fake_audio.float_frame_view()); + info.speech_level_dbfs = initial_level_dbfs; + info.limiter_envelope_dbfs = 1.0f; + info.speech_level_reliable = false; + helper.gain_applier->Process(info, fake_audio.float_frame_view()); // Wait so that the adaptive gain applier has time to lower the gain. if (i > num_initial_frames) { @@ -245,7 +217,7 @@ TEST(AutomaticGainController2GainApplier, AudioLevelLimitsGain) { *std::max_element(fake_audio.float_frame_view().channel(0).begin(), fake_audio.float_frame_view().channel(0).end()); - EXPECT_NEAR(maximal_ratio, 1.f, 0.001f); + EXPECT_NEAR(maximal_ratio, 1.0f, 0.001f); } } } @@ -260,14 +232,11 @@ TEST_P(AdaptiveDigitalGainApplierTest, const int adjacent_speech_frames_threshold = AdjacentSpeechFramesThreshold(); GainApplierHelper helper(adjacent_speech_frames_threshold); - AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; - info.input_level_dbfs = -25.0; - - float prev_gain = 0.f; + float prev_gain = 0.0f; for (int i = 0; i < adjacent_speech_frames_threshold; ++i) { SCOPED_TRACE(i); - VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f); - helper.gain_applier.Process(info, audio.float_frame_view()); + VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.0f); + helper.gain_applier->Process(kFrameInfo, audio.float_frame_view()); const float gain = audio.float_frame_view().channel(0)[0]; if (i > 0) { EXPECT_EQ(prev_gain, gain); // No gain increase. @@ -280,25 +249,23 @@ TEST_P(AdaptiveDigitalGainApplierTest, IncreaseGainWithEnoughSpeechFrames) { const int adjacent_speech_frames_threshold = AdjacentSpeechFramesThreshold(); GainApplierHelper helper(adjacent_speech_frames_threshold); - AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; - info.input_level_dbfs = -25.0; - - float prev_gain = 0.f; + float prev_gain = 0.0f; for (int i = 0; i < adjacent_speech_frames_threshold; ++i) { - VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f); - helper.gain_applier.Process(info, audio.float_frame_view()); + SCOPED_TRACE(i); + VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.0f); + helper.gain_applier->Process(kFrameInfo, audio.float_frame_view()); prev_gain = audio.float_frame_view().channel(0)[0]; } // Process one more speech frame. - VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f); - helper.gain_applier.Process(info, audio.float_frame_view()); + VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.0f); + helper.gain_applier->Process(kFrameInfo, audio.float_frame_view()); // The gain has increased. EXPECT_GT(audio.float_frame_view().channel(0)[0], prev_gain); } -INSTANTIATE_TEST_SUITE_P(AutomaticGainController2, +INSTANTIATE_TEST_SUITE_P(GainController2, AdaptiveDigitalGainApplierTest, ::testing::Values(1, 7, 31)); diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc b/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc index 9857471eb9..507aa12cb4 100644 --- a/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc +++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc @@ -22,37 +22,17 @@ namespace { using LevelEstimatorType = AudioProcessing::Config::GainController2::LevelEstimator; -// Combines a level estimation with the saturation protector margins. -float ComputeLevelEstimateDbfs(float level_estimate_dbfs, - float saturation_margin_db, - float extra_saturation_margin_db) { - return rtc::SafeClamp( - level_estimate_dbfs + saturation_margin_db + extra_saturation_margin_db, - -90.f, 30.f); -} - -// Returns the level of given type from `vad_level`. -float GetLevel(const VadLevelAnalyzer::Result& vad_level, - LevelEstimatorType type) { - switch (type) { - case LevelEstimatorType::kRms: - return vad_level.rms_dbfs; - break; - case LevelEstimatorType::kPeak: - return vad_level.peak_dbfs; - break; - } - RTC_CHECK_NOTREACHED(); +float ClampLevelEstimateDbfs(float level_estimate_dbfs) { + return rtc::SafeClamp(level_estimate_dbfs, -90.f, 30.f); } } // namespace bool AdaptiveModeLevelEstimator::LevelEstimatorState::operator==( const AdaptiveModeLevelEstimator::LevelEstimatorState& b) const { - return time_to_full_buffer_ms == b.time_to_full_buffer_ms && + return time_to_confidence_ms == b.time_to_confidence_ms && level_dbfs.numerator == b.level_dbfs.numerator && - level_dbfs.denominator == b.level_dbfs.denominator && - saturation_protector == b.saturation_protector; + level_dbfs.denominator == b.level_dbfs.denominator; } float AdaptiveModeLevelEstimator::LevelEstimatorState::Ratio::GetRatio() const { @@ -64,25 +44,14 @@ AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator( ApmDataDumper* apm_data_dumper) : AdaptiveModeLevelEstimator( apm_data_dumper, - AudioProcessing::Config::GainController2::LevelEstimator::kRms, - kDefaultLevelEstimatorAdjacentSpeechFramesThreshold, - kDefaultInitialSaturationMarginDb, - kDefaultExtraSaturationMarginDb) {} + kDefaultLevelEstimatorAdjacentSpeechFramesThreshold) {} AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator( ApmDataDumper* apm_data_dumper, - AudioProcessing::Config::GainController2::LevelEstimator level_estimator, - int adjacent_speech_frames_threshold, - float initial_saturation_margin_db, - float extra_saturation_margin_db) + int adjacent_speech_frames_threshold) : apm_data_dumper_(apm_data_dumper), - level_estimator_type_(level_estimator), adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold), - initial_saturation_margin_db_(initial_saturation_margin_db), - extra_saturation_margin_db_(extra_saturation_margin_db), - level_dbfs_(ComputeLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs, - initial_saturation_margin_db_, - extra_saturation_margin_db_)) { + level_dbfs_(ClampLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs)) { RTC_DCHECK(apm_data_dumper_); RTC_DCHECK_GE(adjacent_speech_frames_threshold_, 1); Reset(); @@ -96,8 +65,6 @@ void AdaptiveModeLevelEstimator::Update( RTC_DCHECK_LT(vad_level.peak_dbfs, 50.f); RTC_DCHECK_GE(vad_level.speech_probability, 0.f); RTC_DCHECK_LE(vad_level.speech_probability, 1.f); - DumpDebugData(); - if (vad_level.speech_probability < kVadConfidenceThreshold) { // Not a speech frame. if (adjacent_speech_frames_threshold_ > 1) { @@ -115,89 +82,82 @@ void AdaptiveModeLevelEstimator::Update( } } num_adjacent_speech_frames_ = 0; - return; - } - - // Speech frame observed. - num_adjacent_speech_frames_++; - - // Update preliminary level estimate. - RTC_DCHECK_GE(preliminary_state_.time_to_full_buffer_ms, 0); - const bool buffer_is_full = preliminary_state_.time_to_full_buffer_ms == 0; - if (!buffer_is_full) { - preliminary_state_.time_to_full_buffer_ms -= kFrameDurationMs; - } - // Weighted average of levels with speech probability as weight. - RTC_DCHECK_GT(vad_level.speech_probability, 0.f); - const float leak_factor = buffer_is_full ? kFullBufferLeakFactor : 1.f; - preliminary_state_.level_dbfs.numerator = - preliminary_state_.level_dbfs.numerator * leak_factor + - GetLevel(vad_level, level_estimator_type_) * vad_level.speech_probability; - preliminary_state_.level_dbfs.denominator = - preliminary_state_.level_dbfs.denominator * leak_factor + - vad_level.speech_probability; - - const float level_dbfs = preliminary_state_.level_dbfs.GetRatio(); - - UpdateSaturationProtectorState(vad_level.peak_dbfs, level_dbfs, - preliminary_state_.saturation_protector); - - if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) { - // `preliminary_state_` is now reliable. Update the last level estimation. - level_dbfs_ = ComputeLevelEstimateDbfs( - level_dbfs, preliminary_state_.saturation_protector.margin_db, - extra_saturation_margin_db_); + } else { + // Speech frame observed. + num_adjacent_speech_frames_++; + + // Update preliminary level estimate. + RTC_DCHECK_GE(preliminary_state_.time_to_confidence_ms, 0); + const bool buffer_is_full = preliminary_state_.time_to_confidence_ms == 0; + if (!buffer_is_full) { + preliminary_state_.time_to_confidence_ms -= kFrameDurationMs; + } + // Weighted average of levels with speech probability as weight. + RTC_DCHECK_GT(vad_level.speech_probability, 0.f); + const float leak_factor = buffer_is_full ? kLevelEstimatorLeakFactor : 1.f; + preliminary_state_.level_dbfs.numerator = + preliminary_state_.level_dbfs.numerator * leak_factor + + vad_level.rms_dbfs * vad_level.speech_probability; + preliminary_state_.level_dbfs.denominator = + preliminary_state_.level_dbfs.denominator * leak_factor + + vad_level.speech_probability; + + const float level_dbfs = preliminary_state_.level_dbfs.GetRatio(); + + if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) { + // `preliminary_state_` is now reliable. Update the last level estimation. + level_dbfs_ = ClampLevelEstimateDbfs(level_dbfs); + } } + DumpDebugData(); } bool AdaptiveModeLevelEstimator::IsConfident() const { if (adjacent_speech_frames_threshold_ == 1) { // Ignore `reliable_state_` when a single frame is enough to update the // level estimate (because it is not used). - return preliminary_state_.time_to_full_buffer_ms == 0; + return preliminary_state_.time_to_confidence_ms == 0; } // Once confident, it remains confident. - RTC_DCHECK(reliable_state_.time_to_full_buffer_ms != 0 || - preliminary_state_.time_to_full_buffer_ms == 0); + RTC_DCHECK(reliable_state_.time_to_confidence_ms != 0 || + preliminary_state_.time_to_confidence_ms == 0); // During the first long enough speech sequence, `reliable_state_` must be // ignored since `preliminary_state_` is used. - return reliable_state_.time_to_full_buffer_ms == 0 || + return reliable_state_.time_to_confidence_ms == 0 || (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_ && - preliminary_state_.time_to_full_buffer_ms == 0); + preliminary_state_.time_to_confidence_ms == 0); } void AdaptiveModeLevelEstimator::Reset() { ResetLevelEstimatorState(preliminary_state_); ResetLevelEstimatorState(reliable_state_); - level_dbfs_ = ComputeLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs, - initial_saturation_margin_db_, - extra_saturation_margin_db_); + level_dbfs_ = ClampLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs); num_adjacent_speech_frames_ = 0; } void AdaptiveModeLevelEstimator::ResetLevelEstimatorState( LevelEstimatorState& state) const { - state.time_to_full_buffer_ms = kFullBufferSizeMs; - state.level_dbfs.numerator = 0.f; - state.level_dbfs.denominator = 0.f; - ResetSaturationProtectorState(initial_saturation_margin_db_, - state.saturation_protector); + state.time_to_confidence_ms = kLevelEstimatorTimeToConfidenceMs; + state.level_dbfs.numerator = kInitialSpeechLevelEstimateDbfs; + state.level_dbfs.denominator = 1.0f; } void AdaptiveModeLevelEstimator::DumpDebugData() const { - apm_data_dumper_->DumpRaw("agc2_adaptive_level_estimate_dbfs", level_dbfs_); - apm_data_dumper_->DumpRaw("agc2_adaptive_num_adjacent_speech_frames", - num_adjacent_speech_frames_); - apm_data_dumper_->DumpRaw("agc2_adaptive_preliminary_level_estimate_num", - preliminary_state_.level_dbfs.numerator); - apm_data_dumper_->DumpRaw("agc2_adaptive_preliminary_level_estimate_den", - preliminary_state_.level_dbfs.denominator); - apm_data_dumper_->DumpRaw("agc2_adaptive_preliminary_saturation_margin_db", - preliminary_state_.saturation_protector.margin_db); - apm_data_dumper_->DumpRaw("agc2_adaptive_preliminary_time_to_full_buffer_ms", - preliminary_state_.time_to_full_buffer_ms); - apm_data_dumper_->DumpRaw("agc2_adaptive_reliable_time_to_full_buffer_ms", - reliable_state_.time_to_full_buffer_ms); + apm_data_dumper_->DumpRaw( + "agc2_adaptive_level_estimator_num_adjacent_speech_frames", + num_adjacent_speech_frames_); + apm_data_dumper_->DumpRaw( + "agc2_adaptive_level_estimator_preliminary_level_estimate_num", + preliminary_state_.level_dbfs.numerator); + apm_data_dumper_->DumpRaw( + "agc2_adaptive_level_estimator_preliminary_level_estimate_den", + preliminary_state_.level_dbfs.denominator); + apm_data_dumper_->DumpRaw( + "agc2_adaptive_level_estimator_preliminary_time_to_confidence_ms", + preliminary_state_.time_to_confidence_ms); + apm_data_dumper_->DumpRaw( + "agc2_adaptive_level_estimator_reliable_time_to_confidence_ms", + reliable_state_.time_to_confidence_ms); } } // namespace webrtc diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator.h b/modules/audio_processing/agc2/adaptive_mode_level_estimator.h index 213fc0f0c8..6d44938587 100644 --- a/modules/audio_processing/agc2/adaptive_mode_level_estimator.h +++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator.h @@ -15,7 +15,6 @@ #include #include "modules/audio_processing/agc2/agc2_common.h" -#include "modules/audio_processing/agc2/saturation_protector.h" #include "modules/audio_processing/agc2/vad_with_level.h" #include "modules/audio_processing/include/audio_processing.h" @@ -29,12 +28,8 @@ class AdaptiveModeLevelEstimator { AdaptiveModeLevelEstimator(const AdaptiveModeLevelEstimator&) = delete; AdaptiveModeLevelEstimator& operator=(const AdaptiveModeLevelEstimator&) = delete; - AdaptiveModeLevelEstimator( - ApmDataDumper* apm_data_dumper, - AudioProcessing::Config::GainController2::LevelEstimator level_estimator, - int adjacent_speech_frames_threshold, - float initial_saturation_margin_db, - float extra_saturation_margin_db); + AdaptiveModeLevelEstimator(ApmDataDumper* apm_data_dumper, + int adjacent_speech_frames_threshold); // Updates the level estimation. void Update(const VadLevelAnalyzer::Result& vad_data); @@ -57,10 +52,9 @@ class AdaptiveModeLevelEstimator { float denominator; float GetRatio() const; }; - // TODO(crbug.com/webrtc/7494): Remove time_to_full_buffer_ms if redundant. - int time_to_full_buffer_ms; + // TODO(crbug.com/webrtc/7494): Remove time_to_confidence_ms if redundant. + int time_to_confidence_ms; Ratio level_dbfs; - SaturationProtectorState saturation_protector; }; static_assert(std::is_trivially_copyable::value, ""); @@ -70,11 +64,7 @@ class AdaptiveModeLevelEstimator { ApmDataDumper* const apm_data_dumper_; - const AudioProcessing::Config::GainController2::LevelEstimator - level_estimator_type_; const int adjacent_speech_frames_threshold_; - const float initial_saturation_margin_db_; - const float extra_saturation_margin_db_; LevelEstimatorState preliminary_state_; LevelEstimatorState reliable_state_; float level_dbfs_; diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc b/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc index ea35797f5e..c55950ac29 100644 --- a/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc +++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc @@ -19,22 +19,34 @@ namespace webrtc { namespace { -constexpr float kInitialSaturationMarginDb = 20.f; -constexpr float kExtraSaturationMarginDb = 2.f; +// Number of speech frames that the level estimator must observe in order to +// become confident about the estimated level. +constexpr int kNumFramesToConfidence = + kLevelEstimatorTimeToConfidenceMs / kFrameDurationMs; +static_assert(kNumFramesToConfidence > 0, ""); -static_assert(kInitialSpeechLevelEstimateDbfs < 0.f, ""); -constexpr float kVadLevelRms = kInitialSpeechLevelEstimateDbfs / 2.f; -constexpr float kVadLevelPeak = kInitialSpeechLevelEstimateDbfs / 3.f; +// Fake levels and speech probabilities used in the tests. +static_assert(kInitialSpeechLevelEstimateDbfs < 0.0f, ""); +constexpr float kVadLevelRms = kInitialSpeechLevelEstimateDbfs / 2.0f; +constexpr float kVadLevelPeak = kInitialSpeechLevelEstimateDbfs / 3.0f; +static_assert(kVadLevelRms < kVadLevelPeak, ""); +static_assert(kVadLevelRms > kInitialSpeechLevelEstimateDbfs, ""); +static_assert(kVadLevelRms - kInitialSpeechLevelEstimateDbfs > 5.0f, + "Adjust `kVadLevelRms` so that the difference from the initial " + "level is wide enough for the tests."); -constexpr VadLevelAnalyzer::Result kVadDataSpeech{/*speech_probability=*/1.f, +constexpr VadLevelAnalyzer::Result kVadDataSpeech{/*speech_probability=*/1.0f, kVadLevelRms, kVadLevelPeak}; constexpr VadLevelAnalyzer::Result kVadDataNonSpeech{ - /*speech_probability=*/kVadConfidenceThreshold / 2.f, kVadLevelRms, + /*speech_probability=*/kVadConfidenceThreshold / 2.0f, kVadLevelRms, kVadLevelPeak}; -constexpr float kMinSpeechProbability = 0.f; -constexpr float kMaxSpeechProbability = 1.f; +constexpr float kMinSpeechProbability = 0.0f; +constexpr float kMaxSpeechProbability = 1.0f; +constexpr float kConvergenceSpeedTestsLevelTolerance = 0.5f; + +// Provides the `vad_level` value `num_iterations` times to `level_estimator`. void RunOnConstantLevel(int num_iterations, const VadLevelAnalyzer::Result& vad_level, AdaptiveModeLevelEstimator& level_estimator) { @@ -43,172 +55,125 @@ void RunOnConstantLevel(int num_iterations, } } +// Level estimator with data dumper. struct TestLevelEstimator { TestLevelEstimator() : data_dumper(0), estimator(std::make_unique( &data_dumper, - AudioProcessing::Config::GainController2::LevelEstimator::kRms, - /*adjacent_speech_frames_threshold=*/1, - kInitialSaturationMarginDb, - kExtraSaturationMarginDb)) {} + /*adjacent_speech_frames_threshold=*/1)) {} ApmDataDumper data_dumper; std::unique_ptr estimator; }; -TEST(AutomaticGainController2AdaptiveModeLevelEstimator, - EstimatorShouldNotCrash) { +// Checks the initially estimated level. +TEST(GainController2AdaptiveModeLevelEstimator, CheckInitialEstimate) { TestLevelEstimator level_estimator; - - VadLevelAnalyzer::Result vad_level{kMaxSpeechProbability, /*rms_dbfs=*/-20.f, - /*peak_dbfs=*/-10.f}; - level_estimator.estimator->Update(vad_level); - static_cast(level_estimator.estimator->level_dbfs()); + EXPECT_FLOAT_EQ(level_estimator.estimator->level_dbfs(), + kInitialSpeechLevelEstimateDbfs); } -TEST(AutomaticGainController2AdaptiveModeLevelEstimator, LevelShouldStabilize) { +// Checks that the level estimator converges to a constant input speech level. +TEST(GainController2AdaptiveModeLevelEstimator, LevelStabilizes) { TestLevelEstimator level_estimator; - - constexpr float kSpeechPeakDbfs = -15.f; - RunOnConstantLevel(100, - VadLevelAnalyzer::Result{kMaxSpeechProbability, - /*rms_dbfs=*/kSpeechPeakDbfs - - kInitialSaturationMarginDb, - kSpeechPeakDbfs}, + RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, kVadDataSpeech, *level_estimator.estimator); - - EXPECT_NEAR( - level_estimator.estimator->level_dbfs() - kExtraSaturationMarginDb, - kSpeechPeakDbfs, 0.1f); + const float estimated_level_dbfs = level_estimator.estimator->level_dbfs(); + RunOnConstantLevel(/*num_iterations=*/1, kVadDataSpeech, + *level_estimator.estimator); + EXPECT_NEAR(level_estimator.estimator->level_dbfs(), estimated_level_dbfs, + 0.1f); } -TEST(AutomaticGainController2AdaptiveModeLevelEstimator, - EstimatorIgnoresZeroProbabilityFrames) { +// Checks that the level controller does not become confident when too few +// speech frames are observed. +TEST(GainController2AdaptiveModeLevelEstimator, IsNotConfident) { TestLevelEstimator level_estimator; + RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence / 2, + kVadDataSpeech, *level_estimator.estimator); + EXPECT_FALSE(level_estimator.estimator->IsConfident()); +} - // Run for one second of fake audio. - constexpr float kSpeechRmsDbfs = -25.f; - RunOnConstantLevel(100, - VadLevelAnalyzer::Result{kMaxSpeechProbability, - /*rms_dbfs=*/kSpeechRmsDbfs - - kInitialSaturationMarginDb, - /*peak_dbfs=*/kSpeechRmsDbfs}, +// Checks that the level controller becomes confident when enough speech frames +// are observed. +TEST(GainController2AdaptiveModeLevelEstimator, IsConfident) { + TestLevelEstimator level_estimator; + RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, kVadDataSpeech, *level_estimator.estimator); + EXPECT_TRUE(level_estimator.estimator->IsConfident()); +} - // Run for one more second, but mark as not speech. - constexpr float kNoiseRmsDbfs = 0.f; - RunOnConstantLevel(100, +// Checks that the estimated level is not affected by the level of non-speech +// frames. +TEST(GainController2AdaptiveModeLevelEstimator, + EstimatorIgnoresNonSpeechFrames) { + TestLevelEstimator level_estimator; + // Simulate speech. + RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, kVadDataSpeech, + *level_estimator.estimator); + const float estimated_level_dbfs = level_estimator.estimator->level_dbfs(); + // Simulate full-scale non-speech. + RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, VadLevelAnalyzer::Result{kMinSpeechProbability, - /*rms_dbfs=*/kNoiseRmsDbfs, - /*peak_dbfs=*/kNoiseRmsDbfs}, + /*rms_dbfs=*/0.0f, + /*peak_dbfs=*/0.0f}, *level_estimator.estimator); - - // Level should not have changed. - EXPECT_NEAR( - level_estimator.estimator->level_dbfs() - kExtraSaturationMarginDb, - kSpeechRmsDbfs, 0.1f); + // No estimated level change is expected. + EXPECT_FLOAT_EQ(level_estimator.estimator->level_dbfs(), + estimated_level_dbfs); } -TEST(AutomaticGainController2AdaptiveModeLevelEstimator, TimeToAdapt) { +// Checks the convergence speed of the estimator before it becomes confident. +TEST(GainController2AdaptiveModeLevelEstimator, + ConvergenceSpeedBeforeConfidence) { TestLevelEstimator level_estimator; - - // Run for one 'window size' interval. - constexpr float kInitialSpeechRmsDbfs = -30.f; - RunOnConstantLevel( - kFullBufferSizeMs / kFrameDurationMs, - VadLevelAnalyzer::Result{ - kMaxSpeechProbability, - /*rms_dbfs=*/kInitialSpeechRmsDbfs - kInitialSaturationMarginDb, - /*peak_dbfs=*/kInitialSpeechRmsDbfs}, - *level_estimator.estimator); - - // Run for one half 'window size' interval. This should not be enough to - // adapt. - constexpr float kDifferentSpeechRmsDbfs = -10.f; - // It should at most differ by 25% after one half 'window size' interval. - // TODO(crbug.com/webrtc/7494): Add constexpr for repeated expressions. - const float kMaxDifferenceDb = - 0.25f * std::abs(kDifferentSpeechRmsDbfs - kInitialSpeechRmsDbfs); - RunOnConstantLevel( - static_cast(kFullBufferSizeMs / kFrameDurationMs / 2), - VadLevelAnalyzer::Result{ - kMaxSpeechProbability, - /*rms_dbfs=*/kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb, - /*peak_dbfs=*/kDifferentSpeechRmsDbfs}, - *level_estimator.estimator); - EXPECT_GT(std::abs(kDifferentSpeechRmsDbfs - - level_estimator.estimator->level_dbfs()), - kMaxDifferenceDb); - - // Run for some more time. Afterwards, we should have adapted. - RunOnConstantLevel( - static_cast(3 * kFullBufferSizeMs / kFrameDurationMs), - VadLevelAnalyzer::Result{ - kMaxSpeechProbability, - /*rms_dbfs=*/kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb, - /*peak_dbfs=*/kDifferentSpeechRmsDbfs}, - *level_estimator.estimator); - EXPECT_NEAR( - level_estimator.estimator->level_dbfs() - kExtraSaturationMarginDb, - kDifferentSpeechRmsDbfs, kMaxDifferenceDb * 0.5f); + RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, kVadDataSpeech, + *level_estimator.estimator); + EXPECT_NEAR(level_estimator.estimator->level_dbfs(), kVadDataSpeech.rms_dbfs, + kConvergenceSpeedTestsLevelTolerance); } -TEST(AutomaticGainController2AdaptiveModeLevelEstimator, - ResetGivesFastAdaptation) { +// Checks the convergence speed of the estimator after it becomes confident. +TEST(GainController2AdaptiveModeLevelEstimator, + ConvergenceSpeedAfterConfidence) { TestLevelEstimator level_estimator; - - // Run the level estimator for one window size interval. This gives time to - // adapt. - constexpr float kInitialSpeechRmsDbfs = -30.f; + // Reach confidence using the initial level estimate. RunOnConstantLevel( - kFullBufferSizeMs / kFrameDurationMs, + /*num_iterations=*/kNumFramesToConfidence, VadLevelAnalyzer::Result{ kMaxSpeechProbability, - /*rms_dbfs=*/kInitialSpeechRmsDbfs - kInitialSaturationMarginDb, - /*peak_dbfs=*/kInitialSpeechRmsDbfs}, + /*rms_dbfs=*/kInitialSpeechLevelEstimateDbfs, + /*peak_dbfs=*/kInitialSpeechLevelEstimateDbfs + 6.0f}, *level_estimator.estimator); - - constexpr float kDifferentSpeechRmsDbfs = -10.f; - // Reset and run one half window size interval. - level_estimator.estimator->Reset(); - + // No estimate change should occur, but confidence is achieved. + ASSERT_FLOAT_EQ(level_estimator.estimator->level_dbfs(), + kInitialSpeechLevelEstimateDbfs); + ASSERT_TRUE(level_estimator.estimator->IsConfident()); + // After confidence. + constexpr float kConvergenceTimeAfterConfidenceNumFrames = 600; // 6 seconds. + static_assert( + kConvergenceTimeAfterConfidenceNumFrames > kNumFramesToConfidence, ""); RunOnConstantLevel( - kFullBufferSizeMs / kFrameDurationMs / 2, - VadLevelAnalyzer::Result{ - kMaxSpeechProbability, - /*rms_dbfs=*/kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb, - /*peak_dbfs=*/kDifferentSpeechRmsDbfs}, - *level_estimator.estimator); - - // The level should be close to 'kDifferentSpeechRmsDbfs'. - const float kMaxDifferenceDb = - 0.1f * std::abs(kDifferentSpeechRmsDbfs - kInitialSpeechRmsDbfs); - EXPECT_LT(std::abs(kDifferentSpeechRmsDbfs - - (level_estimator.estimator->level_dbfs() - - kExtraSaturationMarginDb)), - kMaxDifferenceDb); + /*num_iterations=*/kConvergenceTimeAfterConfidenceNumFrames, + kVadDataSpeech, *level_estimator.estimator); + EXPECT_NEAR(level_estimator.estimator->level_dbfs(), kVadDataSpeech.rms_dbfs, + kConvergenceSpeedTestsLevelTolerance); } -struct TestConfig { - int min_consecutive_speech_frames; - float initial_saturation_margin_db; - float extra_saturation_margin_db; +class AdaptiveModeLevelEstimatorParametrization + : public ::testing::TestWithParam { + protected: + int adjacent_speech_frames_threshold() const { return GetParam(); } }; -class AdaptiveModeLevelEstimatorTest - : public ::testing::TestWithParam {}; - -TEST_P(AdaptiveModeLevelEstimatorTest, DoNotAdaptToShortSpeechSegments) { - const auto params = GetParam(); +TEST_P(AdaptiveModeLevelEstimatorParametrization, + DoNotAdaptToShortSpeechSegments) { ApmDataDumper apm_data_dumper(0); AdaptiveModeLevelEstimator level_estimator( - &apm_data_dumper, - AudioProcessing::Config::GainController2::LevelEstimator::kRms, - params.min_consecutive_speech_frames, params.initial_saturation_margin_db, - params.extra_saturation_margin_db); + &apm_data_dumper, adjacent_speech_frames_threshold()); const float initial_level = level_estimator.level_dbfs(); - ASSERT_LT(initial_level, kVadDataSpeech.rms_dbfs); - for (int i = 0; i < params.min_consecutive_speech_frames - 1; ++i) { + ASSERT_LT(initial_level, kVadDataSpeech.peak_dbfs); + for (int i = 0; i < adjacent_speech_frames_threshold() - 1; ++i) { SCOPED_TRACE(i); level_estimator.Update(kVadDataSpeech); EXPECT_EQ(initial_level, level_estimator.level_dbfs()); @@ -217,26 +182,21 @@ TEST_P(AdaptiveModeLevelEstimatorTest, DoNotAdaptToShortSpeechSegments) { EXPECT_EQ(initial_level, level_estimator.level_dbfs()); } -TEST_P(AdaptiveModeLevelEstimatorTest, AdaptToEnoughSpeechSegments) { - const auto params = GetParam(); +TEST_P(AdaptiveModeLevelEstimatorParametrization, AdaptToEnoughSpeechSegments) { ApmDataDumper apm_data_dumper(0); AdaptiveModeLevelEstimator level_estimator( - &apm_data_dumper, - AudioProcessing::Config::GainController2::LevelEstimator::kRms, - params.min_consecutive_speech_frames, params.initial_saturation_margin_db, - params.extra_saturation_margin_db); + &apm_data_dumper, adjacent_speech_frames_threshold()); const float initial_level = level_estimator.level_dbfs(); - ASSERT_LT(initial_level, kVadDataSpeech.rms_dbfs); - for (int i = 0; i < params.min_consecutive_speech_frames; ++i) { + ASSERT_LT(initial_level, kVadDataSpeech.peak_dbfs); + for (int i = 0; i < adjacent_speech_frames_threshold(); ++i) { level_estimator.Update(kVadDataSpeech); } EXPECT_LT(initial_level, level_estimator.level_dbfs()); } -INSTANTIATE_TEST_SUITE_P(AutomaticGainController2, - AdaptiveModeLevelEstimatorTest, - ::testing::Values(TestConfig{1, 0.f, 0.f}, - TestConfig{9, 0.f, 0.f})); +INSTANTIATE_TEST_SUITE_P(GainController2, + AdaptiveModeLevelEstimatorParametrization, + ::testing::Values(1, 9, 17)); } // namespace } // namespace webrtc diff --git a/modules/audio_processing/agc2/agc2_common.h b/modules/audio_processing/agc2/agc2_common.h index ccd04bcc04..0f806d3938 100644 --- a/modules/audio_processing/agc2/agc2_common.h +++ b/modules/audio_processing/agc2/agc2_common.h @@ -11,20 +11,19 @@ #ifndef MODULES_AUDIO_PROCESSING_AGC2_AGC2_COMMON_H_ #define MODULES_AUDIO_PROCESSING_AGC2_AGC2_COMMON_H_ -#include - namespace webrtc { constexpr float kMinFloatS16Value = -32768.0f; constexpr float kMaxFloatS16Value = 32767.0f; constexpr float kMaxAbsFloatS16Value = 32768.0f; +// Minimum audio level in dBFS scale for S16 samples. +constexpr float kMinLevelDbfs = -90.31f; + constexpr int kFrameDurationMs = 10; constexpr int kSubFramesInFrame = 20; constexpr int kMaximalNumberOfSamplesPerChannel = 480; -constexpr float kAttackFilterConstant = 0.0f; - // Adaptive digital gain applier settings below. constexpr float kHeadroomDbfs = 1.0f; constexpr float kMaxGainDb = 30.0f; @@ -37,43 +36,29 @@ constexpr float kLimiterThresholdForAgcGainDbfs = -kHeadroomDbfs; // gain reduction. constexpr float kVadConfidenceThreshold = 0.95f; -// The amount of 'memory' of the Level Estimator. Decides leak factors. -constexpr int kFullBufferSizeMs = 1200; -constexpr float kFullBufferLeakFactor = 1.0f - 1.0f / kFullBufferSizeMs; - -constexpr float kInitialSpeechLevelEstimateDbfs = -30.0f; +// Adaptive digital level estimator parameters. +// Number of milliseconds of speech frames to observe to make the estimator +// confident. +constexpr float kLevelEstimatorTimeToConfidenceMs = 400; +constexpr float kLevelEstimatorLeakFactor = + 1.0f - 1.0f / kLevelEstimatorTimeToConfidenceMs; // Robust VAD probability and speech decisions. constexpr int kDefaultVadRnnResetPeriodMs = 1500; static_assert(kDefaultVadRnnResetPeriodMs % kFrameDurationMs == 0, ""); -constexpr float kDefaultSmoothedVadProbabilityAttack = 1.0f; -constexpr int kDefaultLevelEstimatorAdjacentSpeechFramesThreshold = 1; +constexpr int kDefaultLevelEstimatorAdjacentSpeechFramesThreshold = 12; // Saturation Protector settings. -constexpr float kDefaultInitialSaturationMarginDb = 20.0f; -constexpr float kDefaultExtraSaturationMarginDb = 2.0f; +constexpr float kSaturationProtectorInitialHeadroomDb = 20.0f; +constexpr float kSaturationProtectorExtraHeadroomDb = 5.0f; +constexpr int kSaturationProtectorBufferSize = 4; -constexpr int kPeakEnveloperSuperFrameLengthMs = 400; -static_assert(kFullBufferSizeMs % kPeakEnveloperSuperFrameLengthMs == 0, - "Full buffer size should be a multiple of super frame length for " - "optimal Saturation Protector performance."); - -constexpr int kPeakEnveloperBufferSize = - kFullBufferSizeMs / kPeakEnveloperSuperFrameLengthMs + 1; - -// This value is 10 ** (-1/20 * frame_size_ms / satproc_attack_ms), -// where satproc_attack_ms is 5000. -constexpr float kSaturationProtectorAttackConstant = 0.9988493699365052f; - -// This value is 10 ** (-1/20 * frame_size_ms / satproc_decay_ms), -// where satproc_decay_ms is 1000. -constexpr float kSaturationProtectorDecayConstant = 0.9997697679981565f; - -// This is computed from kDecayMs by -// 10 ** (-1/20 * subframe_duration / kDecayMs). -// |subframe_duration| is |kFrameDurationMs / kSubFramesInFrame|. -// kDecayMs is defined in agc2_testing_common.h -constexpr float kDecayFilterConstant = 0.9998848773724686f; +// Set the initial speech level estimate so that `kInitialAdaptiveDigitalGainDb` +// is applied at the beginning of the call. +constexpr float kInitialSpeechLevelEstimateDbfs = + -kSaturationProtectorExtraHeadroomDb - + kSaturationProtectorInitialHeadroomDb - kInitialAdaptiveDigitalGainDb - + kHeadroomDbfs; // Number of interpolation points for each region of the limiter. // These values have been tuned to limit the interpolated gain curve error given diff --git a/modules/audio_processing/agc2/agc2_testing_common_unittest.cc b/modules/audio_processing/agc2/agc2_testing_common_unittest.cc index f52ea3caf5..79c3cc95d9 100644 --- a/modules/audio_processing/agc2/agc2_testing_common_unittest.cc +++ b/modules/audio_processing/agc2/agc2_testing_common_unittest.cc @@ -14,7 +14,7 @@ namespace webrtc { -TEST(AutomaticGainController2Common, TestLinSpace) { +TEST(GainController2TestingCommon, LinSpace) { std::vector points1 = test::LinSpace(-1.0, 2.0, 4); const std::vector expected_points1{{-1.0, 0.0, 1.0, 2.0}}; EXPECT_EQ(expected_points1, points1); diff --git a/modules/audio_processing/agc2/fixed_digital_level_estimator.cc b/modules/audio_processing/agc2/fixed_digital_level_estimator.cc index 9636136e4a..3e9bb2efbd 100644 --- a/modules/audio_processing/agc2/fixed_digital_level_estimator.cc +++ b/modules/audio_processing/agc2/fixed_digital_level_estimator.cc @@ -22,6 +22,14 @@ namespace { constexpr float kInitialFilterStateLevel = 0.f; +// Instant attack. +constexpr float kAttackFilterConstant = 0.f; +// This is computed from kDecayMs by +// 10 ** (-1/20 * subframe_duration / kDecayMs). +// |subframe_duration| is |kFrameDurationMs / kSubFramesInFrame|. +// kDecayMs is defined in agc2_testing_common.h +constexpr float kDecayFilterConstant = 0.9998848773724686f; + } // namespace FixedDigitalLevelEstimator::FixedDigitalLevelEstimator( diff --git a/modules/audio_processing/agc2/fixed_digital_level_estimator_unittest.cc b/modules/audio_processing/agc2/fixed_digital_level_estimator_unittest.cc index 7547f8e2ed..97b421d04c 100644 --- a/modules/audio_processing/agc2/fixed_digital_level_estimator_unittest.cc +++ b/modules/audio_processing/agc2/fixed_digital_level_estimator_unittest.cc @@ -101,25 +101,25 @@ float TimeMsToDecreaseLevel(int sample_rate_hz, } } // namespace -TEST(AutomaticGainController2LevelEstimator, EstimatorShouldNotCrash) { +TEST(GainController2FixedDigitalLevelEstimator, EstimatorShouldNotCrash) { TestLevelEstimator(8000, 1, 0, std::numeric_limits::lowest(), std::numeric_limits::max()); } -TEST(AutomaticGainController2LevelEstimator, +TEST(GainController2FixedDigitalLevelEstimator, EstimatorShouldEstimateConstantLevel) { TestLevelEstimator(10000, 1, kInputLevel, kInputLevel * 0.99, kInputLevel * 1.01); } -TEST(AutomaticGainController2LevelEstimator, +TEST(GainController2FixedDigitalLevelEstimator, EstimatorShouldEstimateConstantLevelForManyChannels) { constexpr size_t num_channels = 10; TestLevelEstimator(20000, num_channels, kInputLevel, kInputLevel * 0.99, kInputLevel * 1.01); } -TEST(AutomaticGainController2LevelEstimator, TimeToDecreaseForLowLevel) { +TEST(GainController2FixedDigitalLevelEstimator, TimeToDecreaseForLowLevel) { constexpr float kLevelReductionDb = 25; constexpr float kInitialLowLevel = -40; constexpr float kExpectedTime = kLevelReductionDb * test::kDecayMs; @@ -131,7 +131,8 @@ TEST(AutomaticGainController2LevelEstimator, TimeToDecreaseForLowLevel) { EXPECT_LE(time_to_decrease, kExpectedTime * 1.1); } -TEST(AutomaticGainController2LevelEstimator, TimeToDecreaseForFullScaleLevel) { +TEST(GainController2FixedDigitalLevelEstimator, + TimeToDecreaseForFullScaleLevel) { constexpr float kLevelReductionDb = 25; constexpr float kExpectedTime = kLevelReductionDb * test::kDecayMs; @@ -142,7 +143,7 @@ TEST(AutomaticGainController2LevelEstimator, TimeToDecreaseForFullScaleLevel) { EXPECT_LE(time_to_decrease, kExpectedTime * 1.1); } -TEST(AutomaticGainController2LevelEstimator, +TEST(GainController2FixedDigitalLevelEstimator, TimeToDecreaseForMultipleChannels) { constexpr float kLevelReductionDb = 25; constexpr float kExpectedTime = kLevelReductionDb * test::kDecayMs; diff --git a/modules/audio_processing/agc2/interpolated_gain_curve.h b/modules/audio_processing/agc2/interpolated_gain_curve.h index 69652c5a72..af993204ce 100644 --- a/modules/audio_processing/agc2/interpolated_gain_curve.h +++ b/modules/audio_processing/agc2/interpolated_gain_curve.h @@ -75,7 +75,7 @@ class InterpolatedGainCurve { private: // For comparing 'approximation_params_*_' with ones computed by // ComputeInterpolatedGainCurve. - FRIEND_TEST_ALL_PREFIXES(AutomaticGainController2InterpolatedGainCurve, + FRIEND_TEST_ALL_PREFIXES(GainController2InterpolatedGainCurve, CheckApproximationParams); struct RegionLogger { diff --git a/modules/audio_processing/agc2/interpolated_gain_curve_unittest.cc b/modules/audio_processing/agc2/interpolated_gain_curve_unittest.cc index 67d34e517b..7861ae997d 100644 --- a/modules/audio_processing/agc2/interpolated_gain_curve_unittest.cc +++ b/modules/audio_processing/agc2/interpolated_gain_curve_unittest.cc @@ -34,7 +34,7 @@ const LimiterDbGainCurve limiter; } // namespace -TEST(AutomaticGainController2InterpolatedGainCurve, CreateUse) { +TEST(GainController2InterpolatedGainCurve, CreateUse) { InterpolatedGainCurve igc(&apm_data_dumper, ""); const auto levels = test::LinSpace( @@ -44,7 +44,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve, CreateUse) { } } -TEST(AutomaticGainController2InterpolatedGainCurve, CheckValidOutput) { +TEST(GainController2InterpolatedGainCurve, CheckValidOutput) { InterpolatedGainCurve igc(&apm_data_dumper, ""); const auto levels = test::LinSpace( @@ -57,7 +57,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve, CheckValidOutput) { } } -TEST(AutomaticGainController2InterpolatedGainCurve, CheckMonotonicity) { +TEST(GainController2InterpolatedGainCurve, CheckMonotonicity) { InterpolatedGainCurve igc(&apm_data_dumper, ""); const auto levels = test::LinSpace( @@ -71,7 +71,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve, CheckMonotonicity) { } } -TEST(AutomaticGainController2InterpolatedGainCurve, CheckApproximation) { +TEST(GainController2InterpolatedGainCurve, CheckApproximation) { InterpolatedGainCurve igc(&apm_data_dumper, ""); const auto levels = test::LinSpace( @@ -84,7 +84,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve, CheckApproximation) { } } -TEST(AutomaticGainController2InterpolatedGainCurve, CheckRegionBoundaries) { +TEST(GainController2InterpolatedGainCurve, CheckRegionBoundaries) { InterpolatedGainCurve igc(&apm_data_dumper, ""); const std::vector levels{ @@ -102,7 +102,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve, CheckRegionBoundaries) { EXPECT_EQ(1ul, stats.look_ups_saturation_region); } -TEST(AutomaticGainController2InterpolatedGainCurve, CheckIdentityRegion) { +TEST(GainController2InterpolatedGainCurve, CheckIdentityRegion) { constexpr size_t kNumSteps = 10; InterpolatedGainCurve igc(&apm_data_dumper, ""); @@ -120,8 +120,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve, CheckIdentityRegion) { EXPECT_EQ(0ul, stats.look_ups_saturation_region); } -TEST(AutomaticGainController2InterpolatedGainCurve, - CheckNoOverApproximationKnee) { +TEST(GainController2InterpolatedGainCurve, CheckNoOverApproximationKnee) { constexpr size_t kNumSteps = 10; InterpolatedGainCurve igc(&apm_data_dumper, ""); @@ -142,8 +141,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve, EXPECT_EQ(0ul, stats.look_ups_saturation_region); } -TEST(AutomaticGainController2InterpolatedGainCurve, - CheckNoOverApproximationBeyondKnee) { +TEST(GainController2InterpolatedGainCurve, CheckNoOverApproximationBeyondKnee) { constexpr size_t kNumSteps = 10; InterpolatedGainCurve igc(&apm_data_dumper, ""); @@ -164,7 +162,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve, EXPECT_EQ(0ul, stats.look_ups_saturation_region); } -TEST(AutomaticGainController2InterpolatedGainCurve, +TEST(GainController2InterpolatedGainCurve, CheckNoOverApproximationWithSaturation) { constexpr size_t kNumSteps = 3; InterpolatedGainCurve igc(&apm_data_dumper, ""); @@ -184,7 +182,7 @@ TEST(AutomaticGainController2InterpolatedGainCurve, EXPECT_EQ(kNumSteps, stats.look_ups_saturation_region); } -TEST(AutomaticGainController2InterpolatedGainCurve, CheckApproximationParams) { +TEST(GainController2InterpolatedGainCurve, CheckApproximationParams) { test::InterpolatedParameters parameters = test::ComputeInterpolatedGainCurveApproximationParams(); diff --git a/modules/audio_processing/agc2/noise_level_estimator.cc b/modules/audio_processing/agc2/noise_level_estimator.cc index ae8a50113e..10e8437d3f 100644 --- a/modules/audio_processing/agc2/noise_level_estimator.cc +++ b/modules/audio_processing/agc2/noise_level_estimator.cc @@ -184,7 +184,7 @@ class NoiseFloorEstimator : public NoiseLevelEstimator { const float frame_energy = FrameEnergy(frame); if (frame_energy <= min_noise_energy_) { // Ignore frames when muted or below the minimum measurable energy. - data_dumper_->DumpRaw("agc2_noise_floor_preliminary_level", + data_dumper_->DumpRaw("agc2_noise_floor_estimator_preliminary_level", noise_energy_); return EnergyToDbfs(noise_energy_, frame.samples_per_channel()); } @@ -196,7 +196,7 @@ class NoiseFloorEstimator : public NoiseLevelEstimator { preliminary_noise_energy_ = frame_energy; preliminary_noise_energy_set_ = true; } - data_dumper_->DumpRaw("agc2_noise_floor_preliminary_level", + data_dumper_->DumpRaw("agc2_noise_floor_estimator_preliminary_level", preliminary_noise_energy_); if (counter_ == 0) { diff --git a/modules/audio_processing/agc2/saturation_protector.cc b/modules/audio_processing/agc2/saturation_protector.cc index b64fcdb71f..d6f21ef891 100644 --- a/modules/audio_processing/agc2/saturation_protector.cc +++ b/modules/audio_processing/agc2/saturation_protector.cc @@ -10,84 +10,59 @@ #include "modules/audio_processing/agc2/saturation_protector.h" +#include + +#include "modules/audio_processing/agc2/agc2_common.h" +#include "modules/audio_processing/agc2/saturation_protector_buffer.h" #include "modules/audio_processing/logging/apm_data_dumper.h" +#include "rtc_base/checks.h" #include "rtc_base/numerics/safe_minmax.h" namespace webrtc { namespace { -constexpr float kMinLevelDbfs = -90.f; +constexpr int kPeakEnveloperSuperFrameLengthMs = 400; +constexpr float kMinMarginDb = 12.0f; +constexpr float kMaxMarginDb = 25.0f; +constexpr float kAttack = 0.9988493699365052f; +constexpr float kDecay = 0.9997697679981565f; -// Min/max margins are based on speech crest-factor. -constexpr float kMinMarginDb = 12.f; -constexpr float kMaxMarginDb = 25.f; - -using saturation_protector_impl::RingBuffer; - -} // namespace - -bool RingBuffer::operator==(const RingBuffer& b) const { - RTC_DCHECK_LE(size_, buffer_.size()); - RTC_DCHECK_LE(b.size_, b.buffer_.size()); - if (size_ != b.size_) { - return false; +// Saturation protector state. Defined outside of `SaturationProtectorImpl` to +// implement check-point and restore ops. +struct SaturationProtectorState { + bool operator==(const SaturationProtectorState& s) const { + return headroom_db == s.headroom_db && + peak_delay_buffer == s.peak_delay_buffer && + max_peaks_dbfs == s.max_peaks_dbfs && + time_since_push_ms == s.time_since_push_ms; } - for (int i = 0, i0 = FrontIndex(), i1 = b.FrontIndex(); i < size_; - ++i, ++i0, ++i1) { - if (buffer_[i0 % buffer_.size()] != b.buffer_[i1 % b.buffer_.size()]) { - return false; - } + inline bool operator!=(const SaturationProtectorState& s) const { + return !(*this == s); } - return true; -} -void RingBuffer::Reset() { - next_ = 0; - size_ = 0; -} + float headroom_db; + SaturationProtectorBuffer peak_delay_buffer; + float max_peaks_dbfs; + int time_since_push_ms; // Time since the last ring buffer push operation. +}; -void RingBuffer::PushBack(float v) { - RTC_DCHECK_GE(next_, 0); - RTC_DCHECK_GE(size_, 0); - RTC_DCHECK_LT(next_, buffer_.size()); - RTC_DCHECK_LE(size_, buffer_.size()); - buffer_[next_++] = v; - if (rtc::SafeEq(next_, buffer_.size())) { - next_ = 0; - } - if (rtc::SafeLt(size_, buffer_.size())) { - size_++; - } -} - -absl::optional RingBuffer::Front() const { - if (size_ == 0) { - return absl::nullopt; - } - RTC_DCHECK_LT(FrontIndex(), buffer_.size()); - return buffer_[FrontIndex()]; -} - -bool SaturationProtectorState::operator==( - const SaturationProtectorState& b) const { - return margin_db == b.margin_db && peak_delay_buffer == b.peak_delay_buffer && - max_peaks_dbfs == b.max_peaks_dbfs && - time_since_push_ms == b.time_since_push_ms; -} - -void ResetSaturationProtectorState(float initial_margin_db, +// Resets the saturation protector state. +void ResetSaturationProtectorState(float initial_headroom_db, SaturationProtectorState& state) { - state.margin_db = initial_margin_db; + state.headroom_db = initial_headroom_db; state.peak_delay_buffer.Reset(); state.max_peaks_dbfs = kMinLevelDbfs; state.time_since_push_ms = 0; } -void UpdateSaturationProtectorState(float speech_peak_dbfs, +// Updates `state` by analyzing the estimated speech level `speech_level_dbfs` +// and the peak level `peak_dbfs` for an observed frame. `state` must not be +// modified without calling this function. +void UpdateSaturationProtectorState(float peak_dbfs, float speech_level_dbfs, SaturationProtectorState& state) { // Get the max peak over `kPeakEnveloperSuperFrameLengthMs` ms. - state.max_peaks_dbfs = std::max(state.max_peaks_dbfs, speech_peak_dbfs); + state.max_peaks_dbfs = std::max(state.max_peaks_dbfs, peak_dbfs); state.time_since_push_ms += kFrameDurationMs; if (rtc::SafeGt(state.time_since_push_ms, kPeakEnveloperSuperFrameLengthMs)) { // Push `max_peaks_dbfs` back into the ring buffer. @@ -97,25 +72,117 @@ void UpdateSaturationProtectorState(float speech_peak_dbfs, state.time_since_push_ms = 0; } - // Update margin by comparing the estimated speech level and the delayed max - // speech peak power. - // TODO(alessiob): Check with aleloi@ why we use a delay and how to tune it. + // Update the headroom by comparing the estimated speech level and the delayed + // max speech peak. const float delayed_peak_dbfs = state.peak_delay_buffer.Front().value_or(state.max_peaks_dbfs); const float difference_db = delayed_peak_dbfs - speech_level_dbfs; - if (difference_db > state.margin_db) { + if (difference_db > state.headroom_db) { // Attack. - state.margin_db = - state.margin_db * kSaturationProtectorAttackConstant + - difference_db * (1.f - kSaturationProtectorAttackConstant); + state.headroom_db = + state.headroom_db * kAttack + difference_db * (1.0f - kAttack); } else { // Decay. - state.margin_db = state.margin_db * kSaturationProtectorDecayConstant + - difference_db * (1.f - kSaturationProtectorDecayConstant); + state.headroom_db = + state.headroom_db * kDecay + difference_db * (1.0f - kDecay); } - state.margin_db = - rtc::SafeClamp(state.margin_db, kMinMarginDb, kMaxMarginDb); + state.headroom_db = + rtc::SafeClamp(state.headroom_db, kMinMarginDb, kMaxMarginDb); +} + +// Saturation protector which recommends a headroom based on the recent peaks. +class SaturationProtectorImpl : public SaturationProtector { + public: + explicit SaturationProtectorImpl(float initial_headroom_db, + float extra_headroom_db, + int adjacent_speech_frames_threshold, + ApmDataDumper* apm_data_dumper) + : apm_data_dumper_(apm_data_dumper), + initial_headroom_db_(initial_headroom_db), + extra_headroom_db_(extra_headroom_db), + adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold) { + Reset(); + } + SaturationProtectorImpl(const SaturationProtectorImpl&) = delete; + SaturationProtectorImpl& operator=(const SaturationProtectorImpl&) = delete; + ~SaturationProtectorImpl() = default; + + float HeadroomDb() override { return headroom_db_; } + + void Analyze(float speech_probability, + float peak_dbfs, + float speech_level_dbfs) override { + if (speech_probability < kVadConfidenceThreshold) { + // Not a speech frame. + if (adjacent_speech_frames_threshold_ > 1) { + // When two or more adjacent speech frames are required in order to + // update the state, we need to decide whether to discard or confirm the + // updates based on the speech sequence length. + if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) { + // First non-speech frame after a long enough sequence of speech + // frames. Update the reliable state. + reliable_state_ = preliminary_state_; + } else if (num_adjacent_speech_frames_ > 0) { + // First non-speech frame after a too short sequence of speech frames. + // Reset to the last reliable state. + preliminary_state_ = reliable_state_; + } + } + num_adjacent_speech_frames_ = 0; + } else { + // Speech frame observed. + num_adjacent_speech_frames_++; + + // Update preliminary level estimate. + UpdateSaturationProtectorState(peak_dbfs, speech_level_dbfs, + preliminary_state_); + + if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) { + // `preliminary_state_` is now reliable. Update the headroom. + headroom_db_ = preliminary_state_.headroom_db + extra_headroom_db_; + } + } + DumpDebugData(); + } + + void Reset() override { + num_adjacent_speech_frames_ = 0; + headroom_db_ = initial_headroom_db_ + extra_headroom_db_; + ResetSaturationProtectorState(initial_headroom_db_, preliminary_state_); + ResetSaturationProtectorState(initial_headroom_db_, reliable_state_); + } + + private: + void DumpDebugData() { + apm_data_dumper_->DumpRaw( + "agc2_saturation_protector_preliminary_max_peak_dbfs", + preliminary_state_.max_peaks_dbfs); + apm_data_dumper_->DumpRaw( + "agc2_saturation_protector_reliable_max_peak_dbfs", + reliable_state_.max_peaks_dbfs); + } + + ApmDataDumper* const apm_data_dumper_; + const float initial_headroom_db_; + const float extra_headroom_db_; + const int adjacent_speech_frames_threshold_; + int num_adjacent_speech_frames_; + float headroom_db_; + SaturationProtectorState preliminary_state_; + SaturationProtectorState reliable_state_; +}; + +} // namespace + +std::unique_ptr CreateSaturationProtector( + float initial_headroom_db, + float extra_headroom_db, + int adjacent_speech_frames_threshold, + ApmDataDumper* apm_data_dumper) { + return std::make_unique( + initial_headroom_db, extra_headroom_db, adjacent_speech_frames_threshold, + apm_data_dumper); } } // namespace webrtc diff --git a/modules/audio_processing/agc2/saturation_protector.h b/modules/audio_processing/agc2/saturation_protector.h index 88be91a79b..0c384f1fa0 100644 --- a/modules/audio_processing/agc2/saturation_protector.h +++ b/modules/audio_processing/agc2/saturation_protector.h @@ -11,71 +11,36 @@ #ifndef MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_ #define MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_ -#include - -#include "absl/types/optional.h" -#include "modules/audio_processing/agc2/agc2_common.h" -#include "rtc_base/numerics/safe_compare.h" +#include namespace webrtc { -namespace saturation_protector_impl { +class ApmDataDumper; -// Ring buffer which only supports (i) push back and (ii) read oldest item. -class RingBuffer { +// Saturation protector. Analyzes peak levels and recommends a headroom to +// reduce the chances of clipping. +class SaturationProtector { public: - bool operator==(const RingBuffer& b) const; - inline bool operator!=(const RingBuffer& b) const { return !(*this == b); } + virtual ~SaturationProtector() = default; - // Maximum number of values that the buffer can contain. - int Capacity() const { return buffer_.size(); } - // Number of values in the buffer. - int Size() const { return size_; } + // Returns the recommended headroom in dB. + virtual float HeadroomDb() = 0; - void Reset(); - // Pushes back `v`. If the buffer is full, the oldest value is replaced. - void PushBack(float v); - // Returns the oldest item in the buffer. Returns an empty value if the - // buffer is empty. - absl::optional Front() const; + // Analyzes the peak level of a 10 ms frame along with its speech probability + // and the current speech level estimate to update the recommended headroom. + virtual void Analyze(float speech_probability, + float peak_dbfs, + float speech_level_dbfs) = 0; - private: - inline int FrontIndex() const { - return rtc::SafeEq(size_, buffer_.size()) ? next_ : 0; - } - // `buffer_` has `size_` elements (up to the size of `buffer_`) and `next_` is - // the position where the next new value is written in `buffer_`. - std::array buffer_; - int next_ = 0; - int size_ = 0; + // Resets the internal state. + virtual void Reset() = 0; }; -} // namespace saturation_protector_impl - -// Saturation protector state. Exposed publicly for check-pointing and restore -// ops. -struct SaturationProtectorState { - bool operator==(const SaturationProtectorState& s) const; - inline bool operator!=(const SaturationProtectorState& s) const { - return !(*this == s); - } - - float margin_db; // Recommended margin. - saturation_protector_impl::RingBuffer peak_delay_buffer; - float max_peaks_dbfs; - int time_since_push_ms; // Time since the last ring buffer push operation. -}; - -// Resets the saturation protector state. -void ResetSaturationProtectorState(float initial_margin_db, - SaturationProtectorState& state); - -// Updates `state` by analyzing the estimated speech level `speech_level_dbfs` -// and the peak power `speech_peak_dbfs` for an observed frame which is -// reliably classified as "speech". `state` must not be modified without calling -// this function. -void UpdateSaturationProtectorState(float speech_peak_dbfs, - float speech_level_dbfs, - SaturationProtectorState& state); +// Creates a saturation protector that starts at `initial_headroom_db`. +std::unique_ptr CreateSaturationProtector( + float initial_headroom_db, + float extra_headroom_db, + int adjacent_speech_frames_threshold, + ApmDataDumper* apm_data_dumper); } // namespace webrtc diff --git a/modules/audio_processing/agc2/saturation_protector_buffer.cc b/modules/audio_processing/agc2/saturation_protector_buffer.cc new file mode 100644 index 0000000000..41efdad2c8 --- /dev/null +++ b/modules/audio_processing/agc2/saturation_protector_buffer.cc @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2021 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_processing/agc2/saturation_protector_buffer.h" + +#include "rtc_base/checks.h" +#include "rtc_base/numerics/safe_compare.h" + +namespace webrtc { + +SaturationProtectorBuffer::SaturationProtectorBuffer() = default; + +SaturationProtectorBuffer::~SaturationProtectorBuffer() = default; + +bool SaturationProtectorBuffer::operator==( + const SaturationProtectorBuffer& b) const { + RTC_DCHECK_LE(size_, buffer_.size()); + RTC_DCHECK_LE(b.size_, b.buffer_.size()); + if (size_ != b.size_) { + return false; + } + for (int i = 0, i0 = FrontIndex(), i1 = b.FrontIndex(); i < size_; + ++i, ++i0, ++i1) { + if (buffer_[i0 % buffer_.size()] != b.buffer_[i1 % b.buffer_.size()]) { + return false; + } + } + return true; +} + +int SaturationProtectorBuffer::Capacity() const { + return buffer_.size(); +} + +int SaturationProtectorBuffer::Size() const { + return size_; +} + +void SaturationProtectorBuffer::Reset() { + next_ = 0; + size_ = 0; +} + +void SaturationProtectorBuffer::PushBack(float v) { + RTC_DCHECK_GE(next_, 0); + RTC_DCHECK_GE(size_, 0); + RTC_DCHECK_LT(next_, buffer_.size()); + RTC_DCHECK_LE(size_, buffer_.size()); + buffer_[next_++] = v; + if (rtc::SafeEq(next_, buffer_.size())) { + next_ = 0; + } + if (rtc::SafeLt(size_, buffer_.size())) { + size_++; + } +} + +absl::optional SaturationProtectorBuffer::Front() const { + if (size_ == 0) { + return absl::nullopt; + } + RTC_DCHECK_LT(FrontIndex(), buffer_.size()); + return buffer_[FrontIndex()]; +} + +int SaturationProtectorBuffer::FrontIndex() const { + return rtc::SafeEq(size_, buffer_.size()) ? next_ : 0; +} + +} // namespace webrtc diff --git a/modules/audio_processing/agc2/saturation_protector_buffer.h b/modules/audio_processing/agc2/saturation_protector_buffer.h new file mode 100644 index 0000000000..e17d0998c4 --- /dev/null +++ b/modules/audio_processing/agc2/saturation_protector_buffer.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2021 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_BUFFER_H_ +#define MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_BUFFER_H_ + +#include + +#include "absl/types/optional.h" +#include "modules/audio_processing/agc2/agc2_common.h" + +namespace webrtc { + +// Ring buffer for the saturation protector which only supports (i) push back +// and (ii) read oldest item. +class SaturationProtectorBuffer { + public: + SaturationProtectorBuffer(); + ~SaturationProtectorBuffer(); + + bool operator==(const SaturationProtectorBuffer& b) const; + inline bool operator!=(const SaturationProtectorBuffer& b) const { + return !(*this == b); + } + + // Maximum number of values that the buffer can contain. + int Capacity() const; + + // Number of values in the buffer. + int Size() const; + + void Reset(); + + // Pushes back `v`. If the buffer is full, the oldest value is replaced. + void PushBack(float v); + + // Returns the oldest item in the buffer. Returns an empty value if the + // buffer is empty. + absl::optional Front() const; + + private: + int FrontIndex() const; + // `buffer_` has `size_` elements (up to the size of `buffer_`) and `next_` is + // the position where the next new value is written in `buffer_`. + std::array buffer_; + int next_ = 0; + int size_ = 0; +}; + +} // namespace webrtc + +#endif // MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_BUFFER_H_ diff --git a/modules/audio_processing/agc2/saturation_protector_buffer_unittest.cc b/modules/audio_processing/agc2/saturation_protector_buffer_unittest.cc new file mode 100644 index 0000000000..22187bf027 --- /dev/null +++ b/modules/audio_processing/agc2/saturation_protector_buffer_unittest.cc @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2021 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_processing/agc2/saturation_protector_buffer.h" + +#include "test/gmock.h" +#include "test/gtest.h" + +namespace webrtc { +namespace { + +using ::testing::Eq; +using ::testing::Optional; + +TEST(GainController2SaturationProtectorBuffer, Init) { + SaturationProtectorBuffer b; + EXPECT_EQ(b.Size(), 0); + EXPECT_FALSE(b.Front().has_value()); +} + +TEST(GainController2SaturationProtectorBuffer, PushBack) { + SaturationProtectorBuffer b; + constexpr float kValue = 123.0f; + b.PushBack(kValue); + EXPECT_EQ(b.Size(), 1); + EXPECT_THAT(b.Front(), Optional(Eq(kValue))); +} + +TEST(GainController2SaturationProtectorBuffer, Reset) { + SaturationProtectorBuffer b; + b.PushBack(123.0f); + b.Reset(); + EXPECT_EQ(b.Size(), 0); + EXPECT_FALSE(b.Front().has_value()); +} + +// Checks that the front value does not change until the ring buffer gets full. +TEST(GainController2SaturationProtectorBuffer, FrontUntilBufferIsFull) { + SaturationProtectorBuffer b; + constexpr float kValue = 123.0f; + b.PushBack(kValue); + for (int i = 1; i < b.Capacity(); ++i) { + SCOPED_TRACE(i); + EXPECT_THAT(b.Front(), Optional(Eq(kValue))); + b.PushBack(kValue + i); + } +} + +// Checks that when the buffer is full it behaves as a shift register. +TEST(GainController2SaturationProtectorBuffer, FrontIsDelayed) { + SaturationProtectorBuffer b; + // Fill the buffer. + for (int i = 0; i < b.Capacity(); ++i) { + b.PushBack(i); + } + // The ring buffer should now behave as a shift register with a delay equal to + // its capacity. + for (int i = b.Capacity(); i < 2 * b.Capacity() + 1; ++i) { + SCOPED_TRACE(i); + EXPECT_THAT(b.Front(), Optional(Eq(i - b.Capacity()))); + b.PushBack(i); + } +} + +} // namespace +} // namespace webrtc diff --git a/modules/audio_processing/agc2/saturation_protector_unittest.cc b/modules/audio_processing/agc2/saturation_protector_unittest.cc index 2c5ee5b036..dc16dc254c 100644 --- a/modules/audio_processing/agc2/saturation_protector_unittest.cc +++ b/modules/audio_processing/agc2/saturation_protector_unittest.cc @@ -10,181 +10,166 @@ #include "modules/audio_processing/agc2/saturation_protector.h" -#include - #include "modules/audio_processing/agc2/agc2_common.h" #include "modules/audio_processing/logging/apm_data_dumper.h" #include "rtc_base/gunit.h" -#include "test/gmock.h" namespace webrtc { namespace { -constexpr float kInitialMarginDb = 20.f; +constexpr float kInitialHeadroomDb = 20.0f; +constexpr float kNoExtraHeadroomDb = 0.0f; +constexpr int kNoAdjacentSpeechFramesRequired = 1; +constexpr float kMaxSpeechProbability = 1.0f; -using saturation_protector_impl::RingBuffer; - -SaturationProtectorState CreateSaturationProtectorState() { - SaturationProtectorState state; - ResetSaturationProtectorState(kInitialMarginDb, state); - return state; -} - -// Updates `state` for `num_iterations` times with constant speech level and -// peak powers and returns the maximum margin. +// Calls `Analyze(speech_probability, peak_dbfs, speech_level_dbfs)` +// `num_iterations` times on `saturation_protector` and return the largest +// headroom difference between two consecutive calls. float RunOnConstantLevel(int num_iterations, - float speech_peak_dbfs, + float speech_probability, + float peak_dbfs, float speech_level_dbfs, - SaturationProtectorState& state) { - float last_margin = state.margin_db; - float max_difference = 0.f; + SaturationProtector& saturation_protector) { + float last_headroom = saturation_protector.HeadroomDb(); + float max_difference = 0.0f; for (int i = 0; i < num_iterations; ++i) { - UpdateSaturationProtectorState(speech_peak_dbfs, speech_level_dbfs, state); - const float new_margin = state.margin_db; + saturation_protector.Analyze(speech_probability, peak_dbfs, + speech_level_dbfs); + const float new_headroom = saturation_protector.HeadroomDb(); max_difference = - std::max(max_difference, std::abs(new_margin - last_margin)); - last_margin = new_margin; + std::max(max_difference, std::fabs(new_headroom - last_headroom)); + last_headroom = new_headroom; } return max_difference; } -} // namespace - -TEST(AutomaticGainController2SaturationProtector, RingBufferInit) { - RingBuffer b; - EXPECT_EQ(b.Size(), 0); - EXPECT_FALSE(b.Front().has_value()); -} - -TEST(AutomaticGainController2SaturationProtector, RingBufferPushBack) { - RingBuffer b; - constexpr float kValue = 123.f; - b.PushBack(kValue); - EXPECT_EQ(b.Size(), 1); - ASSERT_TRUE(b.Front().has_value()); - EXPECT_EQ(b.Front().value(), kValue); -} - -TEST(AutomaticGainController2SaturationProtector, RingBufferReset) { - RingBuffer b; - b.PushBack(123.f); - b.Reset(); - EXPECT_EQ(b.Size(), 0); - EXPECT_FALSE(b.Front().has_value()); -} - -// Checks that the front value does not change until the ring buffer gets full. -TEST(AutomaticGainController2SaturationProtector, - RingBufferFrontUntilBufferIsFull) { - RingBuffer b; - constexpr float kValue = 123.f; - b.PushBack(kValue); - for (int i = 1; i < b.Capacity(); ++i) { - EXPECT_EQ(b.Front().value(), kValue); - b.PushBack(kValue + i); - } -} - -// Checks that when the buffer is full it behaves as a shift register. -TEST(AutomaticGainController2SaturationProtector, - FullRingBufferFrontIsDelayed) { - RingBuffer b; - // Fill the buffer. - for (int i = 0; i < b.Capacity(); ++i) { - b.PushBack(i); - } - // The ring buffer should now behave as a shift register with a delay equal to - // its capacity. - for (int i = b.Capacity(); i < 2 * b.Capacity() + 1; ++i) { - EXPECT_EQ(b.Front().value(), i - b.Capacity()); - b.PushBack(i); - } -} - -// Checks that a state after reset equals a state after construction. -TEST(AutomaticGainController2SaturationProtector, ResetState) { - SaturationProtectorState init_state; - ResetSaturationProtectorState(kInitialMarginDb, init_state); - - SaturationProtectorState state; - ResetSaturationProtectorState(kInitialMarginDb, state); - RunOnConstantLevel(/*num_iterations=*/10, /*speech_level_dbfs=*/-20.f, - /*speech_peak_dbfs=*/-10.f, state); - ASSERT_NE(init_state, state); // Make sure that there are side-effects. - ResetSaturationProtectorState(kInitialMarginDb, state); - - EXPECT_EQ(init_state, state); +// Checks that the returned headroom value is correctly reset. +TEST(GainController2SaturationProtector, Reset) { + ApmDataDumper apm_data_dumper(0); + auto saturation_protector = CreateSaturationProtector( + kInitialHeadroomDb, kNoExtraHeadroomDb, kNoAdjacentSpeechFramesRequired, + &apm_data_dumper); + const float initial_headroom_db = saturation_protector->HeadroomDb(); + RunOnConstantLevel(/*num_iterations=*/10, kMaxSpeechProbability, + /*peak_dbfs=*/0.0f, + /*speech_level_dbfs=*/-10.0f, *saturation_protector); + // Make sure that there are side-effects. + ASSERT_NE(initial_headroom_db, saturation_protector->HeadroomDb()); + saturation_protector->Reset(); + EXPECT_EQ(initial_headroom_db, saturation_protector->HeadroomDb()); } // Checks that the estimate converges to the ratio between peaks and level // estimator values after a while. -TEST(AutomaticGainController2SaturationProtector, - ProtectorEstimatesCrestRatio) { +TEST(GainController2SaturationProtector, EstimatesCrestRatio) { constexpr int kNumIterations = 2000; - constexpr float kPeakLevel = -20.f; - constexpr float kCrestFactor = kInitialMarginDb + 1.f; - constexpr float kSpeechLevel = kPeakLevel - kCrestFactor; - const float kMaxDifference = 0.5f * std::abs(kInitialMarginDb - kCrestFactor); + constexpr float kPeakLevelDbfs = -20.0f; + constexpr float kCrestFactorDb = kInitialHeadroomDb + 1.0f; + constexpr float kSpeechLevelDbfs = kPeakLevelDbfs - kCrestFactorDb; + const float kMaxDifferenceDb = + 0.5f * std::fabs(kInitialHeadroomDb - kCrestFactorDb); - auto state = CreateSaturationProtectorState(); - RunOnConstantLevel(kNumIterations, kPeakLevel, kSpeechLevel, state); - - EXPECT_NEAR(state.margin_db, kCrestFactor, kMaxDifference); + ApmDataDumper apm_data_dumper(0); + auto saturation_protector = CreateSaturationProtector( + kInitialHeadroomDb, kNoExtraHeadroomDb, kNoAdjacentSpeechFramesRequired, + &apm_data_dumper); + RunOnConstantLevel(kNumIterations, kMaxSpeechProbability, kPeakLevelDbfs, + kSpeechLevelDbfs, *saturation_protector); + EXPECT_NEAR(saturation_protector->HeadroomDb(), kCrestFactorDb, + kMaxDifferenceDb); } -// Checks that the margin does not change too quickly. -TEST(AutomaticGainController2SaturationProtector, ChangeSlowly) { +// Checks that the extra headroom is applied. +TEST(GainController2SaturationProtector, ExtraHeadroomApplied) { + constexpr float kExtraHeadroomDb = 5.1234f; + constexpr int kNumIterations = 10; + constexpr float kPeakLevelDbfs = -20.0f; + constexpr float kSpeechLevelDbfs = kPeakLevelDbfs - 15.0f; + + ApmDataDumper apm_data_dumper(0); + + auto saturation_protector_no_extra = CreateSaturationProtector( + kInitialHeadroomDb, kNoExtraHeadroomDb, kNoAdjacentSpeechFramesRequired, + &apm_data_dumper); + for (int i = 0; i < kNumIterations; ++i) { + saturation_protector_no_extra->Analyze(kMaxSpeechProbability, + kPeakLevelDbfs, kSpeechLevelDbfs); + } + + auto saturation_protector_extra = CreateSaturationProtector( + kInitialHeadroomDb, kExtraHeadroomDb, kNoAdjacentSpeechFramesRequired, + &apm_data_dumper); + for (int i = 0; i < kNumIterations; ++i) { + saturation_protector_extra->Analyze(kMaxSpeechProbability, kPeakLevelDbfs, + kSpeechLevelDbfs); + } + + EXPECT_EQ(saturation_protector_no_extra->HeadroomDb() + kExtraHeadroomDb, + saturation_protector_extra->HeadroomDb()); +} + +// Checks that the headroom does not change too quickly. +TEST(GainController2SaturationProtector, ChangeSlowly) { constexpr int kNumIterations = 1000; - constexpr float kPeakLevel = -20.f; - constexpr float kCrestFactor = kInitialMarginDb - 5.f; - constexpr float kOtherCrestFactor = kInitialMarginDb; - constexpr float kSpeechLevel = kPeakLevel - kCrestFactor; - constexpr float kOtherSpeechLevel = kPeakLevel - kOtherCrestFactor; - - auto state = CreateSaturationProtectorState(); - float max_difference = - RunOnConstantLevel(kNumIterations, kPeakLevel, kSpeechLevel, state); - max_difference = std::max( - RunOnConstantLevel(kNumIterations, kPeakLevel, kOtherSpeechLevel, state), - max_difference); + constexpr float kPeakLevelDbfs = -20.f; + constexpr float kCrestFactorDb = kInitialHeadroomDb - 5.f; + constexpr float kOtherCrestFactorDb = kInitialHeadroomDb; + constexpr float kSpeechLevelDbfs = kPeakLevelDbfs - kCrestFactorDb; + constexpr float kOtherSpeechLevelDbfs = kPeakLevelDbfs - kOtherCrestFactorDb; + ApmDataDumper apm_data_dumper(0); + auto saturation_protector = CreateSaturationProtector( + kInitialHeadroomDb, kNoExtraHeadroomDb, kNoAdjacentSpeechFramesRequired, + &apm_data_dumper); + float max_difference_db = + RunOnConstantLevel(kNumIterations, kMaxSpeechProbability, kPeakLevelDbfs, + kSpeechLevelDbfs, *saturation_protector); + max_difference_db = std::max( + RunOnConstantLevel(kNumIterations, kMaxSpeechProbability, kPeakLevelDbfs, + kOtherSpeechLevelDbfs, *saturation_protector), + max_difference_db); constexpr float kMaxChangeSpeedDbPerSecond = 0.5f; // 1 db / 2 seconds. - EXPECT_LE(max_difference, + EXPECT_LE(max_difference_db, kMaxChangeSpeedDbPerSecond / 1000 * kFrameDurationMs); } -// Checks that there is a delay between input change and margin adaptations. -TEST(AutomaticGainController2SaturationProtector, AdaptToDelayedChanges) { - constexpr int kDelayIterations = kFullBufferSizeMs / kFrameDurationMs; - constexpr float kInitialSpeechLevelDbfs = -30.f; - constexpr float kLaterSpeechLevelDbfs = -15.f; +class SaturationProtectorParametrization + : public ::testing::TestWithParam { + protected: + int adjacent_speech_frames_threshold() const { return GetParam(); } +}; - auto state = CreateSaturationProtectorState(); - // First run on initial level. - float max_difference = RunOnConstantLevel( - kDelayIterations, kInitialSpeechLevelDbfs + kInitialMarginDb, - kInitialSpeechLevelDbfs, state); - // Then peak changes, but not RMS. - max_difference = - std::max(RunOnConstantLevel(kDelayIterations, - kLaterSpeechLevelDbfs + kInitialMarginDb, - kInitialSpeechLevelDbfs, state), - max_difference); - // Then both change. - max_difference = - std::max(RunOnConstantLevel(kDelayIterations, - kLaterSpeechLevelDbfs + kInitialMarginDb, - kLaterSpeechLevelDbfs, state), - max_difference); - - // The saturation protector expects that the RMS changes roughly - // 'kFullBufferSizeMs' after peaks change. This is to account for delay - // introduced by the level estimator. Therefore, the input above is 'normal' - // and 'expected', and shouldn't influence the margin by much. - const float total_difference = std::abs(state.margin_db - kInitialMarginDb); - - EXPECT_LE(total_difference, 0.05f); - EXPECT_LE(max_difference, 0.01f); +TEST_P(SaturationProtectorParametrization, DoNotAdaptToShortSpeechSegments) { + ApmDataDumper apm_data_dumper(0); + auto saturation_protector = CreateSaturationProtector( + kInitialHeadroomDb, kNoExtraHeadroomDb, + adjacent_speech_frames_threshold(), &apm_data_dumper); + const float initial_headroom_db = saturation_protector->HeadroomDb(); + RunOnConstantLevel(/*num_iterations=*/adjacent_speech_frames_threshold() - 1, + kMaxSpeechProbability, + /*peak_dbfs=*/0.0f, + /*speech_level_dbfs=*/-10.0f, *saturation_protector); + // No adaptation expected. + EXPECT_EQ(initial_headroom_db, saturation_protector->HeadroomDb()); } +TEST_P(SaturationProtectorParametrization, AdaptToEnoughSpeechSegments) { + ApmDataDumper apm_data_dumper(0); + auto saturation_protector = CreateSaturationProtector( + kInitialHeadroomDb, kNoExtraHeadroomDb, + adjacent_speech_frames_threshold(), &apm_data_dumper); + const float initial_headroom_db = saturation_protector->HeadroomDb(); + RunOnConstantLevel(/*num_iterations=*/adjacent_speech_frames_threshold() + 1, + kMaxSpeechProbability, + /*peak_dbfs=*/0.0f, + /*speech_level_dbfs=*/-10.0f, *saturation_protector); + // Adaptation expected. + EXPECT_NE(initial_headroom_db, saturation_protector->HeadroomDb()); +} + +INSTANTIATE_TEST_SUITE_P(GainController2, + SaturationProtectorParametrization, + ::testing::Values(2, 9, 17)); + +} // namespace } // namespace webrtc diff --git a/modules/audio_processing/agc2/vad_with_level.cc b/modules/audio_processing/agc2/vad_with_level.cc index 597c09c902..034f2b6ac0 100644 --- a/modules/audio_processing/agc2/vad_with_level.cc +++ b/modules/audio_processing/agc2/vad_with_level.cc @@ -65,43 +65,23 @@ class Vad : public VoiceActivityDetector { rnn_vad::RnnVad rnn_vad_; }; -// Returns an updated version of `p_old` by using instant decay and the given -// `attack` on a new VAD probability value `p_new`. -float SmoothedVadProbability(float p_old, float p_new, float attack) { - RTC_DCHECK_GT(attack, 0.0f); - RTC_DCHECK_LE(attack, 1.0f); - if (p_new < p_old || attack == 1.0f) { - // Instant decay (or no smoothing). - return p_new; - } else { - // Attack phase. - return attack * p_new + (1.0f - attack) * p_old; - } -} - } // namespace VadLevelAnalyzer::VadLevelAnalyzer() - : VadLevelAnalyzer(kDefaultVadRnnResetPeriodMs, - kDefaultSmoothedVadProbabilityAttack, - GetAvailableCpuFeatures()) {} + : VadLevelAnalyzer(kDefaultVadRnnResetPeriodMs, GetAvailableCpuFeatures()) { +} VadLevelAnalyzer::VadLevelAnalyzer(int vad_reset_period_ms, - float vad_probability_attack, const AvailableCpuFeatures& cpu_features) : VadLevelAnalyzer(vad_reset_period_ms, - vad_probability_attack, std::make_unique(cpu_features)) {} VadLevelAnalyzer::VadLevelAnalyzer(int vad_reset_period_ms, - float vad_probability_attack, std::unique_ptr vad) : vad_(std::move(vad)), vad_reset_period_frames_( rtc::CheckedDivExact(vad_reset_period_ms, kFrameDurationMs)), - vad_probability_attack_(vad_probability_attack), - time_to_vad_reset_(vad_reset_period_frames_), - vad_probability_(0.0f) { + time_to_vad_reset_(vad_reset_period_frames_) { RTC_DCHECK(vad_); RTC_DCHECK_GT(vad_reset_period_frames_, 1); } @@ -123,11 +103,7 @@ VadLevelAnalyzer::Result VadLevelAnalyzer::AnalyzeFrame( peak = std::max(std::fabs(x), peak); rms += x * x; } - // Compute smoothed speech probability. - vad_probability_ = SmoothedVadProbability( - /*p_old=*/vad_probability_, /*p_new=*/vad_->ComputeProbability(frame), - vad_probability_attack_); - return {vad_probability_, + return {vad_->ComputeProbability(frame), FloatS16ToDbfs(std::sqrt(rms / frame.samples_per_channel())), FloatS16ToDbfs(peak)}; } diff --git a/modules/audio_processing/agc2/vad_with_level.h b/modules/audio_processing/agc2/vad_with_level.h index 386f162de6..7cd93d6f2b 100644 --- a/modules/audio_processing/agc2/vad_with_level.h +++ b/modules/audio_processing/agc2/vad_with_level.h @@ -37,18 +37,15 @@ class VadLevelAnalyzer { virtual float ComputeProbability(AudioFrameView frame) = 0; }; - // Ctor. Uses the default VAD. + // Ctor. Uses the default VAD with the default settings. VadLevelAnalyzer(); // Ctor. `vad_reset_period_ms` indicates the period in milliseconds to call // `VadLevelAnalyzer::Reset()`; it must be equal to or greater than the - // duration of two frames. `vad_probability_attack` is a number in (0,1] used - // to smooth the speech probability (instant decay, slow attack). + // duration of two frames. Uses `cpu_features` to instantiate the default VAD. VadLevelAnalyzer(int vad_reset_period_ms, - float vad_probability_attack, const AvailableCpuFeatures& cpu_features); // Ctor. Uses a custom `vad`. VadLevelAnalyzer(int vad_reset_period_ms, - float vad_probability_attack, std::unique_ptr vad); VadLevelAnalyzer(const VadLevelAnalyzer&) = delete; @@ -61,9 +58,7 @@ class VadLevelAnalyzer { private: std::unique_ptr vad_; const int vad_reset_period_frames_; - const float vad_probability_attack_; int time_to_vad_reset_; - float vad_probability_; }; } // namespace webrtc diff --git a/modules/audio_processing/agc2/vad_with_level_unittest.cc b/modules/audio_processing/agc2/vad_with_level_unittest.cc index fd8265e9b4..99b0136376 100644 --- a/modules/audio_processing/agc2/vad_with_level_unittest.cc +++ b/modules/audio_processing/agc2/vad_with_level_unittest.cc @@ -29,9 +29,6 @@ using ::testing::ReturnRoundRobin; constexpr int kNoVadPeriodicReset = kFrameDurationMs * (std::numeric_limits::max() / kFrameDurationMs); -constexpr float kInstantAttack = 1.0f; -constexpr float kSlowAttack = 0.1f; - constexpr int kSampleRateHz = 8000; class MockVad : public VadLevelAnalyzer::VoiceActivityDetector { @@ -48,7 +45,6 @@ class MockVad : public VadLevelAnalyzer::VoiceActivityDetector { // restart from the beginning. std::unique_ptr CreateVadLevelAnalyzerWithMockVad( int vad_reset_period_ms, - float vad_probability_attack, const std::vector& speech_probabilities, int expected_vad_reset_calls = 0) { auto vad = std::make_unique(); @@ -58,8 +54,8 @@ std::unique_ptr CreateVadLevelAnalyzerWithMockVad( if (expected_vad_reset_calls >= 0) { EXPECT_CALL(*vad, Reset).Times(expected_vad_reset_calls); } - return std::make_unique( - vad_reset_period_ms, vad_probability_attack, std::move(vad)); + return std::make_unique(vad_reset_period_ms, + std::move(vad)); } // 10 ms mono frame. @@ -75,7 +71,7 @@ struct FrameWithView { const AudioFrameView view; }; -TEST(AutomaticGainController2VadLevelAnalyzer, PeakLevelGreaterThanRmsLevel) { +TEST(GainController2VadLevelAnalyzer, PeakLevelGreaterThanRmsLevel) { // Handcrafted frame so that the average is lower than the peak value. FrameWithView frame(1000.0f); // Constant frame. frame.samples[10] = 2000.0f; // Except for one peak value. @@ -88,14 +84,13 @@ TEST(AutomaticGainController2VadLevelAnalyzer, PeakLevelGreaterThanRmsLevel) { EXPECT_LT(levels_and_vad_prob.rms_dbfs, levels_and_vad_prob.peak_dbfs); } -// Checks that the unprocessed and the smoothed speech probabilities match when -// instant attack is used. -TEST(AutomaticGainController2VadLevelAnalyzer, NoSpeechProbabilitySmoothing) { +// Checks that the expect VAD probabilities are returned. +TEST(GainController2VadLevelAnalyzer, NoSpeechProbabilitySmoothing) { const std::vector speech_probabilities{0.709f, 0.484f, 0.882f, 0.167f, 0.44f, 0.525f, 0.858f, 0.314f, 0.653f, 0.965f, 0.413f, 0.0f}; - auto analyzer = CreateVadLevelAnalyzerWithMockVad( - kNoVadPeriodicReset, kInstantAttack, speech_probabilities); + auto analyzer = CreateVadLevelAnalyzerWithMockVad(kNoVadPeriodicReset, + speech_probabilities); FrameWithView frame; for (int i = 0; rtc::SafeLt(i, speech_probabilities.size()); ++i) { SCOPED_TRACE(i); @@ -104,45 +99,11 @@ TEST(AutomaticGainController2VadLevelAnalyzer, NoSpeechProbabilitySmoothing) { } } -// Checks that the smoothed speech probability does not instantly converge to -// the unprocessed one when slow attack is used. -TEST(AutomaticGainController2VadLevelAnalyzer, - SlowAttackSpeechProbabilitySmoothing) { - const std::vector speech_probabilities{0.0f, 0.0f, 1.0f, - 1.0f, 1.0f, 1.0f}; - auto analyzer = CreateVadLevelAnalyzerWithMockVad( - kNoVadPeriodicReset, kSlowAttack, speech_probabilities); - FrameWithView frame; - float prev_probability = 0.0f; - for (int i = 0; rtc::SafeLt(i, speech_probabilities.size()); ++i) { - SCOPED_TRACE(i); - const float smoothed_probability = - analyzer->AnalyzeFrame(frame.view).speech_probability; - EXPECT_LT(smoothed_probability, 1.0f); // Not enough time to reach 1. - EXPECT_LE(prev_probability, smoothed_probability); // Converge towards 1. - prev_probability = smoothed_probability; - } -} - -// Checks that the smoothed speech probability instantly decays to the -// unprocessed one when slow attack is used. -TEST(AutomaticGainController2VadLevelAnalyzer, SpeechProbabilityInstantDecay) { - const std::vector speech_probabilities{1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 0.0f}; - auto analyzer = CreateVadLevelAnalyzerWithMockVad( - kNoVadPeriodicReset, kSlowAttack, speech_probabilities); - FrameWithView frame; - for (int i = 0; rtc::SafeLt(i, speech_probabilities.size() - 1); ++i) { - analyzer->AnalyzeFrame(frame.view); - } - EXPECT_EQ(0.0f, analyzer->AnalyzeFrame(frame.view).speech_probability); -} - // Checks that the VAD is not periodically reset. -TEST(AutomaticGainController2VadLevelAnalyzer, VadNoPeriodicReset) { +TEST(GainController2VadLevelAnalyzer, VadNoPeriodicReset) { constexpr int kNumFrames = 19; auto analyzer = CreateVadLevelAnalyzerWithMockVad( - kNoVadPeriodicReset, kSlowAttack, /*speech_probabilities=*/{1.0f}, + kNoVadPeriodicReset, /*speech_probabilities=*/{1.0f}, /*expected_vad_reset_calls=*/0); FrameWithView frame; for (int i = 0; i < kNumFrames; ++i) { @@ -161,7 +122,7 @@ class VadPeriodResetParametrization TEST_P(VadPeriodResetParametrization, VadPeriodicReset) { auto analyzer = CreateVadLevelAnalyzerWithMockVad( /*vad_reset_period_ms=*/vad_reset_period_frames() * kFrameDurationMs, - kSlowAttack, /*speech_probabilities=*/{1.0f}, + /*speech_probabilities=*/{1.0f}, /*expected_vad_reset_calls=*/num_frames() / vad_reset_period_frames()); FrameWithView frame; for (int i = 0; i < num_frames(); ++i) { @@ -169,7 +130,7 @@ TEST_P(VadPeriodResetParametrization, VadPeriodicReset) { } } -INSTANTIATE_TEST_SUITE_P(AutomaticGainController2VadLevelAnalyzer, +INSTANTIATE_TEST_SUITE_P(GainController2VadLevelAnalyzer, VadPeriodResetParametrization, ::testing::Combine(::testing::Values(1, 19, 123), ::testing::Values(2, 5, 20, 53))); diff --git a/modules/audio_processing/gain_controller2.cc b/modules/audio_processing/gain_controller2.cc index 6c5e24e165..9e3e8e7cae 100644 --- a/modules/audio_processing/gain_controller2.cc +++ b/modules/audio_processing/gain_controller2.cc @@ -73,7 +73,7 @@ void GainController2::Process(AudioBuffer* audio) { void GainController2::NotifyAnalogLevel(int level) { if (analog_level_ != level && adaptive_agc_) { - adaptive_agc_->Reset(); + adaptive_agc_->HandleInputGainChange(); } analog_level_ = level; } diff --git a/modules/audio_processing/gain_controller2_unittest.cc b/modules/audio_processing/gain_controller2_unittest.cc index 274c821081..815d58efe7 100644 --- a/modules/audio_processing/gain_controller2_unittest.cc +++ b/modules/audio_processing/gain_controller2_unittest.cc @@ -11,6 +11,7 @@ #include "modules/audio_processing/gain_controller2.h" #include +#include #include #include "api/array_view.h" @@ -68,7 +69,8 @@ std::unique_ptr CreateAgc2FixedDigitalMode( return agc2; } -float GainAfterProcessingFile(GainController2* gain_controller) { +float GainDbAfterProcessingFile(GainController2& gain_controller, + int max_duration_ms) { // Set up an AudioBuffer to be filled from the speech file. constexpr size_t kStereo = 2u; const StreamConfig capture_config(AudioProcessing::kSampleRate48kHz, kStereo, @@ -82,24 +84,29 @@ float GainAfterProcessingFile(GainController2* gain_controller) { std::vector capture_input(capture_config.num_frames() * capture_config.num_channels()); - // The file should contain at least this many frames. Every iteration, we put - // a frame through the gain controller. - const int kNumFramesToProcess = 100; - for (int frame_no = 0; frame_no < kNumFramesToProcess; ++frame_no) { + // Process the input file which must be long enough to cover + // `max_duration_ms`. + RTC_DCHECK_GT(max_duration_ms, 0); + const int num_frames = rtc::CheckedDivExact(max_duration_ms, 10); + for (int i = 0; i < num_frames; ++i) { ReadFloatSamplesFromStereoFile(capture_config.num_frames(), capture_config.num_channels(), &capture_file, capture_input); - test::CopyVectorToAudioBuffer(capture_config, capture_input, &ab); - gain_controller->Process(&ab); + gain_controller.Process(&ab); } - // Send in a last frame with values constant 1 (It's low enough to detect high - // gain, and for ease of computation). The applied gain is the result. + // Send in a last frame with minimum dBFS level. constexpr float sample_value = 1.f; SetAudioBufferSamples(sample_value, &ab); - gain_controller->Process(&ab); - return ab.channels()[0][0]; + gain_controller.Process(&ab); + // Measure the RMS level after processing. + float rms = 0.0f; + for (size_t i = 0; i < capture_config.num_frames(); ++i) { + rms += ab.channels()[0][i] * ab.channels()[0][i]; + } + // Return the applied gain in dB. + return 20.0f * std::log10(std::sqrt(rms / capture_config.num_frames())); } } // namespace @@ -324,34 +331,20 @@ INSTANTIATE_TEST_SUITE_P( 48000, true))); -TEST(GainController2, UsageSaturationMargin) { +// Checks that the gain applied at the end of a PCM samples file is close to the +// expected value. +TEST(GainController2, CheckGainAdaptiveDigital) { + constexpr float kExpectedGainDb = 4.3f; + constexpr float kToleranceDb = 0.5f; GainController2 gain_controller2; gain_controller2.Initialize(AudioProcessing::kSampleRate48kHz); - AudioProcessing::Config::GainController2 config; - // Check that samples are not amplified as much when extra margin is - // high. They should not be amplified at all, but only after convergence. GC2 - // starts with a gain, and it takes time until it's down to 0 dB. config.fixed_digital.gain_db = 0.f; config.adaptive_digital.enabled = true; - config.adaptive_digital.extra_saturation_margin_db = 50.f; gain_controller2.ApplyConfig(config); - - EXPECT_LT(GainAfterProcessingFile(&gain_controller2), 2.f); -} - -TEST(GainController2, UsageNoSaturationMargin) { - GainController2 gain_controller2; - gain_controller2.Initialize(AudioProcessing::kSampleRate48kHz); - - AudioProcessing::Config::GainController2 config; - // Check that some gain is applied if there is no margin. - config.fixed_digital.gain_db = 0.f; - config.adaptive_digital.enabled = true; - config.adaptive_digital.extra_saturation_margin_db = 0.f; - gain_controller2.ApplyConfig(config); - - EXPECT_GT(GainAfterProcessingFile(&gain_controller2), 1.9f); + EXPECT_NEAR( + GainDbAfterProcessingFile(gain_controller2, /*max_duration_ms=*/2000), + kExpectedGainDb, kToleranceDb); } } // namespace test diff --git a/modules/audio_processing/include/audio_processing.cc b/modules/audio_processing/include/audio_processing.cc index 790b1a71dc..fa45230c6b 100644 --- a/modules/audio_processing/include/audio_processing.cc +++ b/modules/audio_processing/include/audio_processing.cc @@ -46,17 +46,6 @@ std::string GainController1ModeToString(const Agc1Config::Mode& mode) { RTC_CHECK_NOTREACHED(); } -std::string GainController2LevelEstimatorToString( - const Agc2Config::LevelEstimator& level) { - switch (level) { - case Agc2Config::LevelEstimator::kRms: - return "Rms"; - case Agc2Config::LevelEstimator::kPeak: - return "Peak"; - } - RTC_CHECK_NOTREACHED(); -} - std::string GainController2NoiseEstimatorToString( const Agc2Config::NoiseEstimator& type) { switch (type) { @@ -174,20 +163,10 @@ std::string AudioProcessing::Config::ToString() const { << gain_controller2.adaptive_digital.enabled << ", noise_estimator: " << GainController2NoiseEstimatorToString( gain_controller2.adaptive_digital.noise_estimator) - << ", level_estimator: { vad_probability_attack: " - << gain_controller2.adaptive_digital.vad_probability_attack << ", type: " - << GainController2LevelEstimatorToString( - gain_controller2.adaptive_digital.level_estimator) + << ", vad_reset_period_ms: " + << gain_controller2.adaptive_digital.vad_reset_period_ms << ", adjacent_speech_frames_threshold: " - << gain_controller2.adaptive_digital - .level_estimator_adjacent_speech_frames_threshold - << ", initial_saturation_margin_db: " - << gain_controller2.adaptive_digital.initial_saturation_margin_db - << ", extra_saturation_margin_db: " - << gain_controller2.adaptive_digital.extra_saturation_margin_db - << " }, gain_applier: { adjacent_speech_frames_threshold: " - << gain_controller2.adaptive_digital - .gain_applier_adjacent_speech_frames_threshold + << gain_controller2.adaptive_digital.adjacent_speech_frames_threshold << ", max_gain_change_db_per_second: " << gain_controller2.adaptive_digital.max_gain_change_db_per_second << ", max_output_noise_level_dbfs: " @@ -195,7 +174,7 @@ std::string AudioProcessing::Config::ToString() const { << ", sse2_allowed: " << gain_controller2.adaptive_digital.sse2_allowed << ", avx2_allowed: " << gain_controller2.adaptive_digital.avx2_allowed << ", neon_allowed: " << gain_controller2.adaptive_digital.neon_allowed - << " }}}, residual_echo_detector: { enabled: " + << "}}, residual_echo_detector: { enabled: " << residual_echo_detector.enabled << " }, level_estimation: { enabled: " << level_estimation.enabled << " }}"; diff --git a/modules/audio_processing/include/audio_processing.h b/modules/audio_processing/include/audio_processing.h index 781b17e44d..01bb7c33c7 100644 --- a/modules/audio_processing/include/audio_processing.h +++ b/modules/audio_processing/include/audio_processing.h @@ -349,6 +349,7 @@ class RTC_EXPORT AudioProcessing : public rtc::RefCountInterface { return !(*this == rhs); } + // TODO(crbug.com/webrtc/7494): Remove `LevelEstimator`. enum LevelEstimator { kRms, kPeak }; enum NoiseEstimator { kStationaryNoise, kNoiseFloor }; bool enabled = false; @@ -359,19 +360,20 @@ class RTC_EXPORT AudioProcessing : public rtc::RefCountInterface { bool enabled = false; NoiseEstimator noise_estimator = kNoiseFloor; int vad_reset_period_ms = 1500; - float vad_probability_attack = 0.9f; - LevelEstimator level_estimator = kRms; - int level_estimator_adjacent_speech_frames_threshold = 11; - // TODO(crbug.com/webrtc/7494): Remove `use_saturation_protector`. - bool use_saturation_protector = true; - float initial_saturation_margin_db = 20.0f; - float extra_saturation_margin_db = 5.0f; - int gain_applier_adjacent_speech_frames_threshold = 11; + int adjacent_speech_frames_threshold = 12; float max_gain_change_db_per_second = 3.0f; - float max_output_noise_level_dbfs = -55.0f; + float max_output_noise_level_dbfs = -50.0f; bool sse2_allowed = true; bool avx2_allowed = true; bool neon_allowed = true; + // TODO(crbug.com/webrtc/7494): Remove deprecated settings below. + float vad_probability_attack = 1.0f; + LevelEstimator level_estimator = kRms; + int level_estimator_adjacent_speech_frames_threshold = 12; + bool use_saturation_protector = true; + float initial_saturation_margin_db = 25.0f; + float extra_saturation_margin_db = 5.0f; + int gain_applier_adjacent_speech_frames_threshold = 12; } adaptive_digital; } gain_controller2;