From 87b86acde990a0288b2c75be4f03d8bd5e1be74b Mon Sep 17 00:00:00 2001 From: Alessio Bazzica Date: Wed, 30 Sep 2020 22:50:18 +0200 Subject: [PATCH] AGC2: gain increase allowed once enough adjacent speech frames observed Make the digital adaptive gain applier more robust to VAD false positives. Achieved by allowing a gain increase only if enough adjacent speech frames are observed. Tested: - Bit-exactness verified with audioproc_f - If `kDefaultDigitalGainApplierAdjacentSpeechFramesThreshold` == 2 then not bit-exact Bug: webrtc:7494 Change-Id: I3bab5a449aaf0ef1a64b671b413ba2ddb4688cd2 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/186042 Commit-Queue: Alessio Bazzica Reviewed-by: Ivo Creusen Cr-Commit-Position: refs/heads/master@{#32263} --- .../agc2/adaptive_digital_gain_applier.cc | 27 +++++-- .../agc2/adaptive_digital_gain_applier.h | 8 +- .../adaptive_digital_gain_applier_unittest.cc | 80 ++++++++++++++++--- .../agc2/adaptive_mode_level_estimator.cc | 13 +-- modules/audio_processing/agc2/agc2_common.h | 3 +- 5 files changed, 105 insertions(+), 26 deletions(-) diff --git a/modules/audio_processing/agc2/adaptive_digital_gain_applier.cc b/modules/audio_processing/agc2/adaptive_digital_gain_applier.cc index 14ca9853a8..50d7e05b87 100644 --- a/modules/audio_processing/agc2/adaptive_digital_gain_applier.cc +++ b/modules/audio_processing/agc2/adaptive_digital_gain_applier.cc @@ -87,13 +87,23 @@ float ComputeGainChangeThisFrameDb(float target_gain_db, AdaptiveDigitalGainApplier::AdaptiveDigitalGainApplier( ApmDataDumper* apm_data_dumper) + : AdaptiveDigitalGainApplier( + apm_data_dumper, + kDefaultDigitalGainApplierAdjacentSpeechFramesThreshold) {} + +AdaptiveDigitalGainApplier::AdaptiveDigitalGainApplier( + ApmDataDumper* apm_data_dumper, + int adjacent_speech_frames_threshold) : apm_data_dumper_(apm_data_dumper), gain_applier_( /*hard_clip_samples=*/false, /*initial_gain_factor=*/DbToRatio(kInitialAdaptiveDigitalGainDb)), + adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold), calls_since_last_gain_log_(0), - gain_increase_allowed_(true), - last_gain_db_(kInitialAdaptiveDigitalGainDb) {} + frames_to_gain_increase_allowed_(adjacent_speech_frames_threshold_), + last_gain_db_(kInitialAdaptiveDigitalGainDb) { + RTC_DCHECK_GE(frames_to_gain_increase_allowed_, 1); +} void AdaptiveDigitalGainApplier::Process(const FrameInfo& info, AudioFrameView frame) { @@ -116,12 +126,17 @@ void AdaptiveDigitalGainApplier::Process(const FrameInfo& info, info.input_noise_level_dbfs, apm_data_dumper_), last_gain_db_, info.limiter_envelope_dbfs, info.estimate_is_confident); - // Forbid increasing the gain when there is no speech. - gain_increase_allowed_ = - info.vad_result.speech_probability > kVadConfidenceThreshold; + // Forbid increasing the gain until enough adjacent speech frames are + // observed. + if (info.vad_result.speech_probability < kVadConfidenceThreshold) { + frames_to_gain_increase_allowed_ = adjacent_speech_frames_threshold_; + } else if (frames_to_gain_increase_allowed_ > 0) { + frames_to_gain_increase_allowed_--; + } const float gain_change_this_frame_db = ComputeGainChangeThisFrameDb( - target_gain_db, last_gain_db_, gain_increase_allowed_); + target_gain_db, last_gain_db_, + /*gain_increase_allowed=*/frames_to_gain_increase_allowed_ == 0); apm_data_dumper_->DumpRaw("agc2_want_to_change_by_db", target_gain_db - last_gain_db_); diff --git a/modules/audio_processing/agc2/adaptive_digital_gain_applier.h b/modules/audio_processing/agc2/adaptive_digital_gain_applier.h index 4dc4863b56..ad3f39ca56 100644 --- a/modules/audio_processing/agc2/adaptive_digital_gain_applier.h +++ b/modules/audio_processing/agc2/adaptive_digital_gain_applier.h @@ -35,6 +35,10 @@ class AdaptiveDigitalGainApplier { }; explicit AdaptiveDigitalGainApplier(ApmDataDumper* apm_data_dumper); + // Ctor. `adjacent_speech_frames_threshold` indicates how many speech frames + // are required before a gain increase is allowed. + AdaptiveDigitalGainApplier(ApmDataDumper* apm_data_dumper, + int adjacent_speech_frames_threshold); AdaptiveDigitalGainApplier(const AdaptiveDigitalGainApplier&) = delete; AdaptiveDigitalGainApplier& operator=(const AdaptiveDigitalGainApplier&) = delete; @@ -46,8 +50,10 @@ class AdaptiveDigitalGainApplier { ApmDataDumper* const apm_data_dumper_; GainApplier gain_applier_; + const int adjacent_speech_frames_threshold_; + int calls_since_last_gain_log_; - bool gain_increase_allowed_; + int frames_to_gain_increase_allowed_; float last_gain_db_; }; diff --git a/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc b/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc index a607e7527f..c8fb6ca0b0 100644 --- a/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc +++ b/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc @@ -21,6 +21,10 @@ namespace webrtc { namespace { +constexpr int kMono = 1; +constexpr int kStereo = 2; +constexpr int kFrameLen10ms48kHz = 480; + // Constants used in place of estimated noise levels. constexpr float kNoNoiseDbfs = -90.f; constexpr float kWithNoiseDbfs = -20.f; @@ -36,7 +40,7 @@ float RunOnConstantLevel(int num_iterations, float gain_linear = 0.f; for (int i = 0; i < num_iterations; ++i) { - VectorFloatFrame fake_audio(1, 1, 1.f); + VectorFloatFrame fake_audio(kMono, 1, 1.f); AdaptiveDigitalGainApplier::FrameInfo info; info.input_level_dbfs = input_level_dbfs; info.input_noise_level_dbfs = kNoNoiseDbfs; @@ -62,7 +66,7 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainApplierShouldNotCrash) { AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper); // Make one call with reasonable audio level values and settings. - VectorFloatFrame fake_audio(2, 480, 10000.f); + VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.f); AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; info.input_level_dbfs = -5.0; gain_applier.Process(kFrameInfo, fake_audio.float_frame_view()); @@ -114,7 +118,7 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainDoesNotChangeFast) { float last_gain_linear = 1.f; for (int i = 0; i < kNumFramesToAdapt; ++i) { SCOPED_TRACE(i); - VectorFloatFrame fake_audio(1, 1, 1.f); + VectorFloatFrame fake_audio(kMono, 1, 1.f); AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; info.input_level_dbfs = initial_level_dbfs; gain_applier.Process(info, fake_audio.float_frame_view()); @@ -127,7 +131,7 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainDoesNotChangeFast) { // Check that the same is true when gain decreases as well. for (int i = 0; i < kNumFramesToAdapt; ++i) { SCOPED_TRACE(i); - VectorFloatFrame fake_audio(1, 1, 1.f); + VectorFloatFrame fake_audio(kMono, 1, 1.f); AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; info.input_level_dbfs = 0.f; gain_applier.Process(info, fake_audio.float_frame_view()); @@ -143,9 +147,8 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainIsRampedInAFrame) { AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper); constexpr float initial_level_dbfs = -25.f; - constexpr int num_samples = 480; - VectorFloatFrame fake_audio(1, num_samples, 1.f); + VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f); AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; info.input_level_dbfs = initial_level_dbfs; gain_applier.Process(info, fake_audio.float_frame_view()); @@ -158,7 +161,8 @@ TEST(AutomaticGainController2AdaptiveGainApplier, GainIsRampedInAFrame) { } const float kMaxChangePerFrameLinear = DbToRatio(kMaxGainChangePerFrameDb); - const float kMaxChangePerSample = kMaxChangePerFrameLinear / num_samples; + const float kMaxChangePerSample = + kMaxChangePerFrameLinear / kFrameLen10ms48kHz; EXPECT_LE(maximal_difference, kMaxChangePerSample); } @@ -168,7 +172,6 @@ TEST(AutomaticGainController2AdaptiveGainApplier, NoiseLimitsGain) { AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper); constexpr float initial_level_dbfs = -25.f; - constexpr int num_samples = 480; constexpr int num_initial_frames = kInitialAdaptiveDigitalGainDb / kMaxGainChangePerFrameDb; constexpr int num_frames = 50; @@ -176,7 +179,7 @@ TEST(AutomaticGainController2AdaptiveGainApplier, NoiseLimitsGain) { ASSERT_GT(kWithNoiseDbfs, kMaxNoiseLevelDbfs) << "kWithNoiseDbfs is too low"; for (int i = 0; i < num_initial_frames + num_frames; ++i) { - VectorFloatFrame fake_audio(1, num_samples, 1.f); + VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f); AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; info.input_level_dbfs = initial_level_dbfs; info.input_noise_level_dbfs = kWithNoiseDbfs; @@ -198,7 +201,7 @@ TEST(AutomaticGainController2GainApplier, CanHandlePositiveSpeechLevels) { AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper); // Make one call with positive audio level values and settings. - VectorFloatFrame fake_audio(2, 480, 10000.f); + VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.f); AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; info.input_level_dbfs = 5.f; gain_applier.Process(info, fake_audio.float_frame_view()); @@ -209,7 +212,6 @@ TEST(AutomaticGainController2GainApplier, AudioLevelLimitsGain) { AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper); constexpr float initial_level_dbfs = -25.f; - constexpr int num_samples = 480; constexpr int num_initial_frames = kInitialAdaptiveDigitalGainDb / kMaxGainChangePerFrameDb; constexpr int num_frames = 50; @@ -217,7 +219,7 @@ TEST(AutomaticGainController2GainApplier, AudioLevelLimitsGain) { ASSERT_GT(kWithNoiseDbfs, kMaxNoiseLevelDbfs) << "kWithNoiseDbfs is too low"; for (int i = 0; i < num_initial_frames + num_frames; ++i) { - VectorFloatFrame fake_audio(1, num_samples, 1.f); + VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f); AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; info.input_level_dbfs = initial_level_dbfs; info.limiter_envelope_dbfs = 1.f; @@ -235,5 +237,59 @@ TEST(AutomaticGainController2GainApplier, AudioLevelLimitsGain) { } } +class AdaptiveDigitalGainApplierTest : public ::testing::TestWithParam { + protected: + int AdjacentSpeechFramesThreshold() const { return GetParam(); } +}; + +TEST_P(AdaptiveDigitalGainApplierTest, + DoNotIncreaseGainWithTooFewSpeechFrames) { + const int adjacent_speech_frames_threshold = AdjacentSpeechFramesThreshold(); + ApmDataDumper apm_data_dumper(0); + AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper, + adjacent_speech_frames_threshold); + AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; + info.input_level_dbfs = -25.0; + + float prev_gain = 0.f; + for (int i = 0; i < adjacent_speech_frames_threshold; ++i) { + SCOPED_TRACE(i); + VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f); + gain_applier.Process(info, audio.float_frame_view()); + const float gain = audio.float_frame_view().channel(0)[0]; + if (i > 0) { + EXPECT_EQ(prev_gain, gain); // No gain increase. + } + prev_gain = gain; + } +} + +TEST_P(AdaptiveDigitalGainApplierTest, IncreaseGainWithEnoughSpeechFrames) { + const int adjacent_speech_frames_threshold = AdjacentSpeechFramesThreshold(); + ApmDataDumper apm_data_dumper(0); + AdaptiveDigitalGainApplier gain_applier(&apm_data_dumper, + adjacent_speech_frames_threshold); + AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; + info.input_level_dbfs = -25.0; + + float prev_gain = 0.f; + for (int i = 0; i < adjacent_speech_frames_threshold; ++i) { + VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f); + gain_applier.Process(info, audio.float_frame_view()); + prev_gain = audio.float_frame_view().channel(0)[0]; + } + + // Process one more speech frame. + VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f); + gain_applier.Process(info, audio.float_frame_view()); + + // The gain has increased. + EXPECT_GT(audio.float_frame_view().channel(0)[0], prev_gain); +} + +INSTANTIATE_TEST_SUITE_P(AutomaticGainController2, + AdaptiveDigitalGainApplierTest, + ::testing::Values(1, 7, 31)); + } // namespace } // namespace webrtc diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc b/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc index 246b34707a..9a8d6482bf 100644 --- a/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc +++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc @@ -64,7 +64,7 @@ AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator( : AdaptiveModeLevelEstimator( apm_data_dumper, AudioProcessing::Config::GainController2::LevelEstimator::kRms, - kDefaultAdjacentSpeechFramesThreshold, + kDefaultLevelEstimatorAdjacentSpeechFramesThreshold, kDefaultInitialSaturationMarginDb, kDefaultExtraSaturationMarginDb) {} @@ -73,11 +73,12 @@ AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator( AudioProcessing::Config::GainController2::LevelEstimator level_estimator, bool use_saturation_protector, float extra_saturation_margin_db) - : AdaptiveModeLevelEstimator(apm_data_dumper, - level_estimator, - kDefaultAdjacentSpeechFramesThreshold, - kDefaultInitialSaturationMarginDb, - extra_saturation_margin_db) { + : AdaptiveModeLevelEstimator( + apm_data_dumper, + level_estimator, + kDefaultLevelEstimatorAdjacentSpeechFramesThreshold, + kDefaultInitialSaturationMarginDb, + extra_saturation_margin_db) { if (!use_saturation_protector) { RTC_LOG(LS_WARNING) << "The saturation protector cannot be disabled."; } diff --git a/modules/audio_processing/agc2/agc2_common.h b/modules/audio_processing/agc2/agc2_common.h index cffe8b5989..30880e2a0a 100644 --- a/modules/audio_processing/agc2/agc2_common.h +++ b/modules/audio_processing/agc2/agc2_common.h @@ -51,7 +51,8 @@ constexpr float kInitialSpeechLevelEstimateDbfs = -30.f; // Robust VAD probability and speech decisions. constexpr float kDefaultSmoothedVadProbabilityAttack = 1.f; -constexpr int kDefaultAdjacentSpeechFramesThreshold = 1; +constexpr int kDefaultDigitalGainApplierAdjacentSpeechFramesThreshold = 1; +constexpr int kDefaultLevelEstimatorAdjacentSpeechFramesThreshold = 1; // Saturation Protector settings. constexpr float kDefaultInitialSaturationMarginDb = 20.f;