From a850e6c8b6776f17e2c85124206a7310060ceb2e Mon Sep 17 00:00:00 2001 From: Alessio Bazzica Date: Mon, 4 Oct 2021 13:35:55 +0200 Subject: [PATCH] AGC2 config: allow tuning of headroom, max gain and initial gain This CL does *not* change the behavior of the AGC2 adaptive digital controller - bitexactness verified with audioproc_f on a collection of AEC dumps and Wav files (42 recordings in total). Tested: compiled Chrome with this patch and made an appr.tc test call Bug: webrtc:7494 Change-Id: Ia8a9f6fbc3a3459b888a2eed87e108f0d39cfe99 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/233520 Commit-Queue: Alessio Bazzica Reviewed-by: Sam Zackrisson Cr-Commit-Position: refs/heads/main@{#35140} --- modules/audio_processing/agc2/BUILD.gn | 1 + modules/audio_processing/agc2/adaptive_agc.cc | 9 +- .../agc2/adaptive_digital_gain_applier.cc | 106 +++++---- .../agc2/adaptive_digital_gain_applier.h | 18 +- .../adaptive_digital_gain_applier_unittest.cc | 217 ++++++++++-------- .../agc2/adaptive_mode_level_estimator.cc | 27 ++- .../agc2/adaptive_mode_level_estimator.h | 7 +- .../adaptive_mode_level_estimator_unittest.cc | 135 +++++------ modules/audio_processing/agc2/agc2_common.h | 18 +- .../audio_processing_unittest.cc | 24 ++ modules/audio_processing/gain_controller2.cc | 4 +- .../gain_controller2_unittest.cc | 30 +++ .../include/audio_processing.cc | 6 + .../include/audio_processing.h | 9 +- 14 files changed, 350 insertions(+), 261 deletions(-) diff --git a/modules/audio_processing/agc2/BUILD.gn b/modules/audio_processing/agc2/BUILD.gn index 6dd8babd78..ce70c5d2e2 100644 --- a/modules/audio_processing/agc2/BUILD.gn +++ b/modules/audio_processing/agc2/BUILD.gn @@ -178,6 +178,7 @@ rtc_library("adaptive_digital_unittests") { ":common", ":gain_applier", ":test_utils", + "..:api", "..:apm_logging", "..:audio_frame_view", "../../../api:array_view", diff --git a/modules/audio_processing/agc2/adaptive_agc.cc b/modules/audio_processing/agc2/adaptive_agc.cc index 0e2535a444..eafbcc26ae 100644 --- a/modules/audio_processing/agc2/adaptive_agc.cc +++ b/modules/audio_processing/agc2/adaptive_agc.cc @@ -43,14 +43,9 @@ AvailableCpuFeatures GetAllowedCpuFeatures( AdaptiveAgc::AdaptiveAgc(ApmDataDumper* apm_data_dumper, const AdaptiveDigitalConfig& config) - : speech_level_estimator_(apm_data_dumper, - config.adjacent_speech_frames_threshold), + : speech_level_estimator_(apm_data_dumper, config), vad_(config.vad_reset_period_ms, GetAllowedCpuFeatures(config)), - gain_controller_(apm_data_dumper, - config.adjacent_speech_frames_threshold, - config.max_gain_change_db_per_second, - config.max_output_noise_level_dbfs, - config.dry_run), + gain_controller_(apm_data_dumper, config), apm_data_dumper_(apm_data_dumper), noise_level_estimator_(CreateNoiseFloorEstimator(apm_data_dumper)), saturation_protector_( diff --git a/modules/audio_processing/agc2/adaptive_digital_gain_applier.cc b/modules/audio_processing/agc2/adaptive_digital_gain_applier.cc index e59b110efe..526ef06e24 100644 --- a/modules/audio_processing/agc2/adaptive_digital_gain_applier.cc +++ b/modules/audio_processing/agc2/adaptive_digital_gain_applier.cc @@ -23,31 +23,38 @@ namespace webrtc { namespace { +using AdaptiveDigitalConfig = + AudioProcessing::Config::GainController2::AdaptiveDigital; + constexpr int kHeadroomHistogramMin = 0; constexpr int kHeadroomHistogramMax = 50; +constexpr int kGainDbHistogramMax = 30; -// This function maps input level to desired applied gain. We want to -// boost the signal so that peaks are at -kHeadroomDbfs. We can't -// apply more than kMaxGainDb gain. -float ComputeGainDb(float input_level_dbfs) { - // If the level is very low, boost it as much as we can. - if (input_level_dbfs < -(kHeadroomDbfs + kMaxGainDb)) { - return kMaxGainDb; +// Computes the gain for `input_level_dbfs` to reach `-config.headroom_db`. +// Clamps the gain in [0, `config.max_gain_db`]. `config.headroom_db` is a +// safety margin to allow transient peaks to exceed the target peak level +// without clipping. +float ComputeGainDb(float input_level_dbfs, + const AdaptiveDigitalConfig& config) { + // If the level is very low, apply the maximum gain. + if (input_level_dbfs < -(config.headroom_db + config.max_gain_db)) { + return config.max_gain_db; } // We expect to end up here most of the time: the level is below // -headroom, but we can boost it to -headroom. - if (input_level_dbfs < -kHeadroomDbfs) { - return -kHeadroomDbfs - input_level_dbfs; + if (input_level_dbfs < -config.headroom_db) { + return -config.headroom_db - input_level_dbfs; } - // Otherwise, the level is too high and we can't boost. - RTC_DCHECK_GE(input_level_dbfs, -kHeadroomDbfs); - return 0.f; + // The level is too high and we can't boost. + RTC_DCHECK_GE(input_level_dbfs, -config.headroom_db); + return 0.0f; } -// Returns `target_gain` if the output noise level is below -// `max_output_noise_level_dbfs`; otherwise returns a capped gain so that the -// output noise level equals `max_output_noise_level_dbfs`. -float LimitGainByNoise(float target_gain, +// Returns `target_gain_db` if applying such a gain to `input_noise_level_dbfs` +// does not exceed `max_output_noise_level_dbfs`. Otherwise lowers and returns +// `target_gain_db` so that the output noise level equals +// `max_output_noise_level_dbfs`. +float LimitGainByNoise(float target_gain_db, float input_noise_level_dbfs, float max_output_noise_level_dbfs, ApmDataDumper& apm_data_dumper) { @@ -55,24 +62,25 @@ float LimitGainByNoise(float target_gain, max_output_noise_level_dbfs - input_noise_level_dbfs; apm_data_dumper.DumpRaw("agc2_adaptive_gain_applier_max_allowed_gain_db", max_allowed_gain_db); - return std::min(target_gain, std::max(max_allowed_gain_db, 0.f)); + return std::min(target_gain_db, std::max(max_allowed_gain_db, 0.0f)); } -float LimitGainByLowConfidence(float target_gain, - float last_gain, +float LimitGainByLowConfidence(float target_gain_db, + float last_gain_db, float limiter_audio_level_dbfs, bool estimate_is_confident) { if (estimate_is_confident || limiter_audio_level_dbfs <= kLimiterThresholdForAgcGainDbfs) { - return target_gain; + return target_gain_db; } - const float limiter_level_before_gain = limiter_audio_level_dbfs - last_gain; + const float limiter_level_dbfs_before_gain = + limiter_audio_level_dbfs - last_gain_db; - // Compute a new gain so that `limiter_level_before_gain` + `new_target_gain` - // is not great than `kLimiterThresholdForAgcGainDbfs`. - const float new_target_gain = std::max( - kLimiterThresholdForAgcGainDbfs - limiter_level_before_gain, 0.f); - return std::min(new_target_gain, target_gain); + // Compute a new gain so that `limiter_level_dbfs_before_gain` + + // `new_target_gain_db` is not great than `kLimiterThresholdForAgcGainDbfs`. + const float new_target_gain_db = std::max( + kLimiterThresholdForAgcGainDbfs - limiter_level_dbfs_before_gain, 0.0f); + return std::min(new_target_gain_db, target_gain_db); } // Computes how the gain should change during this frame. @@ -86,7 +94,7 @@ float ComputeGainChangeThisFrameDb(float target_gain_db, RTC_DCHECK_GT(max_gain_increase_db, 0); float target_gain_difference_db = target_gain_db - last_gain_db; if (!gain_increase_allowed) { - target_gain_difference_db = std::min(target_gain_difference_db, 0.f); + target_gain_difference_db = std::min(target_gain_difference_db, 0.0f); } return rtc::SafeClamp(target_gain_difference_db, -max_gain_decrease_db, max_gain_increase_db); @@ -110,32 +118,28 @@ void CopyAudio(AudioFrameView src, AdaptiveDigitalGainApplier::AdaptiveDigitalGainApplier( ApmDataDumper* apm_data_dumper, - int adjacent_speech_frames_threshold, - float max_gain_change_db_per_second, - float max_output_noise_level_dbfs, - bool dry_run) + const AudioProcessing::Config::GainController2::AdaptiveDigital& config) : apm_data_dumper_(apm_data_dumper), gain_applier_( /*hard_clip_samples=*/false, - /*initial_gain_factor=*/DbToRatio(kInitialAdaptiveDigitalGainDb)), - adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold), - max_gain_change_db_per_10ms_(max_gain_change_db_per_second * - kFrameDurationMs / 1000.f), - max_output_noise_level_dbfs_(max_output_noise_level_dbfs), - dry_run_(dry_run), + /*initial_gain_factor=*/DbToRatio(config.initial_gain_db)), + config_(config), + max_gain_change_db_per_10ms_(config_.max_gain_change_db_per_second * + kFrameDurationMs / 1000.0f), calls_since_last_gain_log_(0), - frames_to_gain_increase_allowed_(adjacent_speech_frames_threshold_), - last_gain_db_(kInitialAdaptiveDigitalGainDb) { - RTC_DCHECK_GT(max_gain_change_db_per_second, 0.0f); + frames_to_gain_increase_allowed_( + config_.adjacent_speech_frames_threshold), + last_gain_db_(config_.initial_gain_db) { + RTC_DCHECK_GT(max_gain_change_db_per_10ms_, 0.0f); RTC_DCHECK_GE(frames_to_gain_increase_allowed_, 1); - RTC_DCHECK_GE(max_output_noise_level_dbfs_, -90.0f); - RTC_DCHECK_LE(max_output_noise_level_dbfs_, 0.0f); + RTC_DCHECK_GE(config_.max_output_noise_level_dbfs, -90.0f); + RTC_DCHECK_LE(config_.max_output_noise_level_dbfs, 0.0f); Initialize(/*sample_rate_hz=*/48000, /*num_channels=*/1); } void AdaptiveDigitalGainApplier::Initialize(int sample_rate_hz, int num_channels) { - if (!dry_run_) { + if (!config_.dry_run) { return; } RTC_DCHECK_GT(sample_rate_hz, 0); @@ -159,7 +163,7 @@ void AdaptiveDigitalGainApplier::Initialize(int sample_rate_hz, void AdaptiveDigitalGainApplier::Process(const FrameInfo& info, AudioFrameView frame) { - RTC_DCHECK_GE(info.speech_level_dbfs, -150.f); + RTC_DCHECK_GE(info.speech_level_dbfs, -150.0f); RTC_DCHECK_GE(frame.num_channels(), 1); RTC_DCHECK( frame.samples_per_channel() == 80 || frame.samples_per_channel() == 160 || @@ -172,15 +176,16 @@ void AdaptiveDigitalGainApplier::Process(const FrameInfo& info, const float input_level_dbfs = info.speech_level_dbfs + info.headroom_db; const float target_gain_db = LimitGainByLowConfidence( - LimitGainByNoise(ComputeGainDb(input_level_dbfs), info.noise_rms_dbfs, - max_output_noise_level_dbfs_, *apm_data_dumper_), + LimitGainByNoise(ComputeGainDb(input_level_dbfs, config_), + info.noise_rms_dbfs, config_.max_output_noise_level_dbfs, + *apm_data_dumper_), last_gain_db_, info.limiter_envelope_dbfs, info.speech_level_reliable); // Forbid increasing the gain until enough adjacent speech frames are // observed. bool first_confident_speech_frame = false; if (info.speech_probability < kVadConfidenceThreshold) { - frames_to_gain_increase_allowed_ = adjacent_speech_frames_threshold_; + frames_to_gain_increase_allowed_ = config_.adjacent_speech_frames_threshold; } else if (frames_to_gain_increase_allowed_ > 0) { frames_to_gain_increase_allowed_--; first_confident_speech_frame = frames_to_gain_increase_allowed_ == 0; @@ -196,7 +201,7 @@ void AdaptiveDigitalGainApplier::Process(const FrameInfo& info, // No gain increase happened while waiting for a long enough speech // sequence. Therefore, temporarily allow a faster gain increase. RTC_DCHECK(gain_increase_allowed); - max_gain_increase_db *= adjacent_speech_frames_threshold_; + max_gain_increase_db *= config_.adjacent_speech_frames_threshold; } const float gain_change_this_frame_db = ComputeGainChangeThisFrameDb( @@ -217,7 +222,7 @@ void AdaptiveDigitalGainApplier::Process(const FrameInfo& info, } // Modify `frame` only if not running in "dry run" mode. - if (!dry_run_) { + if (!config_.dry_run) { gain_applier_.ApplyGain(frame); } else { // Copy `frame` so that `ApplyGain()` is called (on a copy). @@ -247,7 +252,8 @@ void AdaptiveDigitalGainApplier::Process(const FrameInfo& info, kHeadroomHistogramMax, kHeadroomHistogramMax - kHeadroomHistogramMin + 1); RTC_HISTOGRAM_COUNTS_LINEAR("WebRTC.Audio.Agc2.DigitalGainApplied", - last_gain_db_, 0, kMaxGainDb, kMaxGainDb + 1); + last_gain_db_, 0, kGainDbHistogramMax, + kGainDbHistogramMax + 1); RTC_LOG(LS_INFO) << "AGC2 adaptive digital" << " | speech_dbfs: " << info.speech_level_dbfs << " | noise_dbfs: " << info.noise_rms_dbfs diff --git a/modules/audio_processing/agc2/adaptive_digital_gain_applier.h b/modules/audio_processing/agc2/adaptive_digital_gain_applier.h index 6fc8ac1c51..e254b516e4 100644 --- a/modules/audio_processing/agc2/adaptive_digital_gain_applier.h +++ b/modules/audio_processing/agc2/adaptive_digital_gain_applier.h @@ -15,6 +15,7 @@ #include "modules/audio_processing/agc2/gain_applier.h" #include "modules/audio_processing/include/audio_frame_view.h" +#include "modules/audio_processing/include/audio_processing.h" namespace webrtc { @@ -35,16 +36,9 @@ class AdaptiveDigitalGainApplier { float limiter_envelope_dbfs; // Envelope level from the limiter (dBFS). }; - // Ctor. `adjacent_speech_frames_threshold` indicates how many adjacent speech - // frames must be observed in order to consider the sequence as speech. - // `max_gain_change_db_per_second` limits the adaptation speed (uniformly - // operated across frames). `max_output_noise_level_dbfs` limits the output - // noise level. If `dry_run` is true, `Process()` will not modify the audio. - AdaptiveDigitalGainApplier(ApmDataDumper* apm_data_dumper, - int adjacent_speech_frames_threshold, - float max_gain_change_db_per_second, - float max_output_noise_level_dbfs, - bool dry_run); + AdaptiveDigitalGainApplier( + ApmDataDumper* apm_data_dumper, + const AudioProcessing::Config::GainController2::AdaptiveDigital& config); AdaptiveDigitalGainApplier(const AdaptiveDigitalGainApplier&) = delete; AdaptiveDigitalGainApplier& operator=(const AdaptiveDigitalGainApplier&) = delete; @@ -59,10 +53,8 @@ class AdaptiveDigitalGainApplier { ApmDataDumper* const apm_data_dumper_; GainApplier gain_applier_; - const int adjacent_speech_frames_threshold_; + const AudioProcessing::Config::GainController2::AdaptiveDigital config_; const float max_gain_change_db_per_10ms_; - const float max_output_noise_level_dbfs_; - const bool dry_run_; int calls_since_last_gain_log_; int frames_to_gain_increase_allowed_; diff --git a/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc b/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc index 3c5642bce0..efbc1e1799 100644 --- a/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc +++ b/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc @@ -16,6 +16,7 @@ #include "common_audio/include/audio_util.h" #include "modules/audio_processing/agc2/agc2_common.h" #include "modules/audio_processing/agc2/vector_float_frame.h" +#include "modules/audio_processing/include/audio_processing.h" #include "modules/audio_processing/logging/apm_data_dumper.h" #include "rtc_base/gunit.h" @@ -33,57 +34,68 @@ constexpr float kMaxSpeechProbability = 1.0f; constexpr float kNoNoiseDbfs = kMinLevelDbfs; constexpr float kWithNoiseDbfs = -20.0f; -constexpr float kMaxGainChangePerSecondDb = 3.0f; -constexpr float kMaxGainChangePerFrameDb = - kMaxGainChangePerSecondDb * kFrameDurationMs / 1000.0f; -constexpr float kMaxOutputNoiseLevelDbfs = -50.0f; +// Number of additional frames to process in the tests to ensure that the tested +// adaptation processes have converged. +constexpr int kNumExtraFrames = 10; + +constexpr float GetMaxGainChangePerFrameDb( + float max_gain_change_db_per_second) { + return max_gain_change_db_per_second * kFrameDurationMs / 1000.0f; +} + +using AdaptiveDigitalConfig = + AudioProcessing::Config::GainController2::AdaptiveDigital; + +constexpr AdaptiveDigitalConfig kDefaultConfig{}; // Helper to create initialized `AdaptiveDigitalGainApplier` objects. struct GainApplierHelper { - GainApplierHelper() - : GainApplierHelper(/*adjacent_speech_frames_threshold=*/1) {} - explicit GainApplierHelper(int adjacent_speech_frames_threshold) + explicit GainApplierHelper(const AdaptiveDigitalConfig& config) : apm_data_dumper(0), - gain_applier(std::make_unique( - &apm_data_dumper, - adjacent_speech_frames_threshold, - kMaxGainChangePerSecondDb, - kMaxOutputNoiseLevelDbfs, - /*dry_run=*/false)) {} + gain_applier( + std::make_unique(&apm_data_dumper, + config)) {} ApmDataDumper apm_data_dumper; std::unique_ptr gain_applier; }; -// Sample frame information for the tests mocking noiseless speech detected -// with maximum probability and with level, headroom and limiter envelope chosen -// so that the resulting gain equals `kInitialAdaptiveDigitalGainDb` - i.e., no -// gain adaptation is expected. -constexpr AdaptiveDigitalGainApplier::FrameInfo kFrameInfo{ - /*speech_probability=*/kMaxSpeechProbability, - /*speech_level_dbfs=*/kInitialSpeechLevelEstimateDbfs, - /*speech_level_reliable=*/true, - /*noise_rms_dbfs=*/kNoNoiseDbfs, - /*headroom_db=*/kSaturationProtectorInitialHeadroomDb, - /*limiter_envelope_dbfs=*/-2.0f}; +// Returns a `FrameInfo` sample to simulate noiseless speech detected with +// maximum probability and with level, headroom and limiter envelope chosen +// so that the resulting gain equals the default initial adaptive digital gain +// i.e., no gain adaptation is expected. +AdaptiveDigitalGainApplier::FrameInfo GetFrameInfoToNotAdapt( + const AdaptiveDigitalConfig& config) { + AdaptiveDigitalGainApplier::FrameInfo info; + info.speech_probability = kMaxSpeechProbability; + info.speech_level_dbfs = -config.initial_gain_db - config.headroom_db; + info.speech_level_reliable = true; + info.noise_rms_dbfs = kNoNoiseDbfs; + info.headroom_db = config.headroom_db; + info.limiter_envelope_dbfs = -2.0f; + return info; +} TEST(GainController2AdaptiveGainApplier, GainApplierShouldNotCrash) { - GainApplierHelper helper; + GainApplierHelper helper(kDefaultConfig); helper.gain_applier->Initialize(/*sample_rate_hz=*/48000, kStereo); // Make one call with reasonable audio level values and settings. VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.0f); - AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; - info.speech_level_dbfs = -5.0f; - helper.gain_applier->Process(kFrameInfo, fake_audio.float_frame_view()); + helper.gain_applier->Process(GetFrameInfoToNotAdapt(kDefaultConfig), + fake_audio.float_frame_view()); } // Checks that the maximum allowed gain is applied. TEST(GainController2AdaptiveGainApplier, MaxGainApplied) { constexpr int kNumFramesToAdapt = - static_cast(kMaxGainDb / kMaxGainChangePerFrameDb) + 10; + static_cast(kDefaultConfig.max_gain_db / + GetMaxGainChangePerFrameDb( + kDefaultConfig.max_gain_change_db_per_second)) + + kNumExtraFrames; - GainApplierHelper helper; + GainApplierHelper helper(kDefaultConfig); helper.gain_applier->Initialize(/*sample_rate_hz=*/8000, kMono); - AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; + AdaptiveDigitalGainApplier::FrameInfo info = + GetFrameInfoToNotAdapt(kDefaultConfig); info.speech_level_dbfs = -60.0f; float applied_gain; for (int i = 0; i < kNumFramesToAdapt; ++i) { @@ -92,30 +104,33 @@ TEST(GainController2AdaptiveGainApplier, MaxGainApplied) { applied_gain = fake_audio.float_frame_view().channel(0)[0]; } const float applied_gain_db = 20.0f * std::log10f(applied_gain); - EXPECT_NEAR(applied_gain_db, kMaxGainDb, 0.1f); + EXPECT_NEAR(applied_gain_db, kDefaultConfig.max_gain_db, 0.1f); } TEST(GainController2AdaptiveGainApplier, GainDoesNotChangeFast) { - GainApplierHelper helper; + GainApplierHelper helper(kDefaultConfig); helper.gain_applier->Initialize(/*sample_rate_hz=*/8000, kMono); constexpr float initial_level_dbfs = -25.0f; - // A few extra frames for safety. + constexpr float kMaxGainChangeDbPerFrame = + GetMaxGainChangePerFrameDb(kDefaultConfig.max_gain_change_db_per_second); constexpr int kNumFramesToAdapt = - static_cast(initial_level_dbfs / kMaxGainChangePerFrameDb) + 10; + static_cast(initial_level_dbfs / kMaxGainChangeDbPerFrame) + + kNumExtraFrames; - const float kMaxChangePerFrameLinear = DbToRatio(kMaxGainChangePerFrameDb); + const float max_change_per_frame_linear = DbToRatio(kMaxGainChangeDbPerFrame); float last_gain_linear = 1.f; for (int i = 0; i < kNumFramesToAdapt; ++i) { SCOPED_TRACE(i); VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.0f); - AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; + AdaptiveDigitalGainApplier::FrameInfo info = + GetFrameInfoToNotAdapt(kDefaultConfig); info.speech_level_dbfs = initial_level_dbfs; helper.gain_applier->Process(info, fake_audio.float_frame_view()); float current_gain_linear = fake_audio.float_frame_view().channel(0)[0]; EXPECT_LE(std::abs(current_gain_linear - last_gain_linear), - kMaxChangePerFrameLinear); + max_change_per_frame_linear); last_gain_linear = current_gain_linear; } @@ -123,56 +138,61 @@ TEST(GainController2AdaptiveGainApplier, GainDoesNotChangeFast) { for (int i = 0; i < kNumFramesToAdapt; ++i) { SCOPED_TRACE(i); VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.0f); - AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; + AdaptiveDigitalGainApplier::FrameInfo info = + GetFrameInfoToNotAdapt(kDefaultConfig); info.speech_level_dbfs = 0.f; helper.gain_applier->Process(info, fake_audio.float_frame_view()); float current_gain_linear = fake_audio.float_frame_view().channel(0)[0]; EXPECT_LE(std::abs(current_gain_linear - last_gain_linear), - kMaxChangePerFrameLinear); + max_change_per_frame_linear); last_gain_linear = current_gain_linear; } } TEST(GainController2AdaptiveGainApplier, GainIsRampedInAFrame) { - GainApplierHelper helper; + GainApplierHelper helper(kDefaultConfig); helper.gain_applier->Initialize(/*sample_rate_hz=*/48000, kMono); constexpr float initial_level_dbfs = -25.0f; VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.0f); - AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; + AdaptiveDigitalGainApplier::FrameInfo info = + GetFrameInfoToNotAdapt(kDefaultConfig); info.speech_level_dbfs = initial_level_dbfs; helper.gain_applier->Process(info, fake_audio.float_frame_view()); float maximal_difference = 0.0f; - float current_value = 1.0f * DbToRatio(kInitialAdaptiveDigitalGainDb); + float current_value = 1.0f * DbToRatio(kDefaultConfig.initial_gain_db); for (const auto& x : fake_audio.float_frame_view().channel(0)) { const float difference = std::abs(x - current_value); maximal_difference = std::max(maximal_difference, difference); current_value = x; } - const float kMaxChangePerFrameLinear = DbToRatio(kMaxGainChangePerFrameDb); - const float kMaxChangePerSample = - kMaxChangePerFrameLinear / kFrameLen10ms48kHz; + const float max_change_per_frame_linear = DbToRatio( + GetMaxGainChangePerFrameDb(kDefaultConfig.max_gain_change_db_per_second)); + const float max_change_per_sample = + max_change_per_frame_linear / kFrameLen10ms48kHz; - EXPECT_LE(maximal_difference, kMaxChangePerSample); + EXPECT_LE(maximal_difference, max_change_per_sample); } TEST(GainController2AdaptiveGainApplier, NoiseLimitsGain) { - GainApplierHelper helper; + GainApplierHelper helper(kDefaultConfig); helper.gain_applier->Initialize(/*sample_rate_hz=*/48000, kMono); constexpr float initial_level_dbfs = -25.0f; constexpr int num_initial_frames = - kInitialAdaptiveDigitalGainDb / kMaxGainChangePerFrameDb; + kDefaultConfig.initial_gain_db / + GetMaxGainChangePerFrameDb(kDefaultConfig.max_gain_change_db_per_second); constexpr int num_frames = 50; - ASSERT_GT(kWithNoiseDbfs, kMaxOutputNoiseLevelDbfs) + ASSERT_GT(kWithNoiseDbfs, kDefaultConfig.max_output_noise_level_dbfs) << "kWithNoiseDbfs is too low"; for (int i = 0; i < num_initial_frames + num_frames; ++i) { VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.0f); - AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; + AdaptiveDigitalGainApplier::FrameInfo info = + GetFrameInfoToNotAdapt(kDefaultConfig); info.speech_level_dbfs = initial_level_dbfs; info.noise_rms_dbfs = kWithNoiseDbfs; helper.gain_applier->Process(info, fake_audio.float_frame_view()); @@ -189,31 +209,34 @@ TEST(GainController2AdaptiveGainApplier, NoiseLimitsGain) { } TEST(GainController2GainApplier, CanHandlePositiveSpeechLevels) { - GainApplierHelper helper; + GainApplierHelper helper(kDefaultConfig); helper.gain_applier->Initialize(/*sample_rate_hz=*/48000, kStereo); // Make one call with positive audio level values and settings. VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.0f); - AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; + AdaptiveDigitalGainApplier::FrameInfo info = + GetFrameInfoToNotAdapt(kDefaultConfig); info.speech_level_dbfs = 5.0f; helper.gain_applier->Process(info, fake_audio.float_frame_view()); } TEST(GainController2GainApplier, AudioLevelLimitsGain) { - GainApplierHelper helper; + GainApplierHelper helper(kDefaultConfig); helper.gain_applier->Initialize(/*sample_rate_hz=*/48000, kMono); constexpr float initial_level_dbfs = -25.0f; constexpr int num_initial_frames = - kInitialAdaptiveDigitalGainDb / kMaxGainChangePerFrameDb; + kDefaultConfig.initial_gain_db / + GetMaxGainChangePerFrameDb(kDefaultConfig.max_gain_change_db_per_second); constexpr int num_frames = 50; - ASSERT_GT(kWithNoiseDbfs, kMaxOutputNoiseLevelDbfs) + ASSERT_GT(kWithNoiseDbfs, kDefaultConfig.max_output_noise_level_dbfs) << "kWithNoiseDbfs is too low"; for (int i = 0; i < num_initial_frames + num_frames; ++i) { VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.0f); - AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; + AdaptiveDigitalGainApplier::FrameInfo info = + GetFrameInfoToNotAdapt(kDefaultConfig); info.speech_level_dbfs = initial_level_dbfs; info.limiter_envelope_dbfs = 1.0f; info.speech_level_reliable = false; @@ -232,21 +255,22 @@ TEST(GainController2GainApplier, AudioLevelLimitsGain) { class AdaptiveDigitalGainApplierTest : public ::testing::TestWithParam { protected: - int AdjacentSpeechFramesThreshold() const { return GetParam(); } + int adjacent_speech_frames_threshold() const { return GetParam(); } }; TEST_P(AdaptiveDigitalGainApplierTest, DoNotIncreaseGainWithTooFewSpeechFrames) { - const int adjacent_speech_frames_threshold = AdjacentSpeechFramesThreshold(); - GainApplierHelper helper(adjacent_speech_frames_threshold); + AdaptiveDigitalConfig config; + config.adjacent_speech_frames_threshold = adjacent_speech_frames_threshold(); + GainApplierHelper helper(config); helper.gain_applier->Initialize(/*sample_rate_hz=*/48000, kMono); // Lower the speech level so that the target gain will be increased. - AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; + AdaptiveDigitalGainApplier::FrameInfo info = GetFrameInfoToNotAdapt(config); info.speech_level_dbfs -= 12.0f; float prev_gain = 0.0f; - for (int i = 0; i < adjacent_speech_frames_threshold; ++i) { + for (int i = 0; i < config.adjacent_speech_frames_threshold; ++i) { SCOPED_TRACE(i); VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.0f); helper.gain_applier->Process(info, audio.float_frame_view()); @@ -259,16 +283,17 @@ TEST_P(AdaptiveDigitalGainApplierTest, } TEST_P(AdaptiveDigitalGainApplierTest, IncreaseGainWithEnoughSpeechFrames) { - const int adjacent_speech_frames_threshold = AdjacentSpeechFramesThreshold(); - GainApplierHelper helper(adjacent_speech_frames_threshold); + AdaptiveDigitalConfig config; + config.adjacent_speech_frames_threshold = adjacent_speech_frames_threshold(); + GainApplierHelper helper(config); helper.gain_applier->Initialize(/*sample_rate_hz=*/48000, kMono); // Lower the speech level so that the target gain will be increased. - AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; + AdaptiveDigitalGainApplier::FrameInfo info = GetFrameInfoToNotAdapt(config); info.speech_level_dbfs -= 12.0f; float prev_gain = 0.0f; - for (int i = 0; i < adjacent_speech_frames_threshold; ++i) { + for (int i = 0; i < config.adjacent_speech_frames_threshold; ++i) { SCOPED_TRACE(i); VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.0f); helper.gain_applier->Process(info, audio.float_frame_view()); @@ -289,63 +314,65 @@ INSTANTIATE_TEST_SUITE_P(GainController2, // Checks that the input is never modified when running in dry run mode. TEST(GainController2GainApplier, DryRunDoesNotChangeInput) { - ApmDataDumper apm_data_dumper(0); - AdaptiveDigitalGainApplier gain_applier( - &apm_data_dumper, /*adjacent_speech_frames_threshold=*/1, - kMaxGainChangePerSecondDb, kMaxOutputNoiseLevelDbfs, /*dry_run=*/true); + AdaptiveDigitalConfig config; + config.dry_run = true; + GainApplierHelper helper(config); + // Simulate an input signal with log speech level. - AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; + AdaptiveDigitalGainApplier::FrameInfo info = GetFrameInfoToNotAdapt(config); info.speech_level_dbfs = -60.0f; - // Allow enough time to reach the maximum gain. - constexpr int kNumFramesToAdapt = - static_cast(kMaxGainDb / kMaxGainChangePerFrameDb) + 10; + const int num_frames_to_adapt = + static_cast( + config.max_gain_db / + GetMaxGainChangePerFrameDb(config.max_gain_change_db_per_second)) + + kNumExtraFrames; constexpr float kPcmSamples = 123.456f; // Run the gain applier and check that the PCM samples are not modified. - gain_applier.Initialize(/*sample_rate_hz=*/8000, kMono); - for (int i = 0; i < kNumFramesToAdapt; ++i) { + helper.gain_applier->Initialize(/*sample_rate_hz=*/8000, kMono); + for (int i = 0; i < num_frames_to_adapt; ++i) { SCOPED_TRACE(i); VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, kPcmSamples); - gain_applier.Process(info, fake_audio.float_frame_view()); + helper.gain_applier->Process(info, fake_audio.float_frame_view()); EXPECT_FLOAT_EQ(fake_audio.float_frame_view().channel(0)[0], kPcmSamples); } } // Checks that no sample is modified before and after the sample rate changes. TEST(GainController2GainApplier, DryRunHandlesSampleRateChange) { - ApmDataDumper apm_data_dumper(0); - AdaptiveDigitalGainApplier gain_applier( - &apm_data_dumper, /*adjacent_speech_frames_threshold=*/1, - kMaxGainChangePerSecondDb, kMaxOutputNoiseLevelDbfs, /*dry_run=*/true); - AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; + AdaptiveDigitalConfig config; + config.dry_run = true; + GainApplierHelper helper(config); + + AdaptiveDigitalGainApplier::FrameInfo info = GetFrameInfoToNotAdapt(config); info.speech_level_dbfs = -60.0f; constexpr float kPcmSamples = 123.456f; VectorFloatFrame fake_audio_8k(kMono, kFrameLen10ms8kHz, kPcmSamples); - gain_applier.Initialize(/*sample_rate_hz=*/8000, kMono); - gain_applier.Process(info, fake_audio_8k.float_frame_view()); + helper.gain_applier->Initialize(/*sample_rate_hz=*/8000, kMono); + helper.gain_applier->Process(info, fake_audio_8k.float_frame_view()); EXPECT_FLOAT_EQ(fake_audio_8k.float_frame_view().channel(0)[0], kPcmSamples); - gain_applier.Initialize(/*sample_rate_hz=*/48000, kMono); + helper.gain_applier->Initialize(/*sample_rate_hz=*/48000, kMono); VectorFloatFrame fake_audio_48k(kMono, kFrameLen10ms48kHz, kPcmSamples); - gain_applier.Process(info, fake_audio_48k.float_frame_view()); + helper.gain_applier->Process(info, fake_audio_48k.float_frame_view()); EXPECT_FLOAT_EQ(fake_audio_48k.float_frame_view().channel(0)[0], kPcmSamples); } // Checks that no sample is modified before and after the number of channels // changes. TEST(GainController2GainApplier, DryRunHandlesNumChannelsChange) { - ApmDataDumper apm_data_dumper(0); - AdaptiveDigitalGainApplier gain_applier( - &apm_data_dumper, /*adjacent_speech_frames_threshold=*/1, - kMaxGainChangePerSecondDb, kMaxOutputNoiseLevelDbfs, /*dry_run=*/true); - AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo; + AdaptiveDigitalConfig config; + config.dry_run = true; + GainApplierHelper helper(config); + + AdaptiveDigitalGainApplier::FrameInfo info = GetFrameInfoToNotAdapt(config); info.speech_level_dbfs = -60.0f; constexpr float kPcmSamples = 123.456f; VectorFloatFrame fake_audio_8k(kMono, kFrameLen10ms8kHz, kPcmSamples); - gain_applier.Initialize(/*sample_rate_hz=*/8000, kMono); - gain_applier.Process(info, fake_audio_8k.float_frame_view()); + helper.gain_applier->Initialize(/*sample_rate_hz=*/8000, kMono); + helper.gain_applier->Process(info, fake_audio_8k.float_frame_view()); EXPECT_FLOAT_EQ(fake_audio_8k.float_frame_view().channel(0)[0], kPcmSamples); VectorFloatFrame fake_audio_48k(kStereo, kFrameLen10ms8kHz, kPcmSamples); - gain_applier.Initialize(/*sample_rate_hz=*/8000, kStereo); - gain_applier.Process(info, fake_audio_48k.float_frame_view()); + helper.gain_applier->Initialize(/*sample_rate_hz=*/8000, kStereo); + helper.gain_applier->Process(info, fake_audio_48k.float_frame_view()); EXPECT_FLOAT_EQ(fake_audio_48k.float_frame_view().channel(0)[0], kPcmSamples); EXPECT_FLOAT_EQ(fake_audio_48k.float_frame_view().channel(1)[0], kPcmSamples); } diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc b/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc index ca3279e24f..81e7d291f6 100644 --- a/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc +++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc @@ -20,7 +20,14 @@ namespace webrtc { namespace { float ClampLevelEstimateDbfs(float level_estimate_dbfs) { - return rtc::SafeClamp(level_estimate_dbfs, -90.f, 30.f); + return rtc::SafeClamp(level_estimate_dbfs, -90.0f, 30.0f); +} + +// Returns the initial speech level estimate needed to apply the initial gain. +float GetInitialSpeechLevelEstimateDbfs( + const AudioProcessing::Config::GainController2::AdaptiveDigital& config) { + return ClampLevelEstimateDbfs(-kSaturationProtectorInitialHeadroomDb - + config.initial_gain_db - config.headroom_db); } } // namespace @@ -37,18 +44,14 @@ float AdaptiveModeLevelEstimator::LevelEstimatorState::Ratio::GetRatio() const { return numerator / denominator; } -AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator( - ApmDataDumper* apm_data_dumper) - : AdaptiveModeLevelEstimator( - apm_data_dumper, - kDefaultLevelEstimatorAdjacentSpeechFramesThreshold) {} - AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator( ApmDataDumper* apm_data_dumper, - int adjacent_speech_frames_threshold) + const AudioProcessing::Config::GainController2::AdaptiveDigital& config) : apm_data_dumper_(apm_data_dumper), - adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold), - level_dbfs_(ClampLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs)) { + initial_speech_level_dbfs_(GetInitialSpeechLevelEstimateDbfs(config)), + adjacent_speech_frames_threshold_( + config.adjacent_speech_frames_threshold), + level_dbfs_(initial_speech_level_dbfs_) { RTC_DCHECK(apm_data_dumper_); RTC_DCHECK_GE(adjacent_speech_frames_threshold_, 1); Reset(); @@ -128,14 +131,14 @@ bool AdaptiveModeLevelEstimator::IsConfident() const { void AdaptiveModeLevelEstimator::Reset() { ResetLevelEstimatorState(preliminary_state_); ResetLevelEstimatorState(reliable_state_); - level_dbfs_ = ClampLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs); + level_dbfs_ = initial_speech_level_dbfs_; num_adjacent_speech_frames_ = 0; } void AdaptiveModeLevelEstimator::ResetLevelEstimatorState( LevelEstimatorState& state) const { state.time_to_confidence_ms = kLevelEstimatorTimeToConfidenceMs; - state.level_dbfs.numerator = kInitialSpeechLevelEstimateDbfs; + state.level_dbfs.numerator = initial_speech_level_dbfs_; state.level_dbfs.denominator = 1.0f; } diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator.h b/modules/audio_processing/agc2/adaptive_mode_level_estimator.h index e39b6cecd7..e15c6afe4d 100644 --- a/modules/audio_processing/agc2/adaptive_mode_level_estimator.h +++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator.h @@ -24,12 +24,12 @@ class ApmDataDumper; // Level estimator for the digital adaptive gain controller. class AdaptiveModeLevelEstimator { public: - explicit AdaptiveModeLevelEstimator(ApmDataDumper* apm_data_dumper); + AdaptiveModeLevelEstimator( + ApmDataDumper* apm_data_dumper, + const AudioProcessing::Config::GainController2::AdaptiveDigital& config); AdaptiveModeLevelEstimator(const AdaptiveModeLevelEstimator&) = delete; AdaptiveModeLevelEstimator& operator=(const AdaptiveModeLevelEstimator&) = delete; - AdaptiveModeLevelEstimator(ApmDataDumper* apm_data_dumper, - int adjacent_speech_frames_threshold); // Updates the level estimation. void Update(const VadLevelAnalyzer::Result& vad_data); @@ -63,6 +63,7 @@ class AdaptiveModeLevelEstimator { ApmDataDumper* const apm_data_dumper_; + const float initial_speech_level_dbfs_; const int adjacent_speech_frames_threshold_; LevelEstimatorState preliminary_state_; LevelEstimatorState reliable_state_; diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc b/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc index c55950ac29..1cdd91d5d8 100644 --- a/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc +++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc @@ -13,37 +13,22 @@ #include #include "modules/audio_processing/agc2/agc2_common.h" +#include "modules/audio_processing/include/audio_processing.h" #include "modules/audio_processing/logging/apm_data_dumper.h" #include "rtc_base/gunit.h" namespace webrtc { namespace { +using AdaptiveDigitalConfig = + AudioProcessing::Config::GainController2::AdaptiveDigital; + // Number of speech frames that the level estimator must observe in order to // become confident about the estimated level. constexpr int kNumFramesToConfidence = kLevelEstimatorTimeToConfidenceMs / kFrameDurationMs; static_assert(kNumFramesToConfidence > 0, ""); -// Fake levels and speech probabilities used in the tests. -static_assert(kInitialSpeechLevelEstimateDbfs < 0.0f, ""); -constexpr float kVadLevelRms = kInitialSpeechLevelEstimateDbfs / 2.0f; -constexpr float kVadLevelPeak = kInitialSpeechLevelEstimateDbfs / 3.0f; -static_assert(kVadLevelRms < kVadLevelPeak, ""); -static_assert(kVadLevelRms > kInitialSpeechLevelEstimateDbfs, ""); -static_assert(kVadLevelRms - kInitialSpeechLevelEstimateDbfs > 5.0f, - "Adjust `kVadLevelRms` so that the difference from the initial " - "level is wide enough for the tests."); - -constexpr VadLevelAnalyzer::Result kVadDataSpeech{/*speech_probability=*/1.0f, - kVadLevelRms, kVadLevelPeak}; -constexpr VadLevelAnalyzer::Result kVadDataNonSpeech{ - /*speech_probability=*/kVadConfidenceThreshold / 2.0f, kVadLevelRms, - kVadLevelPeak}; - -constexpr float kMinSpeechProbability = 0.0f; -constexpr float kMaxSpeechProbability = 1.0f; - constexpr float kConvergenceSpeedTestsLevelTolerance = 0.5f; // Provides the `vad_level` value `num_iterations` times to `level_estimator`. @@ -55,31 +40,51 @@ void RunOnConstantLevel(int num_iterations, } } +constexpr AdaptiveDigitalConfig GetAdaptiveDigitalConfig( + int adjacent_speech_frames_threshold) { + AdaptiveDigitalConfig config; + config.adjacent_speech_frames_threshold = adjacent_speech_frames_threshold; + return config; +} + // Level estimator with data dumper. struct TestLevelEstimator { - TestLevelEstimator() + explicit TestLevelEstimator(int adjacent_speech_frames_threshold) : data_dumper(0), estimator(std::make_unique( &data_dumper, - /*adjacent_speech_frames_threshold=*/1)) {} + GetAdaptiveDigitalConfig(adjacent_speech_frames_threshold))), + initial_speech_level_dbfs(estimator->level_dbfs()), + vad_level_rms(initial_speech_level_dbfs / 2.0f), + vad_level_peak(initial_speech_level_dbfs / 3.0f), + vad_data_speech( + {/*speech_probability=*/1.0f, vad_level_rms, vad_level_peak}), + vad_data_non_speech( + {/*speech_probability=*/kVadConfidenceThreshold / 2.0f, + vad_level_rms, vad_level_peak}) { + RTC_DCHECK_LT(vad_level_rms, vad_level_peak); + RTC_DCHECK_LT(initial_speech_level_dbfs, vad_level_rms); + RTC_DCHECK_GT(vad_level_rms - initial_speech_level_dbfs, 5.0f) + << "Adjust `vad_level_rms` so that the difference from the initial " + "level is wide enough for the tests"; + } ApmDataDumper data_dumper; std::unique_ptr estimator; + const float initial_speech_level_dbfs; + const float vad_level_rms; + const float vad_level_peak; + const VadLevelAnalyzer::Result vad_data_speech; + const VadLevelAnalyzer::Result vad_data_non_speech; }; -// Checks the initially estimated level. -TEST(GainController2AdaptiveModeLevelEstimator, CheckInitialEstimate) { - TestLevelEstimator level_estimator; - EXPECT_FLOAT_EQ(level_estimator.estimator->level_dbfs(), - kInitialSpeechLevelEstimateDbfs); -} - // Checks that the level estimator converges to a constant input speech level. TEST(GainController2AdaptiveModeLevelEstimator, LevelStabilizes) { - TestLevelEstimator level_estimator; - RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, kVadDataSpeech, + TestLevelEstimator level_estimator(/*adjacent_speech_frames_threshold=*/1); + RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, + level_estimator.vad_data_speech, *level_estimator.estimator); const float estimated_level_dbfs = level_estimator.estimator->level_dbfs(); - RunOnConstantLevel(/*num_iterations=*/1, kVadDataSpeech, + RunOnConstantLevel(/*num_iterations=*/1, level_estimator.vad_data_speech, *level_estimator.estimator); EXPECT_NEAR(level_estimator.estimator->level_dbfs(), estimated_level_dbfs, 0.1f); @@ -88,17 +93,19 @@ TEST(GainController2AdaptiveModeLevelEstimator, LevelStabilizes) { // Checks that the level controller does not become confident when too few // speech frames are observed. TEST(GainController2AdaptiveModeLevelEstimator, IsNotConfident) { - TestLevelEstimator level_estimator; + TestLevelEstimator level_estimator(/*adjacent_speech_frames_threshold=*/1); RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence / 2, - kVadDataSpeech, *level_estimator.estimator); + level_estimator.vad_data_speech, + *level_estimator.estimator); EXPECT_FALSE(level_estimator.estimator->IsConfident()); } // Checks that the level controller becomes confident when enough speech frames // are observed. TEST(GainController2AdaptiveModeLevelEstimator, IsConfident) { - TestLevelEstimator level_estimator; - RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, kVadDataSpeech, + TestLevelEstimator level_estimator(/*adjacent_speech_frames_threshold=*/1); + RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, + level_estimator.vad_data_speech, *level_estimator.estimator); EXPECT_TRUE(level_estimator.estimator->IsConfident()); } @@ -107,14 +114,15 @@ TEST(GainController2AdaptiveModeLevelEstimator, IsConfident) { // frames. TEST(GainController2AdaptiveModeLevelEstimator, EstimatorIgnoresNonSpeechFrames) { - TestLevelEstimator level_estimator; + TestLevelEstimator level_estimator(/*adjacent_speech_frames_threshold=*/1); // Simulate speech. - RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, kVadDataSpeech, + RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, + level_estimator.vad_data_speech, *level_estimator.estimator); const float estimated_level_dbfs = level_estimator.estimator->level_dbfs(); // Simulate full-scale non-speech. RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, - VadLevelAnalyzer::Result{kMinSpeechProbability, + VadLevelAnalyzer::Result{/*speech_probability=*/0.0f, /*rms_dbfs=*/0.0f, /*peak_dbfs=*/0.0f}, *level_estimator.estimator); @@ -126,28 +134,30 @@ TEST(GainController2AdaptiveModeLevelEstimator, // Checks the convergence speed of the estimator before it becomes confident. TEST(GainController2AdaptiveModeLevelEstimator, ConvergenceSpeedBeforeConfidence) { - TestLevelEstimator level_estimator; - RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, kVadDataSpeech, + TestLevelEstimator level_estimator(/*adjacent_speech_frames_threshold=*/1); + RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, + level_estimator.vad_data_speech, *level_estimator.estimator); - EXPECT_NEAR(level_estimator.estimator->level_dbfs(), kVadDataSpeech.rms_dbfs, + EXPECT_NEAR(level_estimator.estimator->level_dbfs(), + level_estimator.vad_data_speech.rms_dbfs, kConvergenceSpeedTestsLevelTolerance); } // Checks the convergence speed of the estimator after it becomes confident. TEST(GainController2AdaptiveModeLevelEstimator, ConvergenceSpeedAfterConfidence) { - TestLevelEstimator level_estimator; + TestLevelEstimator level_estimator(/*adjacent_speech_frames_threshold=*/1); // Reach confidence using the initial level estimate. RunOnConstantLevel( /*num_iterations=*/kNumFramesToConfidence, VadLevelAnalyzer::Result{ - kMaxSpeechProbability, - /*rms_dbfs=*/kInitialSpeechLevelEstimateDbfs, - /*peak_dbfs=*/kInitialSpeechLevelEstimateDbfs + 6.0f}, + /*speech_probability=*/1.0f, + /*rms_dbfs=*/level_estimator.initial_speech_level_dbfs, + /*peak_dbfs=*/level_estimator.initial_speech_level_dbfs + 6.0f}, *level_estimator.estimator); // No estimate change should occur, but confidence is achieved. ASSERT_FLOAT_EQ(level_estimator.estimator->level_dbfs(), - kInitialSpeechLevelEstimateDbfs); + level_estimator.initial_speech_level_dbfs); ASSERT_TRUE(level_estimator.estimator->IsConfident()); // After confidence. constexpr float kConvergenceTimeAfterConfidenceNumFrames = 600; // 6 seconds. @@ -155,8 +165,9 @@ TEST(GainController2AdaptiveModeLevelEstimator, kConvergenceTimeAfterConfidenceNumFrames > kNumFramesToConfidence, ""); RunOnConstantLevel( /*num_iterations=*/kConvergenceTimeAfterConfidenceNumFrames, - kVadDataSpeech, *level_estimator.estimator); - EXPECT_NEAR(level_estimator.estimator->level_dbfs(), kVadDataSpeech.rms_dbfs, + level_estimator.vad_data_speech, *level_estimator.estimator); + EXPECT_NEAR(level_estimator.estimator->level_dbfs(), + level_estimator.vad_data_speech.rms_dbfs, kConvergenceSpeedTestsLevelTolerance); } @@ -168,30 +179,26 @@ class AdaptiveModeLevelEstimatorParametrization TEST_P(AdaptiveModeLevelEstimatorParametrization, DoNotAdaptToShortSpeechSegments) { - ApmDataDumper apm_data_dumper(0); - AdaptiveModeLevelEstimator level_estimator( - &apm_data_dumper, adjacent_speech_frames_threshold()); - const float initial_level = level_estimator.level_dbfs(); - ASSERT_LT(initial_level, kVadDataSpeech.peak_dbfs); + TestLevelEstimator level_estimator(adjacent_speech_frames_threshold()); + const float initial_level = level_estimator.estimator->level_dbfs(); + ASSERT_LT(initial_level, level_estimator.vad_data_speech.peak_dbfs); for (int i = 0; i < adjacent_speech_frames_threshold() - 1; ++i) { SCOPED_TRACE(i); - level_estimator.Update(kVadDataSpeech); - EXPECT_EQ(initial_level, level_estimator.level_dbfs()); + level_estimator.estimator->Update(level_estimator.vad_data_speech); + EXPECT_EQ(initial_level, level_estimator.estimator->level_dbfs()); } - level_estimator.Update(kVadDataNonSpeech); - EXPECT_EQ(initial_level, level_estimator.level_dbfs()); + level_estimator.estimator->Update(level_estimator.vad_data_non_speech); + EXPECT_EQ(initial_level, level_estimator.estimator->level_dbfs()); } TEST_P(AdaptiveModeLevelEstimatorParametrization, AdaptToEnoughSpeechSegments) { - ApmDataDumper apm_data_dumper(0); - AdaptiveModeLevelEstimator level_estimator( - &apm_data_dumper, adjacent_speech_frames_threshold()); - const float initial_level = level_estimator.level_dbfs(); - ASSERT_LT(initial_level, kVadDataSpeech.peak_dbfs); + TestLevelEstimator level_estimator(adjacent_speech_frames_threshold()); + const float initial_level = level_estimator.estimator->level_dbfs(); + ASSERT_LT(initial_level, level_estimator.vad_data_speech.peak_dbfs); for (int i = 0; i < adjacent_speech_frames_threshold(); ++i) { - level_estimator.Update(kVadDataSpeech); + level_estimator.estimator->Update(level_estimator.vad_data_speech); } - EXPECT_LT(initial_level, level_estimator.level_dbfs()); + EXPECT_LT(initial_level, level_estimator.estimator->level_dbfs()); } INSTANTIATE_TEST_SUITE_P(GainController2, diff --git a/modules/audio_processing/agc2/agc2_common.h b/modules/audio_processing/agc2/agc2_common.h index da28d8d9d0..4af85527b8 100644 --- a/modules/audio_processing/agc2/agc2_common.h +++ b/modules/audio_processing/agc2/agc2_common.h @@ -24,38 +24,26 @@ constexpr int kFrameDurationMs = 10; constexpr int kSubFramesInFrame = 20; constexpr int kMaximalNumberOfSamplesPerChannel = 480; -// Adaptive digital gain applier settings below. -constexpr float kHeadroomDbfs = 6.0f; -constexpr float kMaxGainDb = 30.0f; -constexpr float kInitialAdaptiveDigitalGainDb = 8.0f; +// Adaptive digital gain applier settings. + // At what limiter levels should we start decreasing the adaptive digital gain. constexpr float kLimiterThresholdForAgcGainDbfs = -1.0f; // This is the threshold for speech. Speech frames are used for updating the // speech level, measuring the amount of speech, and decide when to allow target -// gain reduction. +// gain changes. constexpr float kVadConfidenceThreshold = 0.95f; -// Adaptive digital level estimator parameters. // Number of milliseconds of speech frames to observe to make the estimator // confident. constexpr float kLevelEstimatorTimeToConfidenceMs = 400; constexpr float kLevelEstimatorLeakFactor = 1.0f - 1.0f / kLevelEstimatorTimeToConfidenceMs; -// Robust VAD probability and speech decisions. -constexpr int kDefaultLevelEstimatorAdjacentSpeechFramesThreshold = 12; - // Saturation Protector settings. constexpr float kSaturationProtectorInitialHeadroomDb = 20.0f; constexpr int kSaturationProtectorBufferSize = 4; -// Set the initial speech level estimate so that `kInitialAdaptiveDigitalGainDb` -// is applied at the beginning of the call. -constexpr float kInitialSpeechLevelEstimateDbfs = - -kSaturationProtectorInitialHeadroomDb - kInitialAdaptiveDigitalGainDb - - kHeadroomDbfs; - // Number of interpolation points for each region of the limiter. // These values have been tuned to limit the interpolated gain curve error given // the limiter parameters and allowing a maximum error of +/- 32768^-1. diff --git a/modules/audio_processing/audio_processing_unittest.cc b/modules/audio_processing/audio_processing_unittest.cc index 100a3c0d86..436effd57e 100644 --- a/modules/audio_processing/audio_processing_unittest.cc +++ b/modules/audio_processing/audio_processing_unittest.cc @@ -3107,6 +3107,18 @@ TEST(AudioProcessing, GainController2ConfigEqual) { b_adaptive.dry_run = a_adaptive.dry_run; EXPECT_EQ(a, b); + a_adaptive.headroom_db += 1.0f; + b_adaptive.headroom_db = a_adaptive.headroom_db; + EXPECT_EQ(a, b); + + a_adaptive.max_gain_db += 1.0f; + b_adaptive.max_gain_db = a_adaptive.max_gain_db; + EXPECT_EQ(a, b); + + a_adaptive.initial_gain_db += 1.0f; + b_adaptive.initial_gain_db = a_adaptive.initial_gain_db; + EXPECT_EQ(a, b); + a_adaptive.vad_reset_period_ms++; b_adaptive.vad_reset_period_ms = a_adaptive.vad_reset_period_ms; EXPECT_EQ(a, b); @@ -3164,6 +3176,18 @@ TEST(AudioProcessing, GainController2ConfigNotEqual) { EXPECT_NE(a, b); a_adaptive = b_adaptive; + a_adaptive.headroom_db += 1.0f; + EXPECT_NE(a, b); + a_adaptive = b_adaptive; + + a_adaptive.max_gain_db += 1.0f; + EXPECT_NE(a, b); + a_adaptive = b_adaptive; + + a_adaptive.initial_gain_db += 1.0f; + EXPECT_NE(a, b); + a_adaptive = b_adaptive; + a_adaptive.vad_reset_period_ms++; EXPECT_NE(a, b); a_adaptive = b_adaptive; diff --git a/modules/audio_processing/gain_controller2.cc b/modules/audio_processing/gain_controller2.cc index 195044adc2..8a22fed97e 100644 --- a/modules/audio_processing/gain_controller2.cc +++ b/modules/audio_processing/gain_controller2.cc @@ -105,7 +105,9 @@ bool GainController2::Validate( const AudioProcessing::Config::GainController2& config) { const auto& fixed = config.fixed_digital; const auto& adaptive = config.adaptive_digital; - return fixed.gain_db >= 0.f && fixed.gain_db < 50.f && + return fixed.gain_db >= 0.0f && fixed.gain_db < 50.f && + adaptive.headroom_db >= 0.0f && adaptive.max_gain_db > 0.0f && + adaptive.initial_gain_db >= 0.0f && adaptive.max_gain_change_db_per_second > 0.0f && adaptive.max_output_noise_level_dbfs <= 0.0f; } diff --git a/modules/audio_processing/gain_controller2_unittest.cc b/modules/audio_processing/gain_controller2_unittest.cc index c8ee113d56..d1c1f5b8d4 100644 --- a/modules/audio_processing/gain_controller2_unittest.cc +++ b/modules/audio_processing/gain_controller2_unittest.cc @@ -89,6 +89,36 @@ TEST(GainController2, CheckFixedDigitalConfig) { EXPECT_TRUE(GainController2::Validate(config)); } +TEST(GainController2, CheckHeadroomDb) { + AudioProcessing::Config::GainController2 config; + config.adaptive_digital.headroom_db = -1.0f; + EXPECT_FALSE(GainController2::Validate(config)); + config.adaptive_digital.headroom_db = 0.0f; + EXPECT_TRUE(GainController2::Validate(config)); + config.adaptive_digital.headroom_db = 5.0f; + EXPECT_TRUE(GainController2::Validate(config)); +} + +TEST(GainController2, CheckMaxGainDb) { + AudioProcessing::Config::GainController2 config; + config.adaptive_digital.max_gain_db = -1.0f; + EXPECT_FALSE(GainController2::Validate(config)); + config.adaptive_digital.max_gain_db = 0.0f; + EXPECT_FALSE(GainController2::Validate(config)); + config.adaptive_digital.max_gain_db = 5.0f; + EXPECT_TRUE(GainController2::Validate(config)); +} + +TEST(GainController2, CheckInitialGainDb) { + AudioProcessing::Config::GainController2 config; + config.adaptive_digital.initial_gain_db = -1.0f; + EXPECT_FALSE(GainController2::Validate(config)); + config.adaptive_digital.initial_gain_db = 0.0f; + EXPECT_TRUE(GainController2::Validate(config)); + config.adaptive_digital.initial_gain_db = 5.0f; + EXPECT_TRUE(GainController2::Validate(config)); +} + TEST(GainController2, CheckAdaptiveDigitalMaxGainChangeSpeedConfig) { AudioProcessing::Config::GainController2 config; config.adaptive_digital.max_gain_change_db_per_second = -1.0f; diff --git a/modules/audio_processing/include/audio_processing.cc b/modules/audio_processing/include/audio_processing.cc index 228619636a..ddd8078c93 100644 --- a/modules/audio_processing/include/audio_processing.cc +++ b/modules/audio_processing/include/audio_processing.cc @@ -90,6 +90,8 @@ bool Agc1Config::operator==(const Agc1Config& rhs) const { bool Agc2Config::AdaptiveDigital::operator==( const Agc2Config::AdaptiveDigital& rhs) const { return enabled == rhs.enabled && dry_run == rhs.dry_run && + headroom_db == rhs.headroom_db && max_gain_db == rhs.max_gain_db && + initial_gain_db == rhs.initial_gain_db && vad_reset_period_ms == rhs.vad_reset_period_ms && adjacent_speech_frames_threshold == rhs.adjacent_speech_frames_threshold && @@ -197,6 +199,10 @@ std::string AudioProcessing::Config::ToString() const { << " }, adaptive_digital: { enabled: " << gain_controller2.adaptive_digital.enabled << ", dry_run: " << gain_controller2.adaptive_digital.dry_run + << ", headroom_db: " << gain_controller2.adaptive_digital.headroom_db + << ", max_gain_db: " << gain_controller2.adaptive_digital.max_gain_db + << ", initial_gain_db: " + << gain_controller2.adaptive_digital.initial_gain_db << ", vad_reset_period_ms: " << gain_controller2.adaptive_digital.vad_reset_period_ms << ", adjacent_speech_frames_threshold: " diff --git a/modules/audio_processing/include/audio_processing.h b/modules/audio_processing/include/audio_processing.h index 8f07c6e3b7..121e430b60 100644 --- a/modules/audio_processing/include/audio_processing.h +++ b/modules/audio_processing/include/audio_processing.h @@ -367,12 +367,19 @@ class RTC_EXPORT AudioProcessing : public rtc::RefCountInterface { } bool enabled = false; - // Run the adaptive digital controller but the signal is not modified. + // When true, the adaptive digital controller runs but the signal is not + // modified. bool dry_run = false; + float headroom_db = 6.0f; + // TODO(bugs.webrtc.org/7494): Consider removing and inferring from + // `max_output_noise_level_dbfs`. + float max_gain_db = 30.0f; + float initial_gain_db = 8.0f; int vad_reset_period_ms = 1500; int adjacent_speech_frames_threshold = 12; float max_gain_change_db_per_second = 3.0f; float max_output_noise_level_dbfs = -50.0f; + // TODO(bugs.webrtc.org/7494): Replace with field trials. bool sse2_allowed = true; bool avx2_allowed = true; bool neon_allowed = true;