diff --git a/modules/audio_processing/BUILD.gn b/modules/audio_processing/BUILD.gn index c8992b06b1..b36e76b1cc 100644 --- a/modules/audio_processing/BUILD.gn +++ b/modules/audio_processing/BUILD.gn @@ -548,6 +548,7 @@ if (rtc_include_tests) { "../../test:test_support", "../audio_coding:neteq_input_audio_tools", "aec_dump:mock_aec_dump_unittests", + "agc2:adaptive_digital_unittests", "agc2:fixed_digital_unittests", "test/conversational_speech:unittest", "vad:vad_unittests", diff --git a/modules/audio_processing/agc2/BUILD.gn b/modules/audio_processing/agc2/BUILD.gn index 61d17e79e0..df5ec6cf0b 100644 --- a/modules/audio_processing/agc2/BUILD.gn +++ b/modules/audio_processing/agc2/BUILD.gn @@ -25,6 +25,8 @@ rtc_source_set("adaptive_digital") { "adaptive_mode_level_estimator.h", "noise_level_estimator.cc", "noise_level_estimator.h", + "saturation_protector.cc", + "saturation_protector.h", ] configs += [ "..:apm_debug_dump" ] @@ -126,3 +128,25 @@ rtc_source_set("fixed_digital_unittests") { "../../../rtc_base:rtc_base_tests_utils", ] } + +rtc_source_set("adaptive_digital_unittests") { + testonly = true + configs += [ "..:apm_debug_dump" ] + + sources = [ + "adaptive_mode_level_estimator_unittest.cc", + ] + deps = [ + ":adaptive_digital", + ":common", + ":test_utils", + "..:apm_logging", + "..:audio_frame_view", + "../../../api:array_view", + "../../../common_audio", + "../../../rtc_base:checks", + "../../../rtc_base:rtc_base_approved", + "../../../rtc_base:rtc_base_tests_utils", + "../vad:vad_with_level", + ] +} diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc b/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc index e293bab390..b1906079d2 100644 --- a/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc +++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc @@ -17,7 +17,9 @@ namespace webrtc { AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator( - ApmDataDumper* apm_data_dumper) {} + ApmDataDumper* apm_data_dumper) + : saturation_protector_(apm_data_dumper), + apm_data_dumper_(apm_data_dumper) {} void AdaptiveModeLevelEstimator::UpdateEstimation( const VadWithLevel::LevelAndProbability& vad_data) { @@ -27,10 +29,40 @@ void AdaptiveModeLevelEstimator::UpdateEstimation( RTC_DCHECK_LT(vad_data.speech_peak_dbfs, 50.f); RTC_DCHECK_GE(vad_data.speech_probability, 0.f); RTC_DCHECK_LE(vad_data.speech_probability, 1.f); + + if (vad_data.speech_probability < kVadConfidenceThreshold) { + DebugDumpEstimate(); + return; + } + + const bool buffer_is_full = buffer_size_ms_ >= kFullBufferSizeMs; + if (!buffer_is_full) { + buffer_size_ms_ += kFrameDurationMs; + } + + const float leak_factor = buffer_is_full ? kFullBufferLeakFactor : 1.f; + + estimate_numerator_ = estimate_numerator_ * leak_factor + + vad_data.speech_rms_dbfs * vad_data.speech_probability; + estimate_denominator_ = + estimate_denominator_ * leak_factor + vad_data.speech_probability; + + last_estimate_with_offset_dbfs_ = estimate_numerator_ / estimate_denominator_; + + saturation_protector_.UpdateMargin(vad_data, last_estimate_with_offset_dbfs_); + DebugDumpEstimate(); } float AdaptiveModeLevelEstimator::LatestLevelEstimate() const { - // TODO(webrtc:7494): This is a stub. Add implementation. - return 0.f; + return rtc::SafeClamp( + last_estimate_with_offset_dbfs_ + saturation_protector_.LastMargin(), + -90.f, 0.f); +} + +void AdaptiveModeLevelEstimator::DebugDumpEstimate() { + apm_data_dumper_->DumpRaw("agc2_adaptive_level_estimate_with_offset_dbfs", + last_estimate_with_offset_dbfs_); + apm_data_dumper_->DumpRaw("agc2_adaptive_level_estimate_dbfs", + LatestLevelEstimate()); } } // namespace webrtc diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator.h b/modules/audio_processing/agc2/adaptive_mode_level_estimator.h index b8dcf1afcf..dfcaa53535 100644 --- a/modules/audio_processing/agc2/adaptive_mode_level_estimator.h +++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator.h @@ -11,6 +11,7 @@ #ifndef MODULES_AUDIO_PROCESSING_AGC2_ADAPTIVE_MODE_LEVEL_ESTIMATOR_H_ #define MODULES_AUDIO_PROCESSING_AGC2_ADAPTIVE_MODE_LEVEL_ESTIMATOR_H_ +#include "modules/audio_processing/agc2/saturation_protector.h" #include "modules/audio_processing/vad/vad_with_level.h" namespace webrtc { @@ -21,6 +22,16 @@ class AdaptiveModeLevelEstimator { explicit AdaptiveModeLevelEstimator(ApmDataDumper* apm_data_dumper); void UpdateEstimation(const VadWithLevel::LevelAndProbability& vad_data); float LatestLevelEstimate() const; + + private: + void DebugDumpEstimate(); + + int buffer_size_ms_ = 0; + float last_estimate_with_offset_dbfs_ = kInitialSpeechLevelEstimateDbfs; + float estimate_numerator_ = 0.f; + float estimate_denominator_ = 0.f; + SaturationProtector saturation_protector_; + ApmDataDumper* const apm_data_dumper_; }; } // namespace webrtc diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc b/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc new file mode 100644 index 0000000000..71909d0626 --- /dev/null +++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_processing/agc2/adaptive_mode_level_estimator.h" + +#include "modules/audio_processing/agc2/agc2_common.h" +#include "modules/audio_processing/logging/apm_data_dumper.h" +#include "rtc_base/gunit.h" + +namespace webrtc { +namespace { +void RunOnConstantLevel(int num_iterations, + VadWithLevel::LevelAndProbability vad_data, + AdaptiveModeLevelEstimator* level_estimator) { + for (int i = 0; i < num_iterations; ++i) { + level_estimator->UpdateEstimation(vad_data); // By copy + } +} +} // namespace + +TEST(AutomaticGainController2AdaptiveModeLevelEstimator, + EstimatorShouldNotCrash) { + ApmDataDumper apm_data_dumper(0); + AdaptiveModeLevelEstimator level_estimator(&apm_data_dumper); + + VadWithLevel::LevelAndProbability vad_data(1.f, -20.f, -10.f); + level_estimator.UpdateEstimation(vad_data); + static_cast(level_estimator.LatestLevelEstimate()); +} + +TEST(AutomaticGainController2AdaptiveModeLevelEstimator, LevelShouldStabilize) { + ApmDataDumper apm_data_dumper(0); + AdaptiveModeLevelEstimator level_estimator(&apm_data_dumper); + + constexpr float kSpeechRmsDbfs = -15.f; + RunOnConstantLevel( + 100, + VadWithLevel::LevelAndProbability( + 1.f, kSpeechRmsDbfs - kInitialSaturationMarginDb, kSpeechRmsDbfs), + &level_estimator); + + EXPECT_NEAR(level_estimator.LatestLevelEstimate(), kSpeechRmsDbfs, 0.1f); +} + +TEST(AutomaticGainController2AdaptiveModeLevelEstimator, + EstimatorIgnoresZeroProbabilityFrames) { + ApmDataDumper apm_data_dumper(0); + AdaptiveModeLevelEstimator level_estimator(&apm_data_dumper); + + // Run for one second of fake audio. + constexpr float kSpeechRmsDbfs = -25.f; + RunOnConstantLevel( + 100, + VadWithLevel::LevelAndProbability( + 1.f, kSpeechRmsDbfs - kInitialSaturationMarginDb, kSpeechRmsDbfs), + &level_estimator); + + // Run for one more second, but mark as not speech. + constexpr float kNoiseRmsDbfs = 0.f; + RunOnConstantLevel( + 100, VadWithLevel::LevelAndProbability(0.f, kNoiseRmsDbfs, kNoiseRmsDbfs), + &level_estimator); + + // Level should not have changed. + EXPECT_NEAR(level_estimator.LatestLevelEstimate(), kSpeechRmsDbfs, 0.1f); +} + +TEST(AutomaticGainController2AdaptiveModeLevelEstimator, TimeToAdapt) { + ApmDataDumper apm_data_dumper(0); + AdaptiveModeLevelEstimator level_estimator(&apm_data_dumper); + + // Run for one 'window size' interval + constexpr float kInitialSpeechRmsDbfs = -30.f; + RunOnConstantLevel( + kFullBufferSizeMs / kFrameDurationMs, + VadWithLevel::LevelAndProbability( + 1.f, kInitialSpeechRmsDbfs - kInitialSaturationMarginDb, + kInitialSpeechRmsDbfs), + &level_estimator); + + // Run for one half 'window size' interval. This should not be enough to + // adapt. + constexpr float kDifferentSpeechRmsDbfs = -10.f; + // It should at most differ by 25% after one 'window size' interval. + const float kMaxDifferenceDb = + 0.25 * std::abs(kDifferentSpeechRmsDbfs - kInitialSpeechRmsDbfs); + RunOnConstantLevel( + static_cast(kFullBufferSizeMs / kFrameDurationMs / 2), + VadWithLevel::LevelAndProbability( + 1.f, kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb, + kDifferentSpeechRmsDbfs), + &level_estimator); + EXPECT_GT( + std::abs(kDifferentSpeechRmsDbfs - level_estimator.LatestLevelEstimate()), + kMaxDifferenceDb); + + // Run for some more time. Afterwards, we should have adapted. + RunOnConstantLevel( + static_cast(3 * kFullBufferSizeMs / kFrameDurationMs), + VadWithLevel::LevelAndProbability( + 1.f, kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb, + kDifferentSpeechRmsDbfs), + &level_estimator); + EXPECT_NEAR(level_estimator.LatestLevelEstimate(), kDifferentSpeechRmsDbfs, + kMaxDifferenceDb); +} + +} // namespace webrtc diff --git a/modules/audio_processing/agc2/agc2_common.h b/modules/audio_processing/agc2/agc2_common.h index ad0ab4ea62..d4aa3fbceb 100644 --- a/modules/audio_processing/agc2/agc2_common.h +++ b/modules/audio_processing/agc2/agc2_common.h @@ -27,6 +27,18 @@ constexpr size_t kMaximalNumberOfSamplesPerChannel = 480; constexpr float kAttackFilterConstant = 0.f; +// Used in the Level Estimator for deciding when to update the speech +// level estimate. +constexpr float kVadConfidenceThreshold = 0.9f; + +// The amount of 'memory' of the Level Estimator. Decides leak factors. +constexpr float kFullBufferSizeMs = 1000.f; +constexpr float kFullBufferLeakFactor = 1.f - 1.f / kFullBufferSizeMs; + +constexpr float kInitialSpeechLevelEstimateDbfs = -30.f; + +constexpr float kInitialSaturationMarginDb = 17.f; + // This is computed from kDecayMs by // 10 ** (-1/20 * subframe_duration / kDecayMs). // |subframe_duration| is |kFrameDurationMs / kSubFramesInFrame|. diff --git a/modules/audio_processing/agc2/noise_level_estimator.h b/modules/audio_processing/agc2/noise_level_estimator.h index f22bfd8a31..f9e4abc8f5 100644 --- a/modules/audio_processing/agc2/noise_level_estimator.h +++ b/modules/audio_processing/agc2/noise_level_estimator.h @@ -20,7 +20,7 @@ class NoiseLevelEstimator { public: NoiseLevelEstimator() {} - // Returns the estimated noise level in DbFS. + // Returns the estimated noise level in dBFS. float Analyze(AudioFrameView frame); private: diff --git a/modules/audio_processing/agc2/saturation_protector.cc b/modules/audio_processing/agc2/saturation_protector.cc new file mode 100644 index 0000000000..a6f1a8350e --- /dev/null +++ b/modules/audio_processing/agc2/saturation_protector.cc @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_processing/agc2/saturation_protector.h" + +#include + +#include "modules/audio_processing/logging/apm_data_dumper.h" +#include "rtc_base/numerics/safe_minmax.h" + +namespace webrtc { + +SaturationProtector::SaturationProtector(ApmDataDumper* apm_data_dumper) {} + +void SaturationProtector::UpdateMargin( + const VadWithLevel::LevelAndProbability& vad_data, + float last_speech_level_estimate) {} + +float SaturationProtector::LastMargin() const { + return kInitialSaturationMarginDb; +} +} // namespace webrtc diff --git a/modules/audio_processing/agc2/saturation_protector.h b/modules/audio_processing/agc2/saturation_protector.h new file mode 100644 index 0000000000..dcf51842f0 --- /dev/null +++ b/modules/audio_processing/agc2/saturation_protector.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_ +#define MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_ + +#include + +#include "modules/audio_processing/agc2/agc2_common.h" +#include "modules/audio_processing/vad/vad_with_level.h" + +namespace webrtc { + +class ApmDataDumper; + +class SaturationProtector { + public: + explicit SaturationProtector(ApmDataDumper* apm_data_dumper); + + // Update and return margin estimate. This method should be called + // whenever a frame is reliably classified as 'speech'. + // + // Returned value is in DB scale. + void UpdateMargin(const VadWithLevel::LevelAndProbability& vad_data, + float last_speech_level_estimate_dbfs); + + // Returns latest computed margin. Used in cases when speech is not + // detected. + float LastMargin() const; +}; + +} // namespace webrtc + +#endif // MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_