From ac29b9c37f94b2035068ace268daf9c71d81da32 Mon Sep 17 00:00:00 2001 From: Alessio Bazzica Date: Mon, 20 Jun 2022 17:52:43 +0200 Subject: [PATCH] APM Transient Suppressor (TS): wire-up RNN VAD, TS and AGC2 When the `WebRTC-Audio-TransientSuppressorVadMode-RnnVad` field trial is set, APM now uses (i) its RNN VAD sub-module to compute the voice probability, (ii) that probability for TS and (iii) a temporally delayed version of it for AGC2 (the delay introduced by TS is taken into account). Bug: webrtc:13663 Change-Id: Ic0f245c3f00d318c19bb01d3dbc2d5176c90f851 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/266362 Commit-Queue: Alessio Bazzica Reviewed-by: Hanna Silen Cr-Commit-Position: refs/heads/main@{#37291} --- .../audio_processing/audio_processing_impl.cc | 45 ++++++++++--------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/modules/audio_processing/audio_processing_impl.cc b/modules/audio_processing/audio_processing_impl.cc index 5714d6b09a..31a6a14d6a 100644 --- a/modules/audio_processing/audio_processing_impl.cc +++ b/modules/audio_processing/audio_processing_impl.cc @@ -1274,28 +1274,42 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() { capture_buffer->num_frames())); } + absl::optional voice_probability; + if (!!submodules_.voice_activity_detector) { + voice_probability = submodules_.voice_activity_detector->Analyze( + AudioFrameView(capture_buffer->channels(), + capture_buffer->num_channels(), + capture_buffer->num_frames())); + } + if (submodules_.transient_suppressor) { - float voice_probability = 1.0f; + float transient_suppressor_voice_probability = 1.0f; switch (transient_suppressor_vad_mode_) { case TransientSuppressor::VadMode::kDefault: if (submodules_.agc_manager) { - voice_probability = submodules_.agc_manager->voice_probability(); + transient_suppressor_voice_probability = + submodules_.agc_manager->voice_probability(); } break; case TransientSuppressor::VadMode::kRnnVad: - // TODO(bugs.webrtc.org/13663): Use RNN VAD. + RTC_DCHECK(voice_probability.has_value()); + transient_suppressor_voice_probability = *voice_probability; break; case TransientSuppressor::VadMode::kNoVad: // The transient suppressor will ignore `voice_probability`. break; } - submodules_.transient_suppressor->Suppress( - capture_buffer->channels()[0], capture_buffer->num_frames(), - capture_buffer->num_channels(), - capture_buffer->split_bands_const(0)[kBand0To8kHz], - capture_buffer->num_frames_per_band(), - /*reference_data=*/nullptr, /*reference_length=*/0, voice_probability, - capture_.key_pressed); + float delayed_voice_probability = + submodules_.transient_suppressor->Suppress( + capture_buffer->channels()[0], capture_buffer->num_frames(), + capture_buffer->num_channels(), + capture_buffer->split_bands_const(0)[kBand0To8kHz], + capture_buffer->num_frames_per_band(), + /*reference_data=*/nullptr, /*reference_length=*/0, + transient_suppressor_voice_probability, capture_.key_pressed); + if (voice_probability.has_value()) { + *voice_probability = delayed_voice_probability; + } } // Experimental APM sub-module that analyzes `capture_buffer`. @@ -1303,19 +1317,10 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() { submodules_.capture_analyzer->Analyze(capture_buffer); } - absl::optional voice_activity_probability = absl::nullopt; if (submodules_.gain_controller2) { submodules_.gain_controller2->NotifyAnalogLevel( recommended_stream_analog_level_locked()); - if (submodules_.voice_activity_detector) { - voice_activity_probability = - submodules_.voice_activity_detector->Analyze( - AudioFrameView(capture_buffer->channels(), - capture_buffer->num_channels(), - capture_buffer->num_frames())); - } - submodules_.gain_controller2->Process(voice_activity_probability, - capture_buffer); + submodules_.gain_controller2->Process(voice_probability, capture_buffer); } if (submodules_.capture_post_processor) {