From 09aaf6f7bcfb4da644bd86c76896a04a41f776e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Bostr=C3=B6m?= Date: Mon, 14 Feb 2022 12:02:45 +0000 Subject: [PATCH] Revert "Reland "Remove unused APM voice activity detection sub-module"" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 54d1344d985b00d4d1580dd18057d4618c11ad1f. Reason for revert: Breaks chromium roll, see https://ci.chromium.org/ui/p/chromium/builders/try/linux_chromium_tsan_rel_ng/1080583/overview https://chromium-review.googlesource.com/c/chromium/src/+/3461512 Original change's description: > Reland "Remove unused APM voice activity detection sub-module" > > This reverts commit a751f167c68343f76528436defdbc61600a8d7b3. > > Reason for revert: dependency in a downstream project removed > > Original change's description: > > Revert "Remove unused APM voice activity detection sub-module" > > > > This reverts commit b4e06d032e6f82a65c52ed0c5364ae9e7c0a0215. > > > > Reason for revert: breaking downstream projects > > > > Original change's description: > > > Remove unused APM voice activity detection sub-module > > > > > > API changes: > > > - webrtc::AudioProcessing::Config::VoiceDetection removed > > > - webrtc::AudioProcessingStats::voice_detected deprecated > > > - cricket::AudioOptions::typing_detection deprecated > > > - webrtc::StatsReport::StatsValueName:: > > > kStatsValueNameTypingNoiseState deprecated > > > > > > PSA: https://groups.google.com/g/discuss-webrtc/c/7X6uwmJarE0 > > > > > > Bug: webrtc:11226,webrtc:11292 > > > Change-Id: I8d008b56708cf62961b9857ec052b59fda3b41bf > > > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/250666 > > > Reviewed-by: Harald Alvestrand > > > Reviewed-by: Gustaf Ullberg > > > Reviewed-by: Sam Zackrisson > > > Reviewed-by: Björn Terelius > > > Commit-Queue: Alessio Bazzica > > > Cr-Commit-Position: refs/heads/main@{#35975} > > > > TBR=gustaf@webrtc.org,saza@webrtc.org,alessiob@webrtc.org,terelius@webrtc.org,hta@webrtc.org,webrtc-scoped@luci-project-accounts.iam.gserviceaccount.com > > > > Change-Id: Iee01fdb874b4e0331277f3ffe60dacaabc3859a2 > > No-Presubmit: true > > No-Tree-Checks: true > > No-Try: true > > Bug: webrtc:11226,webrtc:11292 > > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/251600 > > Reviewed-by: Harald Alvestrand > > Reviewed-by: Gustaf Ullberg > > Commit-Queue: Mirko Bonadei > > Cr-Commit-Position: refs/heads/main@{#35977} > > # Not skipping CQ checks because this is a reland. > > Bug: webrtc:11226,webrtc:11292 > Change-Id: I2fcbc5fdade16bfe6a0f0a02841a33a598d4f2ad > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/251660 > Reviewed-by: Alessio Bazzica > Reviewed-by: Harald Alvestrand > Commit-Queue: Alessio Bazzica > Cr-Commit-Position: refs/heads/main@{#35984} TBR=mbonadei@webrtc.org,gustaf@webrtc.org,saza@webrtc.org,alessiob@webrtc.org,terelius@webrtc.org,hta@webrtc.org,webrtc-scoped@luci-project-accounts.iam.gserviceaccount.com Change-Id: Ib308a3af2dcce85a0074ef5a4680ccec3f82712f No-Presubmit: true No-Tree-Checks: true No-Try: true Bug: webrtc:11226,webrtc:11292 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/251688 Reviewed-by: Henrik Boström Bot-Commit: rubber-stamper@appspot.gserviceaccount.com Auto-Submit: Henrik Boström Reviewed-by: Harald Alvestrand Commit-Queue: Harald Alvestrand Cr-Commit-Position: refs/heads/main@{#35990} --- api/audio_options.h | 2 - api/stats_types.cc | 1 - api/stats_types.h | 1 - audio/audio_transport_impl.cc | 22 ++++ audio/audio_transport_impl.h | 5 +- media/engine/webrtc_voice_engine.cc | 4 +- media/engine/webrtc_voice_engine_unittest.cc | 39 +++++++ modules/audio_processing/BUILD.gn | 17 +++ .../audio_processing/audio_processing_impl.cc | 30 ++++- .../audio_processing/audio_processing_impl.h | 5 + .../audio_processing_impl_locking_unittest.cc | 1 + .../audio_processing_performance_unittest.cc | 3 + .../audio_processing_unittest.cc | 92 +++++++++++++++- .../include/audio_processing.cc | 1 + .../include/audio_processing.h | 7 ++ .../include/audio_processing_statistics.h | 2 - .../test/audio_processing_simulator.cc | 4 + .../test/audio_processing_simulator.h | 1 + .../test/audioproc_float_impl.cc | 6 + modules/audio_processing/voice_detection.cc | 92 ++++++++++++++++ modules/audio_processing/voice_detection.h | 59 ++++++++++ .../voice_detection_unittest.cc | 104 ++++++++++++++++++ .../audio_processing_configs_fuzzer.cc | 3 +- 23 files changed, 483 insertions(+), 18 deletions(-) create mode 100644 modules/audio_processing/voice_detection.cc create mode 100644 modules/audio_processing/voice_detection.h create mode 100644 modules/audio_processing/voice_detection_unittest.cc diff --git a/api/audio_options.h b/api/audio_options.h index 3fcc38d83f..15817db71e 100644 --- a/api/audio_options.h +++ b/api/audio_options.h @@ -60,8 +60,6 @@ struct RTC_EXPORT AudioOptions { absl::optional audio_jitter_buffer_min_delay_ms; // Audio receiver jitter buffer (NetEq) should handle retransmitted packets. absl::optional audio_jitter_buffer_enable_rtx_handling; - // Deprecated. - // TODO(bugs.webrtc.org/11226): Remove. // Audio processing to detect typing. absl::optional typing_detection; // TODO(bugs.webrtc.org/11539): Deprecated, replaced by diff --git a/api/stats_types.cc b/api/stats_types.cc index b044e4ab11..1090643f1c 100644 --- a/api/stats_types.cc +++ b/api/stats_types.cc @@ -648,7 +648,6 @@ const char* StatsReport::Value::display_name() const { return "googTrackId"; case kStatsValueNameTimingFrameInfo: return "googTimingFrameInfo"; - // TODO(bugs.webrtc.org/11226): Remove. case kStatsValueNameTypingNoiseState: return "googTypingNoiseState"; case kStatsValueNameWritable: diff --git a/api/stats_types.h b/api/stats_types.h index e7dd528e62..c3e4451ef6 100644 --- a/api/stats_types.h +++ b/api/stats_types.h @@ -235,7 +235,6 @@ class RTC_EXPORT StatsReport { kStatsValueNameTrackId, kStatsValueNameTransmitBitrate, kStatsValueNameTransportType, - // TODO(bugs.webrtc.org/11226): Remove. kStatsValueNameTypingNoiseState, kStatsValueNameWritable, kStatsValueNameAudioDeviceUnderrunCounter, diff --git a/audio/audio_transport_impl.cc b/audio/audio_transport_impl.cc index 194f09cf6c..a5c952f8bc 100644 --- a/audio/audio_transport_impl.cc +++ b/audio/audio_transport_impl.cc @@ -165,6 +165,24 @@ int32_t AudioTransportImpl::RecordedDataIsAvailable( audio_frame.get()); audio_frame->set_absolute_capture_timestamp_ms(estimated_capture_time_ns / 1000000); + // Typing detection (utilizes the APM/VAD decision). We let the VAD determine + // if we're using this feature or not. + // TODO(solenberg): GetConfig() takes a lock. Work around that. + bool typing_detected = false; + if (audio_processing_ && + audio_processing_->GetConfig().voice_detection.enabled) { + if (audio_frame->vad_activity_ != AudioFrame::kVadUnknown) { + bool vad_active = audio_frame->vad_activity_ == AudioFrame::kVadActive; + typing_detected = typing_detection_.Process(key_pressed, vad_active); + } + } + + // Copy frame and push to each sending stream. The copy is required since an + // encoding task will be posted internally to each stream. + { + MutexLock lock(&capture_lock_); + typing_noise_detected_ = typing_detected; + } RTC_DCHECK_GT(audio_frame->samples_per_channel_, 0); if (async_audio_processing_) @@ -272,4 +290,8 @@ void AudioTransportImpl::SetStereoChannelSwapping(bool enable) { swap_stereo_channels_ = enable; } +bool AudioTransportImpl::typing_noise_detected() const { + MutexLock lock(&capture_lock_); + return typing_noise_detected_; +} } // namespace webrtc diff --git a/audio/audio_transport_impl.h b/audio/audio_transport_impl.h index 89999560c6..0b1406f680 100644 --- a/audio/audio_transport_impl.h +++ b/audio/audio_transport_impl.h @@ -86,9 +86,7 @@ class AudioTransportImpl : public AudioTransport { int send_sample_rate_hz, size_t send_num_channels); void SetStereoChannelSwapping(bool enable); - // Deprecated. - // TODO(bugs.webrtc.org/11226): Remove. - bool typing_noise_detected() const { return false; } + bool typing_noise_detected() const; private: void SendProcessedData(std::unique_ptr audio_frame); @@ -105,6 +103,7 @@ class AudioTransportImpl : public AudioTransport { std::vector audio_senders_ RTC_GUARDED_BY(capture_lock_); int send_sample_rate_hz_ RTC_GUARDED_BY(capture_lock_) = 8000; size_t send_num_channels_ RTC_GUARDED_BY(capture_lock_) = 1; + bool typing_noise_detected_ RTC_GUARDED_BY(capture_lock_) = false; bool swap_stereo_channels_ RTC_GUARDED_BY(capture_lock_) = false; PushResampler capture_resampler_; TypingDetection typing_detection_; diff --git a/media/engine/webrtc_voice_engine.cc b/media/engine/webrtc_voice_engine.cc index 829cb82af1..8e6a62bff2 100644 --- a/media/engine/webrtc_voice_engine.cc +++ b/media/engine/webrtc_voice_engine.cc @@ -609,7 +609,9 @@ bool WebRtcVoiceEngine::ApplyOptions(const AudioOptions& options_in) { } if (options.typing_detection) { - RTC_LOG(LS_WARNING) << "Typing detection is requested, but unsupported."; + RTC_LOG(LS_INFO) << "Typing detection is enabled? " + << *options.typing_detection; + apm_config.voice_detection.enabled = *options.typing_detection; } ap->ApplyConfig(apm_config); diff --git a/media/engine/webrtc_voice_engine_unittest.cc b/media/engine/webrtc_voice_engine_unittest.cc index ad15a638bb..cf0c254039 100644 --- a/media/engine/webrtc_voice_engine_unittest.cc +++ b/media/engine/webrtc_voice_engine_unittest.cc @@ -221,6 +221,11 @@ class WebRtcVoiceEngineTestFake : public ::testing::TestWithParam { // Default Options. VerifyEchoCancellationSettings(/*enabled=*/true); EXPECT_TRUE(IsHighPassFilterEnabled()); +#if defined(WEBRTC_ANDROID) + EXPECT_FALSE(IsTypingDetectionEnabled()); +#else + EXPECT_TRUE(IsTypingDetectionEnabled()); +#endif EXPECT_TRUE(apm_config_.noise_suppression.enabled); EXPECT_EQ(apm_config_.noise_suppression.level, kDefaultNsLevel); VerifyGainControlEnabledCorrectly(); @@ -788,6 +793,10 @@ class WebRtcVoiceEngineTestFake : public ::testing::TestWithParam { return apm_config_.high_pass_filter.enabled; } + bool IsTypingDetectionEnabled() { + return apm_config_.voice_detection.enabled; + } + protected: const bool use_null_apm_; std::unique_ptr task_queue_factory_; @@ -2980,10 +2989,40 @@ TEST_P(WebRtcVoiceEngineTestFake, SetAudioOptions) { if (!use_null_apm_) { VerifyEchoCancellationSettings(/*enabled=*/true); EXPECT_TRUE(IsHighPassFilterEnabled()); +#if defined(WEBRTC_ANDROID) + EXPECT_FALSE(IsTypingDetectionEnabled()); +#else + EXPECT_TRUE(IsTypingDetectionEnabled()); +#endif } EXPECT_EQ(200u, GetRecvStreamConfig(kSsrcY).jitter_buffer_max_packets); EXPECT_FALSE(GetRecvStreamConfig(kSsrcY).jitter_buffer_fast_accelerate); + // Turn typing detection off. + send_parameters_.options.typing_detection = false; + SetSendParameters(send_parameters_); + if (!use_null_apm_) { + EXPECT_FALSE(IsTypingDetectionEnabled()); + } + + // Leave typing detection unchanged, but non-default. + send_parameters_.options.typing_detection = absl::nullopt; + SetSendParameters(send_parameters_); + if (!use_null_apm_) { + EXPECT_FALSE(IsTypingDetectionEnabled()); + } + + // Turn typing detection on. + send_parameters_.options.typing_detection = true; + SetSendParameters(send_parameters_); + if (!use_null_apm_) { +#if defined(WEBRTC_ANDROID) + EXPECT_FALSE(IsTypingDetectionEnabled()); +#else + EXPECT_TRUE(IsTypingDetectionEnabled()); +#endif + } + // Turn echo cancellation off send_parameters_.options.echo_cancellation = false; SetSendParameters(send_parameters_); diff --git a/modules/audio_processing/BUILD.gn b/modules/audio_processing/BUILD.gn index ee6b579617..f32058d62a 100644 --- a/modules/audio_processing/BUILD.gn +++ b/modules/audio_processing/BUILD.gn @@ -168,6 +168,7 @@ rtc_library("audio_processing") { ":high_pass_filter", ":optionally_built_submodule_creators", ":rms_level", + ":voice_detection", "../../api:array_view", "../../api:function_view", "../../api/audio:aec3_config", @@ -217,6 +218,20 @@ rtc_library("audio_processing") { } } +rtc_library("voice_detection") { + sources = [ + "voice_detection.cc", + "voice_detection.h", + ] + deps = [ + ":api", + ":audio_buffer", + "../../api/audio:audio_frame_api", + "../../common_audio:common_audio_c", + "../../rtc_base:checks", + ] +} + rtc_library("residual_echo_detector") { poisonous = [ "default_echo_detector" ] configs += [ ":apm_debug_dump" ] @@ -364,6 +379,7 @@ if (rtc_include_tests) { ":gain_controller2", ":high_pass_filter", ":mocks", + ":voice_detection", "../../api:array_view", "../../api:scoped_refptr", "../../api/audio:aec3_config", @@ -458,6 +474,7 @@ if (rtc_include_tests) { "test/echo_canceller_test_tools_unittest.cc", "test/echo_control_mock.h", "test/test_utils.h", + "voice_detection_unittest.cc", ] } } diff --git a/modules/audio_processing/audio_processing_impl.cc b/modules/audio_processing/audio_processing_impl.cc index 9a1aaee821..8810efeddb 100644 --- a/modules/audio_processing/audio_processing_impl.cc +++ b/modules/audio_processing/audio_processing_impl.cc @@ -141,6 +141,7 @@ bool AudioProcessingImpl::SubmoduleStates::Update( bool gain_controller2_enabled, bool gain_adjustment_enabled, bool echo_controller_enabled, + bool voice_detector_enabled, bool transient_suppressor_enabled) { bool changed = false; changed |= (high_pass_filter_enabled != high_pass_filter_enabled_); @@ -152,6 +153,7 @@ bool AudioProcessingImpl::SubmoduleStates::Update( changed |= (gain_controller2_enabled != gain_controller2_enabled_); changed |= (gain_adjustment_enabled != gain_adjustment_enabled_); changed |= (echo_controller_enabled != echo_controller_enabled_); + changed |= (voice_detector_enabled != voice_detector_enabled_); changed |= (transient_suppressor_enabled != transient_suppressor_enabled_); if (changed) { high_pass_filter_enabled_ = high_pass_filter_enabled; @@ -161,6 +163,7 @@ bool AudioProcessingImpl::SubmoduleStates::Update( gain_controller2_enabled_ = gain_controller2_enabled; gain_adjustment_enabled_ = gain_adjustment_enabled; echo_controller_enabled_ = echo_controller_enabled; + voice_detector_enabled_ = voice_detector_enabled; transient_suppressor_enabled_ = transient_suppressor_enabled; } @@ -171,7 +174,7 @@ bool AudioProcessingImpl::SubmoduleStates::Update( bool AudioProcessingImpl::SubmoduleStates::CaptureMultiBandSubModulesActive() const { - return CaptureMultiBandProcessingPresent(); + return CaptureMultiBandProcessingPresent() || voice_detector_enabled_; } bool AudioProcessingImpl::SubmoduleStates::CaptureMultiBandProcessingPresent() @@ -368,6 +371,7 @@ void AudioProcessingImpl::InitializeLocked() { InitializeGainController1(); InitializeTransientSuppressor(); InitializeHighPassFilter(true); + InitializeVoiceDetector(); InitializeResidualEchoDetector(); InitializeEchoController(); InitializeGainController2(/*config_has_changed=*/true); @@ -502,6 +506,9 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) { const bool agc2_config_changed = config_.gain_controller2 != config.gain_controller2; + const bool voice_detection_config_changed = + config_.voice_detection.enabled != config.voice_detection.enabled; + const bool ns_config_changed = config_.noise_suppression.enabled != config.noise_suppression.enabled || config_.noise_suppression.level != config.noise_suppression.level; @@ -550,6 +557,10 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) { InitializeCaptureLevelsAdjuster(); } + if (voice_detection_config_changed) { + InitializeVoiceDetector(); + } + // Reinitialization must happen after all submodule configuration to avoid // additional reinitializations on the next capture / render processing call. if (pipeline_config_changed) { @@ -1204,6 +1215,13 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() { } } + if (config_.voice_detection.enabled) { + capture_.stats.voice_detected = + submodules_.voice_detector->ProcessCaptureAudio(capture_buffer); + } else { + capture_.stats.voice_detected = absl::nullopt; + } + if (submodules_.agc_manager) { submodules_.agc_manager->Process(capture_buffer); @@ -1664,7 +1682,7 @@ bool AudioProcessingImpl::UpdateActiveSubmoduleStates() { !!submodules_.gain_controller2, config_.pre_amplifier.enabled || config_.capture_level_adjustment.enabled, capture_nonlocked_.echo_controller_enabled, - !!submodules_.transient_suppressor); + config_.voice_detection.enabled, !!submodules_.transient_suppressor); } void AudioProcessingImpl::InitializeTransientSuppressor() { @@ -1714,6 +1732,14 @@ void AudioProcessingImpl::InitializeHighPassFilter(bool forced_reset) { } } +void AudioProcessingImpl::InitializeVoiceDetector() { + if (config_.voice_detection.enabled) { + submodules_.voice_detector = std::make_unique( + proc_split_sample_rate_hz(), VoiceDetection::kVeryLowLikelihood); + } else { + submodules_.voice_detector.reset(); + } +} void AudioProcessingImpl::InitializeEchoController() { bool use_echo_controller = echo_control_factory_ || diff --git a/modules/audio_processing/audio_processing_impl.h b/modules/audio_processing/audio_processing_impl.h index 344b8c5959..47dd62ed02 100644 --- a/modules/audio_processing/audio_processing_impl.h +++ b/modules/audio_processing/audio_processing_impl.h @@ -39,6 +39,7 @@ #include "modules/audio_processing/render_queue_item_verifier.h" #include "modules/audio_processing/rms_level.h" #include "modules/audio_processing/transient/transient_suppressor.h" +#include "modules/audio_processing/voice_detection.h" #include "rtc_base/gtest_prod_util.h" #include "rtc_base/ignore_wundef.h" #include "rtc_base/swap_queue.h" @@ -207,6 +208,7 @@ class AudioProcessingImpl : public AudioProcessing { bool gain_controller2_enabled, bool gain_adjustment_enabled, bool echo_controller_enabled, + bool voice_detector_enabled, bool transient_suppressor_enabled); bool CaptureMultiBandSubModulesActive() const; bool CaptureMultiBandProcessingPresent() const; @@ -229,6 +231,7 @@ class AudioProcessingImpl : public AudioProcessing { bool gain_controller2_enabled_ = false; bool gain_adjustment_enabled_ = false; bool echo_controller_enabled_ = false; + bool voice_detector_enabled_ = false; bool transient_suppressor_enabled_ = false; bool first_update_ = true; }; @@ -264,6 +267,7 @@ class AudioProcessingImpl : public AudioProcessing { // already acquired. void InitializeHighPassFilter(bool forced_reset) RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_); + void InitializeVoiceDetector() RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_); void InitializeGainController1() RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_); void InitializeTransientSuppressor() RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_); @@ -396,6 +400,7 @@ class AudioProcessingImpl : public AudioProcessing { std::unique_ptr echo_control_mobile; std::unique_ptr noise_suppressor; std::unique_ptr transient_suppressor; + std::unique_ptr voice_detector; std::unique_ptr capture_levels_adjuster; } submodules_; diff --git a/modules/audio_processing/audio_processing_impl_locking_unittest.cc b/modules/audio_processing/audio_processing_impl_locking_unittest.cc index 7557e919d6..343f077464 100644 --- a/modules/audio_processing/audio_processing_impl_locking_unittest.cc +++ b/modules/audio_processing/audio_processing_impl_locking_unittest.cc @@ -483,6 +483,7 @@ AudioProcessing::Config GetApmTestConfig(AecType aec_type) { apm_config.gain_controller1.mode = AudioProcessing::Config::GainController1::kAdaptiveDigital; apm_config.noise_suppression.enabled = true; + apm_config.voice_detection.enabled = true; return apm_config; } diff --git a/modules/audio_processing/audio_processing_performance_unittest.cc b/modules/audio_processing/audio_processing_performance_unittest.cc index 57655aea6d..c885293a4f 100644 --- a/modules/audio_processing/audio_processing_performance_unittest.cc +++ b/modules/audio_processing/audio_processing_performance_unittest.cc @@ -441,6 +441,7 @@ class CallSimulator : public ::testing::TestWithParam { apm_config.gain_controller1.enabled = true; apm_config.gain_controller1.mode = AudioProcessing::Config::GainController1::kAdaptiveDigital; + apm_config.voice_detection.enabled = true; apm->ApplyConfig(apm_config); }; @@ -452,6 +453,7 @@ class CallSimulator : public ::testing::TestWithParam { apm_config.noise_suppression.enabled = true; apm_config.gain_controller1.mode = AudioProcessing::Config::GainController1::kAdaptiveDigital; + apm_config.voice_detection.enabled = true; apm->ApplyConfig(apm_config); }; @@ -462,6 +464,7 @@ class CallSimulator : public ::testing::TestWithParam { apm_config.echo_canceller.enabled = false; apm_config.gain_controller1.enabled = false; apm_config.noise_suppression.enabled = false; + apm_config.voice_detection.enabled = false; apm->ApplyConfig(apm_config); }; diff --git a/modules/audio_processing/audio_processing_unittest.cc b/modules/audio_processing/audio_processing_unittest.cc index b21a0227c5..96e2d846d9 100644 --- a/modules/audio_processing/audio_processing_unittest.cc +++ b/modules/audio_processing/audio_processing_unittest.cc @@ -190,6 +190,7 @@ void EnableAllAPComponents(AudioProcessing* ap) { apm_config.noise_suppression.enabled = true; apm_config.high_pass_filter.enabled = true; + apm_config.voice_detection.enabled = true; apm_config.pipeline.maximum_internal_processing_rate = 48000; ap->ApplyConfig(apm_config); } @@ -1225,6 +1226,7 @@ TEST_F(ApmTest, AllProcessingDisabledByDefault) { EXPECT_FALSE(config.high_pass_filter.enabled); EXPECT_FALSE(config.gain_controller1.enabled); EXPECT_FALSE(config.noise_suppression.enabled); + EXPECT_FALSE(config.voice_detection.enabled); } TEST_F(ApmTest, NoProcessingWhenAllComponentsDisabled) { @@ -1365,6 +1367,48 @@ TEST_F(ApmTest, SplittingFilter) { EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy)); apm_->ApplyConfig(apm_config); + // 3. Only GetStatistics-reporting VAD is enabled... + SetFrameTo(&frame_, 1000); + frame_copy.CopyFrom(frame_); + apm_config.voice_detection.enabled = true; + apm_->ApplyConfig(apm_config); + EXPECT_EQ(apm_->kNoError, + apm_->ProcessStream( + frame_.data.data(), + StreamConfig(frame_.sample_rate_hz, frame_.num_channels), + StreamConfig(frame_.sample_rate_hz, frame_.num_channels), + frame_.data.data())); + EXPECT_EQ(apm_->kNoError, + apm_->ProcessStream( + frame_.data.data(), + StreamConfig(frame_.sample_rate_hz, frame_.num_channels), + StreamConfig(frame_.sample_rate_hz, frame_.num_channels), + frame_.data.data())); + EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy)); + apm_config.voice_detection.enabled = false; + apm_->ApplyConfig(apm_config); + + // 4. The VAD is enabled... + SetFrameTo(&frame_, 1000); + frame_copy.CopyFrom(frame_); + apm_config.voice_detection.enabled = true; + apm_->ApplyConfig(apm_config); + EXPECT_EQ(apm_->kNoError, + apm_->ProcessStream( + frame_.data.data(), + StreamConfig(frame_.sample_rate_hz, frame_.num_channels), + StreamConfig(frame_.sample_rate_hz, frame_.num_channels), + frame_.data.data())); + EXPECT_EQ(apm_->kNoError, + apm_->ProcessStream( + frame_.data.data(), + StreamConfig(frame_.sample_rate_hz, frame_.num_channels), + StreamConfig(frame_.sample_rate_hz, frame_.num_channels), + frame_.data.data())); + EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy)); + apm_config.voice_detection.enabled = false; + apm_->ApplyConfig(apm_config); + // Check the test is valid. We should have distortion from the filter // when AEC is enabled (which won't affect the audio). apm_config.echo_canceller.enabled = true; @@ -1692,6 +1736,7 @@ TEST_F(ApmTest, Process) { static_cast(test->num_reverse_channels()), true); int frame_count = 0; + int has_voice_count = 0; int analog_level = 127; int analog_level_average = 0; int max_output_average = 0; @@ -1727,6 +1772,8 @@ TEST_F(ApmTest, Process) { analog_level = apm_->recommended_stream_analog_level(); analog_level_average += analog_level; AudioProcessingStats stats = apm_->GetStatistics(); + EXPECT_TRUE(stats.voice_detected); + has_voice_count += *stats.voice_detected ? 1 : 0; size_t frame_size = frame_.samples_per_channel * frame_.num_channels; size_t write_count = @@ -1782,23 +1829,33 @@ TEST_F(ApmTest, Process) { if (!absl::GetFlag(FLAGS_write_apm_ref_data)) { const int kIntNear = 1; - // All numbers being consistently higher on N7 compare to the reference - // data. + // When running the test on a N7 we get a {2, 6} difference of + // `has_voice_count` and `max_output_average` is up to 18 higher. + // All numbers being consistently higher on N7 compare to ref_data. // TODO(bjornv): If we start getting more of these offsets on Android we // should consider a different approach. Either using one slack for all, // or generate a separate android reference. #if defined(WEBRTC_ANDROID) || defined(WEBRTC_IOS) + const int kHasVoiceCountOffset = 3; + const int kHasVoiceCountNear = 8; const int kMaxOutputAverageOffset = 9; const int kMaxOutputAverageNear = 26; #else + const int kHasVoiceCountOffset = 0; + const int kHasVoiceCountNear = kIntNear; const int kMaxOutputAverageOffset = 0; const int kMaxOutputAverageNear = kIntNear; #endif + EXPECT_NEAR(test->has_voice_count(), + has_voice_count - kHasVoiceCountOffset, kHasVoiceCountNear); + EXPECT_NEAR(test->analog_level_average(), analog_level_average, kIntNear); EXPECT_NEAR(test->max_output_average(), max_output_average - kMaxOutputAverageOffset, kMaxOutputAverageNear); } else { + test->set_has_voice_count(has_voice_count); + test->set_analog_level_average(analog_level_average); test->set_max_output_average(max_output_average); } @@ -2628,6 +2685,7 @@ rtc::scoped_refptr CreateApm(bool mobile_aec) { apm_config.echo_canceller.enabled = true; apm_config.echo_canceller.mobile_mode = mobile_aec; apm_config.noise_suppression.enabled = false; + apm_config.voice_detection.enabled = false; apm->ApplyConfig(apm_config); return apm; } @@ -2736,9 +2794,10 @@ TEST(MAYBE_ApmStatistics, AECMEnabledTest) { EXPECT_FALSE(stats.echo_return_loss_enhancement.has_value()); } -TEST(ApmStatistics, DoNotReportVoiceDetectedStat) { +TEST(ApmStatistics, ReportHasVoice) { ProcessingConfig processing_config = { {{32000, 1}, {32000, 1}, {32000, 1}, {32000, 1}}}; + AudioProcessing::Config config; // Set up an audioframe. Int16FrameData frame; @@ -2755,14 +2814,37 @@ TEST(ApmStatistics, DoNotReportVoiceDetectedStat) { AudioProcessingBuilderForTesting().Create(); apm->Initialize(processing_config); - // No metric should be reported. + // If not enabled, no metric should be reported. EXPECT_EQ( apm->ProcessStream(frame.data.data(), StreamConfig(frame.sample_rate_hz, frame.num_channels), StreamConfig(frame.sample_rate_hz, frame.num_channels), frame.data.data()), 0); - EXPECT_FALSE(apm->GetStatistics().voice_detected.has_value()); + EXPECT_FALSE(apm->GetStatistics().voice_detected); + + // If enabled, metrics should be reported. + config.voice_detection.enabled = true; + apm->ApplyConfig(config); + EXPECT_EQ( + apm->ProcessStream(frame.data.data(), + StreamConfig(frame.sample_rate_hz, frame.num_channels), + StreamConfig(frame.sample_rate_hz, frame.num_channels), + frame.data.data()), + 0); + auto stats = apm->GetStatistics(); + EXPECT_TRUE(stats.voice_detected); + + // If re-disabled, the value is again not reported. + config.voice_detection.enabled = false; + apm->ApplyConfig(config); + EXPECT_EQ( + apm->ProcessStream(frame.data.data(), + StreamConfig(frame.sample_rate_hz, frame.num_channels), + StreamConfig(frame.sample_rate_hz, frame.num_channels), + frame.data.data()), + 0); + EXPECT_FALSE(apm->GetStatistics().voice_detected); } TEST(ApmStatistics, GetStatisticsReportsNoEchoDetectorStatsWhenDisabled) { diff --git a/modules/audio_processing/include/audio_processing.cc b/modules/audio_processing/include/audio_processing.cc index 86edaee087..9643b6ca0b 100644 --- a/modules/audio_processing/include/audio_processing.cc +++ b/modules/audio_processing/include/audio_processing.cc @@ -145,6 +145,7 @@ std::string AudioProcessing::Config::ToString() const { << NoiseSuppressionLevelToString(noise_suppression.level) << " }, transient_suppression: { enabled: " << transient_suppression.enabled + << " }, voice_detection: { enabled: " << voice_detection.enabled << " }, gain_controller1: { enabled: " << gain_controller1.enabled << ", mode: " << GainController1ModeToString(gain_controller1.mode) << ", target_level_dbfs: " << gain_controller1.target_level_dbfs diff --git a/modules/audio_processing/include/audio_processing.h b/modules/audio_processing/include/audio_processing.h index 9d6824c038..8af5013e94 100644 --- a/modules/audio_processing/include/audio_processing.h +++ b/modules/audio_processing/include/audio_processing.h @@ -113,6 +113,8 @@ static constexpr int kClippedLevelMin = 70; // // config.high_pass_filter.enabled = true; // +// config.voice_detection.enabled = true; +// // apm->ApplyConfig(config) // // apm->noise_reduction()->set_level(kHighSuppression); @@ -230,6 +232,11 @@ class RTC_EXPORT AudioProcessing : public rtc::RefCountInterface { bool enabled = false; } transient_suppression; + // Enables reporting of `voice_detected` in webrtc::AudioProcessingStats. + struct VoiceDetection { + bool enabled = false; + } voice_detection; + // Enables automatic gain control (AGC) functionality. // The automatic gain control (AGC) component brings the signal to an // appropriate range. This is done by applying a digital gain directly and, diff --git a/modules/audio_processing/include/audio_processing_statistics.h b/modules/audio_processing/include/audio_processing_statistics.h index 3b43319951..a31dafe49c 100644 --- a/modules/audio_processing/include/audio_processing_statistics.h +++ b/modules/audio_processing/include/audio_processing_statistics.h @@ -24,8 +24,6 @@ struct RTC_EXPORT AudioProcessingStats { AudioProcessingStats(const AudioProcessingStats& other); ~AudioProcessingStats(); - // Deprecated. - // TODO(bugs.webrtc.org/11226): Remove. // True if voice is detected in the last capture frame, after processing. // It is conservative in flagging audio as speech, with low likelihood of // incorrectly flagging a frame as voice. diff --git a/modules/audio_processing/test/audio_processing_simulator.cc b/modules/audio_processing/test/audio_processing_simulator.cc index 4915648fa9..b1edda18d6 100644 --- a/modules/audio_processing/test/audio_processing_simulator.cc +++ b/modules/audio_processing/test/audio_processing_simulator.cc @@ -543,6 +543,10 @@ void AudioProcessingSimulator::ConfigureAudioProcessor() { apm_config.high_pass_filter.enabled = *settings_.use_hpf; } + if (settings_.use_vad) { + apm_config.voice_detection.enabled = *settings_.use_vad; + } + if (settings_.use_agc) { apm_config.gain_controller1.enabled = *settings_.use_agc; } diff --git a/modules/audio_processing/test/audio_processing_simulator.h b/modules/audio_processing/test/audio_processing_simulator.h index af76d7e1c9..ae3cd4fbe5 100644 --- a/modules/audio_processing/test/audio_processing_simulator.h +++ b/modules/audio_processing/test/audio_processing_simulator.h @@ -105,6 +105,7 @@ struct SimulationSettings { absl::optional use_ns; absl::optional use_ts; absl::optional use_analog_agc; + absl::optional use_vad; absl::optional use_all; absl::optional analog_agc_disable_digital_adaptive; absl::optional agc_mode; diff --git a/modules/audio_processing/test/audioproc_float_impl.cc b/modules/audio_processing/test/audioproc_float_impl.cc index aab1881913..d4697e4493 100644 --- a/modules/audio_processing/test/audioproc_float_impl.cc +++ b/modules/audio_processing/test/audioproc_float_impl.cc @@ -117,6 +117,10 @@ ABSL_FLAG(int, analog_agc, kParameterNotSpecifiedValue, "Activate (1) or deactivate (0) the analog AGC"); +ABSL_FLAG(int, + vad, + kParameterNotSpecifiedValue, + "Activate (1) or deactivate (0) the voice activity detector"); ABSL_FLAG(bool, all_default, false, @@ -361,6 +365,7 @@ void SetSettingIfFlagSet(int32_t flag, absl::optional* parameter) { SimulationSettings CreateSettings() { SimulationSettings settings; if (absl::GetFlag(FLAGS_all_default)) { + settings.use_vad = true; settings.use_ts = true; settings.use_analog_agc = true; settings.use_ns = true; @@ -412,6 +417,7 @@ SimulationSettings CreateSettings() { SetSettingIfSpecified(absl::GetFlag(FLAGS_ts), &settings.use_ts); SetSettingIfFlagSet(absl::GetFlag(FLAGS_analog_agc), &settings.use_analog_agc); + SetSettingIfFlagSet(absl::GetFlag(FLAGS_vad), &settings.use_vad); SetSettingIfFlagSet(absl::GetFlag(FLAGS_analog_agc_disable_digital_adaptive), &settings.analog_agc_disable_digital_adaptive); SetSettingIfSpecified(absl::GetFlag(FLAGS_agc_mode), &settings.agc_mode); diff --git a/modules/audio_processing/voice_detection.cc b/modules/audio_processing/voice_detection.cc new file mode 100644 index 0000000000..1a633e2286 --- /dev/null +++ b/modules/audio_processing/voice_detection.cc @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2019 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_processing/voice_detection.h" + +#include "common_audio/vad/include/webrtc_vad.h" +#include "modules/audio_processing/audio_buffer.h" +#include "rtc_base/checks.h" + +namespace webrtc { +class VoiceDetection::Vad { + public: + Vad() { + state_ = WebRtcVad_Create(); + RTC_CHECK(state_); + int error = WebRtcVad_Init(state_); + RTC_DCHECK_EQ(0, error); + } + ~Vad() { WebRtcVad_Free(state_); } + + Vad(Vad&) = delete; + Vad& operator=(Vad&) = delete; + + VadInst* state() { return state_; } + + private: + VadInst* state_ = nullptr; +}; + +VoiceDetection::VoiceDetection(int sample_rate_hz, Likelihood likelihood) + : sample_rate_hz_(sample_rate_hz), + frame_size_samples_(static_cast(sample_rate_hz_ / 100)), + likelihood_(likelihood), + vad_(new Vad()) { + int mode = 2; + switch (likelihood) { + case VoiceDetection::kVeryLowLikelihood: + mode = 3; + break; + case VoiceDetection::kLowLikelihood: + mode = 2; + break; + case VoiceDetection::kModerateLikelihood: + mode = 1; + break; + case VoiceDetection::kHighLikelihood: + mode = 0; + break; + default: + RTC_DCHECK_NOTREACHED(); + break; + } + int error = WebRtcVad_set_mode(vad_->state(), mode); + RTC_DCHECK_EQ(0, error); +} + +VoiceDetection::~VoiceDetection() {} + +bool VoiceDetection::ProcessCaptureAudio(AudioBuffer* audio) { + RTC_DCHECK_GE(AudioBuffer::kMaxSplitFrameLength, + audio->num_frames_per_band()); + std::array mixed_low_pass_data; + rtc::ArrayView mixed_low_pass(mixed_low_pass_data.data(), + audio->num_frames_per_band()); + if (audio->num_channels() == 1) { + FloatS16ToS16(audio->split_bands_const(0)[kBand0To8kHz], + audio->num_frames_per_band(), mixed_low_pass_data.data()); + } else { + const int num_channels = static_cast(audio->num_channels()); + for (size_t i = 0; i < audio->num_frames_per_band(); ++i) { + int32_t value = + FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[0][i]); + for (int j = 1; j < num_channels; ++j) { + value += FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[j][i]); + } + mixed_low_pass_data[i] = value / num_channels; + } + } + + int vad_ret = WebRtcVad_Process(vad_->state(), sample_rate_hz_, + mixed_low_pass.data(), frame_size_samples_); + RTC_DCHECK(vad_ret == 0 || vad_ret == 1); + return vad_ret == 0 ? false : true; +} +} // namespace webrtc diff --git a/modules/audio_processing/voice_detection.h b/modules/audio_processing/voice_detection.h new file mode 100644 index 0000000000..79d44e647c --- /dev/null +++ b/modules/audio_processing/voice_detection.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2019 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_ +#define MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_ + +#include + +#include + +#include "modules/audio_processing/include/audio_processing.h" + +namespace webrtc { + +class AudioBuffer; + +// The voice activity detection (VAD) component analyzes the stream to +// determine if voice is present. +class VoiceDetection { + public: + // Specifies the likelihood that a frame will be declared to contain voice. + // A higher value makes it more likely that speech will not be clipped, at + // the expense of more noise being detected as voice. + enum Likelihood { + kVeryLowLikelihood, + kLowLikelihood, + kModerateLikelihood, + kHighLikelihood + }; + + VoiceDetection(int sample_rate_hz, Likelihood likelihood); + ~VoiceDetection(); + + VoiceDetection(VoiceDetection&) = delete; + VoiceDetection& operator=(VoiceDetection&) = delete; + + // Returns true if voice is detected in the current frame. + bool ProcessCaptureAudio(AudioBuffer* audio); + + Likelihood likelihood() const { return likelihood_; } + + private: + class Vad; + + int sample_rate_hz_; + size_t frame_size_samples_; + Likelihood likelihood_; + std::unique_ptr vad_; +}; +} // namespace webrtc + +#endif // MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_ diff --git a/modules/audio_processing/voice_detection_unittest.cc b/modules/audio_processing/voice_detection_unittest.cc new file mode 100644 index 0000000000..e1117e495d --- /dev/null +++ b/modules/audio_processing/voice_detection_unittest.cc @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2016 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include + +#include "api/array_view.h" +#include "modules/audio_processing/audio_buffer.h" +#include "modules/audio_processing/test/audio_buffer_tools.h" +#include "modules/audio_processing/test/bitexactness_tools.h" +#include "modules/audio_processing/voice_detection.h" +#include "test/gtest.h" + +namespace webrtc { +namespace { + +const int kNumFramesToProcess = 1000; + +// Process one frame of data and produce the output. +bool ProcessOneFrame(int sample_rate_hz, + AudioBuffer* audio_buffer, + VoiceDetection* voice_detection) { + if (sample_rate_hz > AudioProcessing::kSampleRate16kHz) { + audio_buffer->SplitIntoFrequencyBands(); + } + + return voice_detection->ProcessCaptureAudio(audio_buffer); +} + +// Processes a specified amount of frames, verifies the results and reports +// any errors. +void RunBitexactnessTest(int sample_rate_hz, + size_t num_channels, + bool stream_has_voice_reference) { + int sample_rate_to_use = std::min(sample_rate_hz, 16000); + VoiceDetection voice_detection(sample_rate_to_use, + VoiceDetection::kLowLikelihood); + + int samples_per_channel = rtc::CheckedDivExact(sample_rate_hz, 100); + const StreamConfig capture_config(sample_rate_hz, num_channels); + AudioBuffer capture_buffer( + capture_config.sample_rate_hz(), capture_config.num_channels(), + capture_config.sample_rate_hz(), capture_config.num_channels(), + capture_config.sample_rate_hz(), capture_config.num_channels()); + test::InputAudioFile capture_file( + test::GetApmCaptureTestVectorFileName(sample_rate_hz)); + std::vector capture_input(samples_per_channel * num_channels); + bool stream_has_voice = false; + for (int frame_no = 0; frame_no < kNumFramesToProcess; ++frame_no) { + ReadFloatSamplesFromStereoFile(samples_per_channel, num_channels, + &capture_file, capture_input); + + test::CopyVectorToAudioBuffer(capture_config, capture_input, + &capture_buffer); + + stream_has_voice = + ProcessOneFrame(sample_rate_hz, &capture_buffer, &voice_detection); + } + + EXPECT_EQ(stream_has_voice_reference, stream_has_voice); +} + +const bool kStreamHasVoiceReference = true; + +} // namespace + +TEST(VoiceDetectionBitExactnessTest, Mono8kHz) { + RunBitexactnessTest(8000, 1, kStreamHasVoiceReference); +} + +TEST(VoiceDetectionBitExactnessTest, Mono16kHz) { + RunBitexactnessTest(16000, 1, kStreamHasVoiceReference); +} + +TEST(VoiceDetectionBitExactnessTest, Mono32kHz) { + RunBitexactnessTest(32000, 1, kStreamHasVoiceReference); +} + +TEST(VoiceDetectionBitExactnessTest, Mono48kHz) { + RunBitexactnessTest(48000, 1, kStreamHasVoiceReference); +} + +TEST(VoiceDetectionBitExactnessTest, Stereo8kHz) { + RunBitexactnessTest(8000, 2, kStreamHasVoiceReference); +} + +TEST(VoiceDetectionBitExactnessTest, Stereo16kHz) { + RunBitexactnessTest(16000, 2, kStreamHasVoiceReference); +} + +TEST(VoiceDetectionBitExactnessTest, Stereo32kHz) { + RunBitexactnessTest(32000, 2, kStreamHasVoiceReference); +} + +TEST(VoiceDetectionBitExactnessTest, Stereo48kHz) { + RunBitexactnessTest(48000, 2, kStreamHasVoiceReference); +} + +} // namespace webrtc diff --git a/test/fuzzers/audio_processing_configs_fuzzer.cc b/test/fuzzers/audio_processing_configs_fuzzer.cc index f04ef773ac..54a43dfe2d 100644 --- a/test/fuzzers/audio_processing_configs_fuzzer.cc +++ b/test/fuzzers/audio_processing_configs_fuzzer.cc @@ -54,7 +54,7 @@ rtc::scoped_refptr CreateApm(test::FuzzDataHelper* fuzz_data, bool use_agc = fuzz_data->ReadOrDefaultValue(true); bool use_ns = fuzz_data->ReadOrDefaultValue(true); static_cast(fuzz_data->ReadOrDefaultValue(true)); - static_cast(fuzz_data->ReadOrDefaultValue(true)); + bool use_vad = fuzz_data->ReadOrDefaultValue(true); bool use_agc_limiter = fuzz_data->ReadOrDefaultValue(true); bool use_agc2 = fuzz_data->ReadOrDefaultValue(true); @@ -114,6 +114,7 @@ rtc::scoped_refptr CreateApm(test::FuzzDataHelper* fuzz_data, use_agc2_adaptive_digital; apm_config.noise_suppression.enabled = use_ns; apm_config.transient_suppression.enabled = use_ts; + apm_config.voice_detection.enabled = use_vad; rtc::scoped_refptr apm = AudioProcessingBuilderForTesting()