From 687ef0a136873c32c6e7d65c68f2cfcb03a717a0 Mon Sep 17 00:00:00 2001 From: Jeremy Leconte Date: Thu, 1 Feb 2024 15:11:31 +0000 Subject: [PATCH] Revert "Remove post-decode VAD" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 89cf26f1e0532130745f648cf16b1fb8af2f6b4f. Reason for revert: breaking upstream projects Original change's description: > Remove post-decode VAD > > Bug: webrtc:15806 > Change-Id: I6acf8734a70703085cfc1ccf82a79ee0931f59a4 > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/336460 > Reviewed-by: Sam Zackrisson > Commit-Queue: Tomas Lundqvist > Reviewed-by: Jakob Ivarsson‎ > Cr-Commit-Position: refs/heads/main@{#41653} Bug: webrtc:15806 Change-Id: I20e383a6b6d625d86830ecec1be01b42b22e86a2 No-Presubmit: true No-Tree-Checks: true No-Try: true Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/337420 Bot-Commit: rubber-stamper@appspot.gserviceaccount.com Owners-Override: Jeremy Leconte Commit-Queue: Jeremy Leconte Reviewed-by: Jakob Ivarsson‎ Cr-Commit-Position: refs/heads/main@{#41657} --- api/neteq/neteq.cc | 3 +- api/neteq/neteq.h | 8 ++ modules/audio_coding/BUILD.gn | 3 + modules/audio_coding/acm2/acm_receiver.cc | 6 +- .../acm2/acm_receiver_unittest.cc | 59 ++++++++++++ .../audio_coding/neteq/background_noise.cc | 29 +++--- modules/audio_coding/neteq/background_noise.h | 4 +- modules/audio_coding/neteq/neteq_impl.cc | 71 +++++++++++++-- modules/audio_coding/neteq/neteq_impl.h | 11 +++ modules/audio_coding/neteq/post_decode_vad.cc | 90 +++++++++++++++++++ modules/audio_coding/neteq/post_decode_vad.h | 71 +++++++++++++++ .../neteq/post_decode_vad_unittest.cc | 25 ++++++ test/fuzzers/neteq_signal_fuzzer.cc | 1 + 13 files changed, 358 insertions(+), 23 deletions(-) create mode 100644 modules/audio_coding/neteq/post_decode_vad.cc create mode 100644 modules/audio_coding/neteq/post_decode_vad.h create mode 100644 modules/audio_coding/neteq/post_decode_vad_unittest.cc diff --git a/api/neteq/neteq.cc b/api/neteq/neteq.cc index d237def23a..155ddf2cf3 100644 --- a/api/neteq/neteq.cc +++ b/api/neteq/neteq.cc @@ -24,7 +24,8 @@ NetEq::Config& NetEq::Config::operator=(Config&&) = default; std::string NetEq::Config::ToString() const { char buf[1024]; rtc::SimpleStringBuilder ss(buf); - ss << "sample_rate_hz=" << sample_rate_hz + ss << "sample_rate_hz=" << sample_rate_hz << ", enable_post_decode_vad=" + << (enable_post_decode_vad ? "true" : "false") << ", max_packets_in_buffer=" << max_packets_in_buffer << ", min_delay_ms=" << min_delay_ms << ", enable_fast_accelerate=" << (enable_fast_accelerate ? "true" : "false") diff --git a/api/neteq/neteq.h b/api/neteq/neteq.h index fc0c090254..43e0e09784 100644 --- a/api/neteq/neteq.h +++ b/api/neteq/neteq.h @@ -130,6 +130,7 @@ class NetEq { std::string ToString() const; int sample_rate_hz = 48000; // Initial value. Will change with input data. + bool enable_post_decode_vad = false; size_t max_packets_in_buffer = 200; int max_delay_ms = 0; int min_delay_ms = 0; @@ -277,6 +278,13 @@ class NetEq { // statistics are never reset. virtual NetEqOperationsAndState GetOperationsAndState() const = 0; + // Enables post-decode VAD. When enabled, GetAudio() will return + // kOutputVADPassive when the signal contains no speech. + virtual void EnableVad() = 0; + + // Disables post-decode VAD. + virtual void DisableVad() = 0; + // Returns the RTP timestamp for the last sample delivered by GetAudio(). // The return value will be empty if no valid timestamp is available. virtual absl::optional GetPlayoutTimestamp() const = 0; diff --git a/modules/audio_coding/BUILD.gn b/modules/audio_coding/BUILD.gn index 8b23955d5b..5de99efa45 100644 --- a/modules/audio_coding/BUILD.gn +++ b/modules/audio_coding/BUILD.gn @@ -689,6 +689,8 @@ rtc_library("neteq") { "neteq/packet_arrival_history.h", "neteq/packet_buffer.cc", "neteq/packet_buffer.h", + "neteq/post_decode_vad.cc", + "neteq/post_decode_vad.h", "neteq/preemptive_expand.cc", "neteq/preemptive_expand.h", "neteq/random_vector.cc", @@ -1653,6 +1655,7 @@ if (rtc_include_tests) { "neteq/normal_unittest.cc", "neteq/packet_arrival_history_unittest.cc", "neteq/packet_buffer_unittest.cc", + "neteq/post_decode_vad_unittest.cc", "neteq/random_vector_unittest.cc", "neteq/red_payload_splitter_unittest.cc", "neteq/reorder_optimizer_unittest.cc", diff --git a/modules/audio_coding/acm2/acm_receiver.cc b/modules/audio_coding/acm2/acm_receiver.cc index 24a49024c8..a77e472ec1 100644 --- a/modules/audio_coding/acm2/acm_receiver.cc +++ b/modules/audio_coding/acm2/acm_receiver.cc @@ -50,7 +50,11 @@ std::unique_ptr CreateNetEq( AcmReceiver::Config::Config( rtc::scoped_refptr decoder_factory) - : clock(*Clock::GetRealTimeClock()), decoder_factory(decoder_factory) {} + : clock(*Clock::GetRealTimeClock()), decoder_factory(decoder_factory) { + // Post-decode VAD is disabled by default in NetEq, however, Audio + // Conference Mixer relies on VAD decisions and fails without them. + neteq_config.enable_post_decode_vad = true; +} AcmReceiver::Config::Config(const Config&) = default; AcmReceiver::Config::~Config() = default; diff --git a/modules/audio_coding/acm2/acm_receiver_unittest.cc b/modules/audio_coding/acm2/acm_receiver_unittest.cc index 8b35f4a621..cda6688157 100644 --- a/modules/audio_coding/acm2/acm_receiver_unittest.cc +++ b/modules/audio_coding/acm2/acm_receiver_unittest.cc @@ -190,6 +190,9 @@ class AcmReceiverTestFaxModeOldApi : public AcmReceiverTestOldApi { const size_t output_channels = info.num_channels; const size_t samples_per_ms = rtc::checked_cast( rtc::CheckedDivExact(output_sample_rate_hz, 1000)); + const AudioFrame::VADActivity expected_vad_activity = + output_sample_rate_hz > 16000 ? AudioFrame::kVadActive + : AudioFrame::kVadPassive; // Expect the first output timestamp to be 5*fs/8000 samples before the // first inserted timestamp (because of NetEq's look-ahead). (This value is @@ -214,6 +217,7 @@ class AcmReceiverTestFaxModeOldApi : public AcmReceiverTestOldApi { EXPECT_EQ(output_sample_rate_hz, frame.sample_rate_hz_); EXPECT_EQ(output_channels, frame.num_channels_); EXPECT_EQ(AudioFrame::kNormalSpeech, frame.speech_type_); + EXPECT_EQ(expected_vad_activity, frame.vad_activity_); EXPECT_FALSE(muted); } } @@ -238,6 +242,61 @@ TEST_F(AcmReceiverTestFaxModeOldApi, MAYBE_VerifyAudioFrameOpus) { RunVerifyAudioFrame({"opus", 48000, 2}); } +#if defined(WEBRTC_ANDROID) +#define MAYBE_PostdecodingVad DISABLED_PostdecodingVad +#else +#define MAYBE_PostdecodingVad PostdecodingVad +#endif +TEST_F(AcmReceiverTestOldApi, MAYBE_PostdecodingVad) { + EXPECT_TRUE(config_.neteq_config.enable_post_decode_vad); + constexpr int payload_type = 34; + const SdpAudioFormat codec = {"L16", 16000, 1}; + const AudioCodecInfo info = SetEncoder(payload_type, codec); + receiver_->SetCodecs({{payload_type, codec}}); + constexpr int kNumPackets = 5; + AudioFrame frame; + for (int n = 0; n < kNumPackets; ++n) { + const int num_10ms_frames = InsertOnePacketOfSilence(info); + for (int k = 0; k < num_10ms_frames; ++k) { + bool muted; + ASSERT_EQ(0, receiver_->GetAudio(info.sample_rate_hz, &frame, &muted)); + } + } + EXPECT_EQ(AudioFrame::kVadPassive, frame.vad_activity_); +} + +class AcmReceiverTestPostDecodeVadPassiveOldApi : public AcmReceiverTestOldApi { + protected: + AcmReceiverTestPostDecodeVadPassiveOldApi() { + config_.neteq_config.enable_post_decode_vad = false; + } +}; + +#if defined(WEBRTC_ANDROID) +#define MAYBE_PostdecodingVad DISABLED_PostdecodingVad +#else +#define MAYBE_PostdecodingVad PostdecodingVad +#endif +TEST_F(AcmReceiverTestPostDecodeVadPassiveOldApi, MAYBE_PostdecodingVad) { + EXPECT_FALSE(config_.neteq_config.enable_post_decode_vad); + constexpr int payload_type = 34; + const SdpAudioFormat codec = {"L16", 16000, 1}; + const AudioCodecInfo info = SetEncoder(payload_type, codec); + auto const value = encoder_factory_->QueryAudioEncoder(codec); + ASSERT_TRUE(value.has_value()); + receiver_->SetCodecs({{payload_type, codec}}); + const int kNumPackets = 5; + AudioFrame frame; + for (int n = 0; n < kNumPackets; ++n) { + const int num_10ms_frames = InsertOnePacketOfSilence(info); + for (int k = 0; k < num_10ms_frames; ++k) { + bool muted; + ASSERT_EQ(0, receiver_->GetAudio(info.sample_rate_hz, &frame, &muted)); + } + } + EXPECT_EQ(AudioFrame::kVadUnknown, frame.vad_activity_); +} + #if defined(WEBRTC_ANDROID) #define MAYBE_LastAudioCodec DISABLED_LastAudioCodec #else diff --git a/modules/audio_coding/neteq/background_noise.cc b/modules/audio_coding/neteq/background_noise.cc index 0c33dba47a..2c95d3b390 100644 --- a/modules/audio_coding/neteq/background_noise.cc +++ b/modules/audio_coding/neteq/background_noise.cc @@ -17,6 +17,7 @@ #include "common_audio/signal_processing/include/signal_processing_library.h" #include "modules/audio_coding/neteq/audio_multi_vector.h" #include "modules/audio_coding/neteq/cross_correlation.h" +#include "modules/audio_coding/neteq/post_decode_vad.h" namespace webrtc { namespace { @@ -43,11 +44,17 @@ void BackgroundNoise::Reset() { } } -bool BackgroundNoise::Update(const AudioMultiVector& sync_buffer) { +bool BackgroundNoise::Update(const AudioMultiVector& input, + const PostDecodeVad& vad) { bool filter_params_saved = false; + if (vad.running() && vad.active_speech()) { + // Do not update the background noise parameters if we know that the signal + // is active speech. + return filter_params_saved; + } int32_t auto_correlation[kMaxLpcOrder + 1]; - int16_t filter_output[kMaxLpcOrder + kResidualLength]; + int16_t fiter_output[kMaxLpcOrder + kResidualLength]; int16_t reflection_coefficients[kMaxLpcOrder]; int16_t lpc_coefficients[kMaxLpcOrder + 1]; @@ -55,13 +62,14 @@ bool BackgroundNoise::Update(const AudioMultiVector& sync_buffer) { ChannelParameters& parameters = channel_parameters_[channel_ix]; int16_t temp_signal_array[kVecLen + kMaxLpcOrder] = {0}; int16_t* temp_signal = &temp_signal_array[kMaxLpcOrder]; - RTC_DCHECK_GE(sync_buffer.Size(), kVecLen); - sync_buffer[channel_ix].CopyTo(kVecLen, sync_buffer.Size() - kVecLen, - temp_signal); + RTC_DCHECK_GE(input.Size(), kVecLen); + input[channel_ix].CopyTo(kVecLen, input.Size() - kVecLen, temp_signal); int32_t sample_energy = CalculateAutoCorrelation(temp_signal, kVecLen, auto_correlation); - if (sample_energy < parameters.energy_update_threshold) { + if ((!vad.running() && + sample_energy < parameters.energy_update_threshold) || + (vad.running() && !vad.active_speech())) { // Generate LPC coefficients. if (auto_correlation[0] <= 0) { // Center value in auto-correlation is not positive. Do not update. @@ -87,10 +95,10 @@ bool BackgroundNoise::Update(const AudioMultiVector& sync_buffer) { // Generate the CNG gain factor by looking at the energy of the residual. WebRtcSpl_FilterMAFastQ12(temp_signal + kVecLen - kResidualLength, - filter_output, lpc_coefficients, + fiter_output, lpc_coefficients, kMaxLpcOrder + 1, kResidualLength); int32_t residual_energy = WebRtcSpl_DotProductWithScale( - filter_output, filter_output, kResidualLength, 0); + fiter_output, fiter_output, kResidualLength, 0); // Check spectral flatness. // Comparing the residual variance with the input signal variance tells @@ -109,8 +117,9 @@ bool BackgroundNoise::Update(const AudioMultiVector& sync_buffer) { filter_params_saved = true; } } else { - // Will only happen if `sample_energy` is not low enough. Increase the - // threshold for update so that it increases by a factor 4 in 4 seconds. + // Will only happen if post-decode VAD is disabled and `sample_energy` is + // not low enough. Increase the threshold for update so that it increases + // by a factor 4 in 4 seconds. IncrementEnergyThreshold(channel_ix, sample_energy); } } diff --git a/modules/audio_coding/neteq/background_noise.h b/modules/audio_coding/neteq/background_noise.h index 9ef0131c92..8e6d5890a0 100644 --- a/modules/audio_coding/neteq/background_noise.h +++ b/modules/audio_coding/neteq/background_noise.h @@ -39,9 +39,9 @@ class BackgroundNoise { void Reset(); // Updates the parameter estimates based on the signal currently in the - // `sync_buffer`. + // `sync_buffer`, and on the latest decision in `vad` if it is running. // Returns true if the filter parameters are updated. - bool Update(const AudioMultiVector& sync_buffer); + bool Update(const AudioMultiVector& sync_buffer, const PostDecodeVad& vad); // Generates background noise given a random vector and writes the output to // `buffer`. diff --git a/modules/audio_coding/neteq/neteq_impl.cc b/modules/audio_coding/neteq/neteq_impl.cc index 81b46e20e2..e5c8bf6c08 100644 --- a/modules/audio_coding/neteq/neteq_impl.cc +++ b/modules/audio_coding/neteq/neteq_impl.cc @@ -36,6 +36,7 @@ #include "modules/audio_coding/neteq/normal.h" #include "modules/audio_coding/neteq/packet.h" #include "modules/audio_coding/neteq/packet_buffer.h" +#include "modules/audio_coding/neteq/post_decode_vad.h" #include "modules/audio_coding/neteq/preemptive_expand.h" #include "modules/audio_coding/neteq/red_payload_splitter.h" #include "modules/audio_coding/neteq/statistics_calculator.h" @@ -69,26 +70,49 @@ std::unique_ptr CreateNetEqController( return controller_factory.CreateNetEqController(config); } -AudioFrame::SpeechType ToSpeechType(NetEqImpl::OutputType type) { +void SetAudioFrameActivityAndType(bool vad_enabled, + NetEqImpl::OutputType type, + AudioFrame::VADActivity last_vad_activity, + AudioFrame* audio_frame) { switch (type) { case NetEqImpl::OutputType::kNormalSpeech: { - return AudioFrame::kNormalSpeech; + audio_frame->speech_type_ = AudioFrame::kNormalSpeech; + audio_frame->vad_activity_ = AudioFrame::kVadActive; + break; + } + case NetEqImpl::OutputType::kVadPassive: { + // This should only be reached if the VAD is enabled. + RTC_DCHECK(vad_enabled); + audio_frame->speech_type_ = AudioFrame::kNormalSpeech; + audio_frame->vad_activity_ = AudioFrame::kVadPassive; + break; } case NetEqImpl::OutputType::kCNG: { - return AudioFrame::kCNG; + audio_frame->speech_type_ = AudioFrame::kCNG; + audio_frame->vad_activity_ = AudioFrame::kVadPassive; + break; } case NetEqImpl::OutputType::kPLC: { - return AudioFrame::kPLC; + audio_frame->speech_type_ = AudioFrame::kPLC; + audio_frame->vad_activity_ = last_vad_activity; + break; } case NetEqImpl::OutputType::kPLCCNG: { - return AudioFrame::kPLCCNG; + audio_frame->speech_type_ = AudioFrame::kPLCCNG; + audio_frame->vad_activity_ = AudioFrame::kVadPassive; + break; } case NetEqImpl::OutputType::kCodecPLC: { - return AudioFrame::kCodecPLC; + audio_frame->speech_type_ = AudioFrame::kCodecPLC; + audio_frame->vad_activity_ = last_vad_activity; + break; } default: RTC_DCHECK_NOTREACHED(); - return AudioFrame::kUndefined; + } + if (!vad_enabled) { + // Always set kVadUnknown when receive VAD is inactive. + audio_frame->vad_activity_ = AudioFrame::kVadUnknown; } } @@ -145,6 +169,7 @@ NetEqImpl::NetEqImpl(const NetEq::Config& config, packet_buffer_(std::move(deps.packet_buffer)), red_payload_splitter_(std::move(deps.red_payload_splitter)), timestamp_scaler_(std::move(deps.timestamp_scaler)), + vad_(new PostDecodeVad()), expand_factory_(std::move(deps.expand_factory)), accelerate_factory_(std::move(deps.accelerate_factory)), preemptive_expand_factory_(std::move(deps.preemptive_expand_factory)), @@ -186,6 +211,10 @@ NetEqImpl::NetEqImpl(const NetEq::Config& config, if (create_components) { SetSampleRateAndChannels(fs, 1); // Default is 1 channel. } + RTC_DCHECK(!vad_->enabled()); + if (config.enable_post_decode_vad) { + vad_->Enable(); + } } NetEqImpl::~NetEqImpl() = default; @@ -223,7 +252,9 @@ int NetEqImpl::GetAudio(AudioFrame* audio_frame, audio_frame->sample_rate_hz_, rtc::dchecked_cast(audio_frame->samples_per_channel_ * 100)); RTC_DCHECK_EQ(*muted, audio_frame->muted()); - audio_frame->speech_type_ = ToSpeechType(LastOutputType()); + SetAudioFrameActivityAndType(vad_->enabled(), LastOutputType(), + last_vad_activity_, audio_frame); + last_vad_activity_ = audio_frame->vad_activity_; last_output_sample_rate_hz_ = audio_frame->sample_rate_hz_; RTC_DCHECK(last_output_sample_rate_hz_ == 8000 || last_output_sample_rate_hz_ == 16000 || @@ -367,6 +398,18 @@ NetEqOperationsAndState NetEqImpl::GetOperationsAndState() const { return result; } +void NetEqImpl::EnableVad() { + MutexLock lock(&mutex_); + RTC_DCHECK(vad_.get()); + vad_->Enable(); +} + +void NetEqImpl::DisableVad() { + MutexLock lock(&mutex_); + RTC_DCHECK(vad_.get()); + vad_->Disable(); +} + absl::optional NetEqImpl::GetPlayoutTimestamp() const { MutexLock lock(&mutex_); if (first_packet_ || last_mode_ == Mode::kRfc3389Cng || @@ -815,8 +858,11 @@ int NetEqImpl::GetAudioInternal(AudioFrame* audio_frame, last_decoded_type_ = speech_type; } + RTC_DCHECK(vad_.get()); bool sid_frame_available = (operation == Operation::kRfc3389Cng && !packet_list.empty()); + vad_->Update(decoded_buffer_.get(), static_cast(length), speech_type, + sid_frame_available, fs_hz_); // This is the criterion that we did decode some data through the speech // decoder, and the operation resulted in comfort noise. @@ -966,7 +1012,7 @@ int NetEqImpl::GetAudioInternal(AudioFrame* audio_frame, (last_mode_ == Mode::kPreemptiveExpandFail) || (last_mode_ == Mode::kRfc3389Cng) || (last_mode_ == Mode::kCodecInternalCng)) { - background_noise_->Update(*sync_buffer_); + background_noise_->Update(*sync_buffer_, *vad_.get()); } if (operation == Operation::kDtmf) { @@ -2042,6 +2088,10 @@ void NetEqImpl::SetSampleRateAndChannels(int fs_hz, size_t channels) { if (cng_decoder) cng_decoder->Reset(); + // Reinit post-decode VAD with new sample rate. + RTC_DCHECK(vad_.get()); // Cannot be NULL here. + vad_->Init(); + // Delete algorithm buffer and create a new one. algorithm_buffer_.reset(new AudioMultiVector(channels)); @@ -2082,6 +2132,7 @@ void NetEqImpl::SetSampleRateAndChannels(int fs_hz, size_t channels) { } NetEqImpl::OutputType NetEqImpl::LastOutputType() { + RTC_DCHECK(vad_.get()); RTC_DCHECK(expand_.get()); if (last_mode_ == Mode::kCodecInternalCng || last_mode_ == Mode::kRfc3389Cng) { @@ -2091,6 +2142,8 @@ NetEqImpl::OutputType NetEqImpl::LastOutputType() { return OutputType::kPLCCNG; } else if (last_mode_ == Mode::kExpand) { return OutputType::kPLC; + } else if (vad_->running() && !vad_->active_speech()) { + return OutputType::kVadPassive; } else if (last_mode_ == Mode::kCodecPlc) { return OutputType::kCodecPLC; } else { diff --git a/modules/audio_coding/neteq/neteq_impl.h b/modules/audio_coding/neteq/neteq_impl.h index 6ccbde7293..f8f2b06410 100644 --- a/modules/audio_coding/neteq/neteq_impl.h +++ b/modules/audio_coding/neteq/neteq_impl.h @@ -48,6 +48,7 @@ class Merge; class NackTracker; class Normal; class RedPayloadSplitter; +class PostDecodeVad; class PreemptiveExpand; class RandomVector; class SyncBuffer; @@ -170,6 +171,13 @@ class NetEqImpl : public webrtc::NetEq { NetEqOperationsAndState GetOperationsAndState() const override; + // Enables post-decode VAD. When enabled, GetAudio() will return + // kOutputVADPassive when the signal contains no speech. + void EnableVad() override; + + // Disables post-decode VAD. + void DisableVad() override; + absl::optional GetPlayoutTimestamp() const override; int last_output_sample_rate_hz() const override; @@ -348,6 +356,7 @@ class NetEqImpl : public webrtc::NetEq { RTC_GUARDED_BY(mutex_); const std::unique_ptr timestamp_scaler_ RTC_GUARDED_BY(mutex_); + const std::unique_ptr vad_ RTC_GUARDED_BY(mutex_); const std::unique_ptr expand_factory_ RTC_GUARDED_BY(mutex_); const std::unique_ptr accelerate_factory_ RTC_GUARDED_BY(mutex_); @@ -388,6 +397,8 @@ class NetEqImpl : public webrtc::NetEq { std::unique_ptr nack_ RTC_GUARDED_BY(mutex_); bool nack_enabled_ RTC_GUARDED_BY(mutex_); const bool enable_muted_state_ RTC_GUARDED_BY(mutex_); + AudioFrame::VADActivity last_vad_activity_ RTC_GUARDED_BY(mutex_) = + AudioFrame::kVadPassive; std::unique_ptr generated_noise_stopwatch_ RTC_GUARDED_BY(mutex_); std::vector last_decoded_packet_infos_ RTC_GUARDED_BY(mutex_); diff --git a/modules/audio_coding/neteq/post_decode_vad.cc b/modules/audio_coding/neteq/post_decode_vad.cc new file mode 100644 index 0000000000..9999d6764b --- /dev/null +++ b/modules/audio_coding/neteq/post_decode_vad.cc @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/audio_coding/neteq/post_decode_vad.h" + +namespace webrtc { + +PostDecodeVad::~PostDecodeVad() { + if (vad_instance_) + WebRtcVad_Free(vad_instance_); +} + +void PostDecodeVad::Enable() { + if (!vad_instance_) { + // Create the instance. + vad_instance_ = WebRtcVad_Create(); + if (vad_instance_ == nullptr) { + // Failed to create instance. + Disable(); + return; + } + } + Init(); + enabled_ = true; +} + +void PostDecodeVad::Disable() { + enabled_ = false; + running_ = false; +} + +void PostDecodeVad::Init() { + running_ = false; + if (vad_instance_) { + WebRtcVad_Init(vad_instance_); + WebRtcVad_set_mode(vad_instance_, kVadMode); + running_ = true; + } +} + +void PostDecodeVad::Update(int16_t* signal, + size_t length, + AudioDecoder::SpeechType speech_type, + bool sid_frame, + int fs_hz) { + if (!vad_instance_ || !enabled_) { + return; + } + + if (speech_type == AudioDecoder::kComfortNoise || sid_frame || + fs_hz > 16000) { + // TODO(hlundin): Remove restriction on fs_hz. + running_ = false; + active_speech_ = true; + sid_interval_counter_ = 0; + } else if (!running_) { + ++sid_interval_counter_; + } + + if (sid_interval_counter_ >= kVadAutoEnable) { + Init(); + } + + if (length > 0 && running_) { + size_t vad_sample_index = 0; + active_speech_ = false; + // Loop through frame sizes 30, 20, and 10 ms. + for (int vad_frame_size_ms = 30; vad_frame_size_ms >= 10; + vad_frame_size_ms -= 10) { + size_t vad_frame_size_samples = + static_cast(vad_frame_size_ms * fs_hz / 1000); + while (length - vad_sample_index >= vad_frame_size_samples) { + int vad_return = + WebRtcVad_Process(vad_instance_, fs_hz, &signal[vad_sample_index], + vad_frame_size_samples); + active_speech_ |= (vad_return == 1); + vad_sample_index += vad_frame_size_samples; + } + } + } +} + +} // namespace webrtc diff --git a/modules/audio_coding/neteq/post_decode_vad.h b/modules/audio_coding/neteq/post_decode_vad.h new file mode 100644 index 0000000000..3bd91b9edb --- /dev/null +++ b/modules/audio_coding/neteq/post_decode_vad.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef MODULES_AUDIO_CODING_NETEQ_POST_DECODE_VAD_H_ +#define MODULES_AUDIO_CODING_NETEQ_POST_DECODE_VAD_H_ + +#include +#include + +#include "api/audio_codecs/audio_decoder.h" +#include "common_audio/vad/include/webrtc_vad.h" + +namespace webrtc { + +class PostDecodeVad { + public: + PostDecodeVad() + : enabled_(false), + running_(false), + active_speech_(true), + sid_interval_counter_(0), + vad_instance_(NULL) {} + + virtual ~PostDecodeVad(); + + PostDecodeVad(const PostDecodeVad&) = delete; + PostDecodeVad& operator=(const PostDecodeVad&) = delete; + + // Enables post-decode VAD. + void Enable(); + + // Disables post-decode VAD. + void Disable(); + + // Initializes post-decode VAD. + void Init(); + + // Updates post-decode VAD with the audio data in `signal` having `length` + // samples. The data is of type `speech_type`, at the sample rate `fs_hz`. + void Update(int16_t* signal, + size_t length, + AudioDecoder::SpeechType speech_type, + bool sid_frame, + int fs_hz); + + // Accessors. + bool enabled() const { return enabled_; } + bool running() const { return running_; } + bool active_speech() const { return active_speech_; } + + private: + static const int kVadMode = 0; // Sets aggressiveness to "Normal". + // Number of Update() calls without CNG/SID before re-enabling VAD. + static const int kVadAutoEnable = 3000; + + bool enabled_; + bool running_; + bool active_speech_; + int sid_interval_counter_; + ::VadInst* vad_instance_; +}; + +} // namespace webrtc +#endif // MODULES_AUDIO_CODING_NETEQ_POST_DECODE_VAD_H_ diff --git a/modules/audio_coding/neteq/post_decode_vad_unittest.cc b/modules/audio_coding/neteq/post_decode_vad_unittest.cc new file mode 100644 index 0000000000..da3e4e864e --- /dev/null +++ b/modules/audio_coding/neteq/post_decode_vad_unittest.cc @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Unit tests for PostDecodeVad class. + +#include "modules/audio_coding/neteq/post_decode_vad.h" + +#include "test/gtest.h" + +namespace webrtc { + +TEST(PostDecodeVad, CreateAndDestroy) { + PostDecodeVad vad; +} + +// TODO(hlundin): Write more tests. + +} // namespace webrtc diff --git a/test/fuzzers/neteq_signal_fuzzer.cc b/test/fuzzers/neteq_signal_fuzzer.cc index 3b1f70cdb4..485c38085e 100644 --- a/test/fuzzers/neteq_signal_fuzzer.cc +++ b/test/fuzzers/neteq_signal_fuzzer.cc @@ -179,6 +179,7 @@ void FuzzOneInputTest(const uint8_t* data, size_t size) { // Configure NetEq and the NetEqTest object. NetEqTest::Callbacks callbacks; NetEq::Config config; + config.enable_post_decode_vad = true; config.enable_fast_accelerate = true; auto codecs = NetEqTest::StandardDecoderMap(); // rate_types contains the payload types that will be used for encoding.