From a1351271e6cfedecd9d16258a21b2cf097ea2242 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Per=20=C3=85hgren?= Date: Thu, 15 Aug 2019 12:15:46 +0200 Subject: [PATCH] Remove all AudioBuffer code that is not related to storing audio data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This CL moves/removes all code from the AudioBuffer that: -Is not directly handling audio data (e.g., keytaps, VAD descisions). -Is caching aggregated versions of the rest of the audio data. -Is not used (or only used in testing) Bug: webrtc:10882 Change-Id: I737deb3f692748eff30f46ad806b2c6f6292802c Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/149072 Reviewed-by: Gustaf Ullberg Commit-Queue: Per Ã…hgren Cr-Commit-Position: refs/heads/master@{#28866} --- modules/audio_processing/audio_buffer.cc | 156 +----------------- modules/audio_processing/audio_buffer.h | 39 +---- .../audio_processing/audio_buffer_unittest.cc | 4 - .../audio_processing/audio_processing_impl.cc | 46 ++++-- .../audio_processing/audio_processing_impl.h | 6 + .../echo_control_mobile_impl.cc | 22 ++- .../echo_control_mobile_impl.h | 3 + modules/audio_processing/gain_control_impl.cc | 24 ++- .../audio_processing/voice_detection_impl.cc | 40 +++-- .../audio_processing/voice_detection_impl.h | 9 +- 10 files changed, 125 insertions(+), 224 deletions(-) diff --git a/modules/audio_processing/audio_buffer.cc b/modules/audio_processing/audio_buffer.cc index 1a9946394a..584111c29a 100644 --- a/modules/audio_processing/audio_buffer.cc +++ b/modules/audio_processing/audio_buffer.cc @@ -27,15 +27,6 @@ const size_t kSamplesPer16kHzChannel = 160; const size_t kSamplesPer32kHzChannel = 320; const size_t kSamplesPer48kHzChannel = 480; -int KeyboardChannelIndex(const StreamConfig& stream_config) { - if (!stream_config.has_keyboard()) { - RTC_NOTREACHED(); - return 0; - } - - return stream_config.num_channels(); -} - size_t NumBandsFromSamplesPerChannel(size_t num_frames) { size_t num_bands = 1; if (num_frames == kSamplesPer32kHzChannel || @@ -60,10 +51,6 @@ AudioBuffer::AudioBuffer(size_t input_num_frames, num_channels_(num_process_channels), num_bands_(NumBandsFromSamplesPerChannel(proc_num_frames_)), num_split_frames_(rtc::CheckedDivExact(proc_num_frames_, num_bands_)), - mixed_low_pass_valid_(false), - reference_copied_(false), - activity_(AudioFrame::kVadUnknown), - keyboard_data_(NULL), data_(new IFChannelBuffer(proc_num_frames_, num_proc_channels_)), output_buffer_(new IFChannelBuffer(output_num_frames_, num_channels_)) { RTC_DCHECK_GT(input_num_frames_, 0); @@ -118,10 +105,6 @@ void AudioBuffer::CopyFrom(const float* const* data, new IFChannelBuffer(input_num_frames_, num_proc_channels_)); } - if (stream_config.has_keyboard()) { - keyboard_data_ = data[KeyboardChannelIndex(stream_config)]; - } - // Downmix. const float* const* data_ptr = data; if (need_to_downmix) { @@ -179,10 +162,6 @@ void AudioBuffer::CopyTo(const StreamConfig& stream_config, } void AudioBuffer::InitForNewData() { - keyboard_data_ = NULL; - mixed_low_pass_valid_ = false; - reference_copied_ = false; - activity_ = AudioFrame::kVadUnknown; num_channels_ = num_proc_channels_; data_->set_num_channels(num_proc_channels_); if (split_data_.get()) { @@ -195,7 +174,6 @@ const int16_t* const* AudioBuffer::channels_const() const { } int16_t* const* AudioBuffer::channels() { - mixed_low_pass_valid_ = false; return data_->ibuf()->channels(); } @@ -205,7 +183,6 @@ const int16_t* const* AudioBuffer::split_bands_const(size_t channel) const { } int16_t* const* AudioBuffer::split_bands(size_t channel) { - mixed_low_pass_valid_ = false; return split_data_.get() ? split_data_->ibuf()->bands(channel) : data_->ibuf()->bands(channel); } @@ -218,39 +195,11 @@ const int16_t* const* AudioBuffer::split_channels_const(Band band) const { } } -int16_t* const* AudioBuffer::split_channels(Band band) { - mixed_low_pass_valid_ = false; - if (split_data_.get()) { - return split_data_->ibuf()->channels(band); - } else { - return band == kBand0To8kHz ? data_->ibuf()->channels() : nullptr; - } -} - -ChannelBuffer* AudioBuffer::data() { - mixed_low_pass_valid_ = false; - return data_->ibuf(); -} - -const ChannelBuffer* AudioBuffer::data() const { - return data_->ibuf_const(); -} - -ChannelBuffer* AudioBuffer::split_data() { - mixed_low_pass_valid_ = false; - return split_data_.get() ? split_data_->ibuf() : data_->ibuf(); -} - -const ChannelBuffer* AudioBuffer::split_data() const { - return split_data_.get() ? split_data_->ibuf_const() : data_->ibuf_const(); -} - const float* const* AudioBuffer::channels_const_f() const { return data_->fbuf_const()->channels(); } float* const* AudioBuffer::channels_f() { - mixed_low_pass_valid_ = false; return data_->fbuf()->channels(); } @@ -260,85 +209,10 @@ const float* const* AudioBuffer::split_bands_const_f(size_t channel) const { } float* const* AudioBuffer::split_bands_f(size_t channel) { - mixed_low_pass_valid_ = false; return split_data_.get() ? split_data_->fbuf()->bands(channel) : data_->fbuf()->bands(channel); } -const float* const* AudioBuffer::split_channels_const_f(Band band) const { - if (split_data_.get()) { - return split_data_->fbuf_const()->channels(band); - } else { - return band == kBand0To8kHz ? data_->fbuf_const()->channels() : nullptr; - } -} - -float* const* AudioBuffer::split_channels_f(Band band) { - mixed_low_pass_valid_ = false; - if (split_data_.get()) { - return split_data_->fbuf()->channels(band); - } else { - return band == kBand0To8kHz ? data_->fbuf()->channels() : nullptr; - } -} - -ChannelBuffer* AudioBuffer::data_f() { - mixed_low_pass_valid_ = false; - return data_->fbuf(); -} - -const ChannelBuffer* AudioBuffer::data_f() const { - return data_->fbuf_const(); -} - -ChannelBuffer* AudioBuffer::split_data_f() { - mixed_low_pass_valid_ = false; - return split_data_.get() ? split_data_->fbuf() : data_->fbuf(); -} - -const ChannelBuffer* AudioBuffer::split_data_f() const { - return split_data_.get() ? split_data_->fbuf_const() : data_->fbuf_const(); -} - -const int16_t* AudioBuffer::mixed_low_pass_data() { - if (num_proc_channels_ == 1) { - return split_bands_const(0)[kBand0To8kHz]; - } - - if (!mixed_low_pass_valid_) { - if (!mixed_low_pass_channels_.get()) { - mixed_low_pass_channels_.reset( - new ChannelBuffer(num_split_frames_, 1)); - } - - DownmixToMono(split_channels_const(kBand0To8kHz), - num_split_frames_, num_channels_, - mixed_low_pass_channels_->channels()[0]); - mixed_low_pass_valid_ = true; - } - return mixed_low_pass_channels_->channels()[0]; -} - -const int16_t* AudioBuffer::low_pass_reference(int channel) const { - if (!reference_copied_) { - return NULL; - } - - return low_pass_reference_channels_->channels()[channel]; -} - -const float* AudioBuffer::keyboard_data() const { - return keyboard_data_; -} - -void AudioBuffer::set_activity(AudioFrame::VADActivity activity) { - activity_ = activity; -} - -AudioFrame::VADActivity AudioBuffer::activity() const { - return activity_; -} - size_t AudioBuffer::num_channels() const { return num_channels_; } @@ -359,17 +233,12 @@ size_t AudioBuffer::num_frames_per_band() const { return num_split_frames_; } -size_t AudioBuffer::num_keyboard_frames() const { - // We don't resample the keyboard channel. - return input_num_frames_; -} - size_t AudioBuffer::num_bands() const { return num_bands_; } // The resampler is only for supporting 48kHz to 16kHz in the reverse stream. -void AudioBuffer::DeinterleaveFrom(AudioFrame* frame) { +void AudioBuffer::DeinterleaveFrom(const AudioFrame* frame) { RTC_DCHECK_EQ(frame->num_channels_, num_input_channels_); RTC_DCHECK_EQ(frame->samples_per_channel_, input_num_frames_); InitForNewData(); @@ -378,7 +247,6 @@ void AudioBuffer::DeinterleaveFrom(AudioFrame* frame) { input_buffer_.reset( new IFChannelBuffer(input_num_frames_, num_proc_channels_)); } - activity_ = frame->vad_activity_; int16_t* const* deinterleaved; if (input_num_frames_ == proc_num_frames_) { @@ -407,12 +275,7 @@ void AudioBuffer::DeinterleaveFrom(AudioFrame* frame) { } } -void AudioBuffer::InterleaveTo(AudioFrame* frame, bool data_changed) const { - frame->vad_activity_ = activity_; - if (!data_changed) { - return; - } - +void AudioBuffer::InterleaveTo(AudioFrame* frame) const { RTC_DCHECK(frame->num_channels_ == num_channels_ || num_channels_ == 1); RTC_DCHECK_EQ(frame->samples_per_channel_, output_num_frames_); @@ -437,21 +300,6 @@ void AudioBuffer::InterleaveTo(AudioFrame* frame, bool data_changed) const { } } -void AudioBuffer::CopyLowPassToReference() { - reference_copied_ = true; - if (!low_pass_reference_channels_.get() || - low_pass_reference_channels_->num_channels() != num_channels_) { - low_pass_reference_channels_.reset( - new ChannelBuffer(num_split_frames_, num_proc_channels_)); - } - for (size_t i = 0; i < num_proc_channels_; i++) { - memcpy(low_pass_reference_channels_->channels()[i], - split_bands_const(i)[kBand0To8kHz], - low_pass_reference_channels_->num_frames_per_band() * - sizeof(split_bands_const(i)[kBand0To8kHz][0])); - } -} - void AudioBuffer::SplitIntoFrequencyBands() { splitting_filter_->Analysis(data_.get(), split_data_.get()); } diff --git a/modules/audio_processing/audio_buffer.h b/modules/audio_processing/audio_buffer.h index 8fba9f918a..c1bfb63673 100644 --- a/modules/audio_processing/audio_buffer.h +++ b/modules/audio_processing/audio_buffer.h @@ -40,10 +40,10 @@ class AudioBuffer { virtual ~AudioBuffer(); size_t num_channels() const; + size_t num_proc_channels() const { return num_proc_channels_; } void set_num_channels(size_t num_channels); size_t num_frames() const; size_t num_frames_per_band() const; - size_t num_keyboard_frames() const; size_t num_bands() const; // Returns a pointer array to the full-band channels. @@ -76,44 +76,17 @@ class AudioBuffer { // 0 <= band < |num_bands_| // 0 <= channel < |num_proc_channels_| // 0 <= sample < |num_split_frames_| - int16_t* const* split_channels(Band band); const int16_t* const* split_channels_const(Band band) const; - float* const* split_channels_f(Band band); - const float* const* split_channels_const_f(Band band) const; - - // Returns a pointer to the ChannelBuffer that encapsulates the full-band - // data. - ChannelBuffer* data(); - const ChannelBuffer* data() const; - ChannelBuffer* data_f(); - const ChannelBuffer* data_f() const; - - // Returns a pointer to the ChannelBuffer that encapsulates the split data. - ChannelBuffer* split_data(); - const ChannelBuffer* split_data() const; - ChannelBuffer* split_data_f(); - const ChannelBuffer* split_data_f() const; - - // Returns a pointer to the low-pass data downmixed to mono. If this data - // isn't already available it re-calculates it. - const int16_t* mixed_low_pass_data(); - const int16_t* low_pass_reference(int channel) const; - - const float* keyboard_data() const; - - void set_activity(AudioFrame::VADActivity activity); - AudioFrame::VADActivity activity() const; // Use for int16 interleaved data. - void DeinterleaveFrom(AudioFrame* audioFrame); + void DeinterleaveFrom(const AudioFrame* audioFrame); // If |data_changed| is false, only the non-audio data members will be copied // to |frame|. - void InterleaveTo(AudioFrame* frame, bool data_changed) const; + void InterleaveTo(AudioFrame* frame) const; // Use for float deinterleaved data. void CopyFrom(const float* const* data, const StreamConfig& stream_config); void CopyTo(const StreamConfig& stream_config, float* const* data); - void CopyLowPassToReference(); // Splits the signal into different bands. void SplitIntoFrequencyBands(); @@ -142,16 +115,10 @@ class AudioBuffer { size_t num_bands_; size_t num_split_frames_; - bool mixed_low_pass_valid_; - bool reference_copied_; - AudioFrame::VADActivity activity_; - const float* keyboard_data_; std::unique_ptr data_; std::unique_ptr split_data_; std::unique_ptr splitting_filter_; - std::unique_ptr> mixed_low_pass_channels_; - std::unique_ptr> low_pass_reference_channels_; std::unique_ptr input_buffer_; std::unique_ptr output_buffer_; std::unique_ptr> process_buffer_; diff --git a/modules/audio_processing/audio_buffer_unittest.cc b/modules/audio_processing/audio_buffer_unittest.cc index 5c231598b6..b8847999dc 100644 --- a/modules/audio_processing/audio_buffer_unittest.cc +++ b/modules/audio_processing/audio_buffer_unittest.cc @@ -21,10 +21,6 @@ const size_t kStereo = 2u; const size_t kMono = 1u; void ExpectNumChannels(const AudioBuffer& ab, size_t num_channels) { - EXPECT_EQ(ab.data()->num_channels(), num_channels); - EXPECT_EQ(ab.data_f()->num_channels(), num_channels); - EXPECT_EQ(ab.split_data()->num_channels(), num_channels); - EXPECT_EQ(ab.split_data_f()->num_channels(), num_channels); EXPECT_EQ(ab.num_channels(), num_channels); } diff --git a/modules/audio_processing/audio_processing_impl.cc b/modules/audio_processing/audio_processing_impl.cc index 9b4ae81390..804802f7a7 100644 --- a/modules/audio_processing/audio_processing_impl.cc +++ b/modules/audio_processing/audio_processing_impl.cc @@ -949,6 +949,7 @@ int AudioProcessingImpl::ProcessStream(const float* const* src, RecordUnprocessedCaptureStream(src); } + capture_.keyboard_info.Extract(src, formats_.api_format.input_stream()); capture_.capture_audio->CopyFrom(src, formats_.api_format.input_stream()); RETURN_ON_ERR(ProcessCaptureStreamLocked()); capture_.capture_audio->CopyTo(formats_.api_format.output_stream(), dest); @@ -1243,11 +1244,14 @@ int AudioProcessingImpl::ProcessStream(AudioFrame* frame) { RecordUnprocessedCaptureStream(*frame); } + capture_.vad_activity = frame->vad_activity_; capture_.capture_audio->DeinterleaveFrom(frame); RETURN_ON_ERR(ProcessCaptureStreamLocked()); - capture_.capture_audio->InterleaveTo( - frame, submodule_states_.CaptureMultiBandProcessingActive() || - submodule_states_.CaptureFullBandProcessingActive()); + if (submodule_states_.CaptureMultiBandProcessingActive() || + submodule_states_.CaptureFullBandProcessingActive()) { + capture_.capture_audio->InterleaveTo(frame); + } + frame->vad_activity_ = capture_.vad_activity; if (aec_dump_) { RecordProcessedCaptureStream(*frame); @@ -1361,7 +1365,8 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() { } if (public_submodules_->noise_suppression->is_enabled()) { - capture_buffer->CopyLowPassToReference(); + private_submodules_->echo_control_mobile->CopyLowPassReference( + capture_buffer); } public_submodules_->noise_suppression->ProcessCaptureAudio(capture_buffer); @@ -1393,7 +1398,15 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() { public_submodules_->noise_suppression->ProcessCaptureAudio(capture_buffer); } - public_submodules_->voice_detection->ProcessCaptureAudio(capture_buffer); + if (public_submodules_->voice_detection->is_enabled() && + !public_submodules_->voice_detection->using_external_vad()) { + bool voice_active = + public_submodules_->voice_detection->ProcessCaptureAudio( + capture_buffer); + capture_.vad_activity = + voice_active ? AudioFrame::kVadActive : AudioFrame::kVadPassive; + } + if (config_.voice_detection.enabled) { private_submodules_->voice_detector->ProcessCaptureAudio(capture_buffer); capture_.stats.voice_detected = @@ -1440,8 +1453,9 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() { capture_buffer->channels_f()[0], capture_buffer->num_frames(), capture_buffer->num_channels(), capture_buffer->split_bands_const_f(0)[kBand0To8kHz], - capture_buffer->num_frames_per_band(), capture_buffer->keyboard_data(), - capture_buffer->num_keyboard_frames(), voice_probability, + capture_buffer->num_frames_per_band(), + capture_.keyboard_info.keyboard_data, + capture_.keyboard_info.num_keyboard_frames, voice_probability, capture_.key_pressed); } @@ -1598,9 +1612,10 @@ int AudioProcessingImpl::ProcessReverseStream(AudioFrame* frame) { render_.render_audio->DeinterleaveFrom(frame); RETURN_ON_ERR(ProcessRenderStreamLocked()); - render_.render_audio->InterleaveTo( - frame, submodule_states_.RenderMultiBandProcessingActive() || - submodule_states_.RenderFullBandProcessingActive()); + if (submodule_states_.RenderMultiBandProcessingActive() || + submodule_states_.RenderFullBandProcessingActive()) { + render_.render_audio->InterleaveTo(frame); + } return kNoError; } @@ -2117,6 +2132,17 @@ AudioProcessingImpl::ApmCaptureState::ApmCaptureState( AudioProcessingImpl::ApmCaptureState::~ApmCaptureState() = default; +void AudioProcessingImpl::ApmCaptureState::KeyboardInfo::Extract( + const float* const* data, + const StreamConfig& stream_config) { + if (stream_config.has_keyboard()) { + keyboard_data = data[stream_config.num_channels()]; + } else { + keyboard_data = NULL; + } + num_keyboard_frames = stream_config.num_frames(); +} + AudioProcessingImpl::ApmRenderState::ApmRenderState() = default; AudioProcessingImpl::ApmRenderState::~ApmRenderState() = default; diff --git a/modules/audio_processing/audio_processing_impl.h b/modules/audio_processing/audio_processing_impl.h index 05dbb50007..1539cd582a 100644 --- a/modules/audio_processing/audio_processing_impl.h +++ b/modules/audio_processing/audio_processing_impl.h @@ -394,6 +394,12 @@ class AudioProcessingImpl : public AudioProcessing { int playout_volume; int prev_playout_volume; AudioProcessingStats stats; + struct KeyboardInfo { + void Extract(const float* const* data, const StreamConfig& stream_config); + size_t num_keyboard_frames = 0; + const float* keyboard_data = nullptr; + } keyboard_info; + AudioFrame::VADActivity vad_activity = AudioFrame::kVadUnknown; } capture_ RTC_GUARDED_BY(crit_capture_); struct ApmCaptureNonLockedState { diff --git a/modules/audio_processing/echo_control_mobile_impl.cc b/modules/audio_processing/echo_control_mobile_impl.cc index 69dfafe4ac..c8084ea39b 100644 --- a/modules/audio_processing/echo_control_mobile_impl.cc +++ b/modules/audio_processing/echo_control_mobile_impl.cc @@ -101,7 +101,10 @@ class EchoControlMobileImpl::Canceller { }; EchoControlMobileImpl::EchoControlMobileImpl() - : routing_mode_(kSpeakerphone), comfort_noise_enabled_(false) {} + : routing_mode_(kSpeakerphone), comfort_noise_enabled_(false) { + low_pass_reference_[0].fill(0); + low_pass_reference_[1].fill(0); +} EchoControlMobileImpl::~EchoControlMobileImpl() {} @@ -168,7 +171,9 @@ int EchoControlMobileImpl::ProcessCaptureAudio(AudioBuffer* audio, for (size_t capture = 0; capture < audio->num_channels(); ++capture) { // TODO(ajm): improve how this works, possibly inside AECM. // This is kind of hacked up. - const int16_t* noisy = audio->low_pass_reference(capture); + RTC_DCHECK_LT(capture, low_pass_reference_.size()); + const int16_t* noisy = + reference_copied_ ? low_pass_reference_[capture].data() : nullptr; const int16_t* clean = audio->split_bands_const(capture)[kBand0To8kHz]; if (noisy == NULL) { noisy = clean; @@ -195,6 +200,16 @@ int EchoControlMobileImpl::ProcessCaptureAudio(AudioBuffer* audio, return AudioProcessing::kNoError; } +void EchoControlMobileImpl::CopyLowPassReference(AudioBuffer* audio) { + RTC_DCHECK_LE(audio->num_channels(), low_pass_reference_.size()); + reference_copied_ = true; + for (size_t capture = 0; capture < audio->num_channels(); ++capture) { + memcpy(low_pass_reference_[capture].data(), + audio->split_bands_const(capture)[kBand0To8kHz], + audio->num_frames_per_band() * sizeof(int16_t)); + } +} + int EchoControlMobileImpl::set_routing_mode(RoutingMode mode) { if (MapSetting(mode) == -1) { return AudioProcessing::kBadParameterError; @@ -219,6 +234,9 @@ bool EchoControlMobileImpl::is_comfort_noise_enabled() const { void EchoControlMobileImpl::Initialize(int sample_rate_hz, size_t num_reverse_channels, size_t num_output_channels) { + low_pass_reference_[0].fill(0); + low_pass_reference_[1].fill(0); + stream_properties_.reset(new StreamProperties( sample_rate_hz, num_reverse_channels, num_output_channels)); diff --git a/modules/audio_processing/echo_control_mobile_impl.h b/modules/audio_processing/echo_control_mobile_impl.h index d84a15ef05..718819d2d4 100644 --- a/modules/audio_processing/echo_control_mobile_impl.h +++ b/modules/audio_processing/echo_control_mobile_impl.h @@ -54,6 +54,7 @@ class EchoControlMobileImpl { void ProcessRenderAudio(rtc::ArrayView packed_render_audio); int ProcessCaptureAudio(AudioBuffer* audio, int stream_delay_ms); + void CopyLowPassReference(AudioBuffer* audio); void Initialize(int sample_rate_hz, size_t num_reverse_channels, @@ -78,6 +79,8 @@ class EchoControlMobileImpl { std::vector> cancellers_; std::unique_ptr stream_properties_; + std::array, 2> low_pass_reference_; + bool reference_copied_ = false; }; } // namespace webrtc diff --git a/modules/audio_processing/gain_control_impl.cc b/modules/audio_processing/gain_control_impl.cc index 2ca522cda3..58559430a1 100644 --- a/modules/audio_processing/gain_control_impl.cc +++ b/modules/audio_processing/gain_control_impl.cc @@ -120,10 +120,28 @@ void GainControlImpl::PackRenderAudioBuffer( std::vector* packed_buffer) { RTC_DCHECK_GE(160, audio->num_frames_per_band()); + std::array mixed_low_pass_data; + rtc::ArrayView mixed_low_pass; + if (audio->num_proc_channels() == 1) { + mixed_low_pass = + rtc::ArrayView(audio->split_bands_const(0)[kBand0To8kHz], + audio->num_frames_per_band()); + } else { + const int num_channels = static_cast(audio->num_channels()); + for (size_t i = 0; i < audio->num_frames_per_band(); ++i) { + int32_t value = audio->split_channels_const(kBand0To8kHz)[0][i]; + for (int j = 1; j < num_channels; ++j) { + value += audio->split_channels_const(kBand0To8kHz)[j][i]; + } + mixed_low_pass_data[i] = value / num_channels; + } + mixed_low_pass = rtc::ArrayView( + mixed_low_pass_data.data(), audio->num_frames_per_band()); + } + packed_buffer->clear(); - packed_buffer->insert( - packed_buffer->end(), audio->mixed_low_pass_data(), - (audio->mixed_low_pass_data() + audio->num_frames_per_band())); + packed_buffer->insert(packed_buffer->end(), mixed_low_pass.data(), + (mixed_low_pass.data() + audio->num_frames_per_band())); } int GainControlImpl::AnalyzeCaptureAudio(AudioBuffer* audio) { diff --git a/modules/audio_processing/voice_detection_impl.cc b/modules/audio_processing/voice_detection_impl.cc index 7bf6c4a29c..0263de4651 100644 --- a/modules/audio_processing/voice_detection_impl.cc +++ b/modules/audio_processing/voice_detection_impl.cc @@ -54,30 +54,42 @@ void VoiceDetectionImpl::Initialize(int sample_rate_hz) { set_likelihood(likelihood_); } -void VoiceDetectionImpl::ProcessCaptureAudio(AudioBuffer* audio) { +bool VoiceDetectionImpl::ProcessCaptureAudio(AudioBuffer* audio) { rtc::CritScope cs(crit_); - if (!enabled_) { - return; - } - if (using_external_vad_) { - using_external_vad_ = false; - return; - } + RTC_DCHECK(enabled_); RTC_DCHECK_GE(160, audio->num_frames_per_band()); - // TODO(ajm): concatenate data in frame buffer here. - int vad_ret = - WebRtcVad_Process(vad_->state(), sample_rate_hz_, - audio->mixed_low_pass_data(), frame_size_samples_); + std::array mixed_low_pass_data; + rtc::ArrayView mixed_low_pass; + if (audio->num_proc_channels() == 1) { + mixed_low_pass = + rtc::ArrayView(audio->split_bands_const(0)[kBand0To8kHz], + audio->num_frames_per_band()); + } else { + const int num_channels = static_cast(audio->num_channels()); + for (size_t i = 0; i < audio->num_frames_per_band(); ++i) { + int32_t value = audio->split_channels_const(kBand0To8kHz)[0][i]; + for (int j = 1; j < num_channels; ++j) { + value += audio->split_channels_const(kBand0To8kHz)[j][i]; + } + mixed_low_pass_data[i] = value / num_channels; + } + mixed_low_pass = rtc::ArrayView( + mixed_low_pass_data.data(), audio->num_frames_per_band()); + } + + int vad_ret = WebRtcVad_Process(vad_->state(), sample_rate_hz_, + mixed_low_pass.data(), frame_size_samples_); if (vad_ret == 0) { stream_has_voice_ = false; - audio->set_activity(AudioFrame::kVadPassive); + return false; } else if (vad_ret == 1) { stream_has_voice_ = true; - audio->set_activity(AudioFrame::kVadActive); } else { RTC_NOTREACHED(); } + + return stream_has_voice_; } int VoiceDetectionImpl::Enable(bool enable) { diff --git a/modules/audio_processing/voice_detection_impl.h b/modules/audio_processing/voice_detection_impl.h index 4007f6713b..7ee303ff9f 100644 --- a/modules/audio_processing/voice_detection_impl.h +++ b/modules/audio_processing/voice_detection_impl.h @@ -31,7 +31,14 @@ class VoiceDetectionImpl : public VoiceDetection { // TODO(peah): Fold into ctor, once public API is removed. void Initialize(int sample_rate_hz); - void ProcessCaptureAudio(AudioBuffer* audio); + + // Returns the VAD activity. + bool ProcessCaptureAudio(AudioBuffer* audio); + + bool using_external_vad() const { + rtc::CritScope cs(crit_); + return using_external_vad_; + } // VoiceDetection implementation. int Enable(bool enable) override;