From ed083d407941c2b81e51e9e959653ae1c39e197c Mon Sep 17 00:00:00 2001 From: "andrew@webrtc.org" Date: Mon, 19 Sep 2011 15:28:51 +0000 Subject: [PATCH] Modify the _vadActivity member of the AudioFrame passed to AudioProcessing. This saves the user from having to explicitly check stream_has_voice(). It will allow typing detection to function, which relies on this behaviour. Review URL: http://webrtc-codereview.appspot.com/144004 git-svn-id: http://webrtc.googlecode.com/svn/trunk@621 4adac7df-926f-26a2-2b94-8c16560cd09d --- .../main/interface/audio_processing.h | 5 + .../main/source/audio_buffer.cc | 106 ++++++++++-------- .../main/source/audio_buffer.h | 53 ++++----- .../main/source/voice_detection_impl.cc | 16 +-- .../main/test/unit_test/unit_test.cc | 42 +++---- 5 files changed, 122 insertions(+), 100 deletions(-) diff --git a/src/modules/audio_processing/main/interface/audio_processing.h b/src/modules/audio_processing/main/interface/audio_processing.h index 350ef8207a..c8c87127ad 100644 --- a/src/modules/audio_processing/main/interface/audio_processing.h +++ b/src/modules/audio_processing/main/interface/audio_processing.h @@ -486,6 +486,7 @@ class HighPassFilter { }; // An estimation component used to retrieve level metrics. +// NOTE: currently unavailable. All methods return errors. class LevelEstimator { public: virtual int Enable(bool enable) = 0; @@ -539,6 +540,10 @@ class NoiseSuppression { // The voice activity detection (VAD) component analyzes the stream to // determine if voice is present. A facility is also provided to pass in an // external VAD decision. +// +// In addition to |stream_has_voice()| the VAD decision is provided through the +// |AudioFrame| passed to |ProcessStream()|. The |_vadActivity| member will be +// modified to reflect the current decision. class VoiceDetection { public: virtual int Enable(bool enable) = 0; diff --git a/src/modules/audio_processing/main/source/audio_buffer.cc b/src/modules/audio_processing/main/source/audio_buffer.cc index 6b20fcecee..f7c55b480b 100644 --- a/src/modules/audio_processing/main/source/audio_buffer.cc +++ b/src/modules/audio_processing/main/source/audio_buffer.cc @@ -10,8 +10,6 @@ #include "audio_buffer.h" -#include "module_common_types.h" - namespace webrtc { namespace { @@ -64,21 +62,22 @@ struct SplitAudioChannel { WebRtc_Word32 synthesis_filter_state2[6]; }; -// TODO(am): check range of input parameters? -AudioBuffer::AudioBuffer(WebRtc_Word32 max_num_channels, - WebRtc_Word32 samples_per_channel) - : max_num_channels_(max_num_channels), - num_channels_(0), - num_mixed_channels_(0), - num_mixed_low_pass_channels_(0), - samples_per_channel_(samples_per_channel), - samples_per_split_channel_(samples_per_channel), - reference_copied_(false), - data_(NULL), - channels_(NULL), - split_channels_(NULL), - mixed_low_pass_channels_(NULL), - low_pass_reference_channels_(NULL) { +// TODO(andrew): check range of input parameters? +AudioBuffer::AudioBuffer(int max_num_channels, + int samples_per_channel) + : max_num_channels_(max_num_channels), + num_channels_(0), + num_mixed_channels_(0), + num_mixed_low_pass_channels_(0), + samples_per_channel_(samples_per_channel), + samples_per_split_channel_(samples_per_channel), + reference_copied_(false), + activity_(AudioFrame::kVadUnknown), + data_(NULL), + channels_(NULL), + split_channels_(NULL), + mixed_low_pass_channels_(NULL), + low_pass_reference_channels_(NULL) { if (max_num_channels_ > 1) { channels_ = new AudioChannel[max_num_channels_]; mixed_low_pass_channels_ = new AudioChannel[max_num_channels_]; @@ -109,7 +108,7 @@ AudioBuffer::~AudioBuffer() { } } -WebRtc_Word16* AudioBuffer::data(WebRtc_Word32 channel) const { +WebRtc_Word16* AudioBuffer::data(int channel) const { assert(channel >= 0 && channel < num_channels_); if (data_ != NULL) { return data_; @@ -118,7 +117,7 @@ WebRtc_Word16* AudioBuffer::data(WebRtc_Word32 channel) const { return channels_[channel].data; } -WebRtc_Word16* AudioBuffer::low_pass_split_data(WebRtc_Word32 channel) const { +WebRtc_Word16* AudioBuffer::low_pass_split_data(int channel) const { assert(channel >= 0 && channel < num_channels_); if (split_channels_ == NULL) { return data(channel); @@ -127,7 +126,7 @@ WebRtc_Word16* AudioBuffer::low_pass_split_data(WebRtc_Word32 channel) const { return split_channels_[channel].low_pass_data; } -WebRtc_Word16* AudioBuffer::high_pass_split_data(WebRtc_Word32 channel) const { +WebRtc_Word16* AudioBuffer::high_pass_split_data(int channel) const { assert(channel >= 0 && channel < num_channels_); if (split_channels_ == NULL) { return NULL; @@ -136,13 +135,13 @@ WebRtc_Word16* AudioBuffer::high_pass_split_data(WebRtc_Word32 channel) const { return split_channels_[channel].high_pass_data; } -WebRtc_Word16* AudioBuffer::mixed_low_pass_data(WebRtc_Word32 channel) const { +WebRtc_Word16* AudioBuffer::mixed_low_pass_data(int channel) const { assert(channel >= 0 && channel < num_mixed_low_pass_channels_); return mixed_low_pass_channels_[channel].data; } -WebRtc_Word16* AudioBuffer::low_pass_reference(WebRtc_Word32 channel) const { +WebRtc_Word16* AudioBuffer::low_pass_reference(int channel) const { assert(channel >= 0 && channel < num_channels_); if (!reference_copied_) { return NULL; @@ -151,58 +150,67 @@ WebRtc_Word16* AudioBuffer::low_pass_reference(WebRtc_Word32 channel) const { return low_pass_reference_channels_[channel].data; } -WebRtc_Word32* AudioBuffer::analysis_filter_state1(WebRtc_Word32 channel) const { +WebRtc_Word32* AudioBuffer::analysis_filter_state1(int channel) const { assert(channel >= 0 && channel < num_channels_); return split_channels_[channel].analysis_filter_state1; } -WebRtc_Word32* AudioBuffer::analysis_filter_state2(WebRtc_Word32 channel) const { +WebRtc_Word32* AudioBuffer::analysis_filter_state2(int channel) const { assert(channel >= 0 && channel < num_channels_); return split_channels_[channel].analysis_filter_state2; } -WebRtc_Word32* AudioBuffer::synthesis_filter_state1(WebRtc_Word32 channel) const { +WebRtc_Word32* AudioBuffer::synthesis_filter_state1(int channel) const { assert(channel >= 0 && channel < num_channels_); return split_channels_[channel].synthesis_filter_state1; } -WebRtc_Word32* AudioBuffer::synthesis_filter_state2(WebRtc_Word32 channel) const { +WebRtc_Word32* AudioBuffer::synthesis_filter_state2(int channel) const { assert(channel >= 0 && channel < num_channels_); return split_channels_[channel].synthesis_filter_state2; } -WebRtc_Word32 AudioBuffer::num_channels() const { +void AudioBuffer::set_activity(AudioFrame::VADActivity activity) { + activity_ = activity; +} + +AudioFrame::VADActivity AudioBuffer::activity() { + return activity_; +} + +int AudioBuffer::num_channels() const { return num_channels_; } -WebRtc_Word32 AudioBuffer::samples_per_channel() const { +int AudioBuffer::samples_per_channel() const { return samples_per_channel_; } -WebRtc_Word32 AudioBuffer::samples_per_split_channel() const { +int AudioBuffer::samples_per_split_channel() const { return samples_per_split_channel_; } -// TODO(ajm): Do deinterleaving and mixing in one step? -void AudioBuffer::DeinterleaveFrom(AudioFrame* audioFrame) { - assert(audioFrame->_audioChannel <= max_num_channels_); - assert(audioFrame->_payloadDataLengthInSamples == samples_per_channel_); +// TODO(andrew): Do deinterleaving and mixing in one step? +void AudioBuffer::DeinterleaveFrom(AudioFrame* frame) { + assert(frame->_audioChannel <= max_num_channels_); + assert(frame->_payloadDataLengthInSamples == samples_per_channel_); - num_channels_ = audioFrame->_audioChannel; + num_channels_ = frame->_audioChannel; num_mixed_channels_ = 0; num_mixed_low_pass_channels_ = 0; reference_copied_ = false; + activity_ = frame->_vadActivity; if (num_channels_ == 1) { // We can get away with a pointer assignment in this case. - data_ = audioFrame->_payloadData; + data_ = frame->_payloadData; return; } + WebRtc_Word16* interleaved = frame->_payloadData; for (int i = 0; i < num_channels_; i++) { WebRtc_Word16* deinterleaved = channels_[i].data; - WebRtc_Word16* interleaved = audioFrame->_payloadData; - WebRtc_Word32 interleaved_idx = i; + int interleaved_idx = i; for (int j = 0; j < samples_per_channel_; j++) { deinterleaved[j] = interleaved[interleaved_idx]; interleaved_idx += num_channels_; @@ -210,27 +218,28 @@ void AudioBuffer::DeinterleaveFrom(AudioFrame* audioFrame) { } } -void AudioBuffer::InterleaveTo(AudioFrame* audioFrame) const { - assert(audioFrame->_audioChannel == num_channels_); - assert(audioFrame->_payloadDataLengthInSamples == samples_per_channel_); +void AudioBuffer::InterleaveTo(AudioFrame* frame) const { + assert(frame->_audioChannel == num_channels_); + assert(frame->_payloadDataLengthInSamples == samples_per_channel_); + frame->_vadActivity = activity_; if (num_channels_ == 1) { if (num_mixed_channels_ == 1) { - memcpy(audioFrame->_payloadData, + memcpy(frame->_payloadData, channels_[0].data, sizeof(WebRtc_Word16) * samples_per_channel_); } else { // These should point to the same buffer in this case. - assert(data_ == audioFrame->_payloadData); + assert(data_ == frame->_payloadData); } return; } + WebRtc_Word16* interleaved = frame->_payloadData; for (int i = 0; i < num_channels_; i++) { WebRtc_Word16* deinterleaved = channels_[i].data; - WebRtc_Word16* interleaved = audioFrame->_payloadData; - WebRtc_Word32 interleaved_idx = i; + int interleaved_idx = i; for (int j = 0; j < samples_per_channel_; j++) { interleaved[interleaved_idx] = deinterleaved[j]; interleaved_idx += num_channels_; @@ -238,9 +247,10 @@ void AudioBuffer::InterleaveTo(AudioFrame* audioFrame) const { } } -// TODO(ajm): would be good to support the no-mix case with pointer assignment. -// TODO(ajm): handle mixing to multiple channels? -void AudioBuffer::Mix(WebRtc_Word32 num_mixed_channels) { +// TODO(andrew): would be good to support the no-mix case with pointer +// assignment. +// TODO(andrew): handle mixing to multiple channels? +void AudioBuffer::Mix(int num_mixed_channels) { // We currently only support the stereo to mono case. assert(num_channels_ == 2); assert(num_mixed_channels == 1); @@ -254,7 +264,7 @@ void AudioBuffer::Mix(WebRtc_Word32 num_mixed_channels) { num_mixed_channels_ = num_mixed_channels; } -void AudioBuffer::CopyAndMixLowPass(WebRtc_Word32 num_mixed_channels) { +void AudioBuffer::CopyAndMixLowPass(int num_mixed_channels) { // We currently only support the stereo to mono case. assert(num_channels_ == 2); assert(num_mixed_channels == 1); diff --git a/src/modules/audio_processing/main/source/audio_buffer.h b/src/modules/audio_processing/main/source/audio_buffer.h index 15f850b67b..1bdd3c709c 100644 --- a/src/modules/audio_processing/main/source/audio_buffer.h +++ b/src/modules/audio_processing/main/source/audio_buffer.h @@ -11,55 +11,58 @@ #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_SOURCE_AUDIO_BUFFER_H_ #define WEBRTC_MODULES_AUDIO_PROCESSING_MAIN_SOURCE_AUDIO_BUFFER_H_ +#include "module_common_types.h" #include "typedefs.h" - namespace webrtc { struct AudioChannel; struct SplitAudioChannel; -class AudioFrame; class AudioBuffer { public: - AudioBuffer(WebRtc_Word32 max_num_channels, WebRtc_Word32 samples_per_channel); + AudioBuffer(int max_num_channels, int samples_per_channel); virtual ~AudioBuffer(); - WebRtc_Word32 num_channels() const; - WebRtc_Word32 samples_per_channel() const; - WebRtc_Word32 samples_per_split_channel() const; + int num_channels() const; + int samples_per_channel() const; + int samples_per_split_channel() const; - WebRtc_Word16* data(WebRtc_Word32 channel) const; - WebRtc_Word16* low_pass_split_data(WebRtc_Word32 channel) const; - WebRtc_Word16* high_pass_split_data(WebRtc_Word32 channel) const; - WebRtc_Word16* mixed_low_pass_data(WebRtc_Word32 channel) const; - WebRtc_Word16* low_pass_reference(WebRtc_Word32 channel) const; + WebRtc_Word16* data(int channel) const; + WebRtc_Word16* low_pass_split_data(int channel) const; + WebRtc_Word16* high_pass_split_data(int channel) const; + WebRtc_Word16* mixed_low_pass_data(int channel) const; + WebRtc_Word16* low_pass_reference(int channel) const; - WebRtc_Word32* analysis_filter_state1(WebRtc_Word32 channel) const; - WebRtc_Word32* analysis_filter_state2(WebRtc_Word32 channel) const; - WebRtc_Word32* synthesis_filter_state1(WebRtc_Word32 channel) const; - WebRtc_Word32* synthesis_filter_state2(WebRtc_Word32 channel) const; + WebRtc_Word32* analysis_filter_state1(int channel) const; + WebRtc_Word32* analysis_filter_state2(int channel) const; + WebRtc_Word32* synthesis_filter_state1(int channel) const; + WebRtc_Word32* synthesis_filter_state2(int channel) const; + + void set_activity(AudioFrame::VADActivity activity); + AudioFrame::VADActivity activity(); void DeinterleaveFrom(AudioFrame* audioFrame); void InterleaveTo(AudioFrame* audioFrame) const; - void Mix(WebRtc_Word32 num_mixed_channels); - void CopyAndMixLowPass(WebRtc_Word32 num_mixed_channels); + void Mix(int num_mixed_channels); + void CopyAndMixLowPass(int num_mixed_channels); void CopyLowPassToReference(); private: - const WebRtc_Word32 max_num_channels_; - WebRtc_Word32 num_channels_; - WebRtc_Word32 num_mixed_channels_; - WebRtc_Word32 num_mixed_low_pass_channels_; - const WebRtc_Word32 samples_per_channel_; - WebRtc_Word32 samples_per_split_channel_; + const int max_num_channels_; + int num_channels_; + int num_mixed_channels_; + int num_mixed_low_pass_channels_; + const int samples_per_channel_; + int samples_per_split_channel_; bool reference_copied_; + AudioFrame::VADActivity activity_; WebRtc_Word16* data_; - // TODO(ajm): Prefer to make these vectors if permitted... + // TODO(andrew): use vectors here. AudioChannel* channels_; SplitAudioChannel* split_channels_; - // TODO(ajm): improve this, we don't need the full 32 kHz space here. + // TODO(andrew): improve this, we don't need the full 32 kHz space here. AudioChannel* mixed_low_pass_channels_; AudioChannel* low_pass_reference_channels_; }; diff --git a/src/modules/audio_processing/main/source/voice_detection_impl.cc b/src/modules/audio_processing/main/source/voice_detection_impl.cc index 3eb446e911..49aac2e674 100644 --- a/src/modules/audio_processing/main/source/voice_detection_impl.cc +++ b/src/modules/audio_processing/main/source/voice_detection_impl.cc @@ -74,16 +74,16 @@ int VoiceDetectionImpl::ProcessCaptureAudio(AudioBuffer* audio) { // TODO(ajm): concatenate data in frame buffer here. - int vad_ret_val; - vad_ret_val = WebRtcVad_Process(static_cast(handle(0)), - apm_->split_sample_rate_hz(), - mixed_data, - frame_size_samples_); - - if (vad_ret_val == 0) { + int vad_ret = WebRtcVad_Process(static_cast(handle(0)), + apm_->split_sample_rate_hz(), + mixed_data, + frame_size_samples_); + if (vad_ret == 0) { stream_has_voice_ = false; - } else if (vad_ret_val == 1) { + audio->set_activity(AudioFrame::kVadPassive); + } else if (vad_ret == 1) { stream_has_voice_ = true; + audio->set_activity(AudioFrame::kVadActive); } else { return apm_->kUnspecifiedError; } diff --git a/src/modules/audio_processing/main/test/unit_test/unit_test.cc b/src/modules/audio_processing/main/test/unit_test/unit_test.cc index 0563fdfe49..5c9f5afd3e 100644 --- a/src/modules/audio_processing/main/test/unit_test/unit_test.cc +++ b/src/modules/audio_processing/main/test/unit_test/unit_test.cc @@ -555,6 +555,7 @@ TEST_F(ApmTest, Process) { &temp_data[0], sizeof(WebRtc_Word16) * read_count); } + frame_->_vadActivity = AudioFrame::kVadUnknown; EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_)); @@ -571,6 +572,9 @@ TEST_F(ApmTest, Process) { } if (apm_->voice_detection()->stream_has_voice()) { has_voice_count++; + EXPECT_EQ(AudioFrame::kVadActive, frame_->_vadActivity); + } else { + EXPECT_EQ(AudioFrame::kVadPassive, frame_->_vadActivity); } frame_count++; @@ -966,27 +970,27 @@ TEST_F(ApmTest, VoiceDetection) { EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(false)); EXPECT_FALSE(apm_->voice_detection()->is_enabled()); + // Test that AudioFrame activity is maintained when VAD is disabled. + EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(false)); + AudioFrame::VADActivity activity[] = { + AudioFrame::kVadActive, + AudioFrame::kVadPassive, + AudioFrame::kVadUnknown + }; + for (size_t i = 0; i < sizeof(activity)/sizeof(*activity); i++) { + frame_->_vadActivity = activity[i]; + EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_)); + EXPECT_EQ(activity[i], frame_->_vadActivity); + } + + // Test that AudioFrame activity is set when VAD is enabled. + EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(true)); + frame_->_vadActivity = AudioFrame::kVadUnknown; + EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_)); + EXPECT_NE(AudioFrame::kVadUnknown, frame_->_vadActivity); + // TODO(bjornv): Add tests for streamed voice; stream_has_voice() } - -// Below are some ideas for tests from VPM. - -/*TEST_F(VideoProcessingModuleTest, GetVersionTest) -{ -} - -TEST_F(VideoProcessingModuleTest, HandleNullBuffer) -{ -} - -TEST_F(VideoProcessingModuleTest, HandleBadSize) -{ -} - -TEST_F(VideoProcessingModuleTest, IdenticalResultsAfterReset) -{ -} -*/ } // namespace int main(int argc, char** argv) {