From 4259fd725c14049447f4d5e1b1708f289bbabc47 Mon Sep 17 00:00:00 2001 From: "bjornv@webrtc.org" Date: Tue, 17 Jan 2012 14:37:59 +0000 Subject: [PATCH] Refactor VAD: Code restructure - Tests added for vad_core. - Replaced two macros with constants. - Made an internal function static. - Replaced replicated code with function call. Review URL: https://webrtc-codereview.appspot.com/354001 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1444 4adac7df-926f-26a2-2b94-8c16560cd09d --- src/common_audio/vad/vad.gypi | 3 +- src/common_audio/vad/vad_core.c | 501 ++++++++++------------ src/common_audio/vad/vad_core.h | 26 +- src/common_audio/vad/vad_core_unittest.cc | 99 +++++ src/common_audio/vad/vad_defines.h | 2 - 5 files changed, 326 insertions(+), 305 deletions(-) create mode 100644 src/common_audio/vad/vad_core_unittest.cc diff --git a/src/common_audio/vad/vad.gypi b/src/common_audio/vad/vad.gypi index 0890605e9d..f15e65143c 100644 --- a/src/common_audio/vad/vad.gypi +++ b/src/common_audio/vad/vad.gypi @@ -49,10 +49,11 @@ '<(webrtc_root)/../testing/gtest.gyp:gtest', ], 'sources': [ - 'vad_unittest.cc', + 'vad_core_unittest.cc', 'vad_filterbank_unittest.cc', 'vad_gmm_unittest.cc', 'vad_sp_unittest.cc', + 'vad_unittest.cc', 'vad_unittest.h', ], }, # vad_unittests diff --git a/src/common_audio/vad/vad_core.c b/src/common_audio/vad/vad_core.c index 6e0dc545f6..24392d360c 100644 --- a/src/common_audio/vad/vad_core.c +++ b/src/common_audio/vad/vad_core.c @@ -8,12 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - -/* - * This file includes the implementation of the core functionality in VAD. - * For function description, see vad_core.h. - */ - #include "vad_core.h" #include "signal_processing_library.h" @@ -60,275 +54,27 @@ static const WebRtc_Word16 kNoiseDataStds[12] = { static const WebRtc_Word16 kSpeechDataStds[12] = { 555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 }; +// Constants used in GmmProbability(). +// +// Maximum number of counted speech (VAD = 1) frames in a row. +static const int16_t kMaxSpeechFrames = 6; +// Minimum standard deviation for both speech and noise. +static const int16_t kMinStd = 384; + static const int kInitCheck = 42; -// Initialize VAD -int WebRtcVad_InitCore(VadInstT *inst, short mode) -{ - int i; - - // Initialization of struct - inst->vad = 1; - inst->frame_counter = 0; - inst->over_hang = 0; - inst->num_of_speech = 0; - - // Initialization of downsampling filter state - inst->downsampling_filter_states[0] = 0; - inst->downsampling_filter_states[1] = 0; - inst->downsampling_filter_states[2] = 0; - inst->downsampling_filter_states[3] = 0; - - // Read initial PDF parameters - for (i = 0; i < NUM_TABLE_VALUES; i++) - { - inst->noise_means[i] = kNoiseDataMeans[i]; - inst->speech_means[i] = kSpeechDataMeans[i]; - inst->noise_stds[i] = kNoiseDataStds[i]; - inst->speech_stds[i] = kSpeechDataStds[i]; - } - - // Index and Minimum value vectors are initialized - for (i = 0; i < 16 * NUM_CHANNELS; i++) - { - inst->low_value_vector[i] = 10000; - inst->index_vector[i] = 0; - } - - for (i = 0; i < 5; i++) - { - inst->upper_state[i] = 0; - inst->lower_state[i] = 0; - } - - for (i = 0; i < 4; i++) - { - inst->hp_filter_state[i] = 0; - } - - // Init mean value memory, for FindMin function - inst->mean_value[0] = 1600; - inst->mean_value[1] = 1600; - inst->mean_value[2] = 1600; - inst->mean_value[3] = 1600; - inst->mean_value[4] = 1600; - inst->mean_value[5] = 1600; - - if (mode == 0) - { - // Quality mode - inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst - inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst - inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst - inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst - inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst - inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst - - inst->individual[0] = INDIVIDUAL_10MS_Q; - inst->individual[1] = INDIVIDUAL_20MS_Q; - inst->individual[2] = INDIVIDUAL_30MS_Q; - - inst->total[0] = TOTAL_10MS_Q; - inst->total[1] = TOTAL_20MS_Q; - inst->total[2] = TOTAL_30MS_Q; - } else if (mode == 1) - { - // Low bitrate mode - inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst - inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst - inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst - inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst - inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst - inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst - - inst->individual[0] = INDIVIDUAL_10MS_LBR; - inst->individual[1] = INDIVIDUAL_20MS_LBR; - inst->individual[2] = INDIVIDUAL_30MS_LBR; - - inst->total[0] = TOTAL_10MS_LBR; - inst->total[1] = TOTAL_20MS_LBR; - inst->total[2] = TOTAL_30MS_LBR; - } else if (mode == 2) - { - // Aggressive mode - inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst - inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst - inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst - inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst - inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst - inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst - - inst->individual[0] = INDIVIDUAL_10MS_AGG; - inst->individual[1] = INDIVIDUAL_20MS_AGG; - inst->individual[2] = INDIVIDUAL_30MS_AGG; - - inst->total[0] = TOTAL_10MS_AGG; - inst->total[1] = TOTAL_20MS_AGG; - inst->total[2] = TOTAL_30MS_AGG; - } else - { - // Very aggressive mode - inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst - inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst - inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst - inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst - inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst - inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst - - inst->individual[0] = INDIVIDUAL_10MS_VAG; - inst->individual[1] = INDIVIDUAL_20MS_VAG; - inst->individual[2] = INDIVIDUAL_30MS_VAG; - - inst->total[0] = TOTAL_10MS_VAG; - inst->total[1] = TOTAL_20MS_VAG; - inst->total[2] = TOTAL_30MS_VAG; - } - - inst->init_flag = kInitCheck; - - return 0; -} - -// Set aggressiveness mode -int WebRtcVad_set_mode_core(VadInstT *inst, short mode) -{ - - if (mode == 0) - { - // Quality mode - inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst - inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst - inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst - inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst - inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst - inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst - - inst->individual[0] = INDIVIDUAL_10MS_Q; - inst->individual[1] = INDIVIDUAL_20MS_Q; - inst->individual[2] = INDIVIDUAL_30MS_Q; - - inst->total[0] = TOTAL_10MS_Q; - inst->total[1] = TOTAL_20MS_Q; - inst->total[2] = TOTAL_30MS_Q; - } else if (mode == 1) - { - // Low bitrate mode - inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst - inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst - inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst - inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst - inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst - inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst - - inst->individual[0] = INDIVIDUAL_10MS_LBR; - inst->individual[1] = INDIVIDUAL_20MS_LBR; - inst->individual[2] = INDIVIDUAL_30MS_LBR; - - inst->total[0] = TOTAL_10MS_LBR; - inst->total[1] = TOTAL_20MS_LBR; - inst->total[2] = TOTAL_30MS_LBR; - } else if (mode == 2) - { - // Aggressive mode - inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst - inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst - inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst - inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst - inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst - inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst - - inst->individual[0] = INDIVIDUAL_10MS_AGG; - inst->individual[1] = INDIVIDUAL_20MS_AGG; - inst->individual[2] = INDIVIDUAL_30MS_AGG; - - inst->total[0] = TOTAL_10MS_AGG; - inst->total[1] = TOTAL_20MS_AGG; - inst->total[2] = TOTAL_30MS_AGG; - } else if (mode == 3) - { - // Very aggressive mode - inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst - inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst - inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst - inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst - inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst - inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst - - inst->individual[0] = INDIVIDUAL_10MS_VAG; - inst->individual[1] = INDIVIDUAL_20MS_VAG; - inst->individual[2] = INDIVIDUAL_30MS_VAG; - - inst->total[0] = TOTAL_10MS_VAG; - inst->total[1] = TOTAL_20MS_VAG; - inst->total[2] = TOTAL_30MS_VAG; - } else - { - return -1; - } - - return 0; -} - -// Calculate VAD decision by first extracting feature values and then calculate -// probability for both speech and background noise. - -WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT *inst, WebRtc_Word16 *speech_frame, - int frame_length) -{ - WebRtc_Word16 len, vad; - WebRtc_Word16 speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB) - WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) - - - // Downsample signal 32->16->8 before doing VAD - WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]), - frame_length); - len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1); - - WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len); - len = WEBRTC_SPL_RSHIFT_W16(len, 1); - - // Do VAD on an 8 kHz signal - vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); - - return vad; -} - -WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT *inst, WebRtc_Word16 *speech_frame, - int frame_length) -{ - WebRtc_Word16 len, vad; - WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) - - // Wideband: Downsample signal before doing VAD - WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states, - frame_length); - - len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1); - vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); - - return vad; -} - -WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT *inst, WebRtc_Word16 *speech_frame, - int frame_length) -{ - WebRtc_Word16 feature_vector[NUM_CHANNELS], total_power; - - // Get power in the bands - total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length, - feature_vector); - - // Make a VAD - inst->vad = WebRtcVad_GmmProbability(inst, feature_vector, total_power, frame_length); - - return inst->vad; -} - -// Calculate probability for both speech and background noise, and perform a -// hypothesis-test. -WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector, - WebRtc_Word16 total_power, int frame_length) +// Calculates the probabilities for both speech and background noise using +// Gaussian Mixture Models. A hypothesis-test is performed to decide which type +// of signal is most probable. +// +// - inst [i/o] : Pointer to VAD instance +// - feature_vector [i] : Feature vector = log10(energy in frequency band) +// - total_power [i] : Total power in audio frame. +// - frame_length [i] : Number of input samples +// +// - returns : the VAD decision (0 - noise, 1 - speech). +static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector, + WebRtc_Word16 total_power, int frame_length) { int n, k; WebRtc_Word16 backval; @@ -590,8 +336,8 @@ WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_ve tmp16 += 128; // Rounding ssk += WEBRTC_SPL_RSHIFT_W16(tmp16, 8); // Division with 8 plus Q7 - if (ssk < MIN_STD) - ssk = MIN_STD; + if (ssk < kMinStd) + ssk = kMinStd; *sstd2ptr = ssk; } else { @@ -618,8 +364,8 @@ WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_ve tmp16 += 32; // Rounding nsk += WEBRTC_SPL_RSHIFT_W16(tmp16, 6); - if (nsk < MIN_STD) - nsk = MIN_STD; + if (nsk < kMinStd) + nsk = kMinStd; *nstd2ptr = nsk; } @@ -713,12 +459,209 @@ WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_ve } else { inst->num_of_speech = inst->num_of_speech + 1; - if (inst->num_of_speech > NSP_MAX) + if (inst->num_of_speech > kMaxSpeechFrames) { - inst->num_of_speech = NSP_MAX; + inst->num_of_speech = kMaxSpeechFrames; inst->over_hang = overhead2; } else inst->over_hang = overhead1; } return vadflag; } + +// Initialize VAD +int WebRtcVad_InitCore(VadInstT *inst, short mode) +{ + int i; + + // Initialization of struct + inst->vad = 1; + inst->frame_counter = 0; + inst->over_hang = 0; + inst->num_of_speech = 0; + + // Initialization of downsampling filter state + inst->downsampling_filter_states[0] = 0; + inst->downsampling_filter_states[1] = 0; + inst->downsampling_filter_states[2] = 0; + inst->downsampling_filter_states[3] = 0; + + // Read initial PDF parameters + for (i = 0; i < NUM_TABLE_VALUES; i++) + { + inst->noise_means[i] = kNoiseDataMeans[i]; + inst->speech_means[i] = kSpeechDataMeans[i]; + inst->noise_stds[i] = kNoiseDataStds[i]; + inst->speech_stds[i] = kSpeechDataStds[i]; + } + + // Index and Minimum value vectors are initialized + for (i = 0; i < 16 * NUM_CHANNELS; i++) + { + inst->low_value_vector[i] = 10000; + inst->index_vector[i] = 0; + } + + for (i = 0; i < 5; i++) + { + inst->upper_state[i] = 0; + inst->lower_state[i] = 0; + } + + for (i = 0; i < 4; i++) + { + inst->hp_filter_state[i] = 0; + } + + // Init mean value memory, for FindMin function + inst->mean_value[0] = 1600; + inst->mean_value[1] = 1600; + inst->mean_value[2] = 1600; + inst->mean_value[3] = 1600; + inst->mean_value[4] = 1600; + inst->mean_value[5] = 1600; + + if (WebRtcVad_set_mode_core(inst, mode) != 0) { + return -1; + } + + inst->init_flag = kInitCheck; + + return 0; +} + +// Set aggressiveness mode +int WebRtcVad_set_mode_core(VadInstT *inst, short mode) +{ + + if (mode == 0) + { + // Quality mode + inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_Q; + inst->individual[1] = INDIVIDUAL_20MS_Q; + inst->individual[2] = INDIVIDUAL_30MS_Q; + + inst->total[0] = TOTAL_10MS_Q; + inst->total[1] = TOTAL_20MS_Q; + inst->total[2] = TOTAL_30MS_Q; + } else if (mode == 1) + { + // Low bitrate mode + inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_LBR; + inst->individual[1] = INDIVIDUAL_20MS_LBR; + inst->individual[2] = INDIVIDUAL_30MS_LBR; + + inst->total[0] = TOTAL_10MS_LBR; + inst->total[1] = TOTAL_20MS_LBR; + inst->total[2] = TOTAL_30MS_LBR; + } else if (mode == 2) + { + // Aggressive mode + inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_AGG; + inst->individual[1] = INDIVIDUAL_20MS_AGG; + inst->individual[2] = INDIVIDUAL_30MS_AGG; + + inst->total[0] = TOTAL_10MS_AGG; + inst->total[1] = TOTAL_20MS_AGG; + inst->total[2] = TOTAL_30MS_AGG; + } else if (mode == 3) + { + // Very aggressive mode + inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_VAG; + inst->individual[1] = INDIVIDUAL_20MS_VAG; + inst->individual[2] = INDIVIDUAL_30MS_VAG; + + inst->total[0] = TOTAL_10MS_VAG; + inst->total[1] = TOTAL_20MS_VAG; + inst->total[2] = TOTAL_30MS_VAG; + } else + { + return -1; + } + + return 0; +} + +// Calculate VAD decision by first extracting feature values and then calculate +// probability for both speech and background noise. + +WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT *inst, WebRtc_Word16 *speech_frame, + int frame_length) +{ + WebRtc_Word16 len, vad; + WebRtc_Word16 speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB) + WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) + + + // Downsample signal 32->16->8 before doing VAD + WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]), + frame_length); + len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1); + + WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len); + len = WEBRTC_SPL_RSHIFT_W16(len, 1); + + // Do VAD on an 8 kHz signal + vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); + + return vad; +} + +WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT *inst, WebRtc_Word16 *speech_frame, + int frame_length) +{ + WebRtc_Word16 len, vad; + WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) + + // Wideband: Downsample signal before doing VAD + WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states, + frame_length); + + len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1); + vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); + + return vad; +} + +WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT *inst, WebRtc_Word16 *speech_frame, + int frame_length) +{ + WebRtc_Word16 feature_vector[NUM_CHANNELS], total_power; + + // Get power in the bands + total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length, + feature_vector); + + // Make a VAD + inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length); + + return inst->vad; +} diff --git a/src/common_audio/vad/vad_core.h b/src/common_audio/vad/vad_core.h index cad6ca4a7d..d48564e98b 100644 --- a/src/common_audio/vad/vad_core.h +++ b/src/common_audio/vad/vad_core.h @@ -13,8 +13,8 @@ * This header file includes the descriptions of the core VAD calls. */ -#ifndef WEBRTC_VAD_CORE_H_ -#define WEBRTC_VAD_CORE_H_ +#ifndef WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_ +#define WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_ #include "typedefs.h" #include "vad_defines.h" @@ -112,24 +112,4 @@ WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT* inst, WebRtc_Word16* speech_frame WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT* inst, WebRtc_Word16* speech_frame, int frame_length); -/**************************************************************************** - * WebRtcVad_GmmProbability(...) - * - * This function calculates the probabilities for background noise and - * speech using Gaussian Mixture Models. A hypothesis-test is performed to decide - * which type of signal is most probable. - * - * Input: - * - inst : Pointer to VAD instance - * - feature_vector : Feature vector = log10(energy in frequency band) - * - total_power : Total power in frame. - * - frame_length : Number of input samples - * - * Output: - * VAD decision : 0 - noise, 1 - speech - * - */ -WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT* inst, WebRtc_Word16* feature_vector, - WebRtc_Word16 total_power, int frame_length); - -#endif // WEBRTC_VAD_CORE_H_ +#endif // WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_ diff --git a/src/common_audio/vad/vad_core_unittest.cc b/src/common_audio/vad/vad_core_unittest.cc new file mode 100644 index 0000000000..e7c5ad1c31 --- /dev/null +++ b/src/common_audio/vad/vad_core_unittest.cc @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "gtest/gtest.h" +#include "typedefs.h" +#include "vad_unittest.h" + +extern "C" { +#include "vad_core.h" +} + +namespace { + +TEST_F(VadTest, InitCore) { + VadInstT* self = reinterpret_cast(malloc(sizeof(VadInstT))); + + // TODO(bjornv): Add NULL pointer check if we take care of it in + // vad_core.c + + // Test WebRtcVad_InitCore(). + // Verify return = 0 for all modes. + for (size_t j = 0; j < kModesSize; ++j) { + EXPECT_EQ(0, WebRtcVad_InitCore(self, kModes[j])); + } + + free(self); +} + +TEST_F(VadTest, set_mode_core) { + VadInstT* self = reinterpret_cast(malloc(sizeof(VadInstT))); + + // TODO(bjornv): Add NULL pointer check if we take care of it in + // vad_core.c + + ASSERT_EQ(0, WebRtcVad_InitCore(self, 0)); + // Test WebRtcVad_set_mode_core(). + // Invalid modes should return -1. + EXPECT_EQ(-1, WebRtcVad_set_mode_core(self, -1)); + EXPECT_EQ(-1, WebRtcVad_set_mode_core(self, (short) kModesSize)); + // Valid modes should return 0. + for (size_t j = 0; j < kModesSize; ++j) { + EXPECT_EQ(0, WebRtcVad_set_mode_core(self, kModes[j])); + } + + free(self); +} + +TEST_F(VadTest, CalcVad) { + VadInstT* self = reinterpret_cast(malloc(sizeof(VadInstT))); + int16_t speech[kMaxFrameLength]; + + // TODO(bjornv): Add NULL pointer check if we take care of it in + // vad_core.c + + // Test WebRtcVad_CalcVadXXkhz() + // Verify that all zeros in gives VAD = 0 out. + memset(speech, 0, sizeof(speech)); + ASSERT_EQ(0, WebRtcVad_InitCore(self, 0)); + for (size_t j = 0; j < kFrameLengthsSize; ++j) { + if (ValidRatesAndFrameLengths(8000, kFrameLengths[j])) { + EXPECT_EQ(0, WebRtcVad_CalcVad8khz(self, speech, kFrameLengths[j])); + } + if (ValidRatesAndFrameLengths(16000, kFrameLengths[j])) { + EXPECT_EQ(0, WebRtcVad_CalcVad16khz(self, speech, kFrameLengths[j])); + } + if (ValidRatesAndFrameLengths(32000, kFrameLengths[j])) { + EXPECT_EQ(0, WebRtcVad_CalcVad32khz(self, speech, kFrameLengths[j])); + } + } + + // Construct a speech signal that will trigger the VAD in all modes. It is + // known that (i * i) will wrap around, but that doesn't matter in this case. + for (int16_t i = 0; i < kMaxFrameLength; ++i) { + speech[i] = (i * i); + } + for (size_t j = 0; j < kFrameLengthsSize; ++j) { + if (ValidRatesAndFrameLengths(8000, kFrameLengths[j])) { + EXPECT_EQ(1, WebRtcVad_CalcVad8khz(self, speech, kFrameLengths[j])); + } + if (ValidRatesAndFrameLengths(16000, kFrameLengths[j])) { + EXPECT_EQ(1, WebRtcVad_CalcVad16khz(self, speech, kFrameLengths[j])); + } + if (ValidRatesAndFrameLengths(32000, kFrameLengths[j])) { + EXPECT_EQ(1, WebRtcVad_CalcVad32khz(self, speech, kFrameLengths[j])); + } + } + + free(self); +} +} // namespace diff --git a/src/common_audio/vad/vad_defines.h b/src/common_audio/vad/vad_defines.h index b33af2ef7d..5d1539dd09 100644 --- a/src/common_audio/vad/vad_defines.h +++ b/src/common_audio/vad/vad_defines.h @@ -23,8 +23,6 @@ #define MIN_ENERGY 10 #define ALPHA1 6553 // 0.2 in Q15 #define ALPHA2 32439 // 0.99 in Q15 -#define NSP_MAX 6 // Maximum number of VAD=1 frames in a row counted -#define MIN_STD 384 // Minimum standard deviation // Mode 0, Quality thresholds - Different thresholds for the different frame lengths #define INDIVIDUAL_10MS_Q 24 #define INDIVIDUAL_20MS_Q 21 // (log10(2)*66)<<2 ~=16