Refactor VAD: Code restructure

- Tests added for vad_core. - Replaced two macros with constants. - Made an internal function static. - Replaced replicated code with function call. Review URL: https://webrtc-codereview.appspot.com/354001 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1444 4adac7df-926f-26a2-2b94-8c16560cd09d
2012-01-17 14:37:59 +00:00 · 2012-01-17 14:37:59 +00:00 · 4259fd725c
commit 4259fd725c
parent 38e0a771d2
5 changed files with 326 additions and 305 deletions
--- a/src/common_audio/vad/vad.gypi
+++ b/src/common_audio/vad/vad.gypi
@ -49,10 +49,11 @@
            '<(webrtc_root)/../testing/gtest.gyp:gtest',
          ],
          'sources': [
-            'vad_unittest.cc',
+            'vad_core_unittest.cc',
            'vad_filterbank_unittest.cc',
            'vad_gmm_unittest.cc',
            'vad_sp_unittest.cc',
+            'vad_unittest.cc',
            'vad_unittest.h',
          ],
        }, # vad_unittests
--- a/src/common_audio/vad/vad_core.c
+++ b/src/common_audio/vad/vad_core.c
@ -8,12 +8,6 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-
-/*
- * This file includes the implementation of the core functionality in VAD.
- * For function description, see vad_core.h.
- */
-
 #include "vad_core.h"

 #include "signal_processing_library.h"
@ -60,275 +54,27 @@ static const WebRtc_Word16 kNoiseDataStds[12] = {
 static const WebRtc_Word16 kSpeechDataStds[12] = {
    555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };

+// Constants used in GmmProbability().
+//
+// Maximum number of counted speech (VAD = 1) frames in a row.
+static const int16_t kMaxSpeechFrames = 6;
+// Minimum standard deviation for both speech and noise.
+static const int16_t kMinStd = 384;
+
 static const int kInitCheck = 42;

-// Initialize VAD
-int WebRtcVad_InitCore(VadInstT *inst, short mode)
-{
-    int i;
-
-    // Initialization of struct
-    inst->vad = 1;
-    inst->frame_counter = 0;
-    inst->over_hang = 0;
-    inst->num_of_speech = 0;
-
-    // Initialization of downsampling filter state
-    inst->downsampling_filter_states[0] = 0;
-    inst->downsampling_filter_states[1] = 0;
-    inst->downsampling_filter_states[2] = 0;
-    inst->downsampling_filter_states[3] = 0;
-
-    // Read initial PDF parameters
-    for (i = 0; i < NUM_TABLE_VALUES; i++)
-    {
-        inst->noise_means[i] = kNoiseDataMeans[i];
-        inst->speech_means[i] = kSpeechDataMeans[i];
-        inst->noise_stds[i] = kNoiseDataStds[i];
-        inst->speech_stds[i] = kSpeechDataStds[i];
-    }
-
-    // Index and Minimum value vectors are initialized
-    for (i = 0; i < 16 * NUM_CHANNELS; i++)
-    {
-        inst->low_value_vector[i] = 10000;
-        inst->index_vector[i] = 0;
-    }
-
-    for (i = 0; i < 5; i++)
-    {
-        inst->upper_state[i] = 0;
-        inst->lower_state[i] = 0;
-    }
-
-    for (i = 0; i < 4; i++)
-    {
-        inst->hp_filter_state[i] = 0;
-    }
-
-    // Init mean value memory, for FindMin function
-    inst->mean_value[0] = 1600;
-    inst->mean_value[1] = 1600;
-    inst->mean_value[2] = 1600;
-    inst->mean_value[3] = 1600;
-    inst->mean_value[4] = 1600;
-    inst->mean_value[5] = 1600;
-
-    if (mode == 0)
-    {
-        // Quality mode
-        inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
-        inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
-        inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
-        inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
-        inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
-        inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
-
-        inst->individual[0] = INDIVIDUAL_10MS_Q;
-        inst->individual[1] = INDIVIDUAL_20MS_Q;
-        inst->individual[2] = INDIVIDUAL_30MS_Q;
-
-        inst->total[0] = TOTAL_10MS_Q;
-        inst->total[1] = TOTAL_20MS_Q;
-        inst->total[2] = TOTAL_30MS_Q;
-    } else if (mode == 1)
-    {
-        // Low bitrate mode
-        inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
-        inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
-        inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
-        inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
-        inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
-        inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
-
-        inst->individual[0] = INDIVIDUAL_10MS_LBR;
-        inst->individual[1] = INDIVIDUAL_20MS_LBR;
-        inst->individual[2] = INDIVIDUAL_30MS_LBR;
-
-        inst->total[0] = TOTAL_10MS_LBR;
-        inst->total[1] = TOTAL_20MS_LBR;
-        inst->total[2] = TOTAL_30MS_LBR;
-    } else if (mode == 2)
-    {
-        // Aggressive mode
-        inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
-        inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
-        inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
-        inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
-        inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
-        inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
-
-        inst->individual[0] = INDIVIDUAL_10MS_AGG;
-        inst->individual[1] = INDIVIDUAL_20MS_AGG;
-        inst->individual[2] = INDIVIDUAL_30MS_AGG;
-
-        inst->total[0] = TOTAL_10MS_AGG;
-        inst->total[1] = TOTAL_20MS_AGG;
-        inst->total[2] = TOTAL_30MS_AGG;
-    } else
-    {
-        // Very aggressive mode
-        inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
-        inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
-        inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
-        inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
-        inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
-        inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
-
-        inst->individual[0] = INDIVIDUAL_10MS_VAG;
-        inst->individual[1] = INDIVIDUAL_20MS_VAG;
-        inst->individual[2] = INDIVIDUAL_30MS_VAG;
-
-        inst->total[0] = TOTAL_10MS_VAG;
-        inst->total[1] = TOTAL_20MS_VAG;
-        inst->total[2] = TOTAL_30MS_VAG;
-    }
-
-    inst->init_flag = kInitCheck;
-
-    return 0;
-}
-
-// Set aggressiveness mode
-int WebRtcVad_set_mode_core(VadInstT *inst, short mode)
-{
-
-    if (mode == 0)
-    {
-        // Quality mode
-        inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
-        inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
-        inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
-        inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
-        inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
-        inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
-
-        inst->individual[0] = INDIVIDUAL_10MS_Q;
-        inst->individual[1] = INDIVIDUAL_20MS_Q;
-        inst->individual[2] = INDIVIDUAL_30MS_Q;
-
-        inst->total[0] = TOTAL_10MS_Q;
-        inst->total[1] = TOTAL_20MS_Q;
-        inst->total[2] = TOTAL_30MS_Q;
-    } else if (mode == 1)
-    {
-        // Low bitrate mode
-        inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
-        inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
-        inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
-        inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
-        inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
-        inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
-
-        inst->individual[0] = INDIVIDUAL_10MS_LBR;
-        inst->individual[1] = INDIVIDUAL_20MS_LBR;
-        inst->individual[2] = INDIVIDUAL_30MS_LBR;
-
-        inst->total[0] = TOTAL_10MS_LBR;
-        inst->total[1] = TOTAL_20MS_LBR;
-        inst->total[2] = TOTAL_30MS_LBR;
-    } else if (mode == 2)
-    {
-        // Aggressive mode
-        inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
-        inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
-        inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
-        inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
-        inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
-        inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
-
-        inst->individual[0] = INDIVIDUAL_10MS_AGG;
-        inst->individual[1] = INDIVIDUAL_20MS_AGG;
-        inst->individual[2] = INDIVIDUAL_30MS_AGG;
-
-        inst->total[0] = TOTAL_10MS_AGG;
-        inst->total[1] = TOTAL_20MS_AGG;
-        inst->total[2] = TOTAL_30MS_AGG;
-    } else if (mode == 3)
-    {
-        // Very aggressive mode
-        inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
-        inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
-        inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
-        inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
-        inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
-        inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
-
-        inst->individual[0] = INDIVIDUAL_10MS_VAG;
-        inst->individual[1] = INDIVIDUAL_20MS_VAG;
-        inst->individual[2] = INDIVIDUAL_30MS_VAG;
-
-        inst->total[0] = TOTAL_10MS_VAG;
-        inst->total[1] = TOTAL_20MS_VAG;
-        inst->total[2] = TOTAL_30MS_VAG;
-    } else
-    {
-        return -1;
-    }
-
-    return 0;
-}
-
-// Calculate VAD decision by first extracting feature values and then calculate
-// probability for both speech and background noise.
-
-WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
-                                     int frame_length)
-{
-    WebRtc_Word16 len, vad;
-    WebRtc_Word16 speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
-    WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
-
-
-    // Downsample signal 32->16->8 before doing VAD
-    WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
-                           frame_length);
-    len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
-
-    WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
-    len = WEBRTC_SPL_RSHIFT_W16(len, 1);
-
-    // Do VAD on an 8 kHz signal
-    vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
-
-    return vad;
-}
-
-WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
-                                     int frame_length)
-{
-    WebRtc_Word16 len, vad;
-    WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
-
-    // Wideband: Downsample signal before doing VAD
-    WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
-                           frame_length);
-
-    len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
-    vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
-
-    return vad;
-}
-
-WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
-                                    int frame_length)
-{
-    WebRtc_Word16 feature_vector[NUM_CHANNELS], total_power;
-
-    // Get power in the bands
-    total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
-                                              feature_vector);
-
-    // Make a VAD
-    inst->vad = WebRtcVad_GmmProbability(inst, feature_vector, total_power, frame_length);
-
-    return inst->vad;
-}
-
-// Calculate probability for both speech and background noise, and perform a
-// hypothesis-test.
-WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
-                                       WebRtc_Word16 total_power, int frame_length)
+// Calculates the probabilities for both speech and background noise using
+// Gaussian Mixture Models. A hypothesis-test is performed to decide which type
+// of signal is most probable.
+//
+// - inst           [i/o] : Pointer to VAD instance
+// - feature_vector [i]   : Feature vector = log10(energy in frequency band)
+// - total_power    [i]   : Total power in audio frame.
+// - frame_length   [i]   : Number of input samples
+//
+// - returns              : the VAD decision (0 - noise, 1 - speech).
+static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
+                              WebRtc_Word16 total_power, int frame_length)
 {
    int n, k;
    WebRtc_Word16 backval;
@ -590,8 +336,8 @@ WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_ve
                    tmp16 += 128; // Rounding
                    ssk += WEBRTC_SPL_RSHIFT_W16(tmp16, 8);
                    // Division with 8 plus Q7
-                    if (ssk < MIN_STD)
-                        ssk = MIN_STD;
+                    if (ssk < kMinStd)
+                        ssk = kMinStd;
                    *sstd2ptr = ssk;
                } else
                {
@ -618,8 +364,8 @@ WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_ve
                    tmp16 += 32; // Rounding
                    nsk += WEBRTC_SPL_RSHIFT_W16(tmp16, 6);

-                    if (nsk < MIN_STD)
-                        nsk = MIN_STD;
+                    if (nsk < kMinStd)
+                        nsk = kMinStd;

                    *nstd2ptr = nsk;
                }
@ -713,12 +459,209 @@ WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_ve
    } else
    {
        inst->num_of_speech = inst->num_of_speech + 1;
-        if (inst->num_of_speech > NSP_MAX)
+        if (inst->num_of_speech > kMaxSpeechFrames)
        {
-            inst->num_of_speech = NSP_MAX;
+            inst->num_of_speech = kMaxSpeechFrames;
            inst->over_hang = overhead2;
        } else
            inst->over_hang = overhead1;
    }
    return vadflag;
 }
+
+// Initialize VAD
+int WebRtcVad_InitCore(VadInstT *inst, short mode)
+{
+    int i;
+
+    // Initialization of struct
+    inst->vad = 1;
+    inst->frame_counter = 0;
+    inst->over_hang = 0;
+    inst->num_of_speech = 0;
+
+    // Initialization of downsampling filter state
+    inst->downsampling_filter_states[0] = 0;
+    inst->downsampling_filter_states[1] = 0;
+    inst->downsampling_filter_states[2] = 0;
+    inst->downsampling_filter_states[3] = 0;
+
+    // Read initial PDF parameters
+    for (i = 0; i < NUM_TABLE_VALUES; i++)
+    {
+        inst->noise_means[i] = kNoiseDataMeans[i];
+        inst->speech_means[i] = kSpeechDataMeans[i];
+        inst->noise_stds[i] = kNoiseDataStds[i];
+        inst->speech_stds[i] = kSpeechDataStds[i];
+    }
+
+    // Index and Minimum value vectors are initialized
+    for (i = 0; i < 16 * NUM_CHANNELS; i++)
+    {
+        inst->low_value_vector[i] = 10000;
+        inst->index_vector[i] = 0;
+    }
+
+    for (i = 0; i < 5; i++)
+    {
+        inst->upper_state[i] = 0;
+        inst->lower_state[i] = 0;
+    }
+
+    for (i = 0; i < 4; i++)
+    {
+        inst->hp_filter_state[i] = 0;
+    }
+
+    // Init mean value memory, for FindMin function
+    inst->mean_value[0] = 1600;
+    inst->mean_value[1] = 1600;
+    inst->mean_value[2] = 1600;
+    inst->mean_value[3] = 1600;
+    inst->mean_value[4] = 1600;
+    inst->mean_value[5] = 1600;
+
+    if (WebRtcVad_set_mode_core(inst, mode) != 0) {
+      return -1;
+    }
+
+    inst->init_flag = kInitCheck;
+
+    return 0;
+}
+
+// Set aggressiveness mode
+int WebRtcVad_set_mode_core(VadInstT *inst, short mode)
+{
+
+    if (mode == 0)
+    {
+        // Quality mode
+        inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
+        inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
+        inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
+        inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
+        inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
+        inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
+
+        inst->individual[0] = INDIVIDUAL_10MS_Q;
+        inst->individual[1] = INDIVIDUAL_20MS_Q;
+        inst->individual[2] = INDIVIDUAL_30MS_Q;
+
+        inst->total[0] = TOTAL_10MS_Q;
+        inst->total[1] = TOTAL_20MS_Q;
+        inst->total[2] = TOTAL_30MS_Q;
+    } else if (mode == 1)
+    {
+        // Low bitrate mode
+        inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
+        inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
+        inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
+        inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
+        inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
+        inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
+
+        inst->individual[0] = INDIVIDUAL_10MS_LBR;
+        inst->individual[1] = INDIVIDUAL_20MS_LBR;
+        inst->individual[2] = INDIVIDUAL_30MS_LBR;
+
+        inst->total[0] = TOTAL_10MS_LBR;
+        inst->total[1] = TOTAL_20MS_LBR;
+        inst->total[2] = TOTAL_30MS_LBR;
+    } else if (mode == 2)
+    {
+        // Aggressive mode
+        inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
+        inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
+        inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
+        inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
+        inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
+        inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
+
+        inst->individual[0] = INDIVIDUAL_10MS_AGG;
+        inst->individual[1] = INDIVIDUAL_20MS_AGG;
+        inst->individual[2] = INDIVIDUAL_30MS_AGG;
+
+        inst->total[0] = TOTAL_10MS_AGG;
+        inst->total[1] = TOTAL_20MS_AGG;
+        inst->total[2] = TOTAL_30MS_AGG;
+    } else if (mode == 3)
+    {
+        // Very aggressive mode
+        inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
+        inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
+        inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
+        inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
+        inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
+        inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
+
+        inst->individual[0] = INDIVIDUAL_10MS_VAG;
+        inst->individual[1] = INDIVIDUAL_20MS_VAG;
+        inst->individual[2] = INDIVIDUAL_30MS_VAG;
+
+        inst->total[0] = TOTAL_10MS_VAG;
+        inst->total[1] = TOTAL_20MS_VAG;
+        inst->total[2] = TOTAL_30MS_VAG;
+    } else
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+// Calculate VAD decision by first extracting feature values and then calculate
+// probability for both speech and background noise.
+
+WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
+                                     int frame_length)
+{
+    WebRtc_Word16 len, vad;
+    WebRtc_Word16 speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
+    WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
+
+
+    // Downsample signal 32->16->8 before doing VAD
+    WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
+                           frame_length);
+    len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
+
+    WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
+    len = WEBRTC_SPL_RSHIFT_W16(len, 1);
+
+    // Do VAD on an 8 kHz signal
+    vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
+
+    return vad;
+}
+
+WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
+                                     int frame_length)
+{
+    WebRtc_Word16 len, vad;
+    WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
+
+    // Wideband: Downsample signal before doing VAD
+    WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
+                           frame_length);
+
+    len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
+    vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
+
+    return vad;
+}
+
+WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
+                                    int frame_length)
+{
+    WebRtc_Word16 feature_vector[NUM_CHANNELS], total_power;
+
+    // Get power in the bands
+    total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
+                                              feature_vector);
+
+    // Make a VAD
+    inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length);
+
+    return inst->vad;
+}
--- a/src/common_audio/vad/vad_core.h
+++ b/src/common_audio/vad/vad_core.h
@ -13,8 +13,8 @@
 * This header file includes the descriptions of the core VAD calls.
 */

-#ifndef WEBRTC_VAD_CORE_H_
-#define WEBRTC_VAD_CORE_H_
+#ifndef WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_
+#define WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_

 #include "typedefs.h"
 #include "vad_defines.h"
@ -112,24 +112,4 @@ WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT* inst, WebRtc_Word16* speech_frame
 WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT* inst, WebRtc_Word16* speech_frame,
                                    int frame_length);

-/****************************************************************************
- * WebRtcVad_GmmProbability(...)
- *
- * This function calculates the probabilities for background noise and
- * speech using Gaussian Mixture Models. A hypothesis-test is performed to decide
- * which type of signal is most probable.
- *
- * Input:
- *      - inst              : Pointer to VAD instance
- *      - feature_vector    : Feature vector = log10(energy in frequency band)
- *      - total_power       : Total power in frame.
- *      - frame_length      : Number of input samples
- *
- * Output:
- *      VAD decision        : 0 - noise, 1 - speech
- *    
- */
-WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT* inst, WebRtc_Word16* feature_vector,
-                                       WebRtc_Word16 total_power, int frame_length);
-
-#endif // WEBRTC_VAD_CORE_H_
+#endif  // WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_
--- a/src/common_audio/vad/vad_core_unittest.cc
+++ b/src/common_audio/vad/vad_core_unittest.cc
@ -0,0 +1,99 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "gtest/gtest.h"
+#include "typedefs.h"
+#include "vad_unittest.h"
+
+extern "C" {
+#include "vad_core.h"
+}
+
+namespace {
+
+TEST_F(VadTest, InitCore) {
+  VadInstT* self = reinterpret_cast<VadInstT*>(malloc(sizeof(VadInstT)));
+
+  // TODO(bjornv): Add NULL pointer check if we take care of it in
+  // vad_core.c
+
+  // Test WebRtcVad_InitCore().
+  // Verify return = 0 for all modes.
+  for (size_t j = 0; j < kModesSize; ++j) {
+    EXPECT_EQ(0, WebRtcVad_InitCore(self, kModes[j]));
+  }
+
+  free(self);
+}
+
+TEST_F(VadTest, set_mode_core) {
+  VadInstT* self = reinterpret_cast<VadInstT*>(malloc(sizeof(VadInstT)));
+
+  // TODO(bjornv): Add NULL pointer check if we take care of it in
+  // vad_core.c
+
+  ASSERT_EQ(0, WebRtcVad_InitCore(self, 0));
+  // Test WebRtcVad_set_mode_core().
+  // Invalid modes should return -1.
+  EXPECT_EQ(-1, WebRtcVad_set_mode_core(self, -1));
+  EXPECT_EQ(-1, WebRtcVad_set_mode_core(self, (short) kModesSize));
+  // Valid modes should return 0.
+  for (size_t j = 0; j < kModesSize; ++j) {
+    EXPECT_EQ(0, WebRtcVad_set_mode_core(self, kModes[j]));
+  }
+
+  free(self);
+}
+
+TEST_F(VadTest, CalcVad) {
+  VadInstT* self = reinterpret_cast<VadInstT*>(malloc(sizeof(VadInstT)));
+  int16_t speech[kMaxFrameLength];
+
+  // TODO(bjornv): Add NULL pointer check if we take care of it in
+  // vad_core.c
+
+  // Test WebRtcVad_CalcVadXXkhz()
+  // Verify that all zeros in gives VAD = 0 out.
+  memset(speech, 0, sizeof(speech));
+  ASSERT_EQ(0, WebRtcVad_InitCore(self, 0));
+  for (size_t j = 0; j < kFrameLengthsSize; ++j) {
+    if (ValidRatesAndFrameLengths(8000, kFrameLengths[j])) {
+      EXPECT_EQ(0, WebRtcVad_CalcVad8khz(self, speech, kFrameLengths[j]));
+    }
+    if (ValidRatesAndFrameLengths(16000, kFrameLengths[j])) {
+      EXPECT_EQ(0, WebRtcVad_CalcVad16khz(self, speech, kFrameLengths[j]));
+    }
+    if (ValidRatesAndFrameLengths(32000, kFrameLengths[j])) {
+      EXPECT_EQ(0, WebRtcVad_CalcVad32khz(self, speech, kFrameLengths[j]));
+    }
+  }
+
+  // Construct a speech signal that will trigger the VAD in all modes. It is
+  // known that (i * i) will wrap around, but that doesn't matter in this case.
+  for (int16_t i = 0; i < kMaxFrameLength; ++i) {
+    speech[i] = (i * i);
+  }
+  for (size_t j = 0; j < kFrameLengthsSize; ++j) {
+    if (ValidRatesAndFrameLengths(8000, kFrameLengths[j])) {
+      EXPECT_EQ(1, WebRtcVad_CalcVad8khz(self, speech, kFrameLengths[j]));
+    }
+    if (ValidRatesAndFrameLengths(16000, kFrameLengths[j])) {
+      EXPECT_EQ(1, WebRtcVad_CalcVad16khz(self, speech, kFrameLengths[j]));
+    }
+    if (ValidRatesAndFrameLengths(32000, kFrameLengths[j])) {
+      EXPECT_EQ(1, WebRtcVad_CalcVad32khz(self, speech, kFrameLengths[j]));
+    }
+  }
+
+  free(self);
+}
+}  // namespace
--- a/src/common_audio/vad/vad_defines.h
+++ b/src/common_audio/vad/vad_defines.h
@ -23,8 +23,6 @@
 #define MIN_ENERGY          10
 #define ALPHA1              6553    // 0.2 in Q15
 #define ALPHA2              32439   // 0.99 in Q15
-#define NSP_MAX             6       // Maximum number of VAD=1 frames in a row counted
-#define MIN_STD             384     // Minimum standard deviation
 // Mode 0, Quality thresholds - Different thresholds for the different frame lengths
 #define INDIVIDUAL_10MS_Q   24
 #define INDIVIDUAL_20MS_Q   21      // (log10(2)*66)<<2 ~=16