Refactor VAD: Code restructure

- Tests added for vad_core.
- Replaced two macros with constants.
- Made an internal function static.
- Replaced replicated code with function call.
Review URL: https://webrtc-codereview.appspot.com/354001

git-svn-id: http://webrtc.googlecode.com/svn/trunk@1444 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
bjornv@webrtc.org 2012-01-17 14:37:59 +00:00
parent 38e0a771d2
commit 4259fd725c
5 changed files with 326 additions and 305 deletions

View File

@ -49,10 +49,11 @@
'<(webrtc_root)/../testing/gtest.gyp:gtest',
],
'sources': [
'vad_unittest.cc',
'vad_core_unittest.cc',
'vad_filterbank_unittest.cc',
'vad_gmm_unittest.cc',
'vad_sp_unittest.cc',
'vad_unittest.cc',
'vad_unittest.h',
],
}, # vad_unittests

View File

@ -8,12 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* This file includes the implementation of the core functionality in VAD.
* For function description, see vad_core.h.
*/
#include "vad_core.h"
#include "signal_processing_library.h"
@ -60,275 +54,27 @@ static const WebRtc_Word16 kNoiseDataStds[12] = {
static const WebRtc_Word16 kSpeechDataStds[12] = {
555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
// Constants used in GmmProbability().
//
// Maximum number of counted speech (VAD = 1) frames in a row.
static const int16_t kMaxSpeechFrames = 6;
// Minimum standard deviation for both speech and noise.
static const int16_t kMinStd = 384;
static const int kInitCheck = 42;
// Initialize VAD
int WebRtcVad_InitCore(VadInstT *inst, short mode)
{
int i;
// Initialization of struct
inst->vad = 1;
inst->frame_counter = 0;
inst->over_hang = 0;
inst->num_of_speech = 0;
// Initialization of downsampling filter state
inst->downsampling_filter_states[0] = 0;
inst->downsampling_filter_states[1] = 0;
inst->downsampling_filter_states[2] = 0;
inst->downsampling_filter_states[3] = 0;
// Read initial PDF parameters
for (i = 0; i < NUM_TABLE_VALUES; i++)
{
inst->noise_means[i] = kNoiseDataMeans[i];
inst->speech_means[i] = kSpeechDataMeans[i];
inst->noise_stds[i] = kNoiseDataStds[i];
inst->speech_stds[i] = kSpeechDataStds[i];
}
// Index and Minimum value vectors are initialized
for (i = 0; i < 16 * NUM_CHANNELS; i++)
{
inst->low_value_vector[i] = 10000;
inst->index_vector[i] = 0;
}
for (i = 0; i < 5; i++)
{
inst->upper_state[i] = 0;
inst->lower_state[i] = 0;
}
for (i = 0; i < 4; i++)
{
inst->hp_filter_state[i] = 0;
}
// Init mean value memory, for FindMin function
inst->mean_value[0] = 1600;
inst->mean_value[1] = 1600;
inst->mean_value[2] = 1600;
inst->mean_value[3] = 1600;
inst->mean_value[4] = 1600;
inst->mean_value[5] = 1600;
if (mode == 0)
{
// Quality mode
inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_Q;
inst->individual[1] = INDIVIDUAL_20MS_Q;
inst->individual[2] = INDIVIDUAL_30MS_Q;
inst->total[0] = TOTAL_10MS_Q;
inst->total[1] = TOTAL_20MS_Q;
inst->total[2] = TOTAL_30MS_Q;
} else if (mode == 1)
{
// Low bitrate mode
inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_LBR;
inst->individual[1] = INDIVIDUAL_20MS_LBR;
inst->individual[2] = INDIVIDUAL_30MS_LBR;
inst->total[0] = TOTAL_10MS_LBR;
inst->total[1] = TOTAL_20MS_LBR;
inst->total[2] = TOTAL_30MS_LBR;
} else if (mode == 2)
{
// Aggressive mode
inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_AGG;
inst->individual[1] = INDIVIDUAL_20MS_AGG;
inst->individual[2] = INDIVIDUAL_30MS_AGG;
inst->total[0] = TOTAL_10MS_AGG;
inst->total[1] = TOTAL_20MS_AGG;
inst->total[2] = TOTAL_30MS_AGG;
} else
{
// Very aggressive mode
inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_VAG;
inst->individual[1] = INDIVIDUAL_20MS_VAG;
inst->individual[2] = INDIVIDUAL_30MS_VAG;
inst->total[0] = TOTAL_10MS_VAG;
inst->total[1] = TOTAL_20MS_VAG;
inst->total[2] = TOTAL_30MS_VAG;
}
inst->init_flag = kInitCheck;
return 0;
}
// Set aggressiveness mode
int WebRtcVad_set_mode_core(VadInstT *inst, short mode)
{
if (mode == 0)
{
// Quality mode
inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_Q;
inst->individual[1] = INDIVIDUAL_20MS_Q;
inst->individual[2] = INDIVIDUAL_30MS_Q;
inst->total[0] = TOTAL_10MS_Q;
inst->total[1] = TOTAL_20MS_Q;
inst->total[2] = TOTAL_30MS_Q;
} else if (mode == 1)
{
// Low bitrate mode
inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_LBR;
inst->individual[1] = INDIVIDUAL_20MS_LBR;
inst->individual[2] = INDIVIDUAL_30MS_LBR;
inst->total[0] = TOTAL_10MS_LBR;
inst->total[1] = TOTAL_20MS_LBR;
inst->total[2] = TOTAL_30MS_LBR;
} else if (mode == 2)
{
// Aggressive mode
inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_AGG;
inst->individual[1] = INDIVIDUAL_20MS_AGG;
inst->individual[2] = INDIVIDUAL_30MS_AGG;
inst->total[0] = TOTAL_10MS_AGG;
inst->total[1] = TOTAL_20MS_AGG;
inst->total[2] = TOTAL_30MS_AGG;
} else if (mode == 3)
{
// Very aggressive mode
inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_VAG;
inst->individual[1] = INDIVIDUAL_20MS_VAG;
inst->individual[2] = INDIVIDUAL_30MS_VAG;
inst->total[0] = TOTAL_10MS_VAG;
inst->total[1] = TOTAL_20MS_VAG;
inst->total[2] = TOTAL_30MS_VAG;
} else
{
return -1;
}
return 0;
}
// Calculate VAD decision by first extracting feature values and then calculate
// probability for both speech and background noise.
WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
int frame_length)
{
WebRtc_Word16 len, vad;
WebRtc_Word16 speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
// Downsample signal 32->16->8 before doing VAD
WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
frame_length);
len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
len = WEBRTC_SPL_RSHIFT_W16(len, 1);
// Do VAD on an 8 kHz signal
vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
return vad;
}
WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
int frame_length)
{
WebRtc_Word16 len, vad;
WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
// Wideband: Downsample signal before doing VAD
WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
frame_length);
len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
return vad;
}
WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
int frame_length)
{
WebRtc_Word16 feature_vector[NUM_CHANNELS], total_power;
// Get power in the bands
total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
feature_vector);
// Make a VAD
inst->vad = WebRtcVad_GmmProbability(inst, feature_vector, total_power, frame_length);
return inst->vad;
}
// Calculate probability for both speech and background noise, and perform a
// hypothesis-test.
WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
WebRtc_Word16 total_power, int frame_length)
// Calculates the probabilities for both speech and background noise using
// Gaussian Mixture Models. A hypothesis-test is performed to decide which type
// of signal is most probable.
//
// - inst [i/o] : Pointer to VAD instance
// - feature_vector [i] : Feature vector = log10(energy in frequency band)
// - total_power [i] : Total power in audio frame.
// - frame_length [i] : Number of input samples
//
// - returns : the VAD decision (0 - noise, 1 - speech).
static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
WebRtc_Word16 total_power, int frame_length)
{
int n, k;
WebRtc_Word16 backval;
@ -590,8 +336,8 @@ WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_ve
tmp16 += 128; // Rounding
ssk += WEBRTC_SPL_RSHIFT_W16(tmp16, 8);
// Division with 8 plus Q7
if (ssk < MIN_STD)
ssk = MIN_STD;
if (ssk < kMinStd)
ssk = kMinStd;
*sstd2ptr = ssk;
} else
{
@ -618,8 +364,8 @@ WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_ve
tmp16 += 32; // Rounding
nsk += WEBRTC_SPL_RSHIFT_W16(tmp16, 6);
if (nsk < MIN_STD)
nsk = MIN_STD;
if (nsk < kMinStd)
nsk = kMinStd;
*nstd2ptr = nsk;
}
@ -713,12 +459,209 @@ WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_ve
} else
{
inst->num_of_speech = inst->num_of_speech + 1;
if (inst->num_of_speech > NSP_MAX)
if (inst->num_of_speech > kMaxSpeechFrames)
{
inst->num_of_speech = NSP_MAX;
inst->num_of_speech = kMaxSpeechFrames;
inst->over_hang = overhead2;
} else
inst->over_hang = overhead1;
}
return vadflag;
}
// Initialize VAD
int WebRtcVad_InitCore(VadInstT *inst, short mode)
{
int i;
// Initialization of struct
inst->vad = 1;
inst->frame_counter = 0;
inst->over_hang = 0;
inst->num_of_speech = 0;
// Initialization of downsampling filter state
inst->downsampling_filter_states[0] = 0;
inst->downsampling_filter_states[1] = 0;
inst->downsampling_filter_states[2] = 0;
inst->downsampling_filter_states[3] = 0;
// Read initial PDF parameters
for (i = 0; i < NUM_TABLE_VALUES; i++)
{
inst->noise_means[i] = kNoiseDataMeans[i];
inst->speech_means[i] = kSpeechDataMeans[i];
inst->noise_stds[i] = kNoiseDataStds[i];
inst->speech_stds[i] = kSpeechDataStds[i];
}
// Index and Minimum value vectors are initialized
for (i = 0; i < 16 * NUM_CHANNELS; i++)
{
inst->low_value_vector[i] = 10000;
inst->index_vector[i] = 0;
}
for (i = 0; i < 5; i++)
{
inst->upper_state[i] = 0;
inst->lower_state[i] = 0;
}
for (i = 0; i < 4; i++)
{
inst->hp_filter_state[i] = 0;
}
// Init mean value memory, for FindMin function
inst->mean_value[0] = 1600;
inst->mean_value[1] = 1600;
inst->mean_value[2] = 1600;
inst->mean_value[3] = 1600;
inst->mean_value[4] = 1600;
inst->mean_value[5] = 1600;
if (WebRtcVad_set_mode_core(inst, mode) != 0) {
return -1;
}
inst->init_flag = kInitCheck;
return 0;
}
// Set aggressiveness mode
int WebRtcVad_set_mode_core(VadInstT *inst, short mode)
{
if (mode == 0)
{
// Quality mode
inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_Q;
inst->individual[1] = INDIVIDUAL_20MS_Q;
inst->individual[2] = INDIVIDUAL_30MS_Q;
inst->total[0] = TOTAL_10MS_Q;
inst->total[1] = TOTAL_20MS_Q;
inst->total[2] = TOTAL_30MS_Q;
} else if (mode == 1)
{
// Low bitrate mode
inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_LBR;
inst->individual[1] = INDIVIDUAL_20MS_LBR;
inst->individual[2] = INDIVIDUAL_30MS_LBR;
inst->total[0] = TOTAL_10MS_LBR;
inst->total[1] = TOTAL_20MS_LBR;
inst->total[2] = TOTAL_30MS_LBR;
} else if (mode == 2)
{
// Aggressive mode
inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_AGG;
inst->individual[1] = INDIVIDUAL_20MS_AGG;
inst->individual[2] = INDIVIDUAL_30MS_AGG;
inst->total[0] = TOTAL_10MS_AGG;
inst->total[1] = TOTAL_20MS_AGG;
inst->total[2] = TOTAL_30MS_AGG;
} else if (mode == 3)
{
// Very aggressive mode
inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
inst->individual[0] = INDIVIDUAL_10MS_VAG;
inst->individual[1] = INDIVIDUAL_20MS_VAG;
inst->individual[2] = INDIVIDUAL_30MS_VAG;
inst->total[0] = TOTAL_10MS_VAG;
inst->total[1] = TOTAL_20MS_VAG;
inst->total[2] = TOTAL_30MS_VAG;
} else
{
return -1;
}
return 0;
}
// Calculate VAD decision by first extracting feature values and then calculate
// probability for both speech and background noise.
WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
int frame_length)
{
WebRtc_Word16 len, vad;
WebRtc_Word16 speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
// Downsample signal 32->16->8 before doing VAD
WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
frame_length);
len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
len = WEBRTC_SPL_RSHIFT_W16(len, 1);
// Do VAD on an 8 kHz signal
vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
return vad;
}
WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
int frame_length)
{
WebRtc_Word16 len, vad;
WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
// Wideband: Downsample signal before doing VAD
WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
frame_length);
len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
return vad;
}
WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
int frame_length)
{
WebRtc_Word16 feature_vector[NUM_CHANNELS], total_power;
// Get power in the bands
total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
feature_vector);
// Make a VAD
inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length);
return inst->vad;
}

View File

@ -13,8 +13,8 @@
* This header file includes the descriptions of the core VAD calls.
*/
#ifndef WEBRTC_VAD_CORE_H_
#define WEBRTC_VAD_CORE_H_
#ifndef WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_
#define WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_
#include "typedefs.h"
#include "vad_defines.h"
@ -112,24 +112,4 @@ WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT* inst, WebRtc_Word16* speech_frame
WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT* inst, WebRtc_Word16* speech_frame,
int frame_length);
/****************************************************************************
* WebRtcVad_GmmProbability(...)
*
* This function calculates the probabilities for background noise and
* speech using Gaussian Mixture Models. A hypothesis-test is performed to decide
* which type of signal is most probable.
*
* Input:
* - inst : Pointer to VAD instance
* - feature_vector : Feature vector = log10(energy in frequency band)
* - total_power : Total power in frame.
* - frame_length : Number of input samples
*
* Output:
* VAD decision : 0 - noise, 1 - speech
*
*/
WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT* inst, WebRtc_Word16* feature_vector,
WebRtc_Word16 total_power, int frame_length);
#endif // WEBRTC_VAD_CORE_H_
#endif // WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_

View File

@ -0,0 +1,99 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdlib.h>
#include "gtest/gtest.h"
#include "typedefs.h"
#include "vad_unittest.h"
extern "C" {
#include "vad_core.h"
}
namespace {
TEST_F(VadTest, InitCore) {
VadInstT* self = reinterpret_cast<VadInstT*>(malloc(sizeof(VadInstT)));
// TODO(bjornv): Add NULL pointer check if we take care of it in
// vad_core.c
// Test WebRtcVad_InitCore().
// Verify return = 0 for all modes.
for (size_t j = 0; j < kModesSize; ++j) {
EXPECT_EQ(0, WebRtcVad_InitCore(self, kModes[j]));
}
free(self);
}
TEST_F(VadTest, set_mode_core) {
VadInstT* self = reinterpret_cast<VadInstT*>(malloc(sizeof(VadInstT)));
// TODO(bjornv): Add NULL pointer check if we take care of it in
// vad_core.c
ASSERT_EQ(0, WebRtcVad_InitCore(self, 0));
// Test WebRtcVad_set_mode_core().
// Invalid modes should return -1.
EXPECT_EQ(-1, WebRtcVad_set_mode_core(self, -1));
EXPECT_EQ(-1, WebRtcVad_set_mode_core(self, (short) kModesSize));
// Valid modes should return 0.
for (size_t j = 0; j < kModesSize; ++j) {
EXPECT_EQ(0, WebRtcVad_set_mode_core(self, kModes[j]));
}
free(self);
}
TEST_F(VadTest, CalcVad) {
VadInstT* self = reinterpret_cast<VadInstT*>(malloc(sizeof(VadInstT)));
int16_t speech[kMaxFrameLength];
// TODO(bjornv): Add NULL pointer check if we take care of it in
// vad_core.c
// Test WebRtcVad_CalcVadXXkhz()
// Verify that all zeros in gives VAD = 0 out.
memset(speech, 0, sizeof(speech));
ASSERT_EQ(0, WebRtcVad_InitCore(self, 0));
for (size_t j = 0; j < kFrameLengthsSize; ++j) {
if (ValidRatesAndFrameLengths(8000, kFrameLengths[j])) {
EXPECT_EQ(0, WebRtcVad_CalcVad8khz(self, speech, kFrameLengths[j]));
}
if (ValidRatesAndFrameLengths(16000, kFrameLengths[j])) {
EXPECT_EQ(0, WebRtcVad_CalcVad16khz(self, speech, kFrameLengths[j]));
}
if (ValidRatesAndFrameLengths(32000, kFrameLengths[j])) {
EXPECT_EQ(0, WebRtcVad_CalcVad32khz(self, speech, kFrameLengths[j]));
}
}
// Construct a speech signal that will trigger the VAD in all modes. It is
// known that (i * i) will wrap around, but that doesn't matter in this case.
for (int16_t i = 0; i < kMaxFrameLength; ++i) {
speech[i] = (i * i);
}
for (size_t j = 0; j < kFrameLengthsSize; ++j) {
if (ValidRatesAndFrameLengths(8000, kFrameLengths[j])) {
EXPECT_EQ(1, WebRtcVad_CalcVad8khz(self, speech, kFrameLengths[j]));
}
if (ValidRatesAndFrameLengths(16000, kFrameLengths[j])) {
EXPECT_EQ(1, WebRtcVad_CalcVad16khz(self, speech, kFrameLengths[j]));
}
if (ValidRatesAndFrameLengths(32000, kFrameLengths[j])) {
EXPECT_EQ(1, WebRtcVad_CalcVad32khz(self, speech, kFrameLengths[j]));
}
}
free(self);
}
} // namespace

View File

@ -23,8 +23,6 @@
#define MIN_ENERGY 10
#define ALPHA1 6553 // 0.2 in Q15
#define ALPHA2 32439 // 0.99 in Q15
#define NSP_MAX 6 // Maximum number of VAD=1 frames in a row counted
#define MIN_STD 384 // Minimum standard deviation
// Mode 0, Quality thresholds - Different thresholds for the different frame lengths
#define INDIVIDUAL_10MS_Q 24
#define INDIVIDUAL_20MS_Q 21 // (log10(2)*66)<<2 ~=16