Refactor VAD: Code restructure
- Tests added for vad_core. - Replaced two macros with constants. - Made an internal function static. - Replaced replicated code with function call. Review URL: https://webrtc-codereview.appspot.com/354001 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1444 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
38e0a771d2
commit
4259fd725c
@ -49,10 +49,11 @@
|
||||
'<(webrtc_root)/../testing/gtest.gyp:gtest',
|
||||
],
|
||||
'sources': [
|
||||
'vad_unittest.cc',
|
||||
'vad_core_unittest.cc',
|
||||
'vad_filterbank_unittest.cc',
|
||||
'vad_gmm_unittest.cc',
|
||||
'vad_sp_unittest.cc',
|
||||
'vad_unittest.cc',
|
||||
'vad_unittest.h',
|
||||
],
|
||||
}, # vad_unittests
|
||||
|
||||
@ -8,12 +8,6 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* This file includes the implementation of the core functionality in VAD.
|
||||
* For function description, see vad_core.h.
|
||||
*/
|
||||
|
||||
#include "vad_core.h"
|
||||
|
||||
#include "signal_processing_library.h"
|
||||
@ -60,275 +54,27 @@ static const WebRtc_Word16 kNoiseDataStds[12] = {
|
||||
static const WebRtc_Word16 kSpeechDataStds[12] = {
|
||||
555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
|
||||
|
||||
// Constants used in GmmProbability().
|
||||
//
|
||||
// Maximum number of counted speech (VAD = 1) frames in a row.
|
||||
static const int16_t kMaxSpeechFrames = 6;
|
||||
// Minimum standard deviation for both speech and noise.
|
||||
static const int16_t kMinStd = 384;
|
||||
|
||||
static const int kInitCheck = 42;
|
||||
|
||||
// Initialize VAD
|
||||
int WebRtcVad_InitCore(VadInstT *inst, short mode)
|
||||
{
|
||||
int i;
|
||||
|
||||
// Initialization of struct
|
||||
inst->vad = 1;
|
||||
inst->frame_counter = 0;
|
||||
inst->over_hang = 0;
|
||||
inst->num_of_speech = 0;
|
||||
|
||||
// Initialization of downsampling filter state
|
||||
inst->downsampling_filter_states[0] = 0;
|
||||
inst->downsampling_filter_states[1] = 0;
|
||||
inst->downsampling_filter_states[2] = 0;
|
||||
inst->downsampling_filter_states[3] = 0;
|
||||
|
||||
// Read initial PDF parameters
|
||||
for (i = 0; i < NUM_TABLE_VALUES; i++)
|
||||
{
|
||||
inst->noise_means[i] = kNoiseDataMeans[i];
|
||||
inst->speech_means[i] = kSpeechDataMeans[i];
|
||||
inst->noise_stds[i] = kNoiseDataStds[i];
|
||||
inst->speech_stds[i] = kSpeechDataStds[i];
|
||||
}
|
||||
|
||||
// Index and Minimum value vectors are initialized
|
||||
for (i = 0; i < 16 * NUM_CHANNELS; i++)
|
||||
{
|
||||
inst->low_value_vector[i] = 10000;
|
||||
inst->index_vector[i] = 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < 5; i++)
|
||||
{
|
||||
inst->upper_state[i] = 0;
|
||||
inst->lower_state[i] = 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
inst->hp_filter_state[i] = 0;
|
||||
}
|
||||
|
||||
// Init mean value memory, for FindMin function
|
||||
inst->mean_value[0] = 1600;
|
||||
inst->mean_value[1] = 1600;
|
||||
inst->mean_value[2] = 1600;
|
||||
inst->mean_value[3] = 1600;
|
||||
inst->mean_value[4] = 1600;
|
||||
inst->mean_value[5] = 1600;
|
||||
|
||||
if (mode == 0)
|
||||
{
|
||||
// Quality mode
|
||||
inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
|
||||
inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
|
||||
inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
|
||||
inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
|
||||
inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
|
||||
inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
|
||||
|
||||
inst->individual[0] = INDIVIDUAL_10MS_Q;
|
||||
inst->individual[1] = INDIVIDUAL_20MS_Q;
|
||||
inst->individual[2] = INDIVIDUAL_30MS_Q;
|
||||
|
||||
inst->total[0] = TOTAL_10MS_Q;
|
||||
inst->total[1] = TOTAL_20MS_Q;
|
||||
inst->total[2] = TOTAL_30MS_Q;
|
||||
} else if (mode == 1)
|
||||
{
|
||||
// Low bitrate mode
|
||||
inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
|
||||
inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
|
||||
inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
|
||||
inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
|
||||
inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
|
||||
inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
|
||||
|
||||
inst->individual[0] = INDIVIDUAL_10MS_LBR;
|
||||
inst->individual[1] = INDIVIDUAL_20MS_LBR;
|
||||
inst->individual[2] = INDIVIDUAL_30MS_LBR;
|
||||
|
||||
inst->total[0] = TOTAL_10MS_LBR;
|
||||
inst->total[1] = TOTAL_20MS_LBR;
|
||||
inst->total[2] = TOTAL_30MS_LBR;
|
||||
} else if (mode == 2)
|
||||
{
|
||||
// Aggressive mode
|
||||
inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
|
||||
inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
|
||||
inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
|
||||
inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
|
||||
inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
|
||||
inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
|
||||
|
||||
inst->individual[0] = INDIVIDUAL_10MS_AGG;
|
||||
inst->individual[1] = INDIVIDUAL_20MS_AGG;
|
||||
inst->individual[2] = INDIVIDUAL_30MS_AGG;
|
||||
|
||||
inst->total[0] = TOTAL_10MS_AGG;
|
||||
inst->total[1] = TOTAL_20MS_AGG;
|
||||
inst->total[2] = TOTAL_30MS_AGG;
|
||||
} else
|
||||
{
|
||||
// Very aggressive mode
|
||||
inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
|
||||
inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
|
||||
inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
|
||||
inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
|
||||
inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
|
||||
inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
|
||||
|
||||
inst->individual[0] = INDIVIDUAL_10MS_VAG;
|
||||
inst->individual[1] = INDIVIDUAL_20MS_VAG;
|
||||
inst->individual[2] = INDIVIDUAL_30MS_VAG;
|
||||
|
||||
inst->total[0] = TOTAL_10MS_VAG;
|
||||
inst->total[1] = TOTAL_20MS_VAG;
|
||||
inst->total[2] = TOTAL_30MS_VAG;
|
||||
}
|
||||
|
||||
inst->init_flag = kInitCheck;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Set aggressiveness mode
|
||||
int WebRtcVad_set_mode_core(VadInstT *inst, short mode)
|
||||
{
|
||||
|
||||
if (mode == 0)
|
||||
{
|
||||
// Quality mode
|
||||
inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
|
||||
inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
|
||||
inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
|
||||
inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
|
||||
inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
|
||||
inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
|
||||
|
||||
inst->individual[0] = INDIVIDUAL_10MS_Q;
|
||||
inst->individual[1] = INDIVIDUAL_20MS_Q;
|
||||
inst->individual[2] = INDIVIDUAL_30MS_Q;
|
||||
|
||||
inst->total[0] = TOTAL_10MS_Q;
|
||||
inst->total[1] = TOTAL_20MS_Q;
|
||||
inst->total[2] = TOTAL_30MS_Q;
|
||||
} else if (mode == 1)
|
||||
{
|
||||
// Low bitrate mode
|
||||
inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
|
||||
inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
|
||||
inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
|
||||
inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
|
||||
inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
|
||||
inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
|
||||
|
||||
inst->individual[0] = INDIVIDUAL_10MS_LBR;
|
||||
inst->individual[1] = INDIVIDUAL_20MS_LBR;
|
||||
inst->individual[2] = INDIVIDUAL_30MS_LBR;
|
||||
|
||||
inst->total[0] = TOTAL_10MS_LBR;
|
||||
inst->total[1] = TOTAL_20MS_LBR;
|
||||
inst->total[2] = TOTAL_30MS_LBR;
|
||||
} else if (mode == 2)
|
||||
{
|
||||
// Aggressive mode
|
||||
inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
|
||||
inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
|
||||
inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
|
||||
inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
|
||||
inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
|
||||
inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
|
||||
|
||||
inst->individual[0] = INDIVIDUAL_10MS_AGG;
|
||||
inst->individual[1] = INDIVIDUAL_20MS_AGG;
|
||||
inst->individual[2] = INDIVIDUAL_30MS_AGG;
|
||||
|
||||
inst->total[0] = TOTAL_10MS_AGG;
|
||||
inst->total[1] = TOTAL_20MS_AGG;
|
||||
inst->total[2] = TOTAL_30MS_AGG;
|
||||
} else if (mode == 3)
|
||||
{
|
||||
// Very aggressive mode
|
||||
inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
|
||||
inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
|
||||
inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
|
||||
inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
|
||||
inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
|
||||
inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
|
||||
|
||||
inst->individual[0] = INDIVIDUAL_10MS_VAG;
|
||||
inst->individual[1] = INDIVIDUAL_20MS_VAG;
|
||||
inst->individual[2] = INDIVIDUAL_30MS_VAG;
|
||||
|
||||
inst->total[0] = TOTAL_10MS_VAG;
|
||||
inst->total[1] = TOTAL_20MS_VAG;
|
||||
inst->total[2] = TOTAL_30MS_VAG;
|
||||
} else
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Calculate VAD decision by first extracting feature values and then calculate
|
||||
// probability for both speech and background noise.
|
||||
|
||||
WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
|
||||
int frame_length)
|
||||
{
|
||||
WebRtc_Word16 len, vad;
|
||||
WebRtc_Word16 speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
|
||||
WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
|
||||
|
||||
|
||||
// Downsample signal 32->16->8 before doing VAD
|
||||
WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
|
||||
frame_length);
|
||||
len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
|
||||
|
||||
WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
|
||||
len = WEBRTC_SPL_RSHIFT_W16(len, 1);
|
||||
|
||||
// Do VAD on an 8 kHz signal
|
||||
vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
|
||||
|
||||
return vad;
|
||||
}
|
||||
|
||||
WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
|
||||
int frame_length)
|
||||
{
|
||||
WebRtc_Word16 len, vad;
|
||||
WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
|
||||
|
||||
// Wideband: Downsample signal before doing VAD
|
||||
WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
|
||||
frame_length);
|
||||
|
||||
len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
|
||||
vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
|
||||
|
||||
return vad;
|
||||
}
|
||||
|
||||
WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
|
||||
int frame_length)
|
||||
{
|
||||
WebRtc_Word16 feature_vector[NUM_CHANNELS], total_power;
|
||||
|
||||
// Get power in the bands
|
||||
total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
|
||||
feature_vector);
|
||||
|
||||
// Make a VAD
|
||||
inst->vad = WebRtcVad_GmmProbability(inst, feature_vector, total_power, frame_length);
|
||||
|
||||
return inst->vad;
|
||||
}
|
||||
|
||||
// Calculate probability for both speech and background noise, and perform a
|
||||
// hypothesis-test.
|
||||
WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
|
||||
WebRtc_Word16 total_power, int frame_length)
|
||||
// Calculates the probabilities for both speech and background noise using
|
||||
// Gaussian Mixture Models. A hypothesis-test is performed to decide which type
|
||||
// of signal is most probable.
|
||||
//
|
||||
// - inst [i/o] : Pointer to VAD instance
|
||||
// - feature_vector [i] : Feature vector = log10(energy in frequency band)
|
||||
// - total_power [i] : Total power in audio frame.
|
||||
// - frame_length [i] : Number of input samples
|
||||
//
|
||||
// - returns : the VAD decision (0 - noise, 1 - speech).
|
||||
static int16_t GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
|
||||
WebRtc_Word16 total_power, int frame_length)
|
||||
{
|
||||
int n, k;
|
||||
WebRtc_Word16 backval;
|
||||
@ -590,8 +336,8 @@ WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_ve
|
||||
tmp16 += 128; // Rounding
|
||||
ssk += WEBRTC_SPL_RSHIFT_W16(tmp16, 8);
|
||||
// Division with 8 plus Q7
|
||||
if (ssk < MIN_STD)
|
||||
ssk = MIN_STD;
|
||||
if (ssk < kMinStd)
|
||||
ssk = kMinStd;
|
||||
*sstd2ptr = ssk;
|
||||
} else
|
||||
{
|
||||
@ -618,8 +364,8 @@ WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_ve
|
||||
tmp16 += 32; // Rounding
|
||||
nsk += WEBRTC_SPL_RSHIFT_W16(tmp16, 6);
|
||||
|
||||
if (nsk < MIN_STD)
|
||||
nsk = MIN_STD;
|
||||
if (nsk < kMinStd)
|
||||
nsk = kMinStd;
|
||||
|
||||
*nstd2ptr = nsk;
|
||||
}
|
||||
@ -713,12 +459,209 @@ WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_ve
|
||||
} else
|
||||
{
|
||||
inst->num_of_speech = inst->num_of_speech + 1;
|
||||
if (inst->num_of_speech > NSP_MAX)
|
||||
if (inst->num_of_speech > kMaxSpeechFrames)
|
||||
{
|
||||
inst->num_of_speech = NSP_MAX;
|
||||
inst->num_of_speech = kMaxSpeechFrames;
|
||||
inst->over_hang = overhead2;
|
||||
} else
|
||||
inst->over_hang = overhead1;
|
||||
}
|
||||
return vadflag;
|
||||
}
|
||||
|
||||
// Initialize VAD
|
||||
int WebRtcVad_InitCore(VadInstT *inst, short mode)
|
||||
{
|
||||
int i;
|
||||
|
||||
// Initialization of struct
|
||||
inst->vad = 1;
|
||||
inst->frame_counter = 0;
|
||||
inst->over_hang = 0;
|
||||
inst->num_of_speech = 0;
|
||||
|
||||
// Initialization of downsampling filter state
|
||||
inst->downsampling_filter_states[0] = 0;
|
||||
inst->downsampling_filter_states[1] = 0;
|
||||
inst->downsampling_filter_states[2] = 0;
|
||||
inst->downsampling_filter_states[3] = 0;
|
||||
|
||||
// Read initial PDF parameters
|
||||
for (i = 0; i < NUM_TABLE_VALUES; i++)
|
||||
{
|
||||
inst->noise_means[i] = kNoiseDataMeans[i];
|
||||
inst->speech_means[i] = kSpeechDataMeans[i];
|
||||
inst->noise_stds[i] = kNoiseDataStds[i];
|
||||
inst->speech_stds[i] = kSpeechDataStds[i];
|
||||
}
|
||||
|
||||
// Index and Minimum value vectors are initialized
|
||||
for (i = 0; i < 16 * NUM_CHANNELS; i++)
|
||||
{
|
||||
inst->low_value_vector[i] = 10000;
|
||||
inst->index_vector[i] = 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < 5; i++)
|
||||
{
|
||||
inst->upper_state[i] = 0;
|
||||
inst->lower_state[i] = 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
inst->hp_filter_state[i] = 0;
|
||||
}
|
||||
|
||||
// Init mean value memory, for FindMin function
|
||||
inst->mean_value[0] = 1600;
|
||||
inst->mean_value[1] = 1600;
|
||||
inst->mean_value[2] = 1600;
|
||||
inst->mean_value[3] = 1600;
|
||||
inst->mean_value[4] = 1600;
|
||||
inst->mean_value[5] = 1600;
|
||||
|
||||
if (WebRtcVad_set_mode_core(inst, mode) != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
inst->init_flag = kInitCheck;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Set aggressiveness mode
|
||||
int WebRtcVad_set_mode_core(VadInstT *inst, short mode)
|
||||
{
|
||||
|
||||
if (mode == 0)
|
||||
{
|
||||
// Quality mode
|
||||
inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
|
||||
inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
|
||||
inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
|
||||
inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
|
||||
inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
|
||||
inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
|
||||
|
||||
inst->individual[0] = INDIVIDUAL_10MS_Q;
|
||||
inst->individual[1] = INDIVIDUAL_20MS_Q;
|
||||
inst->individual[2] = INDIVIDUAL_30MS_Q;
|
||||
|
||||
inst->total[0] = TOTAL_10MS_Q;
|
||||
inst->total[1] = TOTAL_20MS_Q;
|
||||
inst->total[2] = TOTAL_30MS_Q;
|
||||
} else if (mode == 1)
|
||||
{
|
||||
// Low bitrate mode
|
||||
inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
|
||||
inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
|
||||
inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
|
||||
inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
|
||||
inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
|
||||
inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
|
||||
|
||||
inst->individual[0] = INDIVIDUAL_10MS_LBR;
|
||||
inst->individual[1] = INDIVIDUAL_20MS_LBR;
|
||||
inst->individual[2] = INDIVIDUAL_30MS_LBR;
|
||||
|
||||
inst->total[0] = TOTAL_10MS_LBR;
|
||||
inst->total[1] = TOTAL_20MS_LBR;
|
||||
inst->total[2] = TOTAL_30MS_LBR;
|
||||
} else if (mode == 2)
|
||||
{
|
||||
// Aggressive mode
|
||||
inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
|
||||
inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
|
||||
inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
|
||||
inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
|
||||
inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
|
||||
inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
|
||||
|
||||
inst->individual[0] = INDIVIDUAL_10MS_AGG;
|
||||
inst->individual[1] = INDIVIDUAL_20MS_AGG;
|
||||
inst->individual[2] = INDIVIDUAL_30MS_AGG;
|
||||
|
||||
inst->total[0] = TOTAL_10MS_AGG;
|
||||
inst->total[1] = TOTAL_20MS_AGG;
|
||||
inst->total[2] = TOTAL_30MS_AGG;
|
||||
} else if (mode == 3)
|
||||
{
|
||||
// Very aggressive mode
|
||||
inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
|
||||
inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
|
||||
inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
|
||||
inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
|
||||
inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
|
||||
inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
|
||||
|
||||
inst->individual[0] = INDIVIDUAL_10MS_VAG;
|
||||
inst->individual[1] = INDIVIDUAL_20MS_VAG;
|
||||
inst->individual[2] = INDIVIDUAL_30MS_VAG;
|
||||
|
||||
inst->total[0] = TOTAL_10MS_VAG;
|
||||
inst->total[1] = TOTAL_20MS_VAG;
|
||||
inst->total[2] = TOTAL_30MS_VAG;
|
||||
} else
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Calculate VAD decision by first extracting feature values and then calculate
|
||||
// probability for both speech and background noise.
|
||||
|
||||
WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
|
||||
int frame_length)
|
||||
{
|
||||
WebRtc_Word16 len, vad;
|
||||
WebRtc_Word16 speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
|
||||
WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
|
||||
|
||||
|
||||
// Downsample signal 32->16->8 before doing VAD
|
||||
WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
|
||||
frame_length);
|
||||
len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
|
||||
|
||||
WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
|
||||
len = WEBRTC_SPL_RSHIFT_W16(len, 1);
|
||||
|
||||
// Do VAD on an 8 kHz signal
|
||||
vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
|
||||
|
||||
return vad;
|
||||
}
|
||||
|
||||
WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
|
||||
int frame_length)
|
||||
{
|
||||
WebRtc_Word16 len, vad;
|
||||
WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
|
||||
|
||||
// Wideband: Downsample signal before doing VAD
|
||||
WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
|
||||
frame_length);
|
||||
|
||||
len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
|
||||
vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
|
||||
|
||||
return vad;
|
||||
}
|
||||
|
||||
WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
|
||||
int frame_length)
|
||||
{
|
||||
WebRtc_Word16 feature_vector[NUM_CHANNELS], total_power;
|
||||
|
||||
// Get power in the bands
|
||||
total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
|
||||
feature_vector);
|
||||
|
||||
// Make a VAD
|
||||
inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length);
|
||||
|
||||
return inst->vad;
|
||||
}
|
||||
|
||||
@ -13,8 +13,8 @@
|
||||
* This header file includes the descriptions of the core VAD calls.
|
||||
*/
|
||||
|
||||
#ifndef WEBRTC_VAD_CORE_H_
|
||||
#define WEBRTC_VAD_CORE_H_
|
||||
#ifndef WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_
|
||||
#define WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_
|
||||
|
||||
#include "typedefs.h"
|
||||
#include "vad_defines.h"
|
||||
@ -112,24 +112,4 @@ WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT* inst, WebRtc_Word16* speech_frame
|
||||
WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT* inst, WebRtc_Word16* speech_frame,
|
||||
int frame_length);
|
||||
|
||||
/****************************************************************************
|
||||
* WebRtcVad_GmmProbability(...)
|
||||
*
|
||||
* This function calculates the probabilities for background noise and
|
||||
* speech using Gaussian Mixture Models. A hypothesis-test is performed to decide
|
||||
* which type of signal is most probable.
|
||||
*
|
||||
* Input:
|
||||
* - inst : Pointer to VAD instance
|
||||
* - feature_vector : Feature vector = log10(energy in frequency band)
|
||||
* - total_power : Total power in frame.
|
||||
* - frame_length : Number of input samples
|
||||
*
|
||||
* Output:
|
||||
* VAD decision : 0 - noise, 1 - speech
|
||||
*
|
||||
*/
|
||||
WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT* inst, WebRtc_Word16* feature_vector,
|
||||
WebRtc_Word16 total_power, int frame_length);
|
||||
|
||||
#endif // WEBRTC_VAD_CORE_H_
|
||||
#endif // WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_
|
||||
|
||||
99
src/common_audio/vad/vad_core_unittest.cc
Normal file
99
src/common_audio/vad/vad_core_unittest.cc
Normal file
@ -0,0 +1,99 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "typedefs.h"
|
||||
#include "vad_unittest.h"
|
||||
|
||||
extern "C" {
|
||||
#include "vad_core.h"
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
TEST_F(VadTest, InitCore) {
|
||||
VadInstT* self = reinterpret_cast<VadInstT*>(malloc(sizeof(VadInstT)));
|
||||
|
||||
// TODO(bjornv): Add NULL pointer check if we take care of it in
|
||||
// vad_core.c
|
||||
|
||||
// Test WebRtcVad_InitCore().
|
||||
// Verify return = 0 for all modes.
|
||||
for (size_t j = 0; j < kModesSize; ++j) {
|
||||
EXPECT_EQ(0, WebRtcVad_InitCore(self, kModes[j]));
|
||||
}
|
||||
|
||||
free(self);
|
||||
}
|
||||
|
||||
TEST_F(VadTest, set_mode_core) {
|
||||
VadInstT* self = reinterpret_cast<VadInstT*>(malloc(sizeof(VadInstT)));
|
||||
|
||||
// TODO(bjornv): Add NULL pointer check if we take care of it in
|
||||
// vad_core.c
|
||||
|
||||
ASSERT_EQ(0, WebRtcVad_InitCore(self, 0));
|
||||
// Test WebRtcVad_set_mode_core().
|
||||
// Invalid modes should return -1.
|
||||
EXPECT_EQ(-1, WebRtcVad_set_mode_core(self, -1));
|
||||
EXPECT_EQ(-1, WebRtcVad_set_mode_core(self, (short) kModesSize));
|
||||
// Valid modes should return 0.
|
||||
for (size_t j = 0; j < kModesSize; ++j) {
|
||||
EXPECT_EQ(0, WebRtcVad_set_mode_core(self, kModes[j]));
|
||||
}
|
||||
|
||||
free(self);
|
||||
}
|
||||
|
||||
TEST_F(VadTest, CalcVad) {
|
||||
VadInstT* self = reinterpret_cast<VadInstT*>(malloc(sizeof(VadInstT)));
|
||||
int16_t speech[kMaxFrameLength];
|
||||
|
||||
// TODO(bjornv): Add NULL pointer check if we take care of it in
|
||||
// vad_core.c
|
||||
|
||||
// Test WebRtcVad_CalcVadXXkhz()
|
||||
// Verify that all zeros in gives VAD = 0 out.
|
||||
memset(speech, 0, sizeof(speech));
|
||||
ASSERT_EQ(0, WebRtcVad_InitCore(self, 0));
|
||||
for (size_t j = 0; j < kFrameLengthsSize; ++j) {
|
||||
if (ValidRatesAndFrameLengths(8000, kFrameLengths[j])) {
|
||||
EXPECT_EQ(0, WebRtcVad_CalcVad8khz(self, speech, kFrameLengths[j]));
|
||||
}
|
||||
if (ValidRatesAndFrameLengths(16000, kFrameLengths[j])) {
|
||||
EXPECT_EQ(0, WebRtcVad_CalcVad16khz(self, speech, kFrameLengths[j]));
|
||||
}
|
||||
if (ValidRatesAndFrameLengths(32000, kFrameLengths[j])) {
|
||||
EXPECT_EQ(0, WebRtcVad_CalcVad32khz(self, speech, kFrameLengths[j]));
|
||||
}
|
||||
}
|
||||
|
||||
// Construct a speech signal that will trigger the VAD in all modes. It is
|
||||
// known that (i * i) will wrap around, but that doesn't matter in this case.
|
||||
for (int16_t i = 0; i < kMaxFrameLength; ++i) {
|
||||
speech[i] = (i * i);
|
||||
}
|
||||
for (size_t j = 0; j < kFrameLengthsSize; ++j) {
|
||||
if (ValidRatesAndFrameLengths(8000, kFrameLengths[j])) {
|
||||
EXPECT_EQ(1, WebRtcVad_CalcVad8khz(self, speech, kFrameLengths[j]));
|
||||
}
|
||||
if (ValidRatesAndFrameLengths(16000, kFrameLengths[j])) {
|
||||
EXPECT_EQ(1, WebRtcVad_CalcVad16khz(self, speech, kFrameLengths[j]));
|
||||
}
|
||||
if (ValidRatesAndFrameLengths(32000, kFrameLengths[j])) {
|
||||
EXPECT_EQ(1, WebRtcVad_CalcVad32khz(self, speech, kFrameLengths[j]));
|
||||
}
|
||||
}
|
||||
|
||||
free(self);
|
||||
}
|
||||
} // namespace
|
||||
@ -23,8 +23,6 @@
|
||||
#define MIN_ENERGY 10
|
||||
#define ALPHA1 6553 // 0.2 in Q15
|
||||
#define ALPHA2 32439 // 0.99 in Q15
|
||||
#define NSP_MAX 6 // Maximum number of VAD=1 frames in a row counted
|
||||
#define MIN_STD 384 // Minimum standard deviation
|
||||
// Mode 0, Quality thresholds - Different thresholds for the different frame lengths
|
||||
#define INDIVIDUAL_10MS_Q 24
|
||||
#define INDIVIDUAL_20MS_Q 21 // (log10(2)*66)<<2 ~=16
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user