From 8d13c4fe1a3e05e65ff1f8637cc7a097f42dde8c Mon Sep 17 00:00:00 2001 From: peah Date: Sat, 7 May 2016 15:03:47 -0700 Subject: [PATCH] Changed the AEC SubbandCoherence function to not use the full AEC state This CL is step towards simplifying the AEC code, making it more modifiable and modular. The changes should be bitexact. BUG=webrtc:5201, webrtc:5298 Review-Url: https://codereview.webrtc.org/1936173002 Cr-Commit-Position: refs/heads/master@{#12652} --- .../modules/audio_processing/aec/aec_core.cc | 87 +++++++------ .../audio_processing/aec/aec_core_internal.h | 16 ++- .../audio_processing/aec/aec_core_neon.cc | 106 +++++++++------- .../audio_processing/aec/aec_core_sse2.cc | 120 ++++++++++-------- 4 files changed, 190 insertions(+), 139 deletions(-) diff --git a/webrtc/modules/audio_processing/aec/aec_core.cc b/webrtc/modules/audio_processing/aec/aec_core.cc index 955bc9a4a6..a01c1785c5 100644 --- a/webrtc/modules/audio_processing/aec/aec_core.cc +++ b/webrtc/modules/audio_processing/aec/aec_core.cc @@ -404,53 +404,60 @@ const float WebRtcAec_kMinFarendPSD = 15; // // In addition to updating the PSDs, also the filter diverge state is // determined. -static void SmoothedPSD(AecCore* aec, +static void SmoothedPSD(int mult, + bool extended_filter_enabled, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], + CoherenceState* coherence_state, + short* filter_divergence_state, int* extreme_filter_divergence) { // Power estimate smoothing coefficients. const float* ptrGCoh = - aec->extended_filter_enabled - ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] - : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; + extended_filter_enabled + ? WebRtcAec_kExtendedSmoothingCoefficients[mult - 1] + : WebRtcAec_kNormalSmoothingCoefficients[mult - 1]; int i; float sdSum = 0, seSum = 0; for (i = 0; i < PART_LEN1; i++) { - aec->sd[i] = ptrGCoh[0] * aec->sd[i] + - ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); - aec->se[i] = ptrGCoh[0] * aec->se[i] + - ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); + coherence_state->sd[i] = + ptrGCoh[0] * coherence_state->sd[i] + + ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); + coherence_state->se[i] = + ptrGCoh[0] * coherence_state->se[i] + + ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); // We threshold here to protect against the ill-effects of a zero farend. // The threshold is not arbitrarily chosen, but balances protection and // adverse interaction with the algorithm's tuning. // TODO(bjornv): investigate further why this is so sensitive. - aec->sx[i] = ptrGCoh[0] * aec->sx[i] + - ptrGCoh[1] * WEBRTC_SPL_MAX( - xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], - WebRtcAec_kMinFarendPSD); + coherence_state->sx[i] = + ptrGCoh[0] * coherence_state->sx[i] + + ptrGCoh[1] * + WEBRTC_SPL_MAX(xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], + WebRtcAec_kMinFarendPSD); - aec->sde[i][0] = - ptrGCoh[0] * aec->sde[i][0] + + coherence_state->sde[i][0] = + ptrGCoh[0] * coherence_state->sde[i][0] + ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); - aec->sde[i][1] = - ptrGCoh[0] * aec->sde[i][1] + + coherence_state->sde[i][1] = + ptrGCoh[0] * coherence_state->sde[i][1] + ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); - aec->sxd[i][0] = - ptrGCoh[0] * aec->sxd[i][0] + + coherence_state->sxd[i][0] = + ptrGCoh[0] * coherence_state->sxd[i][0] + ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); - aec->sxd[i][1] = - ptrGCoh[0] * aec->sxd[i][1] + + coherence_state->sxd[i][1] = + ptrGCoh[0] * coherence_state->sxd[i][1] + ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); - sdSum += aec->sd[i]; - seSum += aec->se[i]; + sdSum += coherence_state->sd[i]; + seSum += coherence_state->se[i]; } // Divergent filter safeguard update. - aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; + *filter_divergence_state = + (*filter_divergence_state ? 1.05f : 1.0f) * seSum > sdSum; // Signal extreme filter divergence if the error is significantly larger // than the nearend (13 dB). @@ -481,26 +488,30 @@ __inline static void StoreAsComplex(const float* data, data_complex[1][PART_LEN] = 0; } -static void SubbandCoherence(AecCore* aec, +static void SubbandCoherence(int mult, + bool extended_filter_enabled, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], float* fft, float* cohde, float* cohxd, + CoherenceState* coherence_state, + short* filter_divergence_state, int* extreme_filter_divergence) { int i; - SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); + SmoothedPSD(mult, extended_filter_enabled, efw, dfw, xfw, coherence_state, + filter_divergence_state, extreme_filter_divergence); // Subband coherence for (i = 0; i < PART_LEN1; i++) { - cohde[i] = - (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / - (aec->sd[i] * aec->se[i] + 1e-10f); - cohxd[i] = - (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / - (aec->sx[i] * aec->sd[i] + 1e-10f); + cohde[i] = (coherence_state->sde[i][0] * coherence_state->sde[i][0] + + coherence_state->sde[i][1] * coherence_state->sde[i][1]) / + (coherence_state->sd[i] * coherence_state->se[i] + 1e-10f); + cohxd[i] = (coherence_state->sxd[i][0] * coherence_state->sxd[i][0] + + coherence_state->sxd[i][1] * coherence_state->sxd[i][1]) / + (coherence_state->sx[i] * coherence_state->sd[i] + 1e-10f); } } @@ -1050,7 +1061,9 @@ static void EchoSuppression(AecCore* aec, memcpy(xfw, aec->xfwBuf + aec->delayIdx * PART_LEN1, sizeof(xfw[0][0]) * 2 * PART_LEN1); - WebRtcAec_SubbandCoherence(aec, efw, dfw, xfw, fft, cohde, cohxd, + WebRtcAec_SubbandCoherence(aec->mult, aec->extended_filter_enabled == 1, efw, + dfw, xfw, fft, cohde, cohxd, &aec->coherence_state, + &aec->divergeState, &aec->extreme_filter_divergence); // Select the microphone signal as output if the filter is deemed to have @@ -1666,18 +1679,18 @@ int WebRtcAec_InitAec(AecCore* aec, int sampFreq) { // doesn't change the output at all and yields 0.4% overall speedup. memset(aec->xfBuf, 0, sizeof(complex_t) * kExtendedNumPartitions * PART_LEN1); memset(aec->wfBuf, 0, sizeof(complex_t) * kExtendedNumPartitions * PART_LEN1); - memset(aec->sde, 0, sizeof(complex_t) * PART_LEN1); - memset(aec->sxd, 0, sizeof(complex_t) * PART_LEN1); + memset(aec->coherence_state.sde, 0, sizeof(complex_t) * PART_LEN1); + memset(aec->coherence_state.sxd, 0, sizeof(complex_t) * PART_LEN1); memset(aec->xfwBuf, 0, sizeof(complex_t) * kExtendedNumPartitions * PART_LEN1); - memset(aec->se, 0, sizeof(float) * PART_LEN1); + memset(aec->coherence_state.se, 0, sizeof(float) * PART_LEN1); // To prevent numerical instability in the first block. for (i = 0; i < PART_LEN1; i++) { - aec->sd[i] = 1; + aec->coherence_state.sd[i] = 1; } for (i = 0; i < PART_LEN1; i++) { - aec->sx[i] = 1; + aec->coherence_state.sx[i] = 1; } memset(aec->hNs, 0, sizeof(aec->hNs)); diff --git a/webrtc/modules/audio_processing/aec/aec_core_internal.h b/webrtc/modules/audio_processing/aec/aec_core_internal.h index 78dd1870bf..c8f545289b 100644 --- a/webrtc/modules/audio_processing/aec/aec_core_internal.h +++ b/webrtc/modules/audio_processing/aec/aec_core_internal.h @@ -72,12 +72,20 @@ class DivergentFilterFraction { RTC_DISALLOW_COPY_AND_ASSIGN(DivergentFilterFraction); }; +typedef struct CoherenceState { + complex_t sde[PART_LEN1]; // cross-psd of nearend and error + complex_t sxd[PART_LEN1]; // cross-psd of farend and nearend + float sx[PART_LEN1], sd[PART_LEN1], se[PART_LEN1]; // far, near, error psd +} CoherenceState; + struct AecCore { explicit AecCore(int instance_index); ~AecCore(); std::unique_ptr data_dumper; + CoherenceState coherence_state; + int farBufWritePos, farBufReadPos; int knownDelay; @@ -103,12 +111,9 @@ struct AecCore { float xfBuf[2][kExtendedNumPartitions * PART_LEN1]; // farend fft buffer float wfBuf[2][kExtendedNumPartitions * PART_LEN1]; // filter fft - complex_t sde[PART_LEN1]; // cross-psd of nearend and error - complex_t sxd[PART_LEN1]; // cross-psd of farend and nearend // Farend windowed fft buffer. complex_t xfwBuf[kExtendedNumPartitions * PART_LEN1]; - float sx[PART_LEN1], sd[PART_LEN1], se[PART_LEN1]; // far, near, error psd float hNs[PART_LEN1]; float hNlFbMin, hNlFbLocalMin; float hNlXdAvgMin; @@ -223,13 +228,16 @@ typedef void (*WebRtcAecComfortNoise)(AecCore* aec, const float* lambda); extern WebRtcAecComfortNoise WebRtcAec_ComfortNoise; -typedef void (*WebRtcAecSubBandCoherence)(AecCore* aec, +typedef void (*WebRtcAecSubBandCoherence)(int mult, + bool extended_filter_enabled, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], float* fft, float* cohde, float* cohxd, + CoherenceState* coherence_state, + short* filter_divergence_state, int* extreme_filter_divergence); extern WebRtcAecSubBandCoherence WebRtcAec_SubbandCoherence; diff --git a/webrtc/modules/audio_processing/aec/aec_core_neon.cc b/webrtc/modules/audio_processing/aec/aec_core_neon.cc index 80c52012b2..01e6ce71b7 100644 --- a/webrtc/modules/audio_processing/aec/aec_core_neon.cc +++ b/webrtc/modules/audio_processing/aec/aec_core_neon.cc @@ -502,16 +502,19 @@ static int PartitionDelayNEON(const AecCore* aec) { // // In addition to updating the PSDs, also the filter diverge state is determined // upon actions are taken. -static void SmoothedPSD(AecCore* aec, +static void SmoothedPSD(int mult, + bool extended_filter_enabled, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], + CoherenceState* coherence_state, + short* filter_divergence_state, int* extreme_filter_divergence) { // Power estimate smoothing coefficients. const float* ptrGCoh = - aec->extended_filter_enabled - ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] - : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; + extended_filter_enabled + ? WebRtcAec_kExtendedSmoothingCoefficients[mult - 1] + : WebRtcAec_kNormalSmoothingCoefficients[mult - 1]; int i; float sdSum = 0, seSum = 0; const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD); @@ -525,9 +528,12 @@ static void SmoothedPSD(AecCore* aec, const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]); const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]); const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]); - float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]); - float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]); - float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]); + float32x4_t vec_sd = + vmulq_n_f32(vld1q_f32(&coherence_state->sd[i]), ptrGCoh[0]); + float32x4_t vec_se = + vmulq_n_f32(vld1q_f32(&coherence_state->se[i]), ptrGCoh[0]); + float32x4_t vec_sx = + vmulq_n_f32(vld1q_f32(&coherence_state->sx[i]), ptrGCoh[0]); float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0); float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0); float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0); @@ -540,12 +546,12 @@ static void SmoothedPSD(AecCore* aec, vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]); vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]); - vst1q_f32(&aec->sd[i], vec_sd); - vst1q_f32(&aec->se[i], vec_se); - vst1q_f32(&aec->sx[i], vec_sx); + vst1q_f32(&coherence_state->sd[i], vec_sd); + vst1q_f32(&coherence_state->se[i], vec_se); + vst1q_f32(&coherence_state->sx[i], vec_sx); { - float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); + float32x4x2_t vec_sde = vld2q_f32(&coherence_state->sde[i][0]); float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0); float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1); vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]); @@ -554,11 +560,11 @@ static void SmoothedPSD(AecCore* aec, vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0); vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]); vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]); - vst2q_f32(&aec->sde[i][0], vec_sde); + vst2q_f32(&coherence_state->sde[i][0], vec_sde); } { - float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); + float32x4x2_t vec_sxd = vld2q_f32(&coherence_state->sxd[i][0]); float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0); float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1); vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]); @@ -567,7 +573,7 @@ static void SmoothedPSD(AecCore* aec, vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0); vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]); vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]); - vst2q_f32(&aec->sxd[i][0], vec_sxd); + vst2q_f32(&coherence_state->sxd[i][0], vec_sxd); } vec_sdSum = vaddq_f32(vec_sdSum, vec_sd); @@ -591,39 +597,43 @@ static void SmoothedPSD(AecCore* aec, // scalar code for the remaining items. for (; i < PART_LEN1; i++) { - aec->sd[i] = ptrGCoh[0] * aec->sd[i] + - ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); - aec->se[i] = ptrGCoh[0] * aec->se[i] + - ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); + coherence_state->sd[i] = + ptrGCoh[0] * coherence_state->sd[i] + + ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); + coherence_state->se[i] = + ptrGCoh[0] * coherence_state->se[i] + + ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); // We threshold here to protect against the ill-effects of a zero farend. // The threshold is not arbitrarily chosen, but balances protection and // adverse interaction with the algorithm's tuning. // TODO(bjornv): investigate further why this is so sensitive. - aec->sx[i] = ptrGCoh[0] * aec->sx[i] + - ptrGCoh[1] * WEBRTC_SPL_MAX( - xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], - WebRtcAec_kMinFarendPSD); + coherence_state->sx[i] = + ptrGCoh[0] * coherence_state->sx[i] + + ptrGCoh[1] * + WEBRTC_SPL_MAX(xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], + WebRtcAec_kMinFarendPSD); - aec->sde[i][0] = - ptrGCoh[0] * aec->sde[i][0] + + coherence_state->sde[i][0] = + ptrGCoh[0] * coherence_state->sde[i][0] + ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); - aec->sde[i][1] = - ptrGCoh[0] * aec->sde[i][1] + + coherence_state->sde[i][1] = + ptrGCoh[0] * coherence_state->sde[i][1] + ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); - aec->sxd[i][0] = - ptrGCoh[0] * aec->sxd[i][0] + + coherence_state->sxd[i][0] = + ptrGCoh[0] * coherence_state->sxd[i][0] + ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); - aec->sxd[i][1] = - ptrGCoh[0] * aec->sxd[i][1] + + coherence_state->sxd[i][1] = + ptrGCoh[0] * coherence_state->sxd[i][1] + ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); - sdSum += aec->sd[i]; - seSum += aec->se[i]; + sdSum += coherence_state->sd[i]; + seSum += coherence_state->se[i]; } // Divergent filter safeguard update. - aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; + *filter_divergence_state = + (*filter_divergence_state ? 1.05f : 1.0f) * seSum > sdSum; // Signal extreme filter divergence if the error is significantly larger // than the nearend (13 dB). @@ -667,30 +677,34 @@ static void StoreAsComplexNEON(const float* data, data_complex[0][PART_LEN] = data[1]; } -static void SubbandCoherenceNEON(AecCore* aec, +static void SubbandCoherenceNEON(int mult, + bool extended_filter_enabled, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], float* fft, float* cohde, float* cohxd, + CoherenceState* coherence_state, + short* filter_divergence_state, int* extreme_filter_divergence) { int i; - SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); + SmoothedPSD(mult, extended_filter_enabled, efw, dfw, xfw, coherence_state, + filter_divergence_state, extreme_filter_divergence); { const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f); // Subband coherence for (i = 0; i + 3 < PART_LEN1; i += 4) { - const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]); - const float32x4_t vec_se = vld1q_f32(&aec->se[i]); - const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]); + const float32x4_t vec_sd = vld1q_f32(&coherence_state->sd[i]); + const float32x4_t vec_se = vld1q_f32(&coherence_state->se[i]); + const float32x4_t vec_sx = vld1q_f32(&coherence_state->sx[i]); const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se); const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx); - float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); - float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); + float32x4x2_t vec_sde = vld2q_f32(&coherence_state->sde[i][0]); + float32x4x2_t vec_sxd = vld2q_f32(&coherence_state->sxd[i][0]); float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]); float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]); vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]); @@ -704,12 +718,12 @@ static void SubbandCoherenceNEON(AecCore* aec, } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { - cohde[i] = - (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / - (aec->sd[i] * aec->se[i] + 1e-10f); - cohxd[i] = - (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / - (aec->sx[i] * aec->sd[i] + 1e-10f); + cohde[i] = (coherence_state->sde[i][0] * coherence_state->sde[i][0] + + coherence_state->sde[i][1] * coherence_state->sde[i][1]) / + (coherence_state->sd[i] * coherence_state->se[i] + 1e-10f); + cohxd[i] = (coherence_state->sxd[i][0] * coherence_state->sxd[i][0] + + coherence_state->sxd[i][1] * coherence_state->sxd[i][1]) / + (coherence_state->sx[i] * coherence_state->sd[i] + 1e-10f); } } diff --git a/webrtc/modules/audio_processing/aec/aec_core_sse2.cc b/webrtc/modules/audio_processing/aec/aec_core_sse2.cc index 9a64616b0d..91d98b9773 100644 --- a/webrtc/modules/audio_processing/aec/aec_core_sse2.cc +++ b/webrtc/modules/audio_processing/aec/aec_core_sse2.cc @@ -495,16 +495,19 @@ static int PartitionDelaySSE2(const AecCore* aec) { // // In addition to updating the PSDs, also the filter diverge state is determined // upon actions are taken. -static void SmoothedPSD(AecCore* aec, +static void SmoothedPSD(int mult, + bool extended_filter_enabled, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], + CoherenceState* coherence_state, + short* filter_divergence_state, int* extreme_filter_divergence) { // Power estimate smoothing coefficients. const float* ptrGCoh = - aec->extended_filter_enabled - ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] - : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; + extended_filter_enabled + ? WebRtcAec_kExtendedSmoothingCoefficients[mult - 1] + : WebRtcAec_kNormalSmoothingCoefficients[mult - 1]; int i; float sdSum = 0, seSum = 0; const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD); @@ -520,9 +523,12 @@ static void SmoothedPSD(AecCore* aec, const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]); const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]); const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]); - __m128 vec_sd = _mm_mul_ps(_mm_loadu_ps(&aec->sd[i]), vec_GCoh0); - __m128 vec_se = _mm_mul_ps(_mm_loadu_ps(&aec->se[i]), vec_GCoh0); - __m128 vec_sx = _mm_mul_ps(_mm_loadu_ps(&aec->sx[i]), vec_GCoh0); + __m128 vec_sd = + _mm_mul_ps(_mm_loadu_ps(&coherence_state->sd[i]), vec_GCoh0); + __m128 vec_se = + _mm_mul_ps(_mm_loadu_ps(&coherence_state->se[i]), vec_GCoh0); + __m128 vec_sx = + _mm_mul_ps(_mm_loadu_ps(&coherence_state->sx[i]), vec_GCoh0); __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0); __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0); __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0); @@ -533,13 +539,13 @@ static void SmoothedPSD(AecCore* aec, vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1)); vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1)); vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1)); - _mm_storeu_ps(&aec->sd[i], vec_sd); - _mm_storeu_ps(&aec->se[i], vec_se); - _mm_storeu_ps(&aec->sx[i], vec_sx); + _mm_storeu_ps(&coherence_state->sd[i], vec_sd); + _mm_storeu_ps(&coherence_state->se[i], vec_se); + _mm_storeu_ps(&coherence_state->sx[i], vec_sx); { - const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]); - const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); + const __m128 vec_3210 = _mm_loadu_ps(&coherence_state->sde[i][0]); + const __m128 vec_7654 = _mm_loadu_ps(&coherence_state->sde[i + 2][0]); __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0)); __m128 vec_b = @@ -554,13 +560,14 @@ static void SmoothedPSD(AecCore* aec, _mm_sub_ps(vec_dfwefw0110, _mm_mul_ps(vec_dfw1, vec_efw0)); vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1)); vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1)); - _mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b)); - _mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); + _mm_storeu_ps(&coherence_state->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b)); + _mm_storeu_ps(&coherence_state->sde[i + 2][0], + _mm_unpackhi_ps(vec_a, vec_b)); } { - const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]); - const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); + const __m128 vec_3210 = _mm_loadu_ps(&coherence_state->sxd[i][0]); + const __m128 vec_7654 = _mm_loadu_ps(&coherence_state->sxd[i + 2][0]); __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0)); __m128 vec_b = @@ -575,8 +582,9 @@ static void SmoothedPSD(AecCore* aec, _mm_sub_ps(vec_dfwxfw0110, _mm_mul_ps(vec_dfw1, vec_xfw0)); vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1)); vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1)); - _mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b)); - _mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); + _mm_storeu_ps(&coherence_state->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b)); + _mm_storeu_ps(&coherence_state->sxd[i + 2][0], + _mm_unpackhi_ps(vec_a, vec_b)); } vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd); @@ -587,39 +595,43 @@ static void SmoothedPSD(AecCore* aec, _mm_add_ps_4x1(vec_seSum, &seSum); for (; i < PART_LEN1; i++) { - aec->sd[i] = ptrGCoh[0] * aec->sd[i] + - ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); - aec->se[i] = ptrGCoh[0] * aec->se[i] + - ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); + coherence_state->sd[i] = + ptrGCoh[0] * coherence_state->sd[i] + + ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); + coherence_state->se[i] = + ptrGCoh[0] * coherence_state->se[i] + + ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); // We threshold here to protect against the ill-effects of a zero farend. // The threshold is not arbitrarily chosen, but balances protection and // adverse interaction with the algorithm's tuning. // TODO(bjornv): investigate further why this is so sensitive. - aec->sx[i] = ptrGCoh[0] * aec->sx[i] + - ptrGCoh[1] * WEBRTC_SPL_MAX( - xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], - WebRtcAec_kMinFarendPSD); + coherence_state->sx[i] = + ptrGCoh[0] * coherence_state->sx[i] + + ptrGCoh[1] * + WEBRTC_SPL_MAX(xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], + WebRtcAec_kMinFarendPSD); - aec->sde[i][0] = - ptrGCoh[0] * aec->sde[i][0] + + coherence_state->sde[i][0] = + ptrGCoh[0] * coherence_state->sde[i][0] + ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); - aec->sde[i][1] = - ptrGCoh[0] * aec->sde[i][1] + + coherence_state->sde[i][1] = + ptrGCoh[0] * coherence_state->sde[i][1] + ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); - aec->sxd[i][0] = - ptrGCoh[0] * aec->sxd[i][0] + + coherence_state->sxd[i][0] = + ptrGCoh[0] * coherence_state->sxd[i][0] + ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); - aec->sxd[i][1] = - ptrGCoh[0] * aec->sxd[i][1] + + coherence_state->sxd[i][1] = + ptrGCoh[0] * coherence_state->sxd[i][1] + ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); - sdSum += aec->sd[i]; - seSum += aec->se[i]; + sdSum += coherence_state->sd[i]; + seSum += coherence_state->se[i]; } // Divergent filter safeguard update. - aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; + *filter_divergence_state = + (*filter_divergence_state ? 1.05f : 1.0f) * seSum > sdSum; // Signal extreme filter divergence if the error is significantly larger // than the nearend (13 dB). @@ -666,34 +678,38 @@ static void StoreAsComplexSSE2(const float* data, data_complex[0][PART_LEN] = data[1]; } -static void SubbandCoherenceSSE2(AecCore* aec, +static void SubbandCoherenceSSE2(int mult, + bool extended_filter_enabled, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], float* fft, float* cohde, float* cohxd, + CoherenceState* coherence_state, + short* filter_divergence_state, int* extreme_filter_divergence) { int i; - SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); + SmoothedPSD(mult, extended_filter_enabled, efw, dfw, xfw, coherence_state, + filter_divergence_state, extreme_filter_divergence); { const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f); // Subband coherence for (i = 0; i + 3 < PART_LEN1; i += 4) { - const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]); - const __m128 vec_se = _mm_loadu_ps(&aec->se[i]); - const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]); + const __m128 vec_sd = _mm_loadu_ps(&coherence_state->sd[i]); + const __m128 vec_se = _mm_loadu_ps(&coherence_state->se[i]); + const __m128 vec_sx = _mm_loadu_ps(&coherence_state->sx[i]); const __m128 vec_sdse = _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_se)); const __m128 vec_sdsx = _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_sx)); - const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]); - const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); - const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]); - const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); + const __m128 vec_sde_3210 = _mm_loadu_ps(&coherence_state->sde[i][0]); + const __m128 vec_sde_7654 = _mm_loadu_ps(&coherence_state->sde[i + 2][0]); + const __m128 vec_sxd_3210 = _mm_loadu_ps(&coherence_state->sxd[i][0]); + const __m128 vec_sxd_7654 = _mm_loadu_ps(&coherence_state->sxd[i + 2][0]); const __m128 vec_sde_0 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(2, 0, 2, 0)); const __m128 vec_sde_1 = @@ -714,12 +730,12 @@ static void SubbandCoherenceSSE2(AecCore* aec, // scalar code for the remaining items. for (; i < PART_LEN1; i++) { - cohde[i] = - (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / - (aec->sd[i] * aec->se[i] + 1e-10f); - cohxd[i] = - (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / - (aec->sx[i] * aec->sd[i] + 1e-10f); + cohde[i] = (coherence_state->sde[i][0] * coherence_state->sde[i][0] + + coherence_state->sde[i][1] * coherence_state->sde[i][1]) / + (coherence_state->sd[i] * coherence_state->se[i] + 1e-10f); + cohxd[i] = (coherence_state->sxd[i][0] * coherence_state->sxd[i][0] + + coherence_state->sxd[i][1] * coherence_state->sxd[i][1]) / + (coherence_state->sx[i] * coherence_state->sd[i] + 1e-10f); } } }