From 5af7a804ea711c457e5fc77a1f35cb674464995c Mon Sep 17 00:00:00 2001 From: "cduvivier@google.com" Date: Mon, 13 Jun 2011 18:56:48 +0000 Subject: [PATCH] Optimization of "overdrive and suppress": * float accuracy pow function, vectorized pow approximation, general vectorization. * 10.2% AEC overall speedup for the straight C path. * 16.1% AEC overall speedup for the SSE2 path. Review URL: http://webrtc-codereview.appspot.com/24016 git-svn-id: http://webrtc.googlecode.com/svn/trunk@72 4adac7df-926f-26a2-2b94-8c16560cd09d --- .../aec/main/source/aec_core.c | 99 ++++---- .../aec/main/source/aec_core.h | 3 + .../aec/main/source/aec_core_sse2.c | 216 ++++++++++++++++++ 3 files changed, 273 insertions(+), 45 deletions(-) diff --git a/modules/audio_processing/aec/main/source/aec_core.c b/modules/audio_processing/aec/main/source/aec_core.c index 3f8a088fd9..3303a8a886 100644 --- a/modules/audio_processing/aec/main/source/aec_core.c +++ b/modules/audio_processing/aec/main/source/aec_core.c @@ -82,7 +82,7 @@ static const float sqrtHanning[65] = { weightCurve = [0 ; 0.3 * sqrt(linspace(0,1,64))' + 0.1]; fprintf(1, '\t%.4f, %.4f, %.4f, %.4f, %.4f, %.4f,\n', weightCurve); */ -static const float weightCurve[65] = { +const float WebRtcAec_weightCurve[65] = { 0.0000f, 0.1000f, 0.1378f, 0.1535f, 0.1655f, 0.1756f, 0.1845f, 0.1926f, 0.2000f, 0.2069f, 0.2134f, 0.2195f, 0.2254f, 0.2309f, 0.2363f, 0.2414f, 0.2464f, 0.2512f, @@ -100,7 +100,7 @@ static const float weightCurve[65] = { overDriveCurve = [sqrt(linspace(0,1,65))' + 1]; fprintf(1, '\t%.4f, %.4f, %.4f, %.4f, %.4f, %.4f,\n', overDriveCurve); */ -static const float overDriveCurve[65] = { +const float WebRtcAec_overDriveCurve[65] = { 1.0000f, 1.1250f, 1.1768f, 1.2165f, 1.2500f, 1.2795f, 1.3062f, 1.3307f, 1.3536f, 1.3750f, 1.3953f, 1.4146f, 1.4330f, 1.4507f, 1.4677f, 1.4841f, 1.5000f, 1.5154f, @@ -128,7 +128,7 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output, static void GetHighbandGain(const float *lambda, float *nlpGainHband); // Comfort_noise also computes noise for H band returned in comfortNoiseHband -static void ComfortNoise(aec_t *aec, complex_t *efw, +static void ComfortNoise(aec_t *aec, float efw[2][PART_LEN1], complex_t *comfortNoiseHband, const float *noisePow, const float *lambda); @@ -314,9 +314,32 @@ static void FilterAdaptation(aec_t *aec, float *fft, float ef[2][PART_LEN1], } } +static void OverdriveAndSuppress(aec_t *aec, float hNl[PART_LEN1], + const float hNlFb, + float efw[2][PART_LEN1]) { + int i; + for (i = 0; i < PART_LEN1; i++) { + // Weight subbands + if (hNl[i] > hNlFb) { + hNl[i] = WebRtcAec_weightCurve[i] * hNlFb + + (1 - WebRtcAec_weightCurve[i]) * hNl[i]; + } + hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]); + + // Suppress error signal + efw[0][i] *= hNl[i]; + efw[1][i] *= hNl[i]; + + // Ooura fft returns incorrect sign on imaginary component. It matters here + // because we are making an additive change with comfort noise. + efw[1][i] *= -1; + } +} + WebRtcAec_FilterFar_t WebRtcAec_FilterFar; WebRtcAec_ScaleErrorSignal_t WebRtcAec_ScaleErrorSignal; WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation; +WebRtcAec_OverdriveAndSuppress_t WebRtcAec_OverdriveAndSuppress; int WebRtcAec_InitAec(aec_t *aec, int sampFreq) { @@ -444,6 +467,7 @@ int WebRtcAec_InitAec(aec_t *aec, int sampFreq) WebRtcAec_FilterFar = FilterFar; WebRtcAec_ScaleErrorSignal = ScaleErrorSignal; WebRtcAec_FilterAdaptation = FilterAdaptation; + WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppress; if (WebRtc_GetCPUInfo(kSSE2)) { #if defined(__SSE2__) WebRtcAec_InitAec_SSE2(); @@ -753,7 +777,8 @@ static void ProcessBlock(aec_t *aec, const short *farend, static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output, short *outputH) { - complex_t dfw[PART_LEN1], efw[PART_LEN1], xfw[PART_LEN1]; + float efw[2][PART_LEN1], dfw[2][PART_LEN1]; + complex_t xfw[PART_LEN1]; complex_t comfortNoiseHband[PART_LEN1]; float fft[PART_LEN2]; float scale, dtmp; @@ -841,13 +866,13 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output, } rdft(PART_LEN2, 1, fft, ip, wfft); - dfw[0][1] = 0; - dfw[PART_LEN][1] = 0; + dfw[1][0] = 0; + dfw[1][PART_LEN] = 0; dfw[0][0] = fft[0]; - dfw[PART_LEN][0] = fft[1]; + dfw[0][PART_LEN] = fft[1]; for (i = 1; i < PART_LEN; i++) { - dfw[i][0] = fft[2 * i]; - dfw[i][1] = fft[2 * i + 1]; + dfw[0][i] = fft[2 * i]; + dfw[1][i] = fft[2 * i + 1]; } // Windowed error fft @@ -856,21 +881,21 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output, fft[PART_LEN + i] = aec->eBuf[PART_LEN + i] * sqrtHanning[PART_LEN - i]; } rdft(PART_LEN2, 1, fft, ip, wfft); - efw[0][1] = 0; - efw[PART_LEN][1] = 0; + efw[1][0] = 0; + efw[1][PART_LEN] = 0; efw[0][0] = fft[0]; - efw[PART_LEN][0] = fft[1]; + efw[0][PART_LEN] = fft[1]; for (i = 1; i < PART_LEN; i++) { - efw[i][0] = fft[2 * i]; - efw[i][1] = fft[2 * i + 1]; + efw[0][i] = fft[2 * i]; + efw[1][i] = fft[2 * i + 1]; } // Smoothed PSD for (i = 0; i < PART_LEN1; i++) { aec->sd[i] = ptrGCoh[0] * aec->sd[i] + ptrGCoh[1] * - (dfw[i][0] * dfw[i][0] + dfw[i][1] * dfw[i][1]); + (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); aec->se[i] = ptrGCoh[0] * aec->se[i] + ptrGCoh[1] * - (efw[i][0] * efw[i][0] + efw[i][1] * efw[i][1]); + (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); // We threshold here to protect against the ill-effects of a zero farend. // The threshold is not arbitrarily chosen, but balances protection and // adverse interaction with the algorithm's tuning. @@ -879,14 +904,14 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output, WEBRTC_SPL_MAX(xfw[i][0] * xfw[i][0] + xfw[i][1] * xfw[i][1], 15); aec->sde[i][0] = ptrGCoh[0] * aec->sde[i][0] + ptrGCoh[1] * - (dfw[i][0] * efw[i][0] + dfw[i][1] * efw[i][1]); + (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); aec->sde[i][1] = ptrGCoh[0] * aec->sde[i][1] + ptrGCoh[1] * - (dfw[i][0] * efw[i][1] - dfw[i][1] * efw[i][0]); + (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); aec->sxd[i][0] = ptrGCoh[0] * aec->sxd[i][0] + ptrGCoh[1] * - (dfw[i][0] * xfw[i][0] + dfw[i][1] * xfw[i][1]); + (dfw[0][i] * xfw[i][0] + dfw[1][i] * xfw[i][1]); aec->sxd[i][1] = ptrGCoh[0] * aec->sxd[i][1] + ptrGCoh[1] * - (dfw[i][0] * xfw[i][1] - dfw[i][1] * xfw[i][0]); + (dfw[0][i] * xfw[i][1] - dfw[1][i] * xfw[i][0]); sdSum += aec->sd[i]; seSum += aec->se[i]; @@ -1007,29 +1032,13 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output, // Smooth the overdrive. if (aec->overDrive < aec->overDriveSm) { - aec->overDriveSm = 0.99f * aec->overDriveSm + 0.01f * aec->overDrive; + aec->overDriveSm = 0.99f * aec->overDriveSm + 0.01f * aec->overDrive; } else { - aec->overDriveSm = 0.9f * aec->overDriveSm + 0.1f * aec->overDrive; - } - - for (i = 0; i < PART_LEN1; i++) { - // Weight subbands - if (hNl[i] > hNlFb) { - hNl[i] = weightCurve[i] * hNlFb + (1 - weightCurve[i]) * hNl[i]; - } - - hNl[i] = (float)pow(hNl[i], aec->overDriveSm * overDriveCurve[i]); - - // Suppress error signal - efw[i][0] *= hNl[i]; - efw[i][1] *= hNl[i]; - - // Ooura fft returns incorrect sign on imaginary component. - // It matters here because we are making an additive change with comfort noise. - efw[i][1] *= -1; + aec->overDriveSm = 0.9f * aec->overDriveSm + 0.1f * aec->overDrive; } + WebRtcAec_OverdriveAndSuppress(aec, hNl, hNlFb, efw); #ifdef G167 if (aec->cnToggle) { @@ -1042,11 +1051,11 @@ static void NonLinearProcessing(aec_t *aec, int *ip, float *wfft, short *output, // Inverse error fft. fft[0] = efw[0][0]; - fft[1] = efw[PART_LEN][0]; + fft[1] = efw[0][PART_LEN]; for (i = 1; i < PART_LEN; i++) { - fft[2*i] = efw[i][0]; + fft[2*i] = efw[0][i]; // Sign change required by Ooura fft. - fft[2*i + 1] = -efw[i][1]; + fft[2*i + 1] = -efw[1][i]; } rdft(PART_LEN2, -1, fft, ip, wfft); @@ -1126,7 +1135,7 @@ static void GetHighbandGain(const float *lambda, float *nlpGainHband) nlpGainHband[0] /= (float)(PART_LEN1 - 1 - freqAvgIc); } -static void ComfortNoise(aec_t *aec, complex_t *efw, +static void ComfortNoise(aec_t *aec, float efw[2][PART_LEN1], complex_t *comfortNoiseHband, const float *noisePow, const float *lambda) { int i, num; @@ -1159,8 +1168,8 @@ static void ComfortNoise(aec_t *aec, complex_t *efw, // This is the proper weighting to match the background noise power tmp = sqrtf(WEBRTC_SPL_MAX(1 - lambda[i] * lambda[i], 0)); //tmp = 1 - lambda[i]; - efw[i][0] += tmp * u[i][0]; - efw[i][1] += tmp * u[i][1]; + efw[0][i] += tmp * u[i][0]; + efw[1][i] += tmp * u[i][1]; } // For H band comfort noise diff --git a/modules/audio_processing/aec/main/source/aec_core.h b/modules/audio_processing/aec/main/source/aec_core.h index 80d492f14a..ae043038ac 100644 --- a/modules/audio_processing/aec/main/source/aec_core.h +++ b/modules/audio_processing/aec/main/source/aec_core.h @@ -176,6 +176,9 @@ typedef void (*WebRtcAec_FilterAdaptation_t) (aec_t *aec, float *fft, float ef[2][PART_LEN1], int ip[IP_LEN], float wfft[W_LEN]); extern WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation; +typedef void (*WebRtcAec_OverdriveAndSuppress_t) + (aec_t *aec, float hNl[PART_LEN1], const float hNlFb, float efw[2][PART_LEN1]); +extern WebRtcAec_OverdriveAndSuppress_t WebRtcAec_OverdriveAndSuppress; int WebRtcAec_CreateAec(aec_t **aec); int WebRtcAec_FreeAec(aec_t *aec); diff --git a/modules/audio_processing/aec/main/source/aec_core_sse2.c b/modules/audio_processing/aec/main/source/aec_core_sse2.c index 8dfd118710..d7f30b58cd 100644 --- a/modules/audio_processing/aec/main/source/aec_core_sse2.c +++ b/modules/audio_processing/aec/main/source/aec_core_sse2.c @@ -210,10 +210,226 @@ static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1], } } +#ifdef _MSC_VER /* visual c++ */ +# define ALIGN16_BEG __declspec(align(16)) +# define ALIGN16_END +#else /* gcc or icc */ +# define ALIGN16_BEG +# define ALIGN16_END __attribute__((aligned(16))) +#endif + +static __m128 mm_pow_ps(__m128 a, __m128 b) +{ + // a^b = exp2(b * log2(a)) + // exp2(x) and log2(x) are calculated using polynomial approximations. + __m128 log2_a, b_log2_a, a_exp_b; + + // Calculate log2(x), x = a. + { + // To calculate log2(x), we decompose x like this: + // x = y * 2^n + // n is an integer + // y is in the [1.0, 2.0) range + // + // log2(x) = log2(y) + n + // n can be evaluated by playing with float representation. + // log2(y) in a small range can be approximated, this code uses an order + // five polynomial approximation. The coefficients have been + // estimated with the Remez algorithm and the resulting + // polynomial has a maximum relative error of 0.00086%. + + // Compute n. + // This is done by masking the exponent, shifting it into the top bit of + // the mantissa, putting eight into the biased exponent (to shift/ + // compensate the fact that the exponent has been shifted in the top/ + // fractional part and finally getting rid of the implicit leading one + // from the mantissa by substracting it out. + static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = + {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000}; + static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = + {0x43800000, 0x43800000, 0x43800000, 0x43800000}; + static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = + {0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000}; + static const int shift_exponent_into_top_mantissa = 8; + const __m128 two_n = _mm_and_ps(a, *((__m128 *)float_exponent_mask)); + const __m128 n_1 = (__m128)_mm_srli_epi32((__m128i)two_n, + shift_exponent_into_top_mantissa); + const __m128 n_0 = _mm_or_ps( + (__m128)n_1, *((__m128 *)eight_biased_exponent)); + const __m128 n = _mm_sub_ps(n_0, *((__m128 *)implicit_leading_one)); + + // Compute y. + static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = + {0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF}; + static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = + {0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000}; + const __m128 mantissa = _mm_and_ps(a, *((__m128 *)mantissa_mask)); + const __m128 y = _mm_or_ps( + mantissa, *((__m128 *)zero_biased_exponent_is_one)); + + // Approximate log2(y) ~= (y - 1) * pol5(y). + // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 + static const ALIGN16_BEG float ALIGN16_END C5[4] = + {-3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f}; + static const ALIGN16_BEG float ALIGN16_END C4[4] = + {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f}; + static const ALIGN16_BEG float ALIGN16_END C3[4] = + {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f}; + static const ALIGN16_BEG float ALIGN16_END C2[4] = + {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f}; + static const ALIGN16_BEG float ALIGN16_END C1[4] = + {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f}; + static const ALIGN16_BEG float ALIGN16_END C0[4] = + {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f}; + const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128 *)C5)); + const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128 *)C4)); + const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y); + const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128 *)C3)); + const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y); + const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128 *)C2)); + const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y); + const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128 *)C1)); + const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y); + const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128 *)C0)); + const __m128 y_minus_one = _mm_sub_ps( + y, *((__m128 *)zero_biased_exponent_is_one)); + const __m128 log2_y = _mm_mul_ps(y_minus_one , pol5_y); + + // Combine parts. + log2_a = _mm_add_ps(n, log2_y); + } + + // b * log2(a) + b_log2_a = _mm_mul_ps(b, log2_a); + + // Calculate exp2(x), x = b * log2(a). + { + // To calculate 2^x, we decompose x like this: + // x = n + y + // n is an integer, the value of x - 0.5 rounded down, therefore + // y is in the [0.5, 1.5) range + // + // 2^x = 2^n * 2^y + // 2^n can be evaluated by playing with float representation. + // 2^y in a small range can be approximated, this code uses an order two + // polynomial approximation. The coefficients have been estimated + // with the Remez algorithm and the resulting polynomial has a + // maximum relative error of 0.17%. + + // To avoid over/underflow, we reduce the range of input to ]-127, 129]. + static const ALIGN16_BEG float max_input[4] ALIGN16_END = + {129.f, 129.f, 129.f, 129.f}; + static const ALIGN16_BEG float min_input[4] ALIGN16_END = + {-126.99999f, -126.99999f, -126.99999f, -126.99999f}; + const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128 *)max_input)); + const __m128 x_max = _mm_max_ps(x_min, *((__m128 *)min_input)); + // Compute n. + static const ALIGN16_BEG float half[4] ALIGN16_END = + {0.5f, 0.5f, 0.5f, 0.5f}; + const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128 *)half)); + const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half); + // Compute 2^n. + static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = + {127, 127, 127, 127}; + static const int float_exponent_shift = 23; + const __m128i two_n_exponent = _mm_add_epi32( + x_minus_half_floor, *((__m128i *)float_exponent_bias)); + const __m128 two_n = (__m128)_mm_slli_epi32( + two_n_exponent, float_exponent_shift); + // Compute y. + const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor)); + // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. + static const ALIGN16_BEG float C2[4] ALIGN16_END = + {3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f}; + static const ALIGN16_BEG float C1[4] ALIGN16_END = + {6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f}; + static const ALIGN16_BEG float C0[4] ALIGN16_END = + {1.0017247f, 1.0017247f, 1.0017247f, 1.0017247f}; + const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128 *)C2)); + const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128 *)C1)); + const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y); + const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128 *)C0)); + + // Combine parts. + a_exp_b = _mm_mul_ps(exp2_y, two_n); + } + return a_exp_b; +} + +extern const float WebRtcAec_weightCurve[65]; +extern const float WebRtcAec_overDriveCurve[65]; + +static void OverdriveAndSuppressSSE2(aec_t *aec, float hNl[PART_LEN1], + const float hNlFb, + float efw[2][PART_LEN1]) { + int i; + const __m128 vec_hNlFb = _mm_set1_ps(hNlFb); + const __m128 vec_one = _mm_set1_ps(1.0f); + const __m128 vec_minus_one = _mm_set1_ps(-1.0f); + const __m128 vec_overDriveSm = _mm_set1_ps(aec->overDriveSm); + // vectorized code (four at once) + for (i = 0; i + 3 < PART_LEN1; i+=4) { + // Weight subbands + __m128 vec_hNl = _mm_loadu_ps(&hNl[i]); + const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]); + const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb); + const __m128 vec_weightCurve_hNlFb = _mm_mul_ps( + vec_weightCurve, vec_hNlFb); + const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve); + const __m128 vec_one_weightCurve_hNl = _mm_mul_ps( + vec_one_weightCurve, vec_hNl); + const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl); + const __m128 vec_if1 = _mm_and_ps( + bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl)); + vec_hNl = _mm_or_ps(vec_if0, vec_if1); + + { + const __m128 vec_overDriveCurve = _mm_loadu_ps( + &WebRtcAec_overDriveCurve[i]); + const __m128 vec_overDriveSm_overDriveCurve = _mm_mul_ps( + vec_overDriveSm, vec_overDriveCurve); + vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve); + _mm_storeu_ps(&hNl[i], vec_hNl); + } + + // Suppress error signal + { + __m128 vec_efw_re = _mm_loadu_ps(&efw[0][i]); + __m128 vec_efw_im = _mm_loadu_ps(&efw[1][i]); + vec_efw_re = _mm_mul_ps(vec_efw_re, vec_hNl); + vec_efw_im = _mm_mul_ps(vec_efw_im, vec_hNl); + + // Ooura fft returns incorrect sign on imaginary component. It matters + // here because we are making an additive change with comfort noise. + vec_efw_im = _mm_mul_ps(vec_efw_im, vec_minus_one); + _mm_storeu_ps(&efw[0][i], vec_efw_re); + _mm_storeu_ps(&efw[1][i], vec_efw_im); + } + } + // scalar code for the remaining items. + for (; i < PART_LEN1; i++) { + // Weight subbands + if (hNl[i] > hNlFb) { + hNl[i] = WebRtcAec_weightCurve[i] * hNlFb + + (1 - WebRtcAec_weightCurve[i]) * hNl[i]; + } + hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]); + + // Suppress error signal + efw[0][i] *= hNl[i]; + efw[1][i] *= hNl[i]; + + // Ooura fft returns incorrect sign on imaginary component. It matters + // here because we are making an additive change with comfort noise. + efw[1][i] *= -1; + } +} + void WebRtcAec_InitAec_SSE2(void) { WebRtcAec_FilterFar = FilterFarSSE2; WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2; WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; + WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; } #endif //__SSE2__