diff --git a/webrtc/modules/audio_processing/aec/aec_core.c b/webrtc/modules/audio_processing/aec/aec_core.c index 91b55b6acd..da5c26b683 100644 --- a/webrtc/modules/audio_processing/aec/aec_core.c +++ b/webrtc/modules/audio_processing/aec/aec_core.c @@ -151,30 +151,31 @@ static int CmpFloat(const void* a, const void* b) { return (*da > *db) - (*da < *db); } -static void FilterFar(int num_partitions, - int x_fft_buffer_block_pos, - float x_fft_buffer[2][kExtendedNumPartitions * PART_LEN1], - float h_fft_buffer[2][kExtendedNumPartitions * PART_LEN1], - float y_fft[2][PART_LEN1]) { +static void FilterFar( + int num_partitions, + int x_fft_buf_block_pos, + float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], + float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1], + float y_fft[2][PART_LEN1]) { int i; for (i = 0; i < num_partitions; i++) { int j; - int x_pos = (i + x_fft_buffer_block_pos) * PART_LEN1; + int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; int pos = i * PART_LEN1; - // Check for wrapped buffer. - if (i + x_fft_buffer_block_pos >= num_partitions) { - x_pos -= num_partitions * (PART_LEN1); + // Check for wrap + if (i + x_fft_buf_block_pos >= num_partitions) { + xPos -= num_partitions * (PART_LEN1); } for (j = 0; j < PART_LEN1; j++) { - y_fft[0][j] += MulRe(x_fft_buffer[0][x_pos + j], - x_fft_buffer[1][x_pos + j], - h_fft_buffer[0][pos + j], - h_fft_buffer[1][pos + j]); - y_fft[1][j] += MulIm(x_fft_buffer[0][x_pos + j], - x_fft_buffer[1][x_pos + j], - h_fft_buffer[0][pos + j], - h_fft_buffer[1][pos + j]); + y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], + x_fft_buf[1][xPos + j], + h_fft_buf[0][pos + j], + h_fft_buf[1][pos + j]); + y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], + x_fft_buf[1][xPos + j], + h_fft_buf[0][pos + j], + h_fft_buf[1][pos + j]); } } } @@ -182,7 +183,7 @@ static void FilterFar(int num_partitions, static void ScaleErrorSignal(int extended_filter_enabled, float normal_mu, float normal_error_threshold, - float *x_pow, + float x_pow[PART_LEN1], float ef[2][PART_LEN1]) { const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; const float error_threshold = extended_filter_enabled @@ -207,59 +208,40 @@ static void ScaleErrorSignal(int extended_filter_enabled, } } -// Time-unconstrined filter adaptation. -// TODO(andrew): consider for a low-complexity mode. -// static void FilterAdaptationUnconstrained(AecCore* aec, float *fft, -// float ef[2][PART_LEN1]) { -// int i, j; -// for (i = 0; i < aec->num_partitions; i++) { -// int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1); -// int pos; -// // Check for wrap -// if (i + aec->xfBufBlockPos >= aec->num_partitions) { -// xPos -= aec->num_partitions * PART_LEN1; -// } -// -// pos = i * PART_LEN1; -// -// for (j = 0; j < PART_LEN1; j++) { -// aec->wfBuf[0][pos + j] += MulRe(aec->xfBuf[0][xPos + j], -// -aec->xfBuf[1][xPos + j], -// ef[0][j], ef[1][j]); -// aec->wfBuf[1][pos + j] += MulIm(aec->xfBuf[0][xPos + j], -// -aec->xfBuf[1][xPos + j], -// ef[0][j], ef[1][j]); -// } -// } -//} -static void FilterAdaptation(AecCore* aec, float* fft, float ef[2][PART_LEN1]) { +static void FilterAdaptation( + int num_partitions, + int x_fft_buf_block_pos, + float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], + float e_fft[2][PART_LEN1], + float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) { int i, j; - for (i = 0; i < aec->num_partitions; i++) { - int xPos = (i + aec->xfBufBlockPos) * (PART_LEN1); + float fft[PART_LEN2]; + for (i = 0; i < num_partitions; i++) { + int xPos = (i + x_fft_buf_block_pos) * (PART_LEN1); int pos; // Check for wrap - if (i + aec->xfBufBlockPos >= aec->num_partitions) { - xPos -= aec->num_partitions * PART_LEN1; + if (i + x_fft_buf_block_pos >= num_partitions) { + xPos -= num_partitions * PART_LEN1; } pos = i * PART_LEN1; for (j = 0; j < PART_LEN; j++) { - fft[2 * j] = MulRe(aec->xfBuf[0][xPos + j], - -aec->xfBuf[1][xPos + j], - ef[0][j], - ef[1][j]); - fft[2 * j + 1] = MulIm(aec->xfBuf[0][xPos + j], - -aec->xfBuf[1][xPos + j], - ef[0][j], - ef[1][j]); + fft[2 * j] = MulRe(x_fft_buf[0][xPos + j], + -x_fft_buf[1][xPos + j], + e_fft[0][j], + e_fft[1][j]); + fft[2 * j + 1] = MulIm(x_fft_buf[0][xPos + j], + -x_fft_buf[1][xPos + j], + e_fft[0][j], + e_fft[1][j]); } - fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN], - -aec->xfBuf[1][xPos + PART_LEN], - ef[0][PART_LEN], - ef[1][PART_LEN]); + fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN], + -x_fft_buf[1][xPos + PART_LEN], + e_fft[0][PART_LEN], + e_fft[1][PART_LEN]); aec_rdft_inverse_128(fft); memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); @@ -273,12 +255,12 @@ static void FilterAdaptation(AecCore* aec, float* fft, float ef[2][PART_LEN1]) { } aec_rdft_forward_128(fft); - aec->wfBuf[0][pos] += fft[0]; - aec->wfBuf[0][pos + PART_LEN] += fft[1]; + h_fft_buf[0][pos] += fft[0]; + h_fft_buf[0][pos + PART_LEN] += fft[1]; for (j = 1; j < PART_LEN; j++) { - aec->wfBuf[0][pos + j] += fft[2 * j]; - aec->wfBuf[1][pos + j] += fft[2 * j + 1]; + h_fft_buf[0][pos + j] += fft[2 * j]; + h_fft_buf[1][pos + j] += fft[2 * j + 1]; } } } @@ -845,34 +827,26 @@ static void UpdateDelayMetrics(AecCore* self) { return; } -static void FrequencyToTime(float freq_data[2][PART_LEN1], - float time_data[PART_LEN2]) { +static void InverseFft(float freq_data[2][PART_LEN1], + float time_data[PART_LEN2]) { int i; - time_data[0] = freq_data[0][0]; - time_data[1] = freq_data[0][PART_LEN]; + const float scale = 1.0f / PART_LEN2; + time_data[0] = freq_data[0][0] * scale; + time_data[1] = freq_data[0][PART_LEN] * scale; for (i = 1; i < PART_LEN; i++) { - time_data[2 * i] = freq_data[0][i]; - time_data[2 * i + 1] = freq_data[1][i]; + time_data[2 * i] = freq_data[0][i] * scale; + time_data[2 * i + 1] = freq_data[1][i] * scale; } aec_rdft_inverse_128(time_data); } -static void TimeToFrequency(float time_data[PART_LEN2], - float freq_data[2][PART_LEN1], - int window) { - int i = 0; - - // TODO(bjornv): Should we have a different function/wrapper for windowed FFT? - if (window) { - for (i = 0; i < PART_LEN; i++) { - time_data[i] *= WebRtcAec_sqrtHanning[i]; - time_data[PART_LEN + i] *= WebRtcAec_sqrtHanning[PART_LEN - i]; - } - } - +static void Fft(float time_data[PART_LEN2], + float freq_data[2][PART_LEN1]) { + int i; aec_rdft_forward_128(time_data); - // Reorder. + + // Reorder fft output data. freq_data[1][0] = 0; freq_data[1][PART_LEN] = 0; freq_data[0][0] = time_data[0]; @@ -963,61 +937,76 @@ static int SignalBasedDelayCorrection(AecCore* self) { return delay_correction; } -static void EchoSubtraction(AecCore* aec, - float* nearend_ptr) { - float yf[2][PART_LEN1]; - float fft[PART_LEN2]; - float y[PART_LEN]; +static void EchoSubtraction( + AecCore* aec, + int num_partitions, + int x_fft_buf_block_pos, + int metrics_mode, + int extended_filter_enabled, + float normal_mu, + float normal_error_threshold, + float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], + float* const y, + float x_pow[PART_LEN1], + float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1], + PowerLevel* linout_level, + float echo_subtractor_output[PART_LEN]) { + float s_fft[2][PART_LEN1]; + float e_extended[PART_LEN2]; + float s_extended[PART_LEN2]; + float *s; float e[PART_LEN]; - float ef[2][PART_LEN1]; - float scale; + float e_fft[2][PART_LEN1]; int i; - memset(yf, 0, sizeof(yf)); + memset(s_fft, 0, sizeof(s_fft)); - // Produce frequency domain echo estimate. - WebRtcAec_FilterFar(aec->num_partitions, - aec->xfBufBlockPos, - aec->xfBuf, - aec->wfBuf, - yf); + // Produce echo estimate s_fft. + WebRtcAec_FilterFar(num_partitions, + x_fft_buf_block_pos, + x_fft_buf, + h_fft_buf, + s_fft); - // Inverse fft to obtain echo estimate and error. - FrequencyToTime(yf, fft); - - // Extract the output signal and compute the time-domain error. - scale = 2.0f / PART_LEN2; + // Compute the time-domain echo estimate s. + InverseFft(s_fft, s_extended); + s = &s_extended[PART_LEN]; for (i = 0; i < PART_LEN; ++i) { - y[i] = fft[PART_LEN + i] * scale; // fft scaling. - e[i] = nearend_ptr[i] - y[i]; + s[i] *= 2.0f; } - // Error fft - memcpy(aec->eBuf + PART_LEN, e, sizeof(float) * PART_LEN); - memset(fft, 0, sizeof(float) * PART_LEN); - memcpy(fft + PART_LEN, e, sizeof(float) * PART_LEN); - TimeToFrequency(fft, ef, 0); + // Compute the time-domain echo prediction error. + for (i = 0; i < PART_LEN; ++i) { + e[i] = y[i] - s[i]; + } + + // Compute the frequency domain echo prediction error. + memset(e_extended, 0, sizeof(float) * PART_LEN); + memcpy(e_extended + PART_LEN, e, sizeof(float) * PART_LEN); + Fft(e_extended, e_fft); RTC_AEC_DEBUG_RAW_WRITE(aec->e_fft_file, - &ef[0][0], - sizeof(ef[0][0]) * PART_LEN1 * 2); + &e_fft[0][0], + sizeof(e_fft[0][0]) * PART_LEN1 * 2); - if (aec->metricsMode == 1) { + if (metrics_mode == 1) { // Note that the first PART_LEN samples in fft (before transformation) are // zero. Hence, the scaling by two in UpdateLevel() should not be // performed. That scaling is taken care of in UpdateMetrics() instead. - UpdateLevel(&aec->linoutlevel, ef); + UpdateLevel(linout_level, e_fft); } // Scale error signal inversely with far power. - WebRtcAec_ScaleErrorSignal(aec->extended_filter_enabled, - aec->normal_mu, - aec->normal_error_threshold, - aec->xPow, - ef); - WebRtcAec_FilterAdaptation(aec, fft, ef); - - - RTC_AEC_DEBUG_WAV_WRITE(aec->outLinearFile, e, PART_LEN); + WebRtcAec_ScaleErrorSignal(extended_filter_enabled, + normal_mu, + normal_error_threshold, + x_pow, + e_fft); + WebRtcAec_FilterAdaptation(num_partitions, + x_fft_buf_block_pos, + x_fft_buf, + e_fft, + h_fft_buf); + memcpy(echo_subtractor_output, e, sizeof(float) * PART_LEN); } @@ -1274,6 +1263,7 @@ static void ProcessBlock(AecCore* aec) { const float gInitNoise[2] = {0.999f, 0.001f}; float nearend[PART_LEN]; + float echo_subtractor_output[PART_LEN]; float* nearend_ptr = NULL; float output[PART_LEN]; float outputH[NUM_HIGH_BANDS_MAX][PART_LEN]; @@ -1313,7 +1303,7 @@ static void ProcessBlock(AecCore* aec) { // Near fft memcpy(fft, aec->dBuf, sizeof(float) * PART_LEN2); - TimeToFrequency(fft, df, 0); + Fft(fft, df); // Power smoothing for (i = 0; i < PART_LEN1; i++) { @@ -1392,9 +1382,26 @@ static void ProcessBlock(AecCore* aec) { sizeof(float) * PART_LEN1); // Perform echo subtraction. - EchoSubtraction(aec, nearend_ptr); + EchoSubtraction(aec, + aec->num_partitions, + aec->xfBufBlockPos, + aec->metricsMode, + aec->extended_filter_enabled, + aec->normal_mu, + aec->normal_error_threshold, + aec->xfBuf, + nearend_ptr, + aec->xPow, + aec->wfBuf, + &aec->linoutlevel, + echo_subtractor_output); + + RTC_AEC_DEBUG_WAV_WRITE(aec->outLinearFile, echo_subtractor_output, PART_LEN); // Perform echo suppression. + memcpy(aec->eBuf + PART_LEN, + echo_subtractor_output, + sizeof(float) * PART_LEN); EchoSuppression(aec, output, outputH_ptr); if (aec->metricsMode == 1) { @@ -1737,12 +1744,12 @@ void WebRtcAec_BufferFarendPartition(AecCore* aec, const float* farend) { } // Convert far-end partition to the frequency domain without windowing. memcpy(fft, farend, sizeof(float) * PART_LEN2); - TimeToFrequency(fft, xf, 0); + Fft(fft, xf); WebRtc_WriteBuffer(aec->far_buf, &xf[0][0], 1); // Convert far-end partition to the frequency domain with windowing. - memcpy(fft, farend, sizeof(float) * PART_LEN2); - TimeToFrequency(fft, xf, 1); + WindowData(fft, farend); + Fft(fft, xf); WebRtc_WriteBuffer(aec->far_buf_windowed, &xf[0][0], 1); } diff --git a/webrtc/modules/audio_processing/aec/aec_core_internal.h b/webrtc/modules/audio_processing/aec/aec_core_internal.h index 5d660deac4..881cac6d47 100644 --- a/webrtc/modules/audio_processing/aec/aec_core_internal.h +++ b/webrtc/modules/audio_processing/aec/aec_core_internal.h @@ -172,20 +172,23 @@ struct AecCore { typedef void (*WebRtcAecFilterFar)( int num_partitions, - int x_fft_buffer_block_pos, - float x_fft_buffer[2][kExtendedNumPartitions * PART_LEN1], - float h_fft_buffer[2][kExtendedNumPartitions * PART_LEN1], + int x_fft_buf_block_pos, + float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], + float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1], float y_fft[2][PART_LEN1]); extern WebRtcAecFilterFar WebRtcAec_FilterFar; typedef void (*WebRtcAecScaleErrorSignal)(int extended_filter_enabled, float normal_mu, float normal_error_threshold, - float* xPow, + float x_pow[PART_LEN1], float ef[2][PART_LEN1]); extern WebRtcAecScaleErrorSignal WebRtcAec_ScaleErrorSignal; -typedef void (*WebRtcAecFilterAdaptation)(AecCore* aec, - float* fft, - float ef[2][PART_LEN1]); +typedef void (*WebRtcAecFilterAdaptation)( + int num_partitions, + int x_fft_buf_block_pos, + float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], + float e_fft[2][PART_LEN1], + float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]); extern WebRtcAecFilterAdaptation WebRtcAec_FilterAdaptation; typedef void (*WebRtcAecOverdriveAndSuppress)(AecCore* aec, float hNl[PART_LEN1], diff --git a/webrtc/modules/audio_processing/aec/aec_core_mips.c b/webrtc/modules/audio_processing/aec/aec_core_mips.c index 58e471f6b9..dfd4dc606a 100644 --- a/webrtc/modules/audio_processing/aec/aec_core_mips.c +++ b/webrtc/modules/audio_processing/aec/aec_core_mips.c @@ -322,24 +322,24 @@ void WebRtcAec_ComfortNoise_mips(AecCore* aec, void WebRtcAec_FilterFar_mips( int num_partitions, - int xfBufBlockPos, - float xfBuf[2][kExtendedNumPartitions * PART_LEN1], - float wfBuf[2][kExtendedNumPartitions * PART_LEN1], - float yf[2][PART_LEN1]) { + int x_fft_buf_block_pos, + float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], + float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1], + float y_fft[2][PART_LEN1]) { int i; for (i = 0; i < num_partitions; i++) { - int xPos = (i + xfBufBlockPos) * PART_LEN1; + int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; int pos = i * PART_LEN1; // Check for wrap - if (i + xfBufBlockPos >= num_partitions) { + if (i + x_fft_buf_block_pos >= num_partitions) { xPos -= num_partitions * (PART_LEN1); } - float* yf0 = yf[0]; - float* yf1 = yf[1]; - float* aRe = xfBuf[0] + xPos; - float* aIm = xfBuf[1] + xPos; - float* bRe = wfBuf[0] + pos; - float* bIm = wfBuf[1] + pos; + float* yf0 = y_fft[0]; + float* yf1 = y_fft[1]; + float* aRe = x_fft_buf[0] + xPos; + float* aIm = x_fft_buf[1] + xPos; + float* bRe = h_fft_buf[0] + pos; + float* bIm = h_fft_buf[1] + pos; float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13; int len = PART_LEN1 >> 1; @@ -437,23 +437,27 @@ void WebRtcAec_FilterFar_mips( } } -void WebRtcAec_FilterAdaptation_mips(AecCore* aec, - float* fft, - float ef[2][PART_LEN1]) { +void WebRtcAec_FilterAdaptation_mips( + int num_partitions, + int x_fft_buf_block_pos, + float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], + float e_fft[2][PART_LEN1], + float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) { + float fft[PART_LEN2]; int i; - for (i = 0; i < aec->num_partitions; i++) { - int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1); + for (i = 0; i < num_partitions; i++) { + int xPos = (i + x_fft_buf_block_pos)*(PART_LEN1); int pos; // Check for wrap - if (i + aec->xfBufBlockPos >= aec->num_partitions) { - xPos -= aec->num_partitions * PART_LEN1; + if (i + x_fft_buf_block_pos >= num_partitions) { + xPos -= num_partitions * PART_LEN1; } pos = i * PART_LEN1; - float* aRe = aec->xfBuf[0] + xPos; - float* aIm = aec->xfBuf[1] + xPos; - float* bRe = ef[0]; - float* bIm = ef[1]; + float* aRe = x_fft_buf[0] + xPos; + float* aIm = x_fft_buf[1] + xPos; + float* bRe = e_fft[0]; + float* bIm = e_fft[1]; float* fft_tmp; float f0, f1, f2, f3, f4, f5, f6 ,f7, f8, f9, f10, f11, f12; @@ -578,8 +582,8 @@ void WebRtcAec_FilterAdaptation_mips(AecCore* aec, ); } aec_rdft_forward_128(fft); - aRe = aec->wfBuf[0] + pos; - aIm = aec->wfBuf[1] + pos; + aRe = h_fft_buf[0] + pos; + aIm = h_fft_buf[1] + pos; __asm __volatile ( ".set push \n\t" ".set noreorder \n\t" @@ -707,7 +711,7 @@ void WebRtcAec_OverdriveAndSuppress_mips(AecCore* aec, void WebRtcAec_ScaleErrorSignal_mips(int extended_filter_enabled, float normal_mu, float normal_error_threshold, - float *x_pow, + float x_pow[PART_LEN1], float ef[2][PART_LEN1]) { const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; const float error_threshold = extended_filter_enabled diff --git a/webrtc/modules/audio_processing/aec/aec_core_neon.c b/webrtc/modules/audio_processing/aec/aec_core_neon.c index ba74ebed80..6c94a2e0a7 100644 --- a/webrtc/modules/audio_processing/aec/aec_core_neon.c +++ b/webrtc/modules/audio_processing/aec/aec_core_neon.c @@ -34,48 +34,49 @@ __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { return aRe * bIm + aIm * bRe; } -static void FilterFarNEON(int num_partitions, - int xfBufBlockPos, - float xfBuf[2][kExtendedNumPartitions * PART_LEN1], - float wfBuf[2][kExtendedNumPartitions * PART_LEN1], - float yf[2][PART_LEN1]) { +static void FilterFarNEON( + int num_partitions, + int x_fft_buf_block_pos, + float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], + float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1], + float y_fft[2][PART_LEN1]) { int i; for (i = 0; i < num_partitions; i++) { int j; - int xPos = (i + xfBufBlockPos) * PART_LEN1; + int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; int pos = i * PART_LEN1; // Check for wrap - if (i + xfBufBlockPos >= num_partitions) { + if (i + x_fft_buf_block_pos >= num_partitions) { xPos -= num_partitions * PART_LEN1; } // vectorized code (four at once) for (j = 0; j + 3 < PART_LEN1; j += 4) { - const float32x4_t xfBuf_re = vld1q_f32(&xfBuf[0][xPos + j]); - const float32x4_t xfBuf_im = vld1q_f32(&xfBuf[1][xPos + j]); - const float32x4_t wfBuf_re = vld1q_f32(&wfBuf[0][pos + j]); - const float32x4_t wfBuf_im = vld1q_f32(&wfBuf[1][pos + j]); - const float32x4_t yf_re = vld1q_f32(&yf[0][j]); - const float32x4_t yf_im = vld1q_f32(&yf[1][j]); - const float32x4_t a = vmulq_f32(xfBuf_re, wfBuf_re); - const float32x4_t e = vmlsq_f32(a, xfBuf_im, wfBuf_im); - const float32x4_t c = vmulq_f32(xfBuf_re, wfBuf_im); - const float32x4_t f = vmlaq_f32(c, xfBuf_im, wfBuf_re); - const float32x4_t g = vaddq_f32(yf_re, e); - const float32x4_t h = vaddq_f32(yf_im, f); - vst1q_f32(&yf[0][j], g); - vst1q_f32(&yf[1][j], h); + const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); + const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); + const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]); + const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]); + const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]); + const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]); + const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re); + const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im); + const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im); + const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re); + const float32x4_t g = vaddq_f32(y_fft_re, e); + const float32x4_t h = vaddq_f32(y_fft_im, f); + vst1q_f32(&y_fft[0][j], g); + vst1q_f32(&y_fft[1][j], h); } // scalar code for the remaining items. for (; j < PART_LEN1; j++) { - yf[0][j] += MulRe(xfBuf[0][xPos + j], - xfBuf[1][xPos + j], - wfBuf[0][pos + j], - wfBuf[1][pos + j]); - yf[1][j] += MulIm(xfBuf[0][xPos + j], - xfBuf[1][xPos + j], - wfBuf[0][pos + j], - wfBuf[1][pos + j]); + y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], + x_fft_buf[1][xPos + j], + h_fft_buf[0][pos + j], + h_fft_buf[1][pos + j]); + y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], + x_fft_buf[1][xPos + j], + h_fft_buf[0][pos + j], + h_fft_buf[1][pos + j]); } } } @@ -128,7 +129,7 @@ static float32x4_t vsqrtq_f32(float32x4_t s) { static void ScaleErrorSignalNEON(int extended_filter_enabled, float normal_mu, float normal_error_threshold, - float *x_pow, + float x_pow[PART_LEN1], float ef[2][PART_LEN1]) { const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; const float error_threshold = extended_filter_enabled ? @@ -185,34 +186,37 @@ static void ScaleErrorSignalNEON(int extended_filter_enabled, } } -static void FilterAdaptationNEON(AecCore* aec, - float* fft, - float ef[2][PART_LEN1]) { +static void FilterAdaptationNEON( + int num_partitions, + int x_fft_buf_block_pos, + float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], + float e_fft[2][PART_LEN1], + float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) { + float fft[PART_LEN2]; int i; - const int num_partitions = aec->num_partitions; for (i = 0; i < num_partitions; i++) { - int xPos = (i + aec->xfBufBlockPos) * PART_LEN1; + int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; int pos = i * PART_LEN1; int j; // Check for wrap - if (i + aec->xfBufBlockPos >= num_partitions) { + if (i + x_fft_buf_block_pos >= num_partitions) { xPos -= num_partitions * PART_LEN1; } // Process the whole array... for (j = 0; j < PART_LEN; j += 4) { - // Load xfBuf and ef. - const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]); - const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]); - const float32x4_t ef_re = vld1q_f32(&ef[0][j]); - const float32x4_t ef_im = vld1q_f32(&ef[1][j]); - // Calculate the product of conjugate(xfBuf) by ef. + // Load x_fft_buf and e_fft. + const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); + const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); + const float32x4_t e_fft_re = vld1q_f32(&e_fft[0][j]); + const float32x4_t e_fft_im = vld1q_f32(&e_fft[1][j]); + // Calculate the product of conjugate(x_fft_buf) by e_fft. // re(conjugate(a) * b) = aRe * bRe + aIm * bIm // im(conjugate(a) * b)= aRe * bIm - aIm * bRe - const float32x4_t a = vmulq_f32(xfBuf_re, ef_re); - const float32x4_t e = vmlaq_f32(a, xfBuf_im, ef_im); - const float32x4_t c = vmulq_f32(xfBuf_re, ef_im); - const float32x4_t f = vmlsq_f32(c, xfBuf_im, ef_re); + const float32x4_t a = vmulq_f32(x_fft_buf_re, e_fft_re); + const float32x4_t e = vmlaq_f32(a, x_fft_buf_im, e_fft_im); + const float32x4_t c = vmulq_f32(x_fft_buf_re, e_fft_im); + const float32x4_t f = vmlsq_f32(c, x_fft_buf_im, e_fft_re); // Interleave real and imaginary parts. const float32x4x2_t g_n_h = vzipq_f32(e, f); // Store @@ -220,10 +224,10 @@ static void FilterAdaptationNEON(AecCore* aec, vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]); } // ... and fixup the first imaginary entry. - fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN], - -aec->xfBuf[1][xPos + PART_LEN], - ef[0][PART_LEN], - ef[1][PART_LEN]); + fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN], + -x_fft_buf[1][xPos + PART_LEN], + e_fft[0][PART_LEN], + e_fft[1][PART_LEN]); aec_rdft_inverse_128(fft); memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); @@ -241,21 +245,21 @@ static void FilterAdaptationNEON(AecCore* aec, aec_rdft_forward_128(fft); { - const float wt1 = aec->wfBuf[1][pos]; - aec->wfBuf[0][pos + PART_LEN] += fft[1]; + const float wt1 = h_fft_buf[1][pos]; + h_fft_buf[0][pos + PART_LEN] += fft[1]; for (j = 0; j < PART_LEN; j += 4) { - float32x4_t wtBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]); - float32x4_t wtBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]); + float32x4_t wtBuf_re = vld1q_f32(&h_fft_buf[0][pos + j]); + float32x4_t wtBuf_im = vld1q_f32(&h_fft_buf[1][pos + j]); const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]); const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]); const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4); wtBuf_re = vaddq_f32(wtBuf_re, fft_re_im.val[0]); wtBuf_im = vaddq_f32(wtBuf_im, fft_re_im.val[1]); - vst1q_f32(&aec->wfBuf[0][pos + j], wtBuf_re); - vst1q_f32(&aec->wfBuf[1][pos + j], wtBuf_im); + vst1q_f32(&h_fft_buf[0][pos + j], wtBuf_re); + vst1q_f32(&h_fft_buf[1][pos + j], wtBuf_im); } - aec->wfBuf[1][pos] = wt1; + h_fft_buf[1][pos] = wt1; } } } diff --git a/webrtc/modules/audio_processing/aec/aec_core_sse2.c b/webrtc/modules/audio_processing/aec/aec_core_sse2.c index b874c98f63..5b950ade05 100644 --- a/webrtc/modules/audio_processing/aec/aec_core_sse2.c +++ b/webrtc/modules/audio_processing/aec/aec_core_sse2.c @@ -29,51 +29,52 @@ __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) { return aRe * bIm + aIm * bRe; } -static void FilterFarSSE2(int num_partitions, - int xfBufBlockPos, - float xfBuf[2][kExtendedNumPartitions * PART_LEN1], - float wfBuf[2][kExtendedNumPartitions * PART_LEN1], - float yf[2][PART_LEN1]) { +static void FilterFarSSE2( + int num_partitions, + int x_fft_buf_block_pos, + float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], + float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1], + float y_fft[2][PART_LEN1]) { int i; for (i = 0; i < num_partitions; i++) { int j; - int xPos = (i + xfBufBlockPos) * PART_LEN1; + int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; int pos = i * PART_LEN1; // Check for wrap - if (i + xfBufBlockPos >= num_partitions) { + if (i + x_fft_buf_block_pos >= num_partitions) { xPos -= num_partitions * (PART_LEN1); } // vectorized code (four at once) for (j = 0; j + 3 < PART_LEN1; j += 4) { - const __m128 xfBuf_re = _mm_loadu_ps(&xfBuf[0][xPos + j]); - const __m128 xfBuf_im = _mm_loadu_ps(&xfBuf[1][xPos + j]); - const __m128 wfBuf_re = _mm_loadu_ps(&wfBuf[0][pos + j]); - const __m128 wfBuf_im = _mm_loadu_ps(&wfBuf[1][pos + j]); - const __m128 yf_re = _mm_loadu_ps(&yf[0][j]); - const __m128 yf_im = _mm_loadu_ps(&yf[1][j]); - const __m128 a = _mm_mul_ps(xfBuf_re, wfBuf_re); - const __m128 b = _mm_mul_ps(xfBuf_im, wfBuf_im); - const __m128 c = _mm_mul_ps(xfBuf_re, wfBuf_im); - const __m128 d = _mm_mul_ps(xfBuf_im, wfBuf_re); + const __m128 x_fft_buf_re = _mm_loadu_ps(&x_fft_buf[0][xPos + j]); + const __m128 x_fft_buf_im = _mm_loadu_ps(&x_fft_buf[1][xPos + j]); + const __m128 h_fft_buf_re = _mm_loadu_ps(&h_fft_buf[0][pos + j]); + const __m128 h_fft_buf_im = _mm_loadu_ps(&h_fft_buf[1][pos + j]); + const __m128 y_fft_re = _mm_loadu_ps(&y_fft[0][j]); + const __m128 y_fft_im = _mm_loadu_ps(&y_fft[1][j]); + const __m128 a = _mm_mul_ps(x_fft_buf_re, h_fft_buf_re); + const __m128 b = _mm_mul_ps(x_fft_buf_im, h_fft_buf_im); + const __m128 c = _mm_mul_ps(x_fft_buf_re, h_fft_buf_im); + const __m128 d = _mm_mul_ps(x_fft_buf_im, h_fft_buf_re); const __m128 e = _mm_sub_ps(a, b); const __m128 f = _mm_add_ps(c, d); - const __m128 g = _mm_add_ps(yf_re, e); - const __m128 h = _mm_add_ps(yf_im, f); - _mm_storeu_ps(&yf[0][j], g); - _mm_storeu_ps(&yf[1][j], h); + const __m128 g = _mm_add_ps(y_fft_re, e); + const __m128 h = _mm_add_ps(y_fft_im, f); + _mm_storeu_ps(&y_fft[0][j], g); + _mm_storeu_ps(&y_fft[1][j], h); } // scalar code for the remaining items. for (; j < PART_LEN1; j++) { - yf[0][j] += MulRe(xfBuf[0][xPos + j], - xfBuf[1][xPos + j], - wfBuf[0][pos + j], - wfBuf[1][pos + j]); - yf[1][j] += MulIm(xfBuf[0][xPos + j], - xfBuf[1][xPos + j], - wfBuf[0][pos + j], - wfBuf[1][pos + j]); + y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], + x_fft_buf[1][xPos + j], + h_fft_buf[0][pos + j], + h_fft_buf[1][pos + j]); + y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], + x_fft_buf[1][xPos + j], + h_fft_buf[0][pos + j], + h_fft_buf[1][pos + j]); } } } @@ -81,7 +82,7 @@ static void FilterFarSSE2(int num_partitions, static void ScaleErrorSignalSSE2(int extended_filter_enabled, float normal_mu, float normal_error_threshold, - float *x_pow, + float x_pow[PART_LEN1], float ef[2][PART_LEN1]) { const __m128 k1e_10f = _mm_set1_ps(1e-10f); const __m128 kMu = extended_filter_enabled ? _mm_set1_ps(kExtendedMu) @@ -147,33 +148,36 @@ static void ScaleErrorSignalSSE2(int extended_filter_enabled, } } -static void FilterAdaptationSSE2(AecCore* aec, - float* fft, - float ef[2][PART_LEN1]) { +static void FilterAdaptationSSE2( + int num_partitions, + int x_fft_buf_block_pos, + float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], + float e_fft[2][PART_LEN1], + float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) { + float fft[PART_LEN2]; int i, j; - const int num_partitions = aec->num_partitions; for (i = 0; i < num_partitions; i++) { - int xPos = (i + aec->xfBufBlockPos) * (PART_LEN1); + int xPos = (i + x_fft_buf_block_pos) * (PART_LEN1); int pos = i * PART_LEN1; // Check for wrap - if (i + aec->xfBufBlockPos >= num_partitions) { + if (i + x_fft_buf_block_pos >= num_partitions) { xPos -= num_partitions * PART_LEN1; } // Process the whole array... for (j = 0; j < PART_LEN; j += 4) { - // Load xfBuf and ef. - const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]); - const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]); - const __m128 ef_re = _mm_loadu_ps(&ef[0][j]); - const __m128 ef_im = _mm_loadu_ps(&ef[1][j]); - // Calculate the product of conjugate(xfBuf) by ef. + // Load x_fft_buf and e_fft. + const __m128 x_fft_buf_re = _mm_loadu_ps(&x_fft_buf[0][xPos + j]); + const __m128 x_fft_buf_im = _mm_loadu_ps(&x_fft_buf[1][xPos + j]); + const __m128 e_fft_re = _mm_loadu_ps(&e_fft[0][j]); + const __m128 e_fft_im = _mm_loadu_ps(&e_fft[1][j]); + // Calculate the product of conjugate(x_fft_buf) by e_fft. // re(conjugate(a) * b) = aRe * bRe + aIm * bIm // im(conjugate(a) * b)= aRe * bIm - aIm * bRe - const __m128 a = _mm_mul_ps(xfBuf_re, ef_re); - const __m128 b = _mm_mul_ps(xfBuf_im, ef_im); - const __m128 c = _mm_mul_ps(xfBuf_re, ef_im); - const __m128 d = _mm_mul_ps(xfBuf_im, ef_re); + const __m128 a = _mm_mul_ps(x_fft_buf_re, e_fft_re); + const __m128 b = _mm_mul_ps(x_fft_buf_im, e_fft_im); + const __m128 c = _mm_mul_ps(x_fft_buf_re, e_fft_im); + const __m128 d = _mm_mul_ps(x_fft_buf_im, e_fft_re); const __m128 e = _mm_add_ps(a, b); const __m128 f = _mm_sub_ps(c, d); // Interleave real and imaginary parts. @@ -184,10 +188,10 @@ static void FilterAdaptationSSE2(AecCore* aec, _mm_storeu_ps(&fft[2 * j + 4], h); } // ... and fixup the first imaginary entry. - fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN], - -aec->xfBuf[1][xPos + PART_LEN], - ef[0][PART_LEN], - ef[1][PART_LEN]); + fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN], + -x_fft_buf[1][xPos + PART_LEN], + e_fft[0][PART_LEN], + e_fft[1][PART_LEN]); aec_rdft_inverse_128(fft); memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); @@ -205,11 +209,11 @@ static void FilterAdaptationSSE2(AecCore* aec, aec_rdft_forward_128(fft); { - float wt1 = aec->wfBuf[1][pos]; - aec->wfBuf[0][pos + PART_LEN] += fft[1]; + float wt1 = h_fft_buf[1][pos]; + h_fft_buf[0][pos + PART_LEN] += fft[1]; for (j = 0; j < PART_LEN; j += 4) { - __m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]); - __m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]); + __m128 wtBuf_re = _mm_loadu_ps(&h_fft_buf[0][pos + j]); + __m128 wtBuf_im = _mm_loadu_ps(&h_fft_buf[1][pos + j]); const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]); const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]); const __m128 fft_re = @@ -218,10 +222,10 @@ static void FilterAdaptationSSE2(AecCore* aec, _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1)); wtBuf_re = _mm_add_ps(wtBuf_re, fft_re); wtBuf_im = _mm_add_ps(wtBuf_im, fft_im); - _mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re); - _mm_storeu_ps(&aec->wfBuf[1][pos + j], wtBuf_im); + _mm_storeu_ps(&h_fft_buf[0][pos + j], wtBuf_re); + _mm_storeu_ps(&h_fft_buf[1][pos + j], wtBuf_im); } - aec->wfBuf[1][pos] = wt1; + h_fft_buf[1][pos] = wt1; } } }