From 8dd7466b5279bf4dd302eef2c7d2408317cb8eb8 Mon Sep 17 00:00:00 2001 From: "kma@google.com" Date: Tue, 16 Aug 2011 03:28:28 +0000 Subject: [PATCH] 2nd check in Review URL: http://webrtc-codereview.appspot.com/112002 git-svn-id: http://webrtc.googlecode.com/svn/trunk@372 4adac7df-926f-26a2-2b94-8c16560cd09d --- .../aecm/main/source/aecm_core.c | 522 +++++++++--------- .../aecm/main/source/aecm_core.h | 28 +- .../aecm/main/source/aecm_core_neon.c | 243 +++++--- 3 files changed, 472 insertions(+), 321 deletions(-) diff --git a/src/modules/audio_processing/aecm/main/source/aecm_core.c b/src/modules/audio_processing/aecm/main/source/aecm_core.c index f3e8a83624..cc49e91d6a 100644 --- a/src/modules/audio_processing/aecm/main/source/aecm_core.c +++ b/src/modules/audio_processing/aecm/main/source/aecm_core.c @@ -28,6 +28,14 @@ FILE *dfile; FILE *testfile; #endif +#ifdef _MSC_VER // visual c++ +#define ALIGN8_BEG __declspec(align(8)) +#define ALIGN8_END +#else // gcc or icc +#define ALIGN8_BEG +#define ALIGN8_END __attribute__((aligned(8))) +#endif + #ifdef AECM_SHORT // Square root of Hanning window in Q14 @@ -43,7 +51,7 @@ const WebRtc_Word16 WebRtcAecm_kSqrtHanning[] = #else // Square root of Hanning window in Q14 -const WebRtc_Word16 WebRtcAecm_kSqrtHanning[] = +const ALIGN8_BEG WebRtc_Word16 WebRtcAecm_kSqrtHanning[] ALIGN8_END = { 0, 399, 798, 1196, 1594, 1990, 2386, 2780, 3172, 3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224, 6591, 6954, 7313, 7668, 8019, 8364, @@ -97,12 +105,13 @@ static const WebRtc_Word16 kChannelStored16kHz[PART_LEN1] = { static const WebRtc_Word16 kNoiseEstQDomain = 15; static const WebRtc_Word16 kNoiseEstIncCount = 5; -static void ComfortNoise(AecmCore_t * aecm, +static void ComfortNoise(AecmCore_t* aecm, const WebRtc_UWord16* dfa, - WebRtc_Word16* outReal, - WebRtc_Word16* outImag, + complex16_t* out, const WebRtc_Word16* lambda); +static WebRtc_Word16 CalcSuppressionGain(AecmCore_t * const aecm); + #ifdef ARM_WINM_LOG HANDLE logFile = NULL; #endif @@ -151,10 +160,11 @@ int WebRtcAecm_CreateCore(AecmCore_t **aecmInst) return -1; } - // Init some aecm pointers. 16-byte alignment is only necessary for Neon code currently. - aecm->xBuf = (WebRtc_Word16*) (((uintptr_t)aecm->xBuf_buf + 15) & ~ 15); - aecm->dBufClean = (WebRtc_Word16*) (((uintptr_t)aecm->dBufClean_buf + 15) & ~ 15); - aecm->dBufNoisy = (WebRtc_Word16*) (((uintptr_t)aecm->dBufNoisy_buf + 15) & ~ 15); + // Init some aecm pointers. 16 and 32 byte alignment is only necessary + // for Neon code currently. + aecm->xBuf = (WebRtc_Word16*) (((uintptr_t)aecm->xBuf_buf + 31) & ~ 31); + aecm->dBufClean = (WebRtc_Word16*) (((uintptr_t)aecm->dBufClean_buf + 31) & ~ 31); + aecm->dBufNoisy = (WebRtc_Word16*) (((uintptr_t)aecm->dBufNoisy_buf + 31) & ~ 31); aecm->outBuf = (WebRtc_Word16*) (((uintptr_t)aecm->outBuf_buf + 15) & ~ 15); aecm->channelStored = (WebRtc_Word16*) (((uintptr_t) aecm->channelStored_buf + 15) & ~ 15); @@ -345,7 +355,9 @@ int WebRtcAecm_ProcessFrame(AecmCore_t * aecm, WebRtc_Word16 farBlock[PART_LEN]; WebRtc_Word16 nearNoisyBlock[PART_LEN]; WebRtc_Word16 nearCleanBlock[PART_LEN]; - WebRtc_Word16 outBlock[PART_LEN]; + WebRtc_Word16 outBlock_buf[PART_LEN + 8]; // Align buffer to 8-byte boundary. + WebRtc_Word16* outBlock = (WebRtc_Word16*) (((uintptr_t) outBlock_buf + 15) & ~ 15); + WebRtc_Word16 farFrame[FRAME_LEN]; int size = 0; @@ -892,7 +904,7 @@ void WebRtcAecm_UpdateChannel(AecmCore_t * aecm, // END: Determine if we should store or reset channel estimate. } -// WebRtcAecm_CalcSuppressionGain(...) +// CalcSuppressionGain(...) // // This function calculates the suppression gain that is used in the Wiener filter. // @@ -902,7 +914,7 @@ void WebRtcAecm_UpdateChannel(AecmCore_t * aecm, // level (Q14). // // -WebRtc_Word16 WebRtcAecm_CalcSuppressionGain(AecmCore_t* aecm) +static WebRtc_Word16 CalcSuppressionGain(AecmCore_t * const aecm) { WebRtc_Word32 tmp32no1; @@ -985,8 +997,7 @@ WebRtc_Word16 WebRtcAecm_CalcSuppressionGain(AecmCore_t* aecm) // return value The Q-domain of current frequency values // static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal, - WebRtc_Word16* freq_signal_real, - WebRtc_Word16* freq_signal_imag, + complex16_t* freq_signal, WebRtc_UWord16* freq_signal_abs, WebRtc_UWord32* freq_signal_sum_abs) { @@ -998,9 +1009,9 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal, WebRtc_Word32 tmp32no1; WebRtc_Word32 tmp32no2; - // In fft_buf, +8 for 16-byte alignment, and +2 to make some loops safe. - WebRtc_Word16 fft_buf[PART_LEN4 + 2 + 8]; - WebRtc_Word16 *fft = (WebRtc_Word16 *) (((uintptr_t) fft_buf + 15) & ~15); + // In fft_buf, +16 for 32-byte alignment. + WebRtc_Word16 fft_buf[PART_LEN4 + 16]; + WebRtc_Word16 *fft = (WebRtc_Word16 *) (((uintptr_t) fft_buf + 31) & ~31); WebRtc_Word16 tmp16no1; WebRtc_Word16 tmp16no2; @@ -1016,46 +1027,30 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal, time_signal_scaling = WebRtcSpl_NormW16(tmp16no1); #endif - WebRtcAecm_PrepareFft(fft, time_signal, time_signal_scaling); - - // Fourier transformation of time domain signal. - // The result is scaled with 1/PART_LEN2, that is, the result is in Q(-6) - // for PART_LEN = 32 - - WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); - ret = WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1); - - // Take only the first PART_LEN2 samples - for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2) - { - freq_signal_real[i] = fft[j]; - - // The imaginary part has to switch sign - freq_signal_imag[i] = - fft[j+1]; - } + WebRtcAecm_WindowAndFFT(fft, time_signal, freq_signal, time_signal_scaling); // Extract imaginary and real part, calculate the magnitude for all frequency bins - freq_signal_imag[0] = 0; - freq_signal_imag[PART_LEN] = 0; - freq_signal_real[PART_LEN] = fft[PART_LEN2]; + freq_signal[0].imag = 0; + freq_signal[PART_LEN].imag = 0; + freq_signal[PART_LEN].real = fft[PART_LEN2]; freq_signal_abs[0] = (WebRtc_UWord16)WEBRTC_SPL_ABS_W16( - freq_signal_real[0]); + freq_signal[0].real); freq_signal_abs[PART_LEN] = (WebRtc_UWord16)WEBRTC_SPL_ABS_W16( - freq_signal_real[PART_LEN]); + freq_signal[PART_LEN].real); (*freq_signal_sum_abs) = (WebRtc_UWord32)(freq_signal_abs[0]) + (WebRtc_UWord32)(freq_signal_abs[PART_LEN]); for (i = 1; i < PART_LEN; i++) { - if (freq_signal_real[i] == 0) + if (freq_signal[i].real == 0) { freq_signal_abs[i] = (WebRtc_UWord16)WEBRTC_SPL_ABS_W16( - freq_signal_imag[i]); + freq_signal[i].imag); } - else if (freq_signal_imag[i] == 0) + else if (freq_signal[i].imag == 0) { freq_signal_abs[i] = (WebRtc_UWord16)WEBRTC_SPL_ABS_W16( - freq_signal_real[i]); + freq_signal[i].real); } else { @@ -1066,8 +1061,8 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal, // The parameters alpha and beta are stored in Q15 #ifdef AECM_WITH_ABS_APPROX - tmp16no1 = WEBRTC_SPL_ABS_W16(freq_signal_real[i]); - tmp16no2 = WEBRTC_SPL_ABS_W16(freq_signal_imag[i]); + tmp16no1 = WEBRTC_SPL_ABS_W16(freq_signal[i].real); + tmp16no2 = WEBRTC_SPL_ABS_W16(freq_signal[i].imag); if(tmp16no1 > tmp16no2) { @@ -1103,13 +1098,13 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal, (WebRtc_UWord16)tmp16no2; #else #ifdef WEBRTC_ARCH_ARM_V7A - __asm__("smulbb %0, %1, %2" : "=r"(tmp32no1) : "r"(freq_signal_real[i]), - "r"(freq_signal_real[i])); - __asm__("smlabb %0, %1, %2, %3" :: "r"(tmp32no2), "r"(freq_signal_imag[i]), - "r"(freq_signal_imag[i]), "r"(tmp32no1)); + __asm__("smulbb %0, %1, %2" : "=r"(tmp32no1) : "r"(freq_signal[i].real), + "r"(freq_signal[i].real)); + __asm__("smlabb %0, %1, %2, %3" :: "r"(tmp32no2), "r"(freq_signal[i].imag), + "r"(freq_signal[i].imag), "r"(tmp32no1)); #else - tmp16no1 = WEBRTC_SPL_ABS_W16(freq_signal_real[i]); - tmp16no2 = WEBRTC_SPL_ABS_W16(freq_signal_imag[i]); + tmp16no1 = WEBRTC_SPL_ABS_W16(freq_signal[i].real); + tmp16no2 = WEBRTC_SPL_ABS_W16(freq_signal[i].imag); tmp32no1 = WEBRTC_SPL_MUL_16_16(tmp16no1, tmp16no1); tmp32no2 = WEBRTC_SPL_MUL_16_16(tmp16no2, tmp16no2); tmp32no2 = WEBRTC_SPL_ADD_SAT_W32(tmp32no1, tmp32no2); @@ -1125,7 +1120,8 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal, return time_signal_scaling; } -int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, +int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, + const WebRtc_Word16 * farend, const WebRtc_Word16 * nearendNoisy, const WebRtc_Word16 * nearendClean, WebRtc_Word16 * output) @@ -1140,10 +1136,6 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, WebRtc_Word32 tmp32no1; - // +8 for 32-byte alignment. - WebRtc_Word32 echoEst32_buf[PART_LEN1 + 8]; - WebRtc_Word32 *echoEst32 = (WebRtc_Word32*) (((uintptr_t) echoEst32_buf + 31) & ~ 31); - WebRtc_UWord16 xfa[PART_LEN1]; WebRtc_UWord16 dfaNoisy[PART_LEN1]; WebRtc_UWord16 dfaClean[PART_LEN1]; @@ -1151,11 +1143,18 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, const WebRtc_UWord16* far_spectrum_ptr = NULL; int outCFFT; - WebRtc_Word16 fft[PART_LEN4]; - WebRtc_Word16 dfwReal[PART_LEN1]; - WebRtc_Word16 dfwImag[PART_LEN1]; - WebRtc_Word16 efwReal[PART_LEN1]; - WebRtc_Word16 efwImag[PART_LEN1]; + // 32 byte aligned buffers (with +8 or +16). + // TODO (kma): define fft with complex16_t. + WebRtc_Word16 fft_buf[PART_LEN4 + 2 + 16]; // +2 to make a loop safe. + WebRtc_Word32 echoEst32_buf[PART_LEN1 + 8]; + WebRtc_Word32 dfw_buf[PART_LEN1 + 8]; + WebRtc_Word32 efw_buf[PART_LEN1 + 8]; + + WebRtc_Word16* fft = (WebRtc_Word16*) (((uintptr_t) fft_buf + 31) & ~ 31); + WebRtc_Word32* echoEst32 = (WebRtc_Word32*) (((uintptr_t) echoEst32_buf + 31) & ~ 31); + complex16_t* dfw = (complex16_t*) (((uintptr_t) dfw_buf + 31) & ~ 31); + complex16_t* efw = (complex16_t*) (((uintptr_t) efw_buf + 31) & ~ 31); + WebRtc_Word16 hnl[PART_LEN1]; WebRtc_Word16 numPosCoef = 0; WebRtc_Word16 nlpGain = ONE_Q14; @@ -1206,15 +1205,13 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, // Transform far end signal from time domain to frequency domain. zerosXBuf = TimeToFrequencyDomain(aecm->xBuf, - dfwReal, - dfwImag, + dfw, xfa, &xfaSum); // Transform noisy near end signal from time domain to frequency domain. zerosDBufNoisy = TimeToFrequencyDomain(aecm->dBufNoisy, - dfwReal, - dfwImag, + dfw, dfaNoisy, &dfaNoisySum); aecm->dfaNoisyQDomainOld = aecm->dfaNoisyQDomain; @@ -1231,8 +1228,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, { // Transform clean near end signal from time domain to frequency domain. zerosDBufClean = TimeToFrequencyDomain(aecm->dBufClean, - dfwReal, - dfwImag, + dfw, dfaClean, &dfaCleanSum); aecm->dfaCleanQDomainOld = aecm->dfaCleanQDomain; @@ -1300,7 +1296,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, // This is the channel estimation algorithm. // It is base on NLMS but has a variable step length, which was calculated above. WebRtcAecm_UpdateChannel(aecm, far_spectrum_ptr, zerosXBuf, dfaNoisy, mu, echoEst32); - supGain = WebRtcAecm_CalcSuppressionGain(aecm); + supGain = CalcSuppressionGain(aecm); #ifdef ARM_WINM_LOG_ // measure tick end @@ -1483,9 +1479,9 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, } // multiply with Wiener coefficients - efwReal[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwReal[i], + efw[i].real = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].real, hnl[i], 14)); - efwImag[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwImag[i], + efw[i].imag = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].imag, hnl[i], 14)); } } @@ -1494,16 +1490,16 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, // multiply with Wiener coefficients for (i = 0; i < PART_LEN1; i++) { - efwReal[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwReal[i], + efw[i].real = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].real, hnl[i], 14)); - efwImag[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwImag[i], + efw[i].imag = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].imag, hnl[i], 14)); } } if (aecm->cngMode == AecmTrue) { - ComfortNoise(aecm, ptrDfaClean, efwReal, efwImag, hnl); + ComfortNoise(aecm, ptrDfaClean, efw, hnl); } #ifdef ARM_WINM_LOG_ @@ -1516,177 +1512,11 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend, QueryPerformanceCounter((LARGE_INTEGER*)&start); #endif - // Synthesis - for (i = 1; i < PART_LEN; i++) - { - j = WEBRTC_SPL_LSHIFT_W32(i, 1); - fft[j] = efwReal[i]; - - // mirrored data, even - fft[PART_LEN4 - j] = efwReal[i]; - fft[j + 1] = -efwImag[i]; - - //mirrored data, odd - fft[PART_LEN4 - (j - 1)] = efwImag[i]; - } - fft[0] = efwReal[0]; - fft[1] = -efwImag[0]; - - fft[PART_LEN2] = efwReal[PART_LEN]; - fft[PART_LEN2 + 1] = -efwImag[PART_LEN]; - - // inverse FFT, result should be scaled with outCFFT - WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); - outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1); - - //take only the real values and scale with outCFFT - for (i = 0; i < PART_LEN2; i++) - { - j = WEBRTC_SPL_LSHIFT_W32(i, 1); - fft[i] = fft[j]; - } - - for (i = 0; i < PART_LEN; i++) - { - fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( - fft[i], - WebRtcAecm_kSqrtHanning[i], - 14); - tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i], - outCFFT - aecm->dfaCleanQDomain); - fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX, - tmp32no1 + aecm->outBuf[i], - WEBRTC_SPL_WORD16_MIN); - output[i] = fft[i]; - - tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT( - fft[PART_LEN + i], - WebRtcAecm_kSqrtHanning[PART_LEN - i], - 14); - tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, - outCFFT - aecm->dfaCleanQDomain); - aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT( - WEBRTC_SPL_WORD16_MAX, - tmp32no1, - WEBRTC_SPL_WORD16_MIN); - } - -#ifdef ARM_WINM_LOG_ - // measure tick end - QueryPerformanceCounter((LARGE_INTEGER*)&end); - diff__ = ((end - start) * 1000) / (freq/1000); - milliseconds = (unsigned int)(diff__ & 0xffffffff); - WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL); -#endif - // Copy the current block to the old position (aecm->outBuf is shifted elsewhere) - memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN); - memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN); - if (nearendClean != NULL) - { - memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN); - } + WebRtcAecm_InverseFFTAndWindow(aecm, fft, efw, output, nearendClean); return 0; } -#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)) - -void WebRtcAecm_PrepareFft(WebRtc_Word16* fft, - const WebRtc_Word16* time_signal, - int time_signal_scaling) -{ - int i, j; - - memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4); - // FFT of signal - for (i = 0, j = 0; i < PART_LEN; i++, j += 2) - { - // Window time domain signal and insert into real part of - // transformation array |fft| - fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( - (time_signal[i] << time_signal_scaling), - WebRtcAecm_kSqrtHanning[i], - 14); - fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( - (time_signal[PART_LEN + i] << time_signal_scaling), - WebRtcAecm_kSqrtHanning[PART_LEN - i], - 14); - // Inserting zeros in imaginary parts not necessary since we - // initialized the array with all zeros - } -} - -void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm, - const WebRtc_UWord16* far_spectrum, - WebRtc_Word32* echo_est, - WebRtc_UWord32* far_energy, - WebRtc_UWord32* echo_energy_adapt, - WebRtc_UWord32* echo_energy_stored) -{ - int i; - - // Get energy for the delayed far end signal and estimated - // echo using both stored and adapted channels. - for (i = 0; i < PART_LEN1; i++) - { - echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], - far_spectrum[i]); - (*far_energy) += (WebRtc_UWord32)(far_spectrum[i]); - (*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i], - far_spectrum[i]); - (*echo_energy_stored) += (WebRtc_UWord32)echo_est[i]; - } -} - -void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm, - const WebRtc_UWord16* far_spectrum, - WebRtc_Word32* echo_est) -{ - int i; - - // During startup we store the channel every block. - memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1); - // Recalculate echo estimate - for (i = 0; i < PART_LEN; i += 4) - { - echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], - far_spectrum[i]); - echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1], - far_spectrum[i + 1]); - echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2], - far_spectrum[i + 2]); - echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3], - far_spectrum[i + 3]); - } - echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], - far_spectrum[i]); -} - -void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm) -{ - int i; - - // The stored channel has a significantly lower MSE than the adaptive one for - // two consecutive calculations. Reset the adaptive channel. - memcpy(aecm->channelAdapt16, aecm->channelStored, - sizeof(WebRtc_Word16) * PART_LEN1); - // Restore the W32 channel - for (i = 0; i < PART_LEN; i += 4) - { - aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32( - (WebRtc_Word32)aecm->channelStored[i], 16); - aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32( - (WebRtc_Word32)aecm->channelStored[i + 1], 16); - aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32( - (WebRtc_Word32)aecm->channelStored[i + 2], 16); - aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32( - (WebRtc_Word32)aecm->channelStored[i + 3], 16); - } - aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16); -} - -#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)) - // Generate comfort noise and add to output signal. // @@ -1696,10 +1526,9 @@ void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm) // \param[in,out] outImag Imaginary part of the output signal (Q[aecm->dfaQDomain]). // \param[in] lambda Suppression gain with which to scale the noise level (Q14). // -static void ComfortNoise(AecmCore_t * aecm, +static void ComfortNoise(AecmCore_t* aecm, const WebRtc_UWord16* dfa, - WebRtc_Word16* outReal, - WebRtc_Word16* outImag, + complex16_t* out, const WebRtc_Word16* lambda) { WebRtc_Word16 i; @@ -1827,22 +1656,22 @@ static void ComfortNoise(AecmCore_t * aecm, #if (!defined ARM_WINM) && (!defined ARM9E_GCC) && (!defined ANDROID_AECOPT) for (i = 0; i < PART_LEN1; i++) { - outReal[i] = WEBRTC_SPL_ADD_SAT_W16(outReal[i], uReal[i]); - outImag[i] = WEBRTC_SPL_ADD_SAT_W16(outImag[i], uImag[i]); + out[i].real = WEBRTC_SPL_ADD_SAT_W16(out[i].real, uReal[i]); + out[i].imag = WEBRTC_SPL_ADD_SAT_W16(out[i].imag, uImag[i]); } #else for (i = 0; i < PART_LEN1 -1; ) { - outReal[i] = WEBRTC_SPL_ADD_SAT_W16(outReal[i], uReal[i]); - outImag[i] = WEBRTC_SPL_ADD_SAT_W16(outImag[i], uImag[i]); + out[i].real = WEBRTC_SPL_ADD_SAT_W16(out[i].real, uReal[i]); + out[i].imag = WEBRTC_SPL_ADD_SAT_W16(out[i].imag, uImag[i]); i++; - outReal[i] = WEBRTC_SPL_ADD_SAT_W16(outReal[i], uReal[i]); - outImag[i] = WEBRTC_SPL_ADD_SAT_W16(outImag[i], uImag[i]); + out[i].real = WEBRTC_SPL_ADD_SAT_W16(out[i].real, uReal[i]); + out[i].imag = WEBRTC_SPL_ADD_SAT_W16(out[i].imag, uImag[i]); i++; } - outReal[i] = WEBRTC_SPL_ADD_SAT_W16(outReal[i], uReal[i]); - outImag[i] = WEBRTC_SPL_ADD_SAT_W16(outImag[i], uImag[i]); + out[i].real = WEBRTC_SPL_ADD_SAT_W16(out[i].real, uReal[i]); + out[i].imag = WEBRTC_SPL_ADD_SAT_W16(out[i].imag, uImag[i]); #endif } @@ -1906,3 +1735,196 @@ void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const far sizeof(WebRtc_Word16) * readLen); aecm->farBufReadPos += readLen; } + +#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)) + +void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft, + const WebRtc_Word16* time_signal, + complex16_t* freq_signal, + int time_signal_scaling) +{ + int i, j; + + memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4); + // FFT of signal + for (i = 0, j = 0; i < PART_LEN; i++, j += 2) + { + // Window time domain signal and insert into real part of + // transformation array |fft| + fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( + (time_signal[i] << time_signal_scaling), + WebRtcAecm_kSqrtHanning[i], + 14); + fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( + (time_signal[i + PART_LEN] << time_signal_scaling), + WebRtcAecm_kSqrtHanning[PART_LEN - i], + 14); + // Inserting zeros in imaginary parts not necessary since we + // initialized the array with all zeros + } + + WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); + WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1); + + // Take only the first PART_LEN2 samples + for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2) + { + freq_signal[i].real = fft[j]; + + // The imaginary part has to switch sign + freq_signal[i].imag = - fft[j+1]; + } +} + +void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm, + WebRtc_Word16* fft, + complex16_t* efw, + WebRtc_Word16* output, + const WebRtc_Word16* nearendClean) +{ + int i, j, outCFFT; + WebRtc_Word32 tmp32no1; + + // Synthesis + for (i = 1; i < PART_LEN; i++) + { + j = WEBRTC_SPL_LSHIFT_W32(i, 1); + fft[j] = efw[i].real; + + // mirrored data, even + fft[PART_LEN4 - j] = efw[i].real; + fft[j + 1] = -efw[i].imag; + + //mirrored data, odd + fft[PART_LEN4 - (j - 1)] = efw[i].imag; + } + fft[0] = efw[0].real; + fft[1] = -efw[0].imag; + + fft[PART_LEN2] = efw[PART_LEN].real; + fft[PART_LEN2 + 1] = -efw[PART_LEN].imag; + + // inverse FFT, result should be scaled with outCFFT + WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); + outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1); + + //take only the real values and scale with outCFFT + for (i = 0; i < PART_LEN2; i++) + { + j = WEBRTC_SPL_LSHIFT_W32(i, 1); + fft[i] = fft[j]; + } + + for (i = 0; i < PART_LEN; i++) + { + fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( + fft[i], + WebRtcAecm_kSqrtHanning[i], + 14); + tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i], + outCFFT - aecm->dfaCleanQDomain); + fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX, + tmp32no1 + aecm->outBuf[i], + WEBRTC_SPL_WORD16_MIN); + output[i] = fft[i]; + + tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT( + fft[PART_LEN + i], + WebRtcAecm_kSqrtHanning[PART_LEN - i], + 14); + tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, + outCFFT - aecm->dfaCleanQDomain); + aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT( + WEBRTC_SPL_WORD16_MAX, + tmp32no1, + WEBRTC_SPL_WORD16_MIN); + } + +#ifdef ARM_WINM_LOG_ + // measure tick end + QueryPerformanceCounter((LARGE_INTEGER*)&end); + diff__ = ((end - start) * 1000) / (freq/1000); + milliseconds = (unsigned int)(diff__ & 0xffffffff); + WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL); +#endif + + // Copy the current block to the old position (aecm->outBuf is shifted elsewhere) + memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN); + memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN); + if (nearendClean != NULL) + { + memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN); + } +} + +void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm, + const WebRtc_UWord16* far_spectrum, + WebRtc_Word32* echo_est, + WebRtc_UWord32* far_energy, + WebRtc_UWord32* echo_energy_adapt, + WebRtc_UWord32* echo_energy_stored) +{ + int i; + + // Get energy for the delayed far end signal and estimated + // echo using both stored and adapted channels. + for (i = 0; i < PART_LEN1; i++) + { + echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], + far_spectrum[i]); + (*far_energy) += (WebRtc_UWord32)(far_spectrum[i]); + (*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i], + far_spectrum[i]); + (*echo_energy_stored) += (WebRtc_UWord32)echo_est[i]; + } +} + +void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm, + const WebRtc_UWord16* far_spectrum, + WebRtc_Word32* echo_est) +{ + int i; + + // During startup we store the channel every block. + memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1); + // Recalculate echo estimate + for (i = 0; i < PART_LEN; i += 4) + { + echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], + far_spectrum[i]); + echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1], + far_spectrum[i + 1]); + echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2], + far_spectrum[i + 2]); + echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3], + far_spectrum[i + 3]); + } + echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], + far_spectrum[i]); +} + +void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm) +{ + int i; + + // The stored channel has a significantly lower MSE than the adaptive one for + // two consecutive calculations. Reset the adaptive channel. + memcpy(aecm->channelAdapt16, aecm->channelStored, + sizeof(WebRtc_Word16) * PART_LEN1); + // Restore the W32 channel + for (i = 0; i < PART_LEN; i += 4) + { + aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32( + (WebRtc_Word32)aecm->channelStored[i], 16); + aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32( + (WebRtc_Word32)aecm->channelStored[i + 1], 16); + aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32( + (WebRtc_Word32)aecm->channelStored[i + 2], 16); + aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32( + (WebRtc_Word32)aecm->channelStored[i + 3], 16); + } + aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16); +} + +#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)) + diff --git a/src/modules/audio_processing/aecm/main/source/aecm_core.h b/src/modules/audio_processing/aecm/main/source/aecm_core.h index 1050dee16b..e431c71af0 100644 --- a/src/modules/audio_processing/aecm/main/source/aecm_core.h +++ b/src/modules/audio_processing/aecm/main/source/aecm_core.h @@ -99,6 +99,11 @@ extern const WebRtc_Word16 WebRtcAecm_kSqrtHanning[]; +typedef struct { + WebRtc_Word16 real; + WebRtc_Word16 imag; +} complex16_t; + typedef struct { int farBufWritePos; @@ -142,9 +147,9 @@ typedef struct WebRtc_Word16 channelStored_buf[PART_LEN1 + 8]; WebRtc_Word16 channelAdapt16_buf[PART_LEN1 + 8]; WebRtc_Word32 channelAdapt32_buf[PART_LEN1 + 8]; - WebRtc_Word16 xBuf_buf[PART_LEN2 + 8]; // farend - WebRtc_Word16 dBufClean_buf[PART_LEN2 + 8]; // nearend - WebRtc_Word16 dBufNoisy_buf[PART_LEN2 + 8]; // nearend + WebRtc_Word16 xBuf_buf[PART_LEN2 + 16]; // farend + WebRtc_Word16 dBufClean_buf[PART_LEN2 + 16]; // nearend + WebRtc_Word16 dBufNoisy_buf[PART_LEN2 + 16]; // nearend WebRtc_Word16 outBuf_buf[PART_LEN + 8]; // Pointers to the above buffers @@ -326,9 +331,7 @@ void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const far // Some internal functions shared by ARM NEON and generic C code: // -WebRtc_Word16 WebRtcAecm_CalcSuppressionGain(AecmCore_t * aecm); - -void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm, +void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm, const WebRtc_UWord16* far_spectrum, WebRtc_Word32* echoEst, WebRtc_UWord32* far_energy, @@ -341,8 +344,15 @@ void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm, void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm); -void WebRtcAecm_PrepareFft(WebRtc_Word16* fft, - const WebRtc_Word16* time_signal, - int time_signal_scaling); +void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft, + const WebRtc_Word16* time_signal, + complex16_t* freq_signal, + int time_signal_scaling); + +void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm, + WebRtc_Word16* fft, + complex16_t* efw, + WebRtc_Word16* output, + const WebRtc_Word16* nearendClean); #endif diff --git a/src/modules/audio_processing/aecm/main/source/aecm_core_neon.c b/src/modules/audio_processing/aecm/main/source/aecm_core_neon.c index cfac49ad67..86ced1ed3b 100644 --- a/src/modules/audio_processing/aecm/main/source/aecm_core_neon.c +++ b/src/modules/audio_processing/aecm/main/source/aecm_core_neon.c @@ -13,14 +13,9 @@ #include #include -#include -#include "aecm_delay_estimator.h" -#include "echo_control_mobile.h" -#include "ring_buffer.h" -#include "typedefs.h" -// Square root of Hanning window in Q14 +// Square root of Hanning window in Q14. static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8))) = { 16384, 16373, 16354, 16325, 16286, 16237, 16179, 16111, @@ -40,9 +35,172 @@ static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8))) 1594, 1196, 798, 399 }; -void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm, +void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft, + const WebRtc_Word16* time_signal, + complex16_t* freq_signal, + int time_signal_scaling) +{ + int i, j; + + int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling); + __asm__("vmov.i16 d21, #0" ::: "d21"); + + for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8) + { + int16x4_t tmp16x4_0; + int16x4_t tmp16x4_1; + int32x4_t tmp32x4_0; + + /* Window near end */ + // fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i] + // << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14); + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i])); + tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling); + + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i])); + tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1); + + __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20"); + __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10"); + + // fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( + // (time_signal[PART_LEN + i] << time_signal_scaling), + // WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i + PART_LEN])); + tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling); + + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i])); + tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1); + + __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20"); + __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10"); + } + + WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); + WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1); + + // Take only the first PART_LEN2 samples, and switch the sign of the imaginary part. + for(i = 0, j = 0; j < PART_LEN2; i += 8, j += 16) + { + __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11"); + __asm__("vneg.s16 d22, d22" : : : "q10"); + __asm__("vneg.s16 d23, d23" : : : "q11"); + __asm__("vst2.16 {d20, d21, d22, d23}, [%0, :256]" : : + "r"(&freq_signal[i].real): "q10", "q11"); + } +} + +void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm, + WebRtc_Word16* fft, + complex16_t* efw, + WebRtc_Word16* output, + const WebRtc_Word16* nearendClean) +{ + int i, j, outCFFT; + WebRtc_Word32 tmp32no1; + + // Synthesis + for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8) + { + // We overwrite two more elements in fft[], but it's ok. + __asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10"); + __asm__("vmov q11, q10" : : : "q10", "q11"); + + __asm__("vneg.s16 d23, d23" : : : "q11"); + __asm__("vst2.16 {d22, d23}, [%0, :128]" : : "r"(&fft[j]): "q11"); + + __asm__("vrev64.16 q10, q10" : : : "q10"); + __asm__("vst2.16 {d20, d21}, [%0]" : : "r"(&fft[PART_LEN4 - j - 6]): "q10"); + } + + fft[PART_LEN2] = efw[PART_LEN].real; + fft[PART_LEN2 + 1] = -efw[PART_LEN].imag; + + // Inverse FFT, result should be scaled with outCFFT. + WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); + outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1); + + // Take only the real values and scale with outCFFT. + for (i = 0, j = 0; i < PART_LEN2; i += 8, j+= 16) + { + __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11"); + __asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10"); + } + + int32x4_t tmp32x4_2; + __asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32) + (outCFFT - aecm->dfaCleanQDomain))); + for (i = 0; i < PART_LEN; i += 4) + { + int16x4_t tmp16x4_0; + int16x4_t tmp16x4_1; + int32x4_t tmp32x4_0; + int32x4_t tmp32x4_1; + + // fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( + // fft[i], WebRtcAecm_kSqrtHanning[i], 14); + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[i])); + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i])); + __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1)); + __asm__("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0)); + + // tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i], + // outCFFT - aecm->dfaCleanQDomain); + __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2)); + + // fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX, + // tmp32no1 + outBuf[i], WEBRTC_SPL_WORD16_MIN); + // output[i] = fft[i]; + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i])); + __asm__("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0)); + __asm__("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1)); + __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); + __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&fft[i])); + __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i])); + + // tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT( + // fft[PART_LEN + i], WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[PART_LEN + i])); + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i])); + __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1)); + __asm__("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0)); + + // tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain); + __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2)); + // outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT( + // WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN); + __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); + __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i])); + } + + // Copy the current block to the old position (outBuf is shifted elsewhere). + for (i = 0; i < PART_LEN; i += 16) + { + __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : + "r"(&aecm->xBuf[i + PART_LEN]) : "q10"); + __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10"); + } + for (i = 0; i < PART_LEN; i += 16) + { + __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : + "r"(&aecm->dBufNoisy[i + PART_LEN]) : "q10"); + __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : + "r"(&aecm->dBufNoisy[i]): "q10"); + } + if (nearendClean != NULL) { + for (i = 0; i < PART_LEN; i += 16) + { + __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : + "r"(&aecm->dBufClean[i + PART_LEN]) : "q10"); + __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : + "r"(&aecm->dBufClean[i]): "q10"); + } + } +} + +void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm, const WebRtc_UWord16* far_spectrum, - WebRtc_Word32* echoEst, + WebRtc_Word32* echo_est, WebRtc_UWord32* far_energy, WebRtc_UWord32* echo_energy_adapt, WebRtc_UWord32* echo_energy_stored) @@ -54,29 +212,31 @@ void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm, register WebRtc_UWord32 echo_energy_adapt_r; uint32x4_t tmp32x4_0; - __asm__("vmov.i32 q14, #0" : : : "q14"); //far_energy - __asm__("vmov.i32 q8, #0" : : : "q8"); //echo_energy_stored - __asm__("vmov.i32 q9, #0" : : : "q9"); //echo_energy_adapt + __asm__("vmov.i32 q14, #0" : : : "q14"); // far_energy + __asm__("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored + __asm__("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt for(i = 0; i < PART_LEN -7; i += 8) { - //far_energy += (WebRtc_UWord32)(far_spectrum[i]); + // far_energy += (WebRtc_UWord32)(far_spectrum[i]); __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); __asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13"); __asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13"); - // Get estimated echo energies for adaptive channel and stored channel - //echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); + // Get estimated echo energies for adaptive channel and stored channel. + // echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); - __asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echoEst[i]): "q10", "q11"); + __asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]): + "q10", "q11"); - //echo_energy_stored += (WebRtc_UWord32)echoEst[i]; + // echo_energy_stored += (WebRtc_UWord32)echoEst[i]; __asm__("vadd.u32 q8, q10" : : : "q10", "q8"); __asm__("vadd.u32 q8, q11" : : : "q11", "q8"); - //echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i], far_spectrum[i]); + // echo_energy_adapt += WEBRTC_SPL_UMUL_16_16( + // aecm->channelAdapt16[i], far_spectrum[i]); __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); @@ -96,9 +256,9 @@ void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm, __asm__("vpadd.u32 d16, d16" : : : "q8"); __asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8"); - // Get estimated echo energies for adaptive channel and stored channel - echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); - *echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echoEst[i]; + // Get estimated echo energies for adaptive channel and stored channel. + echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); + *echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echo_est[i]; *far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]); *echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16( aecm->channelAdapt16[i], far_spectrum[i]); @@ -128,7 +288,7 @@ void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm, echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); } -void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm) +void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm) { int i; @@ -151,45 +311,4 @@ void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm) (WebRtc_Word32)aecm->channelStored[i], 16); } -void WebRtcAecm_PrepareFft(WebRtc_Word16* fft, - const WebRtc_Word16* time_signal, - int time_signal_scaling) -{ - int i, j; - int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling); - __asm__("vmov.i16 d21, #0" ::: "d21"); - - for(i = 0, j = 0; i < PART_LEN-3; i += 4, j += 8) - { - int16x4_t tmp16x4_0; - int16x4_t tmp16x4_1; - int32x4_t tmp32x4_0; - - /* Window near end */ - // fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i] - // << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14); - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i])); - tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling); - - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i])); - tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1); - - __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20"); - __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10"); - - // fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( - // (time_signal[PART_LEN + i] << time_signal_scaling), - // WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[PART_LEN + i])); - tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling); - - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i])); - tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1); - - __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20"); - __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10"); - } -} - #endif // #if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON) -