From f9e6cc2e277ae29d80cfc1a22209589576ed3cb6 Mon Sep 17 00:00:00 2001 From: "kma@webrtc.org" Date: Fri, 21 Sep 2012 18:51:12 +0000 Subject: [PATCH] Framework for using real FFT in ARMv7 and Neon platforms. Review URL: https://webrtc-codereview.appspot.com/785004 git-svn-id: http://webrtc.googlecode.com/svn/trunk@2803 4adac7df-926f-26a2-2b94-8c16560cd09d --- src/common_audio/signal_processing/Android.mk | 1 + .../signal_processing/include/real_fft.h | 61 +++- src/common_audio/signal_processing/real_fft.c | 38 ++- .../signal_processing/real_fft_unittest.cc | 46 +-- src/common_audio/signal_processing/spl_init.c | 8 +- src/modules/audio_processing/aecm/aecm_core.c | 101 +++--- src/modules/audio_processing/aecm/aecm_core.h | 6 +- .../audio_processing/aecm/aecm_core_neon.c | 307 ++++++++++-------- .../audio_processing/ns/noise_suppression_x.c | 17 +- src/modules/audio_processing/ns/nsx_core.c | 59 ++-- src/modules/audio_processing/ns/nsx_core.h | 1 + 11 files changed, 399 insertions(+), 246 deletions(-) diff --git a/src/common_audio/signal_processing/Android.mk b/src/common_audio/signal_processing/Android.mk index 1bbcc7aa3d..a0ebd6d63b 100644 --- a/src/common_audio/signal_processing/Android.mk +++ b/src/common_audio/signal_processing/Android.mk @@ -35,6 +35,7 @@ LOCAL_SRC_FILES := \ lpc_to_refl_coef.c \ min_max_operations.c \ randomization_functions.c \ + real_fft.c \ refl_coef_to_lpc.c \ resample.c \ resample_48khz.c \ diff --git a/src/common_audio/signal_processing/include/real_fft.h b/src/common_audio/signal_processing/include/real_fft.h index 129eb8c21d..4028b41416 100644 --- a/src/common_audio/signal_processing/include/real_fft.h +++ b/src/common_audio/signal_processing/include/real_fft.h @@ -19,16 +19,65 @@ struct RealFFT; extern "C" { #endif -// TODO(andrew): documentation. +typedef int (*RealForwardFFT)(struct RealFFT* self, + const int16_t* data_in, + int16_t* data_out); +typedef int (*RealInverseFFT)(struct RealFFT* self, + const int16_t* data_in, + int16_t* data_out); + +extern RealForwardFFT WebRtcSpl_RealForwardFFT; +extern RealInverseFFT WebRtcSpl_RealInverseFFT; + struct RealFFT* WebRtcSpl_CreateRealFFT(int order); void WebRtcSpl_FreeRealFFT(struct RealFFT* self); -// TODO(andrew): This currently functions exactly the same as ComplexFFT(). -// Manage the surrounding operations (ComplexBitReverse etc) here instead. +// TODO(kma): Implement FFT functions for real signals. + +// Compute the forward FFT for a complex signal of length 2^order. +// Input Arguments: +// self - pointer to preallocated and initialized FFT specification structure. +// data_in - the input signal. // -// data must be of length 2^(order + 1) to hold the complex output. -int WebRtcSpl_RealForwardFFT(struct RealFFT* self, int16_t* data); -int WebRtcSpl_RealInverseFFT(struct RealFFT* self, int16_t* data); +// Output Arguments: +// data_out - the output signal; must be different to data_in. +// +// Return Value: +// 0 - FFT calculation is successful. +// -1 - Error +// +int WebRtcSpl_RealForwardFFTC(struct RealFFT* self, + const int16_t* data_in, + int16_t* data_out); + +#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) +int WebRtcSpl_RealForwardFFTNeon(struct RealFFT* self, + const int16_t* data_in, + int16_t* data_out); +#endif + +// Compute the inverse FFT for a complex signal of length 2^order. +// Input Arguments: +// self - pointer to preallocated and initialized FFT specification structure. +// data_in - the input signal. +// +// Output Arguments: +// data_out - the output signal; must be different to data_in. +// +// Return Value: +// 0 or a positive number - a value that the elements in the |data_out| should +// be shifted left with in order to get correct +// physical values. +// -1 - Error +int WebRtcSpl_RealInverseFFTC(struct RealFFT* self, + const int16_t* data_in, + int16_t* data_out); + +#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) +int WebRtcSpl_RealInverseFFTNeon(struct RealFFT* self, + const int16_t* data_in, + int16_t* data_out); +#endif #ifdef __cplusplus } diff --git a/src/common_audio/signal_processing/real_fft.c b/src/common_audio/signal_processing/real_fft.c index e87654e1d1..8f32418d6c 100644 --- a/src/common_audio/signal_processing/real_fft.c +++ b/src/common_audio/signal_processing/real_fft.c @@ -20,12 +20,15 @@ struct RealFFT { struct RealFFT* WebRtcSpl_CreateRealFFT(int order) { struct RealFFT* self = NULL; + // This constraint comes from ComplexFFT(). if (order > 10 || order < 0) { return NULL; } + self = malloc(sizeof(struct RealFFT)); self->order = order; + return self; } @@ -33,10 +36,37 @@ void WebRtcSpl_FreeRealFFT(struct RealFFT* self) { free(self); } -int WebRtcSpl_RealForwardFFT(struct RealFFT* self, int16_t* data) { - return WebRtcSpl_ComplexFFT(data, self->order, 1); +// WebRtcSpl_ComplexFFT and WebRtcSpl_ComplexIFFT use in-place algorithm, +// so copy data from data_in to data_out in the next two functions. + +int WebRtcSpl_RealForwardFFTC(struct RealFFT* self, + const int16_t* data_in, + int16_t* data_out) { + memcpy(data_out, data_in, sizeof(int16_t) * (1 << (self->order + 1))); + WebRtcSpl_ComplexBitReverse(data_out, self->order); + return WebRtcSpl_ComplexFFT(data_out, self->order, 1); } -int WebRtcSpl_RealInverseFFT(struct RealFFT* self, int16_t* data) { - return WebRtcSpl_ComplexIFFT(data, self->order, 1); +int WebRtcSpl_RealInverseFFTC(struct RealFFT* self, + const int16_t* data_in, + int16_t* data_out) { + memcpy(data_out, data_in, sizeof(int16_t) * (1 << (self->order + 1))); + WebRtcSpl_ComplexBitReverse(data_out, self->order); + return WebRtcSpl_ComplexIFFT(data_out, self->order, 1); } + +#if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON) +// TODO(kma): Replace the following function bodies into optimized functions +// for ARM Neon. +int WebRtcSpl_RealForwardFFTNeon(struct RealFFT* self, + const int16_t* data_in, + int16_t* data_out) { + return WebRtcSpl_RealForwardFFTC(self, data_in, data_out); +} + +int WebRtcSpl_RealInverseFFTNeon(struct RealFFT* self, + const int16_t* data_in, + int16_t* data_out) { + return WebRtcSpl_RealInverseFFTC(self, data_in, data_out); +} +#endif diff --git a/src/common_audio/signal_processing/real_fft_unittest.cc b/src/common_audio/signal_processing/real_fft_unittest.cc index d25079702d..a37e732b39 100644 --- a/src/common_audio/signal_processing/real_fft_unittest.cc +++ b/src/common_audio/signal_processing/real_fft_unittest.cc @@ -17,11 +17,13 @@ namespace webrtc { namespace { -const int kOrder = 3; +const int kOrder = 4; const int kLength = 1 << (kOrder + 1); // +1 to hold complex data. const int16_t kRefData[kLength] = { - 11739, -6848, -8688, 31980, -30295, 25242, 27085, 19410, -26299, -15607, - -10791, 11778, -23819, 14498, -25772, 10076 + 11739, 6848, -8688, 31980, -30295, 25242, 27085, 19410, + -26299, 15607, -10791, 11778, -23819, 14498, -25772, 10076, + 1173, 6848, -8688, 31980, -30295, 2522, 27085, 19410, + -2629, 5607, -3, 1178, -23819, 1498, -25772, 10076 }; class RealFFTTest : public ::testing::Test { @@ -38,49 +40,35 @@ TEST_F(RealFFTTest, CreateFailsOnBadInput) { EXPECT_TRUE(fft == NULL); } -// TODO(andrew): Look more into why this was failing. -TEST_F(RealFFTTest, DISABLED_TransformIsInvertible) { - int16_t data[kLength] = {0}; - memcpy(data, kRefData, sizeof(kRefData)); - - RealFFT* fft = NULL; - fft = WebRtcSpl_CreateRealFFT(kOrder); - EXPECT_TRUE(fft != NULL); - - EXPECT_EQ(0, WebRtcSpl_RealForwardFFT(fft, data)); - int scale = WebRtcSpl_RealInverseFFT(fft, data); - - EXPECT_GE(scale, 0); - for (int i = 0; i < kLength; i++) { - EXPECT_EQ(data[i] << scale, kRefData[i]); - } - WebRtcSpl_FreeRealFFT(fft); -} - // TODO(andrew): This won't always be the case, but verifies the current code // at least. TEST_F(RealFFTTest, RealAndComplexAreIdentical) { int16_t real_data[kLength] = {0}; + int16_t real_data_out[kLength] = {0}; int16_t complex_data[kLength] = {0}; memcpy(real_data, kRefData, sizeof(kRefData)); memcpy(complex_data, kRefData, sizeof(kRefData)); - RealFFT* fft = NULL; - fft = WebRtcSpl_CreateRealFFT(kOrder); + RealFFT* fft = WebRtcSpl_CreateRealFFT(kOrder); EXPECT_TRUE(fft != NULL); - EXPECT_EQ(0, WebRtcSpl_RealForwardFFT(fft, real_data)); + EXPECT_EQ(0, WebRtcSpl_RealForwardFFT(fft, real_data, real_data_out)); + WebRtcSpl_ComplexBitReverse(complex_data, kOrder); EXPECT_EQ(0, WebRtcSpl_ComplexFFT(complex_data, kOrder, 1)); + for (int i = 0; i < kLength; i++) { - EXPECT_EQ(real_data[i], complex_data[i]); + EXPECT_EQ(real_data_out[i], complex_data[i]); } - int real_scale = WebRtcSpl_RealInverseFFT(fft, real_data); - int complex_scale = WebRtcSpl_ComplexIFFT(complex_data, kOrder, 1); + memcpy(complex_data, kRefData, sizeof(kRefData)); + + int real_scale = WebRtcSpl_RealInverseFFT(fft, real_data, real_data_out); EXPECT_GE(real_scale, 0); + WebRtcSpl_ComplexBitReverse(complex_data, kOrder); + int complex_scale = WebRtcSpl_ComplexIFFT(complex_data, kOrder, 1); EXPECT_EQ(real_scale, complex_scale); for (int i = 0; i < kLength; i++) { - EXPECT_EQ(real_data[i], complex_data[i]); + EXPECT_EQ(real_data_out[i], complex_data[i]); } WebRtcSpl_FreeRealFFT(fft); } diff --git a/src/common_audio/signal_processing/spl_init.c b/src/common_audio/signal_processing/spl_init.c index b5c7709799..1021d1b298 100644 --- a/src/common_audio/signal_processing/spl_init.c +++ b/src/common_audio/signal_processing/spl_init.c @@ -18,6 +18,7 @@ * (AEC, NS, codecs etc.). */ +#include "common_audio/signal_processing/include/real_fft.h" #include "common_audio/signal_processing/include/signal_processing_library.h" #include "system_wrappers/interface/cpu_features_wrapper.h" @@ -31,6 +32,8 @@ MinValueW32 WebRtcSpl_MinValueW32; CrossCorrelation WebRtcSpl_CrossCorrelation; DownsampleFast WebRtcSpl_DownsampleFast; ScaleAndAddVectorsWithRound WebRtcSpl_ScaleAndAddVectorsWithRound; +RealForwardFFT WebRtcSpl_RealForwardFFT; +RealInverseFFT WebRtcSpl_RealInverseFFT; /* Initialize function pointers to the generic C version. */ static void InitPointersToC() { @@ -44,6 +47,8 @@ static void InitPointersToC() { WebRtcSpl_DownsampleFast = WebRtcSpl_DownsampleFastC; WebRtcSpl_ScaleAndAddVectorsWithRound = WebRtcSpl_ScaleAndAddVectorsWithRoundC; + WebRtcSpl_RealForwardFFT = WebRtcSpl_RealForwardFFTC; + WebRtcSpl_RealInverseFFT = WebRtcSpl_RealInverseFFTC; } #if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON) @@ -59,6 +64,8 @@ static void InitPointersToNeon() { WebRtcSpl_DownsampleFast = WebRtcSpl_DownsampleFastNeon; WebRtcSpl_ScaleAndAddVectorsWithRound = WebRtcSpl_ScaleAndAddVectorsWithRoundNeon; + WebRtcSpl_RealForwardFFT = WebRtcSpl_RealForwardFFTNeon; + WebRtcSpl_RealInverseFFT = WebRtcSpl_RealInverseFFTNeon; } #endif @@ -76,7 +83,6 @@ static void InitFunctionPointers(void) { #endif /* WEBRTC_DETECT_ARM_NEON */ } - #if defined(WEBRTC_POSIX) #include diff --git a/src/modules/audio_processing/aecm/aecm_core.c b/src/modules/audio_processing/aecm/aecm_core.c index d5c1b48ee2..6559e481d2 100644 --- a/src/modules/audio_processing/aecm/aecm_core.c +++ b/src/modules/audio_processing/aecm/aecm_core.c @@ -14,6 +14,7 @@ #include #include +#include "common_audio/signal_processing/include/real_fft.h" #include "cpu_features_wrapper.h" #include "delay_estimator_wrapper.h" #include "echo_control_mobile.h" @@ -313,6 +314,13 @@ int WebRtcAecm_CreateCore(AecmCore_t **aecmInst) return -1; } + aecm->real_fft = WebRtcSpl_CreateRealFFT(PART_LEN_SHIFT); + if (aecm->real_fft == NULL) { + WebRtcAecm_FreeCore(aecm); + aecm = NULL; + return -1; + } + // Init some aecm pointers. 16 and 32 byte alignment is only necessary // for Neon code currently. aecm->xBuf = (WebRtc_Word16*) (((uintptr_t)aecm->xBuf_buf + 31) & ~ 31); @@ -350,7 +358,8 @@ void WebRtcAecm_InitEchoPathCore(AecmCore_t* aecm, const WebRtc_Word16* echo_pat aecm->mseChannelCount = 0; } -static void WindowAndFFTC(WebRtc_Word16* fft, +static void WindowAndFFTC(AecmCore_t* aecm, + WebRtc_Word16* fft, const WebRtc_Word16* time_signal, complex16_t* freq_signal, int time_signal_scaling) @@ -375,31 +384,14 @@ static void WindowAndFFTC(WebRtc_Word16* fft, // initialized the array with all zeros } - WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); - WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1); - - // Take only the first PART_LEN2 samples - for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2) - { - freq_signal[i].real = fft[j]; - - // The imaginary part has to switch sign - freq_signal[i].imag = - fft[j+1]; + // Do forward FFT, then take only the first PART_LEN complex samples, + // and change signs of the imaginary parts. + WebRtcSpl_RealForwardFFT(aecm->real_fft, fft, (int16_t*)freq_signal); + for (i = 0; i < PART_LEN; i++) { + freq_signal[i].imag = -freq_signal[i].imag; } } -// Initialize function pointers for ARM Neon platform. -#if (defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON) -static void WebRtcAecm_InitNeon(void) -{ - WebRtcAecm_WindowAndFFT = WebRtcAecm_WindowAndFFTNeon; - WebRtcAecm_InverseFFTAndWindow = WebRtcAecm_InverseFFTAndWindowNeon; - WebRtcAecm_CalcLinearEnergies = WebRtcAecm_CalcLinearEnergiesNeon; - WebRtcAecm_StoreAdaptiveChannel = WebRtcAecm_StoreAdaptiveChannelNeon; - WebRtcAecm_ResetAdaptiveChannel = WebRtcAecm_ResetAdaptiveChannelNeon; -} -#endif - static void InverseFFTAndWindowC(AecmCore_t* aecm, WebRtc_Word16* fft, complex16_t* efw, @@ -428,32 +420,23 @@ static void InverseFFTAndWindowC(AecmCore_t* aecm, fft[PART_LEN2] = efw[PART_LEN].real; fft[PART_LEN2 + 1] = -efw[PART_LEN].imag; - // inverse FFT, result should be scaled with outCFFT - WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); - outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1); - - //take only the real values and scale with outCFFT - for (i = 0; i < PART_LEN2; i++) - { - j = WEBRTC_SPL_LSHIFT_W32(i, 1); - fft[i] = fft[j]; - } - - for (i = 0; i < PART_LEN; i++) - { - fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( - fft[i], + // Inverse FFT. Then take only the real values, and keep outCFFT + // to scale the samples in the next block. + outCFFT = WebRtcSpl_RealInverseFFT(aecm->real_fft, fft, (int16_t*)efw); + for (i = 0; i < PART_LEN; i++) { + efw[i].real = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( + efw[i].real, WebRtcAecm_kSqrtHanning[i], 14); - tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i], + tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)efw[i].real, outCFFT - aecm->dfaCleanQDomain); - fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX, + efw[i].real = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX, tmp32no1 + aecm->outBuf[i], WEBRTC_SPL_WORD16_MIN); - output[i] = fft[i]; + output[i] = efw[i].real; tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT( - fft[PART_LEN + i], + efw[PART_LEN + i].real, WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, @@ -542,6 +525,19 @@ static void ResetAdaptiveChannelC(AecmCore_t* aecm) aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16); } +// Initialize function pointers for ARM Neon platform. +#if (defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON) +static void WebRtcAecm_InitNeon(void) +{ + // TODO(kma): Check why WebRtcAecm_InverseFFTAndWindowNeon() doesn't work. + WebRtcAecm_WindowAndFFT = WebRtcAecm_WindowAndFFTNeon; + WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowC; + WebRtcAecm_StoreAdaptiveChannel = WebRtcAecm_StoreAdaptiveChannelNeon; + WebRtcAecm_ResetAdaptiveChannel = WebRtcAecm_ResetAdaptiveChannelNeon; + WebRtcAecm_CalcLinearEnergies = WebRtcAecm_CalcLinearEnergiesNeon; +} +#endif + // WebRtcAecm_InitCore(...) // // This function initializes the AECM instant created with WebRtcAecm_CreateCore(...) @@ -704,6 +700,8 @@ int WebRtcAecm_FreeCore(AecmCore_t *aecm) WebRtc_FreeBuffer(aecm->outFrameBuf); WebRtc_FreeDelayEstimator(aecm->delay_estimator); + WebRtcSpl_FreeRealFFT(aecm->real_fft); + free(aecm); return 0; @@ -1376,7 +1374,8 @@ static WebRtc_Word16 CalcSuppressionGain(AecmCore_t * const aecm) // the frequency domain array // return value The Q-domain of current frequency values // -static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal, +static int TimeToFrequencyDomain(AecmCore_t* aecm, + const WebRtc_Word16* time_signal, complex16_t* freq_signal, WebRtc_UWord16* freq_signal_abs, WebRtc_UWord32* freq_signal_sum_abs) @@ -1407,12 +1406,11 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal, time_signal_scaling = WebRtcSpl_NormW16(tmp16no1); #endif - WebRtcAecm_WindowAndFFT(fft, time_signal, freq_signal, time_signal_scaling); + WebRtcAecm_WindowAndFFT(aecm, fft, time_signal, freq_signal, time_signal_scaling); // Extract imaginary and real part, calculate the magnitude for all frequency bins freq_signal[0].imag = 0; freq_signal[PART_LEN].imag = 0; - freq_signal[PART_LEN].real = fft[PART_LEN2]; freq_signal_abs[0] = (WebRtc_UWord16)WEBRTC_SPL_ABS_W16( freq_signal[0].real); freq_signal_abs[PART_LEN] = (WebRtc_UWord16)WEBRTC_SPL_ABS_W16( @@ -1530,8 +1528,8 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, // TODO (kma): define fft with complex16_t. WebRtc_Word16 fft_buf[PART_LEN4 + 2 + 16]; // +2 to make a loop safe. WebRtc_Word32 echoEst32_buf[PART_LEN1 + 8]; - WebRtc_Word32 dfw_buf[PART_LEN1 + 8]; - WebRtc_Word32 efw_buf[PART_LEN1 + 8]; + WebRtc_Word32 dfw_buf[PART_LEN2 + 8]; + WebRtc_Word32 efw_buf[PART_LEN2 + 8]; WebRtc_Word16* fft = (WebRtc_Word16*) (((uintptr_t) fft_buf + 31) & ~ 31); WebRtc_Word32* echoEst32 = (WebRtc_Word32*) (((uintptr_t) echoEst32_buf + 31) & ~ 31); @@ -1575,13 +1573,15 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, } // Transform far end signal from time domain to frequency domain. - far_q = TimeToFrequencyDomain(aecm->xBuf, + far_q = TimeToFrequencyDomain(aecm, + aecm->xBuf, dfw, xfa, &xfaSum); // Transform noisy near end signal from time domain to frequency domain. - zerosDBufNoisy = TimeToFrequencyDomain(aecm->dBufNoisy, + zerosDBufNoisy = TimeToFrequencyDomain(aecm, + aecm->dBufNoisy, dfw, dfaNoisy, &dfaNoisySum); @@ -1598,7 +1598,8 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, } else { // Transform clean near end signal from time domain to frequency domain. - zerosDBufClean = TimeToFrequencyDomain(aecm->dBufClean, + zerosDBufClean = TimeToFrequencyDomain(aecm, + aecm->dBufClean, dfw, dfaClean, &dfaCleanSum); diff --git a/src/modules/audio_processing/aecm/aecm_core.h b/src/modules/audio_processing/aecm/aecm_core.h index 8161a8cbdc..d2b77da3ac 100644 --- a/src/modules/audio_processing/aecm/aecm_core.h +++ b/src/modules/audio_processing/aecm/aecm_core.h @@ -117,6 +117,8 @@ typedef struct WebRtc_Word16 supGainErrParamDiffAB; WebRtc_Word16 supGainErrParamDiffBD; + struct RealFFT* real_fft; + #ifdef AEC_DEBUG FILE *farFile; FILE *nearFile; @@ -276,6 +278,7 @@ typedef void (*ResetAdaptiveChannel)(AecmCore_t* aecm); extern ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel; typedef void (*WindowAndFFT)( + AecmCore_t* aecm, WebRtc_Word16* fft, const WebRtc_Word16* time_signal, complex16_t* freq_signal, @@ -293,7 +296,8 @@ extern InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow; // and defined as static in file aecm_core.c, while those for ARM Neon platforms // are declared below and defined in file aecm_core_neon.s. #if (defined WEBRTC_DETECT_ARM_NEON) || defined (WEBRTC_ARCH_ARM_NEON) -void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft, +void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm, + WebRtc_Word16* fft, const WebRtc_Word16* time_signal, complex16_t* freq_signal, int time_signal_scaling); diff --git a/src/modules/audio_processing/aecm/aecm_core_neon.c b/src/modules/audio_processing/aecm/aecm_core_neon.c index c06a678f5b..b870d2ad57 100644 --- a/src/modules/audio_processing/aecm/aecm_core_neon.c +++ b/src/modules/audio_processing/aecm/aecm_core_neon.c @@ -13,6 +13,10 @@ #include #include +#include "common_audio/signal_processing/include/real_fft.h" + +// TODO(kma): Re-write the corresponding assembly file, the offset +// generating script and makefile, to replace these C functions. // Square root of Hanning window in Q14. static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__((aligned(8))) = { @@ -34,55 +38,74 @@ static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__((aligned(8))) = 1594, 1196, 798, 399 }; -void WebRtcAecm_WindowAndFFTNeon(WebRtc_Word16* fft, +void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm, + WebRtc_Word16* fft, const WebRtc_Word16* time_signal, complex16_t* freq_signal, int time_signal_scaling) { - int i, j; + int i = 0; + const int16_t* p_time_signal = time_signal; + const int16_t* p_time_signal_offset = &time_signal[PART_LEN]; + const int16_t* p_hanning = WebRtcAecm_kSqrtHanning; + const int16_t* p_hanning_reversed = kSqrtHanningReversed; + int16_t* p_fft = fft; + int16_t* p_fft_offset = &fft[PART_LEN2]; - int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling); - __asm__("vmov.i16 d21, #0" ::: "d21"); + assert((uintptr_t)p_time_signal % 8 == 0); + assert((uintptr_t)freq_signal % 32 == 0); + assert((uintptr_t)p_hanning % 8 == 0); + assert((uintptr_t)p_fft % 16 == 0); - for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) { - int16x4_t tmp16x4_0; - int16x4_t tmp16x4_1; - int32x4_t tmp32x4_0; + __asm __volatile( + "vdup.16 d16, %0\n\t" + "vmov.i16 d21, #0\n\t" + "vmov.i16 d27, #0\n\t" + : + :"r"(time_signal_scaling) + : "d16", "d21", "d27" + ); - /* Window near end */ - // fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i] - // << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14); - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i])); - tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling); - - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i])); - tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1); - - __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20"); - __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10"); - - // fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( - // (time_signal[PART_LEN + i] << time_signal_scaling), - // WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i + PART_LEN])); - tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling); - - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i])); - tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1); - - __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20"); - __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10"); + for (i = 0; i < PART_LEN; i += 4) { + __asm __volatile( + "vld1.16 d0, [%[p_time_signal], :64]!\n\t" + "vld1.16 d22, [%[p_time_signal_offset], :64]!\n\t" + "vld1.16 d17, [%[p_hanning], :64]!\n\t" + "vld1.16 d23, [%[p_hanning_reversed], :64]!\n\t" + "vshl.s16 d18, d0, d16\n\t" + "vshl.s16 d22, d22, d16\n\t" + "vmull.s16 q9, d18, d17\n\t" + "vmull.s16 q12, d22, d23\n\t" + "vshrn.i32 d20, q9, #14\n\t" + "vshrn.i32 d26, q12, #14\n\t" + "vst2.16 {d20, d21}, [%[p_fft], :128]!\n\t" + "vst2.16 {d26, d27}, [%[p_fft_offset], :128]!\n\t" + :[p_time_signal]"+r"(p_time_signal), + [p_time_signal_offset]"+r"(p_time_signal_offset), + [p_hanning]"+r"(p_hanning), + [p_hanning_reversed]"+r"(p_hanning_reversed), + [p_fft]"+r"(p_fft), + [p_fft_offset]"+r"(p_fft_offset) + : + :"d0", "d16", "d17", "d18", "d19", "d20", "d21", + "d22", "d23", "d24", "d25", "d26", "d27" + ); } - WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); - WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1); + // Do forward FFT, then take only the first PART_LEN complex samples, + // and change signs of the imaginary parts. + WebRtcSpl_RealForwardFFT(aecm->real_fft, (int16_t*)fft, + (int16_t*)freq_signal); - // Take only the first PART_LEN2 samples, and switch the sign of the imaginary part. - for (i = 0, j = 0; j < PART_LEN2; i += 8, j += 16) { - __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11"); - __asm__("vneg.s16 d22, d22" : : : "q10"); - __asm__("vneg.s16 d23, d23" : : : "q11"); - __asm__("vst2.16 {d20, d21, d22, d23}, [%0, :256]" : : - "r"(&freq_signal[i].real): "q10", "q11"); + for (i = 0; i < PART_LEN; i += 8) { + __asm __volatile( + "vld2.16 {d20, d21, d22, d23}, [%[freq_signal], :256]\n\t" + "vneg.s16 d22, d22\n\t" + "vneg.s16 d23, d23\n\t" + "vst2.16 {d20, d21, d22, d23}, [%[freq_signal], :256]!\n\t" + :[freq_signal]"+r"(freq_signal) + : + : "d20", "d21", "d22", "d23" + ); } } @@ -93,34 +116,47 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, const WebRtc_Word16* nearendClean) { int i, j, outCFFT; + assert((uintptr_t)efw % 32 == 0); + assert((uintptr_t)fft % 16 == 0); + assert((uintptr_t)output% 8 == 0); + assert((uintptr_t)WebRtcAecm_kSqrtHanning % 8 == 0); + assert((uintptr_t)kSqrtHanningReversed % 8 == 0); + assert((uintptr_t)(aecm->outBuf) % 8 == 0); + assert((uintptr_t)(aecm->xBuf) % 32 == 0); + assert((uintptr_t)(aecm->dBufNoisy) % 32 == 0); + assert((uintptr_t)(aecm->dBufClean) % 32 == 0); + // Synthesis + complex16_t* p_efw = efw; + int16_t* p_fft = fft; + int16_t* p_fft_offset = &fft[PART_LEN4 - 6]; + for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) { // We overwrite two more elements in fft[], but it's ok. - __asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10"); - __asm__("vmov q11, q10" : : : "q10", "q11"); - - __asm__("vneg.s16 d23, d23" : : : "q11"); - __asm__("vst2.16 {d22, d23}, [%0, :128]" : : "r"(&fft[j]): "q11"); - - __asm__("vrev64.16 q10, q10" : : : "q10"); - __asm__("vst2.16 {d20, d21}, [%0]" : : "r"(&fft[PART_LEN4 - j - 6]): "q10"); + __asm __volatile( + "vld2.16 {q10}, [%[p_efw], :128]!\n\t" + "vmov q11, q10\n\t" + "vneg.s16 d23, d23\n\t" + "vst2.16 {d22, d23}, [%[p_fft], :128]!\n\t" + "vrev64.16 q10, q10\n\t" + "vst2.16 {q10}, [%[p_fft_offset], %[offset]]\n\t" + :[p_efw]"+r"(p_efw), + [p_fft]"+r"(p_fft), + [p_fft_offset]"+r"(p_fft_offset) + :[offset]"r"(-16) + :"d20", "d21", "d22", "d23" + ); } fft[PART_LEN2] = efw[PART_LEN].real; fft[PART_LEN2 + 1] = -efw[PART_LEN].imag; - // Inverse FFT, result should be scaled with outCFFT. - WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); - outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1); - - // Take only the real values and scale with outCFFT. - for (i = 0, j = 0; i < PART_LEN2; i += 8, j += 16) { - __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11"); - __asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10"); - } + // Inverse FFT. Then take only the real values, and keep outCFFT + // to scale the samples. + outCFFT = WebRtcSpl_RealInverseFFT(aecm->real_fft, fft, (int16_t*)efw); int32x4_t tmp32x4_2; - __asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32) + __asm __volatile("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32) (outCFFT - aecm->dfaCleanQDomain))); for (i = 0; i < PART_LEN; i += 4) { int16x4_t tmp16x4_0; @@ -128,59 +164,59 @@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, int32x4_t tmp32x4_0; int32x4_t tmp32x4_1; - // fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( - // fft[i], WebRtcAecm_kSqrtHanning[i], 14); - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[i])); - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i])); - __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1)); - __asm__("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0)); + //efw[i].real = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( + // efw[i].real, WebRtcAecm_kSqrtHanning[i], 14); + __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&efw[i].real)); + __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i])); + __asm __volatile("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1)); + __asm __volatile("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0)); - // tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i], + //tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)efw[i].real, // outCFFT - aecm->dfaCleanQDomain); - __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2)); + __asm __volatile("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2)); - // fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX, - // tmp32no1 + outBuf[i], WEBRTC_SPL_WORD16_MIN); - // output[i] = fft[i]; - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i])); - __asm__("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0)); - __asm__("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1)); - __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); - __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&fft[i])); - __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i])); + //efw[i].real = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX, + // tmp32no1 + aecm->outBuf[i], WEBRTC_SPL_WORD16_MIN); + // output[i] = efw[i].real; + __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i])); + __asm __volatile("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0)); + __asm __volatile("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1)); + __asm __volatile("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); + __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&efw[i].real)); + __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i])); // tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT( - // fft[PART_LEN + i], WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[PART_LEN + i])); - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i])); - __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1)); - __asm__("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0)); + // efw[PART_LEN + i].real, WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); + __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&efw[PART_LEN + i].real)); + __asm __volatile("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i])); + __asm __volatile("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1)); + __asm __volatile("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0)); // tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain); - __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2)); - // outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT( - // WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN); - __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); - __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i])); + __asm __volatile("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2)); + // aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT( + // WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN); + __asm __volatile("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); + __asm __volatile("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i])); } // Copy the current block to the old position (outBuf is shifted elsewhere). for (i = 0; i < PART_LEN; i += 16) { - __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : + __asm __volatile("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i + PART_LEN]) : "q10"); - __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10"); + __asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10"); } for (i = 0; i < PART_LEN; i += 16) { - __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : + __asm __volatile("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->dBufNoisy[i + PART_LEN]) : "q10"); - __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : + __asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->dBufNoisy[i]): "q10"); } if (nearendClean != NULL) { for (i = 0; i < PART_LEN; i += 16) { - __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : + __asm __volatile("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->dBufClean[i + PART_LEN]) : "q10"); - __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : + __asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->dBufClean[i]): "q10"); } } @@ -198,48 +234,54 @@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm, register WebRtc_UWord32 echo_energy_stored_r; register WebRtc_UWord32 echo_energy_adapt_r; - __asm__("vmov.i32 q14, #0" : : : "q14"); // far_energy - __asm__("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored - __asm__("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt + assert((uintptr_t)echo_est % 32 == 0); + assert((uintptr_t)(aecm->channelStored) % 16 == 0); + assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0); + assert((uintptr_t)(aecm->channelStored) % 16 == 0); + assert((uintptr_t)(aecm->channelStored) % 16 == 0); + + __asm __volatile("vmov.i32 q14, #0" : : : "q14"); // far_energy + __asm __volatile("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored + __asm __volatile("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt for (i = 0; i < PART_LEN - 7; i += 8) { // far_energy += (WebRtc_UWord32)(far_spectrum[i]); - __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); - __asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13"); - __asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13"); + __asm __volatile("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); + __asm __volatile("vaddw.u16 q14, q14, d26" : : : "q14", "q13"); + __asm __volatile("vaddw.u16 q14, q14, d27" : : : "q14", "q13"); // Get estimated echo energies for adaptive channel and stored channel. // echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); - __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); - __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); - __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); - __asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]): + __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); + __asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); + __asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); + __asm __volatile("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]): "q10", "q11"); // echo_energy_stored += (WebRtc_UWord32)echoEst[i]; - __asm__("vadd.u32 q8, q10" : : : "q10", "q8"); - __asm__("vadd.u32 q8, q11" : : : "q11", "q8"); + __asm __volatile("vadd.u32 q8, q10" : : : "q10", "q8"); + __asm __volatile("vadd.u32 q8, q11" : : : "q11", "q8"); // echo_energy_adapt += WEBRTC_SPL_UMUL_16_16( // aecm->channelAdapt16[i], far_spectrum[i]); - __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); - __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); - __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); - __asm__("vadd.u32 q9, q10" : : : "q9", "q15"); - __asm__("vadd.u32 q9, q11" : : : "q9", "q11"); + __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); + __asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); + __asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); + __asm __volatile("vadd.u32 q9, q10" : : : "q9", "q15"); + __asm __volatile("vadd.u32 q9, q11" : : : "q9", "q11"); } - __asm__("vadd.u32 d28, d29" : : : "q14"); - __asm__("vpadd.u32 d28, d28" : : : "q14"); - __asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14"); + __asm __volatile("vadd.u32 d28, d29" : : : "q14"); + __asm __volatile("vpadd.u32 d28, d28" : : : "q14"); + __asm __volatile("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14"); - __asm__("vadd.u32 d18, d19" : : : "q9"); - __asm__("vpadd.u32 d18, d18" : : : "q9"); - __asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9"); + __asm __volatile("vadd.u32 d18, d19" : : : "q9"); + __asm __volatile("vpadd.u32 d18, d18" : : : "q9"); + __asm __volatile("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9"); - __asm__("vadd.u32 d16, d17" : : : "q8"); - __asm__("vpadd.u32 d16, d16" : : : "q8"); - __asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8"); + __asm __volatile("vadd.u32 d16, d17" : : : "q8"); + __asm __volatile("vpadd.u32 d16, d16" : : : "q8"); + __asm __volatile("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8"); // Get estimated echo energies for adaptive channel and stored channel. echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); @@ -254,17 +296,21 @@ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm, WebRtc_Word32* echo_est) { int i; + assert((uintptr_t)echo_est % 32 == 0); + assert((uintptr_t)(aecm->channelStored) % 16 == 0); + assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0); + // During startup we store the channel every block. // Recalculate echo estimate. for (i = 0; i < PART_LEN - 7; i += 8) { // aecm->channelStored[i] = acem->channelAdapt16[i]; // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); - __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); - __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); - __asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); - __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); - __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); - __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : + __asm __volatile("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); + __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); + __asm __volatile("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); + __asm __volatile("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); + __asm __volatile("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); + __asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]) : "q10", "q11"); } aecm->channelStored[i] = aecm->channelAdapt16[i]; @@ -274,21 +320,24 @@ void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore_t* aecm, void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore_t* aecm) { int i; + assert((uintptr_t)(aecm->channelStored) % 16 == 0); + assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0); + assert((uintptr_t)(aecm->channelAdapt32) % 32 == 0); + for (i = 0; i < PART_LEN - 7; i += 8) { // aecm->channelAdapt16[i] = aecm->channelStored[i]; // aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32) // aecm->channelStored[i], 16); - __asm__("vld1.16 {d24, d25}, [%0, :128]" : : + __asm __volatile("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); - __asm__("vst1.16 {d24, d25}, [%0, :128]" : : + __asm __volatile("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); - __asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10"); - __asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11"); - __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : + __asm __volatile("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10"); + __asm __volatile("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11"); + __asm __volatile("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->channelAdapt32[i]): "q10", "q11"); } aecm->channelAdapt16[i] = aecm->channelStored[i]; aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32( (WebRtc_Word32)aecm->channelStored[i], 16); } - diff --git a/src/modules/audio_processing/ns/noise_suppression_x.c b/src/modules/audio_processing/ns/noise_suppression_x.c index ca46389c7d..64ab28785d 100644 --- a/src/modules/audio_processing/ns/noise_suppression_x.c +++ b/src/modules/audio_processing/ns/noise_suppression_x.c @@ -8,18 +8,22 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include -#include - #include "noise_suppression_x.h" + +#include + +#include "common_audio/signal_processing/include/real_fft.h" #include "nsx_core.h" #include "nsx_defines.h" int WebRtcNsx_Create(NsxHandle** nsxInst) { - *nsxInst = (NsxHandle*)malloc(sizeof(NsxInst_t)); - if (*nsxInst != NULL) { + NsxInst_t* self = malloc(sizeof(NsxInst_t)); + *nsxInst = (NsxHandle*)self; + + if (self != NULL) { WebRtcSpl_Init(); - (*(NsxInst_t**)nsxInst)->initFlag = 0; + self->real_fft = NULL; + self->initFlag = 0; return 0; } else { return -1; @@ -28,6 +32,7 @@ int WebRtcNsx_Create(NsxHandle** nsxInst) { } int WebRtcNsx_Free(NsxHandle* nsxInst) { + WebRtcSpl_FreeRealFFT(((NsxInst_t*)nsxInst)->real_fft); free(nsxInst); return 0; } diff --git a/src/modules/audio_processing/ns/nsx_core.c b/src/modules/audio_processing/ns/nsx_core.c index 214b807bae..2894070978 100644 --- a/src/modules/audio_processing/ns/nsx_core.c +++ b/src/modules/audio_processing/ns/nsx_core.c @@ -16,6 +16,7 @@ #include #include +#include "common_audio/signal_processing/include/real_fft.h" #include "cpu_features_wrapper.h" #include "nsx_core.h" @@ -794,6 +795,14 @@ WebRtc_Word32 WebRtcNsx_InitCore(NsxInst_t* inst, WebRtc_UWord32 fs) { inst->anaLen2 = WEBRTC_SPL_RSHIFT_W16(inst->anaLen, 1); inst->magnLen = inst->anaLen2 + 1; + if (inst->real_fft != NULL) { + WebRtcSpl_FreeRealFFT(inst->real_fft); + } + inst->real_fft = WebRtcSpl_CreateRealFFT(inst->stages); + if (inst->real_fft == NULL) { + return -1; + } + WebRtcSpl_ZerosArrayW16(inst->analysisBuffer, ANAL_BLOCKL_MAX); WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer, ANAL_BLOCKL_MAX); @@ -1548,8 +1557,7 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, WebRtc_UWord16* WebRtc_Word16 frac = 0; WebRtc_Word16 log2 = 0; WebRtc_Word16 matrix_determinant = 0; - WebRtc_Word16 winData[ANAL_BLOCKL_MAX], maxWinData; - WebRtc_Word16 realImag[ANAL_BLOCKL_MAX << 1]; + WebRtc_Word16 maxWinData; int i, j; int zeros; @@ -1557,6 +1565,13 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, WebRtc_UWord16* int right_shifts_in_magnU16 = 0; int right_shifts_in_initMagnEst = 0; + int16_t winData_buff[ANAL_BLOCKL_MAX * 2 + 16]; + int16_t realImag_buff[ANAL_BLOCKL_MAX * 2 + 16]; + + // Align the structures to 32-byte boundary for the FFT function. + int16_t* winData = (int16_t*) (((uintptr_t)winData_buff + 31) & ~31); + int16_t* realImag = (int16_t*) (((uintptr_t) realImag_buff + 31) & ~31); + // Update analysis buffer for lower band, and window data before FFT. WebRtcNsx_AnalysisUpdate(inst, winData, speechFrame); @@ -1585,14 +1600,13 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, WebRtc_UWord16* // create realImag as winData interleaved with zeros (= imag. part), normalize it WebRtcNsx_CreateComplexBuffer(inst, winData, realImag); - // bit-reverse position of elements in array and FFT the array - WebRtcSpl_ComplexBitReverse(realImag, inst->stages); // Q(normData-stages) - WebRtcSpl_ComplexFFT(realImag, inst->stages, 1); + // FFT output will be in winData[]. + WebRtcSpl_RealForwardFFT(inst->real_fft, realImag, winData); inst->imag[0] = 0; // Q(normData-stages) inst->imag[inst->anaLen2] = 0; - inst->real[0] = realImag[0]; // Q(normData-stages) - inst->real[inst->anaLen2] = realImag[inst->anaLen]; + inst->real[0] = winData[0]; // Q(normData-stages) + inst->real[inst->anaLen2] = winData[inst->anaLen]; // Q(2*(normData-stages)) inst->magnEnergy = (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(inst->real[0], inst->real[0]); inst->magnEnergy += (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(inst->real[inst->anaLen2], @@ -1604,12 +1618,12 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, WebRtc_UWord16* if (inst->blockIndex >= END_STARTUP_SHORT) { for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) { - inst->real[i] = realImag[j]; - inst->imag[i] = -realImag[j + 1]; + inst->real[i] = winData[j]; + inst->imag[i] = -winData[j + 1]; // magnitude spectrum // energy in Q(2*(normData-stages)) - tmpU32no1 = (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(realImag[j], realImag[j]); - tmpU32no1 += (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(realImag[j + 1], realImag[j + 1]); + tmpU32no1 = (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(winData[j], winData[j]); + tmpU32no1 += (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(winData[j + 1], winData[j + 1]); inst->magnEnergy += tmpU32no1; // Q(2*(normData-stages)) magnU16[i] = (WebRtc_UWord16)WebRtcSpl_SqrtFloor(tmpU32no1); // Q(normData-stages) @@ -1653,12 +1667,12 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, WebRtc_UWord16* sum_log_i_log_magn = (WEBRTC_SPL_MUL_16_16(kLogIndex[inst->anaLen2], log2) >> 3); for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) { - inst->real[i] = realImag[j]; - inst->imag[i] = -realImag[j + 1]; + inst->real[i] = winData[j]; + inst->imag[i] = -winData[j + 1]; // magnitude spectrum // energy in Q(2*(normData-stages)) - tmpU32no1 = (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(realImag[j], realImag[j]); - tmpU32no1 += (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(realImag[j + 1], realImag[j + 1]); + tmpU32no1 = (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(winData[j], winData[j]); + tmpU32no1 += (WebRtc_UWord32)WEBRTC_SPL_MUL_16_16(winData[j + 1], winData[j + 1]); inst->magnEnergy += tmpU32no1; // Q(2*(normData-stages)) magnU16[i] = (WebRtc_UWord16)WebRtcSpl_SqrtFloor(tmpU32no1); // Q(normData-stages) @@ -1780,7 +1794,13 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, WebRtc_UWord16* void WebRtcNsx_DataSynthesis(NsxInst_t* inst, short* outFrame) { WebRtc_Word32 energyOut; - WebRtc_Word16 realImag[ANAL_BLOCKL_MAX << 1]; + int16_t realImag_buff[ANAL_BLOCKL_MAX * 2 + 16]; + int16_t rfft_out_buff[ANAL_BLOCKL_MAX * 2 + 16]; + + // Align the structures to 32-byte boundary for the FFT function. + int16_t* realImag = (int16_t*) (((uintptr_t)realImag_buff + 31) & ~31); + int16_t* rfft_out = (int16_t*) (((uintptr_t) rfft_out_buff + 31) & ~31); + WebRtc_Word16 tmp16no1, tmp16no2; WebRtc_Word16 energyRatio; WebRtc_Word16 gainFactor, gainFactor1, gainFactor2; @@ -1807,12 +1827,11 @@ void WebRtcNsx_DataSynthesis(NsxInst_t* inst, short* outFrame) { // Filter the data in the frequency domain, and create spectrum. WebRtcNsx_PrepareSpectrum(inst, realImag); - // bit-reverse position of elements in array and IFFT it - WebRtcSpl_ComplexBitReverse(realImag, inst->stages); - outCIFFT = WebRtcSpl_ComplexIFFT(realImag, inst->stages, 1); + // Inverse FFT output will be in rfft_out[]. + outCIFFT = WebRtcSpl_RealInverseFFT(inst->real_fft, realImag, rfft_out); // Denormalize. - WebRtcNsx_Denormalize(inst, realImag, outCIFFT); + WebRtcNsx_Denormalize(inst, rfft_out, outCIFFT); //scale factor: only do it after END_STARTUP_LONG time gainFactor = 8192; // 8192 = Q13(1.0) diff --git a/src/modules/audio_processing/ns/nsx_core.h b/src/modules/audio_processing/ns/nsx_core.h index 4740c0610d..503c6fb373 100644 --- a/src/modules/audio_processing/ns/nsx_core.h +++ b/src/modules/audio_processing/ns/nsx_core.h @@ -99,6 +99,7 @@ typedef struct NsxInst_t_ { int scaleEnergyIn; int normData; + struct RealFFT* real_fft; } NsxInst_t; #ifdef __cplusplus