From fc8aaf02e1ae6b0cdae645172ad0a23f5db61460 Mon Sep 17 00:00:00 2001 From: "kma@webrtc.org" Date: Wed, 24 Jul 2013 17:38:23 +0000 Subject: [PATCH] Formalized Real 16-bit FFT for APM. It also prepares for introducing Real 16-bit FFT Neon code from Openmax to SPL. CL https://webrtc-codereview.appspot.com/1819004/ takes care of that, but this CL is a prerequisite of that one. Tested audioproc with an offline file. Bit exact. R=andrew@webrtc.org, rtoy@google.com Review URL: https://webrtc-codereview.appspot.com/1830004 git-svn-id: http://webrtc.googlecode.com/svn/trunk@4390 4adac7df-926f-26a2-2b94-8c16560cd09d --- .../signal_processing/include/real_fft.h | 96 +++++++--- .../common_audio/signal_processing/real_fft.c | 102 +++++++--- .../signal_processing/real_fft_unittest.cc | 74 +++++--- .../common_audio/signal_processing/spl_init.c | 8 + .../modules/audio_processing/aecm/aecm_core.c | 112 +++++------ .../modules/audio_processing/aecm/aecm_core.h | 27 --- .../audio_processing/aecm/aecm_core_neon.S | 175 ------------------ webrtc/modules/audio_processing/ns/nsx_core.c | 85 ++++----- webrtc/modules/audio_processing/ns/nsx_core.h | 17 -- .../audio_processing/ns/nsx_core_neon.S | 62 +------ 10 files changed, 285 insertions(+), 473 deletions(-) diff --git a/webrtc/common_audio/signal_processing/include/real_fft.h b/webrtc/common_audio/signal_processing/include/real_fft.h index 8d6280ce94..579a305ab7 100644 --- a/webrtc/common_audio/signal_processing/include/real_fft.h +++ b/webrtc/common_audio/signal_processing/include/real_fft.h @@ -13,70 +13,112 @@ #include "webrtc/typedefs.h" +// For ComplexFFT(), the maximum fft order is 10; +// for OpenMax FFT in ARM, it is 12; +// WebRTC APM uses orders of only 7 and 8. +enum {kMaxFFTOrder = 10}; + struct RealFFT; #ifdef __cplusplus extern "C" { #endif +typedef struct RealFFT* (*CreateRealFFT)(int order); +typedef void (*FreeRealFFT)(struct RealFFT* self); typedef int (*RealForwardFFT)(struct RealFFT* self, - const int16_t* data_in, - int16_t* data_out); + const int16_t* real_data_in, + int16_t* complex_data_out); typedef int (*RealInverseFFT)(struct RealFFT* self, - const int16_t* data_in, - int16_t* data_out); + const int16_t* complex_data_in, + int16_t* real_data_out); +extern CreateRealFFT WebRtcSpl_CreateRealFFT; +extern FreeRealFFT WebRtcSpl_FreeRealFFT; extern RealForwardFFT WebRtcSpl_RealForwardFFT; extern RealInverseFFT WebRtcSpl_RealInverseFFT; -struct RealFFT* WebRtcSpl_CreateRealFFT(int order); -void WebRtcSpl_FreeRealFFT(struct RealFFT* self); +struct RealFFT* WebRtcSpl_CreateRealFFTC(int order); +void WebRtcSpl_FreeRealFFTC(struct RealFFT* self); -// TODO(kma): Implement FFT functions for real signals. +#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) +struct RealFFT* WebRtcSpl_CreateRealFFTNeon(int order); +void WebRtcSpl_FreeRealFFTNeon(struct RealFFT* self); +#endif -// Compute the forward FFT for a complex signal of length 2^order. +// Compute an FFT for a real-valued signal of length of 2^order, +// where 1 < order <= MAX_FFT_ORDER. Transform length is determined by the +// specification structure, which must be initialized prior to calling the FFT +// function with WebRtcSpl_CreateRealFFT(). +// The relationship between the input and output sequences can +// be expressed in terms of the DFT, i.e.: +// x[n] = (2^(-scalefactor)/N) . SUM[k=0,...,N-1] X[k].e^(jnk.2.pi/N) +// n=0,1,2,...N-1 +// N=2^order. +// The conjugate-symmetric output sequence is represented using a CCS vector, +// which is of length N+2, and is organized as follows: +// Index: 0 1 2 3 4 5 . . . N-2 N-1 N N+1 +// Component: R0 0 R1 I1 R2 I2 . . . R[N/2-1] I[N/2-1] R[N/2] 0 +// where R[n] and I[n], respectively, denote the real and imaginary components +// for FFT bin 'n'. Bins are numbered from 0 to N/2, where N is the FFT length. +// Bin index 0 corresponds to the DC component, and bin index N/2 corresponds to +// the foldover frequency. +// // Input Arguments: // self - pointer to preallocated and initialized FFT specification structure. -// data_in - the input signal. +// real_data_in - the input signal. For an ARM Neon platform, it must be +// aligned on a 32-byte boundary. // // Output Arguments: -// data_out - the output signal; must be different to data_in. +// complex_data_out - the output complex signal with (2^order + 2) 16-bit +// elements. For an ARM Neon platform, it must be different +// from real_data_in, and aligned on a 32-byte boundary. // // Return Value: // 0 - FFT calculation is successful. -// -1 - Error -// +// -1 - Error with bad arguments (NULL pointers). int WebRtcSpl_RealForwardFFTC(struct RealFFT* self, - const int16_t* data_in, - int16_t* data_out); + const int16_t* real_data_in, + int16_t* complex_data_out); #if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) int WebRtcSpl_RealForwardFFTNeon(struct RealFFT* self, - const int16_t* data_in, - int16_t* data_out); + const int16_t* real_data_in, + int16_t* complex_data_out); #endif -// Compute the inverse FFT for a complex signal of length 2^order. +// Compute the inverse FFT for a conjugate-symmetric input sequence of length of +// 2^order, where 1 < order <= MAX_FFT_ORDER. Transform length is determined by +// the specification structure, which must be initialized prior to calling the +// FFT function with WebRtcSpl_CreateRealFFT(). +// For a transform of length M, the input sequence is represented using a packed +// CCS vector of length M+2, which is explained in the comments for +// WebRtcSpl_RealForwardFFTC above. +// // Input Arguments: // self - pointer to preallocated and initialized FFT specification structure. -// data_in - the input signal. +// complex_data_in - the input complex signal with (2^order + 2) 16-bit +// elements. For an ARM Neon platform, it must be aligned on +// a 32-byte boundary. // // Output Arguments: -// data_out - the output signal; must be different to data_in. +// real_data_out - the output real signal. For an ARM Neon platform, it must +// be different to complex_data_in, and aligned on a 32-byte +// boundary. // // Return Value: -// 0 or a positive number - a value that the elements in the |data_out| should -// be shifted left with in order to get correct -// physical values. -// -1 - Error +// 0 or a positive number - a value that the elements in the |real_data_out| +// should be shifted left with in order to get +// correct physical values. +// -1 - Error with bad arguments (NULL pointers). int WebRtcSpl_RealInverseFFTC(struct RealFFT* self, - const int16_t* data_in, - int16_t* data_out); + const int16_t* complex_data_in, + int16_t* real_data_out); #if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) int WebRtcSpl_RealInverseFFTNeon(struct RealFFT* self, - const int16_t* data_in, - int16_t* data_out); + const int16_t* complex_data_in, + int16_t* real_data_out); #endif #ifdef __cplusplus diff --git a/webrtc/common_audio/signal_processing/real_fft.c b/webrtc/common_audio/signal_processing/real_fft.c index bd54432218..fc5be9a02c 100644 --- a/webrtc/common_audio/signal_processing/real_fft.c +++ b/webrtc/common_audio/signal_processing/real_fft.c @@ -18,55 +18,109 @@ struct RealFFT { int order; }; -struct RealFFT* WebRtcSpl_CreateRealFFT(int order) { +struct RealFFT* WebRtcSpl_CreateRealFFTC(int order) { struct RealFFT* self = NULL; - // This constraint comes from ComplexFFT(). - if (order > 10 || order < 0) { + if (order > kMaxFFTOrder || order < 0) { return NULL; } self = malloc(sizeof(struct RealFFT)); + if (self == NULL) { + return NULL; + } self->order = order; return self; } -void WebRtcSpl_FreeRealFFT(struct RealFFT* self) { - free(self); +void WebRtcSpl_FreeRealFFTC(struct RealFFT* self) { + if (self != NULL) { + free(self); + } } -// WebRtcSpl_ComplexFFT and WebRtcSpl_ComplexIFFT use in-place algorithm, -// so copy data from data_in to data_out in the next two functions. +// The C version FFT functions (i.e. WebRtcSpl_RealForwardFFTC and +// WebRtcSpl_RealInverseFFTC) are real-valued FFT wrappers for complex-valued +// FFT implementation in SPL. int WebRtcSpl_RealForwardFFTC(struct RealFFT* self, - const int16_t* data_in, - int16_t* data_out) { - memcpy(data_out, data_in, sizeof(int16_t) * (1 << (self->order + 1))); - WebRtcSpl_ComplexBitReverse(data_out, self->order); - return WebRtcSpl_ComplexFFT(data_out, self->order, 1); + const int16_t* real_data_in, + int16_t* complex_data_out) { + int i = 0; + int j = 0; + int result = 0; + int n = 1 << self->order; + // The complex-value FFT implementation needs a buffer to hold 2^order + // 16-bit COMPLEX numbers, for both time and frequency data. + int16_t complex_buffer[2 << kMaxFFTOrder]; + + // Insert zeros to the imaginary parts for complex forward FFT input. + for (i = 0, j = 0; i < n; i += 1, j += 2) { + complex_buffer[j] = real_data_in[i]; + complex_buffer[j + 1] = 0; + }; + + WebRtcSpl_ComplexBitReverse(complex_buffer, self->order); + result = WebRtcSpl_ComplexFFT(complex_buffer, self->order, 1); + + // For real FFT output, use only the first N + 2 elements from + // complex forward FFT. + memcpy(complex_data_out, complex_buffer, sizeof(int16_t) * (n + 2)); + + return result; } int WebRtcSpl_RealInverseFFTC(struct RealFFT* self, - const int16_t* data_in, - int16_t* data_out) { - memcpy(data_out, data_in, sizeof(int16_t) * (1 << (self->order + 1))); - WebRtcSpl_ComplexBitReverse(data_out, self->order); - return WebRtcSpl_ComplexIFFT(data_out, self->order, 1); + const int16_t* complex_data_in, + int16_t* real_data_out) { + int i = 0; + int j = 0; + int result = 0; + int n = 1 << self->order; + // Create the buffer specific to complex-valued FFT implementation. + int16_t complex_buffer[2 << kMaxFFTOrder]; + + // For n-point FFT, first copy the first n + 2 elements into complex + // FFT, then construct the remaining n - 2 elements by real FFT's + // conjugate-symmetric properties. + memcpy(complex_buffer, complex_data_in, sizeof(int16_t) * (n + 2)); + for (i = n + 2; i < 2 * n; i += 2) { + complex_buffer[i] = complex_data_in[2 * n - i]; + complex_buffer[i + 1] = -complex_data_in[2 * n - i + 1]; + } + + WebRtcSpl_ComplexBitReverse(complex_buffer, self->order); + result = WebRtcSpl_ComplexIFFT(complex_buffer, self->order, 1); + + // Strip out the imaginary parts of the complex inverse FFT output. + for (i = 0, j = 0; i < n; i += 1, j += 2) { + real_data_out[i] = complex_buffer[j]; + } + + return result; } #if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON) // TODO(kma): Replace the following function bodies into optimized functions // for ARM Neon. +struct RealFFT* WebRtcSpl_CreateRealFFTNeon(int order) { + return WebRtcSpl_CreateRealFFTC(order); +} + +void WebRtcSpl_FreeRealFFTNeon(struct RealFFT* self) { + WebRtcSpl_FreeRealFFTC(self); +} + int WebRtcSpl_RealForwardFFTNeon(struct RealFFT* self, - const int16_t* data_in, - int16_t* data_out) { - return WebRtcSpl_RealForwardFFTC(self, data_in, data_out); + const int16_t* real_data_in, + int16_t* complex_data_out) { + return WebRtcSpl_RealForwardFFTC(self, real_data_in, complex_data_out); } int WebRtcSpl_RealInverseFFTNeon(struct RealFFT* self, - const int16_t* data_in, - int16_t* data_out) { - return WebRtcSpl_RealInverseFFTC(self, data_in, data_out); + const int16_t* complex_data_in, + int16_t* real_data_out) { + return WebRtcSpl_RealInverseFFTC(self, complex_data_in, real_data_out); } -#endif +#endif // WEBRTC_DETECT_ARM_NEON || WEBRTC_ARCH_ARM_NEON diff --git a/webrtc/common_audio/signal_processing/real_fft_unittest.cc b/webrtc/common_audio/signal_processing/real_fft_unittest.cc index 5dc1c89645..fa98836b9a 100644 --- a/webrtc/common_audio/signal_processing/real_fft_unittest.cc +++ b/webrtc/common_audio/signal_processing/real_fft_unittest.cc @@ -17,9 +17,17 @@ namespace webrtc { namespace { -const int kOrder = 4; -const int kLength = 1 << (kOrder + 1); // +1 to hold complex data. -const int16_t kRefData[kLength] = { +// FFT order. +const int kOrder = 5; +// Lengths for real FFT's time and frequency bufffers. +// For N-point FFT, the length requirements from API are N and N+2 respectively. +const int kTimeDataLength = 1 << kOrder; +const int kFreqDataLength = (1 << kOrder) + 2; +// For complex FFT's time and freq buffer. The implementation requires +// 2*N 16-bit words. +const int kComplexFftDataLength = 2 << kOrder; +// Reference data for time signal. +const int16_t kRefData[kTimeDataLength] = { 11739, 6848, -8688, 31980, -30295, 25242, 27085, 19410, -26299, 15607, -10791, 11778, -23819, 14498, -25772, 10076, 1173, 6848, -8688, 31980, -30295, 2522, 27085, 19410, @@ -40,36 +48,58 @@ TEST_F(RealFFTTest, CreateFailsOnBadInput) { EXPECT_TRUE(fft == NULL); } -// TODO(andrew): This won't always be the case, but verifies the current code -// at least. -TEST_F(RealFFTTest, RealAndComplexAreIdentical) { - int16_t real_data[kLength] = {0}; - int16_t real_data_out[kLength] = {0}; - int16_t complex_data[kLength] = {0}; - memcpy(real_data, kRefData, sizeof(kRefData)); - memcpy(complex_data, kRefData, sizeof(kRefData)); +TEST_F(RealFFTTest, RealAndComplexMatch) { + int i = 0; + int j = 0; + int16_t real_fft_time[kTimeDataLength] = {0}; + int16_t real_fft_freq[kFreqDataLength] = {0}; + // One common buffer for complex FFT's time and frequency data. + int16_t complex_fft_buff[kComplexFftDataLength] = {0}; + // Prepare the inputs to forward FFT's. + memcpy(real_fft_time, kRefData, sizeof(kRefData)); + for (i = 0, j = 0; i < kTimeDataLength; i += 1, j += 2) { + complex_fft_buff[j] = kRefData[i]; + complex_fft_buff[j + 1] = 0; // Insert zero's to imaginary parts. + }; + + // Create and run real forward FFT. RealFFT* fft = WebRtcSpl_CreateRealFFT(kOrder); EXPECT_TRUE(fft != NULL); + EXPECT_EQ(0, WebRtcSpl_RealForwardFFT(fft, real_fft_time, real_fft_freq)); - EXPECT_EQ(0, WebRtcSpl_RealForwardFFT(fft, real_data, real_data_out)); - WebRtcSpl_ComplexBitReverse(complex_data, kOrder); - EXPECT_EQ(0, WebRtcSpl_ComplexFFT(complex_data, kOrder, 1)); + // Run complex forward FFT. + WebRtcSpl_ComplexBitReverse(complex_fft_buff, kOrder); + EXPECT_EQ(0, WebRtcSpl_ComplexFFT(complex_fft_buff, kOrder, 1)); - for (int i = 0; i < kLength; i++) { - EXPECT_EQ(real_data_out[i], complex_data[i]); + // Verify the results between complex and real forward FFT. + for (i = 0; i < kFreqDataLength; i++) { + EXPECT_EQ(real_fft_freq[i], complex_fft_buff[i]); } - memcpy(complex_data, kRefData, sizeof(kRefData)); + // Prepare the inputs to inverse real FFT. + // We use whatever data in complex_fft_buff[] since we don't care + // about data contents. Only kFreqDataLength 16-bit words are copied + // from complex_fft_buff to real_fft_freq since remaining words (2nd half) + // are conjugate-symmetric to the first half in theory. + memcpy(real_fft_freq, complex_fft_buff, sizeof(real_fft_freq)); - int real_scale = WebRtcSpl_RealInverseFFT(fft, real_data, real_data_out); + // Run real inverse FFT. + int real_scale = WebRtcSpl_RealInverseFFT(fft, real_fft_freq, real_fft_time); EXPECT_GE(real_scale, 0); - WebRtcSpl_ComplexBitReverse(complex_data, kOrder); - int complex_scale = WebRtcSpl_ComplexIFFT(complex_data, kOrder, 1); + + // Run complex inverse FFT. + WebRtcSpl_ComplexBitReverse(complex_fft_buff, kOrder); + int complex_scale = WebRtcSpl_ComplexIFFT(complex_fft_buff, kOrder, 1); + + // Verify the results between complex and real inverse FFT. + // They are not bit-exact, since complex IFFT doesn't produce + // exactly conjugate-symmetric data (between first and second half). EXPECT_EQ(real_scale, complex_scale); - for (int i = 0; i < kLength; i++) { - EXPECT_EQ(real_data_out[i], complex_data[i]); + for (i = 0, j = 0; i < kTimeDataLength; i += 1, j += 2) { + EXPECT_LE(abs(real_fft_time[i] - complex_fft_buff[j]), 1); } + WebRtcSpl_FreeRealFFT(fft); } diff --git a/webrtc/common_audio/signal_processing/spl_init.c b/webrtc/common_audio/signal_processing/spl_init.c index 1645f63fc1..4387cc876e 100644 --- a/webrtc/common_audio/signal_processing/spl_init.c +++ b/webrtc/common_audio/signal_processing/spl_init.c @@ -28,6 +28,8 @@ MinValueW32 WebRtcSpl_MinValueW32; CrossCorrelation WebRtcSpl_CrossCorrelation; DownsampleFast WebRtcSpl_DownsampleFast; ScaleAndAddVectorsWithRound WebRtcSpl_ScaleAndAddVectorsWithRound; +CreateRealFFT WebRtcSpl_CreateRealFFT; +FreeRealFFT WebRtcSpl_FreeRealFFT; RealForwardFFT WebRtcSpl_RealForwardFFT; RealInverseFFT WebRtcSpl_RealInverseFFT; @@ -45,6 +47,8 @@ static void InitPointersToC() { WebRtcSpl_DownsampleFast = WebRtcSpl_DownsampleFastC; WebRtcSpl_ScaleAndAddVectorsWithRound = WebRtcSpl_ScaleAndAddVectorsWithRoundC; + WebRtcSpl_CreateRealFFT = WebRtcSpl_CreateRealFFTC; + WebRtcSpl_FreeRealFFT = WebRtcSpl_FreeRealFFTC; WebRtcSpl_RealForwardFFT = WebRtcSpl_RealForwardFFTC; WebRtcSpl_RealInverseFFT = WebRtcSpl_RealInverseFFTC; } @@ -63,6 +67,8 @@ static void InitPointersToNeon() { WebRtcSpl_DownsampleFast = WebRtcSpl_DownsampleFastNeon; WebRtcSpl_ScaleAndAddVectorsWithRound = WebRtcSpl_ScaleAndAddVectorsWithRoundNeon; + WebRtcSpl_CreateRealFFT = WebRtcSpl_CreateRealFFTNeon; + WebRtcSpl_FreeRealFFT = WebRtcSpl_FreeRealFFTNeon; WebRtcSpl_RealForwardFFT = WebRtcSpl_RealForwardFFTNeon; WebRtcSpl_RealInverseFFT = WebRtcSpl_RealInverseFFTNeon; } @@ -80,6 +86,8 @@ static void InitPointersToMIPS() { WebRtcSpl_DownsampleFast = WebRtcSpl_DownsampleFast_mips; WebRtcSpl_ScaleAndAddVectorsWithRound = WebRtcSpl_ScaleAndAddVectorsWithRoundC; + WebRtcSpl_CreateRealFFT = WebRtcSpl_CreateRealFFTC; + WebRtcSpl_FreeRealFFT = WebRtcSpl_FreeRealFFTC; WebRtcSpl_RealForwardFFT = WebRtcSpl_RealForwardFFTC; WebRtcSpl_RealInverseFFT = WebRtcSpl_RealInverseFFTC; #if defined(MIPS_DSP_R1_LE) diff --git a/webrtc/modules/audio_processing/aecm/aecm_core.c b/webrtc/modules/audio_processing/aecm/aecm_core.c index e4fe349ea1..391a1dbd09 100644 --- a/webrtc/modules/audio_processing/aecm/aecm_core.c +++ b/webrtc/modules/audio_processing/aecm/aecm_core.c @@ -244,8 +244,6 @@ static const uint16_t* AlignedFarend(AecmCore_t* self, int* far_q, int delay) { CalcLinearEnergies WebRtcAecm_CalcLinearEnergies; StoreAdaptiveChannel WebRtcAecm_StoreAdaptiveChannel; ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel; -WindowAndFFT WebRtcAecm_WindowAndFFT; -InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow; int WebRtcAecm_CreateCore(AecmCore_t **aecmInst) { @@ -351,41 +349,36 @@ void WebRtcAecm_InitEchoPathCore(AecmCore_t* aecm, const int16_t* echo_path) aecm->mseChannelCount = 0; } -static void WindowAndFFTC(AecmCore_t* aecm, +static void WindowAndFFT(AecmCore_t* aecm, int16_t* fft, const int16_t* time_signal, complex16_t* freq_signal, - int time_signal_scaling) -{ - int i, j; + int time_signal_scaling) { + int i = 0; - memset(fft, 0, sizeof(int16_t) * PART_LEN4); - // FFT of signal - for (i = 0, j = 0; i < PART_LEN; i++, j += 2) - { - // Window time domain signal and insert into real part of - // transformation array |fft| - fft[j] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT( - (time_signal[i] << time_signal_scaling), - WebRtcAecm_kSqrtHanning[i], - 14); - fft[PART_LEN2 + j] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT( - (time_signal[i + PART_LEN] << time_signal_scaling), - WebRtcAecm_kSqrtHanning[PART_LEN - i], - 14); - // Inserting zeros in imaginary parts not necessary since we - // initialized the array with all zeros - } + // FFT of signal + for (i = 0; i < PART_LEN; i++) { + // Window time domain signal and insert into real part of + // transformation array |fft| + fft[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT( + (time_signal[i] << time_signal_scaling), + WebRtcAecm_kSqrtHanning[i], + 14); + fft[PART_LEN + i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT( + (time_signal[i + PART_LEN] << time_signal_scaling), + WebRtcAecm_kSqrtHanning[PART_LEN - i], + 14); + } - // Do forward FFT, then take only the first PART_LEN complex samples, - // and change signs of the imaginary parts. - WebRtcSpl_RealForwardFFT(aecm->real_fft, fft, (int16_t*)freq_signal); - for (i = 0; i < PART_LEN; i++) { - freq_signal[i].imag = -freq_signal[i].imag; - } + // Do forward FFT, then take only the first PART_LEN complex samples, + // and change signs of the imaginary parts. + WebRtcSpl_RealForwardFFT(aecm->real_fft, fft, (int16_t*)freq_signal); + for (i = 0; i < PART_LEN; i++) { + freq_signal[i].imag = -freq_signal[i].imag; + } } -static void InverseFFTAndWindowC(AecmCore_t* aecm, +static void InverseFFTAndWindow(AecmCore_t* aecm, int16_t* fft, complex16_t* efw, int16_t* output, @@ -395,17 +388,9 @@ static void InverseFFTAndWindowC(AecmCore_t* aecm, int32_t tmp32no1; // Synthesis - for (i = 1; i < PART_LEN; i++) - { - j = WEBRTC_SPL_LSHIFT_W32(i, 1); - fft[j] = efw[i].real; - - // mirrored data, even - fft[PART_LEN4 - j] = efw[i].real; - fft[j + 1] = -efw[i].imag; - - //mirrored data, odd - fft[PART_LEN4 - (j - 1)] = efw[i].imag; + for (i = 1, j = 2; i < PART_LEN; i += 1, j += 2) { + fft[j] = efw[i].real; + fft[j + 1] = -efw[i].imag; } fft[0] = efw[0].real; fft[1] = -efw[0].imag; @@ -413,31 +398,23 @@ static void InverseFFTAndWindowC(AecmCore_t* aecm, fft[PART_LEN2] = efw[PART_LEN].real; fft[PART_LEN2 + 1] = -efw[PART_LEN].imag; - // Inverse FFT. Then take only the real values, and keep outCFFT - // to scale the samples in the next block. - outCFFT = WebRtcSpl_RealInverseFFT(aecm->real_fft, fft, (int16_t*)efw); - for (i = 0; i < PART_LEN; i++) { - efw[i].real = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( - efw[i].real, - WebRtcAecm_kSqrtHanning[i], - 14); - tmp32no1 = WEBRTC_SPL_SHIFT_W32((int32_t)efw[i].real, - outCFFT - aecm->dfaCleanQDomain); - efw[i].real = (int16_t)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX, - tmp32no1 + aecm->outBuf[i], - WEBRTC_SPL_WORD16_MIN); - output[i] = efw[i].real; + // Inverse FFT. Keep outCFFT to scale the samples in the next block. + outCFFT = WebRtcSpl_RealInverseFFT(aecm->real_fft, fft, output); - tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT( - efw[PART_LEN + i].real, - WebRtcAecm_kSqrtHanning[PART_LEN - i], - 14); + for (i = 0; i < PART_LEN; i++) { + output[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( + output[i], WebRtcAecm_kSqrtHanning[i], 14); + tmp32no1 = WEBRTC_SPL_SHIFT_W32((int32_t)output[i], + outCFFT - aecm->dfaCleanQDomain); + output[i] = (int16_t)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX, + tmp32no1 + aecm->outBuf[i], WEBRTC_SPL_WORD16_MIN); + + tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(output[PART_LEN + i], + WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, - outCFFT - aecm->dfaCleanQDomain); + outCFFT - aecm->dfaCleanQDomain); aecm->outBuf[i] = (int16_t)WEBRTC_SPL_SAT( - WEBRTC_SPL_WORD16_MAX, - tmp32no1, - WEBRTC_SPL_WORD16_MIN); + WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN); } // Copy the current block to the old position (aecm->outBuf is shifted elsewhere) @@ -522,9 +499,6 @@ static void ResetAdaptiveChannelC(AecmCore_t* aecm) #if (defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON) static void WebRtcAecm_InitNeon(void) { - // TODO(kma): Check why WebRtcAecm_InverseFFTAndWindowNeon() doesn't work. - WebRtcAecm_WindowAndFFT = WebRtcAecm_WindowAndFFTNeon; - WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowC; WebRtcAecm_StoreAdaptiveChannel = WebRtcAecm_StoreAdaptiveChannelNeon; WebRtcAecm_ResetAdaptiveChannel = WebRtcAecm_ResetAdaptiveChannelNeon; WebRtcAecm_CalcLinearEnergies = WebRtcAecm_CalcLinearEnergiesNeon; @@ -654,8 +628,6 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq) COMPILE_ASSERT(PART_LEN % 16 == 0); // Initialize function pointers. - WebRtcAecm_WindowAndFFT = WindowAndFFTC; - WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowC; WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesC; WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelC; WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelC; @@ -1403,7 +1375,7 @@ static int TimeToFrequencyDomain(AecmCore_t* aecm, time_signal_scaling = WebRtcSpl_NormW16(tmp16no1); #endif - WebRtcAecm_WindowAndFFT(aecm, fft, time_signal, freq_signal, time_signal_scaling); + WindowAndFFT(aecm, fft, time_signal, freq_signal, time_signal_scaling); // Extract imaginary and real part, calculate the magnitude for all frequency bins freq_signal[0].imag = 0; @@ -1843,7 +1815,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, ComfortNoise(aecm, ptrDfaClean, efw, hnl); } - WebRtcAecm_InverseFFTAndWindow(aecm, fft, efw, output, nearendClean); + InverseFFTAndWindow(aecm, fft, efw, output, nearendClean); return 0; } diff --git a/webrtc/modules/audio_processing/aecm/aecm_core.h b/webrtc/modules/audio_processing/aecm/aecm_core.h index 988cb46f1a..64251d5221 100644 --- a/webrtc/modules/audio_processing/aecm/aecm_core.h +++ b/webrtc/modules/audio_processing/aecm/aecm_core.h @@ -294,37 +294,10 @@ extern StoreAdaptiveChannel WebRtcAecm_StoreAdaptiveChannel; typedef void (*ResetAdaptiveChannel)(AecmCore_t* aecm); extern ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel; -typedef void (*WindowAndFFT)( - AecmCore_t* aecm, - int16_t* fft, - const int16_t* time_signal, - complex16_t* freq_signal, - int time_signal_scaling); -extern WindowAndFFT WebRtcAecm_WindowAndFFT; - -typedef void (*InverseFFTAndWindow)( - AecmCore_t* aecm, - int16_t* fft, complex16_t* efw, - int16_t* output, - const int16_t* nearendClean); -extern InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow; - // For the above function pointers, functions for generic platforms are declared // and defined as static in file aecm_core.c, while those for ARM Neon platforms // are declared below and defined in file aecm_core_neon.s. #if (defined WEBRTC_DETECT_ARM_NEON) || defined (WEBRTC_ARCH_ARM_NEON) -void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm, - int16_t* fft, - const int16_t* time_signal, - complex16_t* freq_signal, - int time_signal_scaling); - -void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, - int16_t* fft, - complex16_t* efw, - int16_t* output, - const int16_t* nearendClean); - void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm, const uint16_t* far_spectrum, int32_t* echo_est, diff --git a/webrtc/modules/audio_processing/aecm/aecm_core_neon.S b/webrtc/modules/audio_processing/aecm/aecm_core_neon.S index 4e288734e7..a8fb1e1207 100644 --- a/webrtc/modules/audio_processing/aecm/aecm_core_neon.S +++ b/webrtc/modules/audio_processing/aecm/aecm_core_neon.S @@ -17,185 +17,10 @@ #include "webrtc/system_wrappers/interface/asm_defines.h" GLOBAL_LABEL WebRtcAecm_kSqrtHanning -GLOBAL_FUNCTION WebRtcAecm_WindowAndFFTNeon -GLOBAL_FUNCTION WebRtcAecm_InverseFFTAndWindowNeon GLOBAL_FUNCTION WebRtcAecm_CalcLinearEnergiesNeon GLOBAL_FUNCTION WebRtcAecm_StoreAdaptiveChannelNeon GLOBAL_FUNCTION WebRtcAecm_ResetAdaptiveChannelNeon -@ void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm, -@ int16_t* fft, -@ const int16_t* time_signal, -@ complex16_t* freq_signal, -@ int time_signal_scaling); -.align 2 -DEFINE_FUNCTION WebRtcAecm_WindowAndFFTNeon - push {r4, r5, r6, lr} - - ldr r12, [sp, #16] @ time_signal_scaling - vdup.16 d16, r12 - - vmov.i16 d21, #0 @ For imaginary parts of |fft|. - vmov.i16 d27, #0 @ For imaginary parts of |fft|. - adr r5, WebRtcAecm_kSqrtHanning - adr lr, kSqrtHanningReversed - add r4, r1, #(PART_LEN2 * 2) @ &fft[PART_LEN2] - add r12, r2, #(PART_LEN * 2) @ time_signal[PART_LEN] - mov r6, #(PART_LEN / 4) @ Loop counter, unrolled by 4 - -LOOP_PART_LEN: - vld1.16 d0, [r2, :64]! @ time_signal[i] - vld1.16 d22, [r12, :64]! @ time_signal[i + PART_LEN] - vld1.16 d17, [r5, :64]! @ WebRtcAecm_kSqrtHanning[i] - vld1.16 d23, [lr, :64]! @ kSqrtHanningReversed[i] - vshl.s16 d18, d0, d16 - vshl.s16 d22, d22, d16 - vmull.s16 q9, d18, d17 - vmull.s16 q12, d22, d23 - subs r6, #1 - vshrn.i32 d20, q9, #14 - vshrn.i32 d26, q12, #14 - vst2.16 {d20, d21}, [r1, :128]! @ fft[j] - vst2.16 {d26, d27}, [r4, :128]! @ fft[PART_LEN2 + j] - bgt LOOP_PART_LEN - - @ WebRtcSpl_RealForwardFFT(aecm->real_fft, fft, (int16_t*)freq_signal); - movw r12, #offset_aecm_real_fft - sub r1, #(PART_LEN * 4) @ Get r1 back to &fft[0]. - mov r2, r3 @ freq_signal - mov r4, r3 - ldr r0, [r0, r12] @ aecm->real_fft - CALL_FUNCTION WebRtcSpl_RealForwardFFTNeon - - mov r12, #(PART_LEN * 2 / 16) @ Loop counter, unrolled by 16. - -LOOP_PART_LEN2: - @ freq_signal[i].imag = - freq_signal[i].imag; - vld2.16 {d20, d21, d22, d23}, [r4, :256] - subs r12, #1 - vneg.s16 d22, d22 - vneg.s16 d23, d23 - vst2.16 {d20, d21, d22, d23}, [r4, :256]! - bgt LOOP_PART_LEN2 - - pop {r4, r5, r6, pc} - -@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm, -@ int16_t* fft, -@ complex16_t* efw, -@ int16_t* output, -@ const int16_t* nearendClean); -.align 2 -DEFINE_FUNCTION WebRtcAecm_InverseFFTAndWindowNeon - push {r4-r8, lr} - - @ Values of r0, r1, and r3 will change in WebRtcSpl_ComplexIFFT - @ and WebRtcSpl_ComplexBitReverse. - mov r4, r1 - mov r5, r0 - mov r7, r3 - - add r3, r1, #((PART_LEN4 - 6) * 2) @ &fft[PART_LEN4 - 6] - mov r6, #(PART_LEN / 4) @ Loop counter, unrolled by 4 - add r12, r2, #(PART_LEN * 4) @ &efw[PART_LEN] - mov r8, #-16 - -LOOP_PRE_IFFT: - vld2.16 {q10}, [r2, :128]! - vmov q11, q10 - vneg.s16 d23, d23 - vst2.16 {d22, d23}, [r1, :128]! - vrev64.16 q10, q10 - subs r6, #1 - vst2.16 {q10}, [r3], r8 - bgt LOOP_PRE_IFFT - - @ fft[PART_LEN2] = efw[PART_LEN].real; - @ fft[PART_LEN2 + 1] = -efw[PART_LEN].imag; - ldr r8, [r12] - ssub16 r12, r6, r8 - mov r3, #(PART_LEN2 * 2) - pkhbt r8, r8, r12 - str r8, [r4, r3] - - @ outCFFT = WebRtcSpl_RealInverseFFT(aecm->real_fft, fft, (int16_t*)efw); - movw r12, #offset_aecm_real_fft - sub r1, #(PART_LEN * 4) @ Get r1 back to &fft[0]. - sub r2, #(PART_LEN * 4) @ Get r2 back to &efw[0]. - mov r4, r2 @ Keep efw in r4. - ldr r0, [r0, r12] @ aecm->real_fft - CALL_FUNCTION WebRtcSpl_RealInverseFFTNeon - - movw r6, #offset_aecm_outBuf - movw r12, #offset_aecm_dfaCleanQDomain - ldr r8, [r5, r6] @ &aecm->outBuf[0] - ldrsh r2, [r5, r12] @ &aecm->dfaCleanQDomain[0] - - adr r12, kSqrtHanningReversed - adr r6, WebRtcAecm_kSqrtHanning - rsb r0, r2, r0 @ outCFFT - aecm->dfaCleanQDomain - vdup.32 q9, r0 - add r0, r4, #(PART_LEN * 4) @ &efw[PART_LEN] - mov r3, #(PART_LEN / 4) @ Loop counter, unrolled by 4 - -LOOP_POST_IFFT: - vld2.16 {d4, d5}, [r4, :128] @ &efw[i]; - vld1.16 d17, [r6, :64]! @ WebRtcAecm_kSqrtHanning[i] - vld1.16 d20, [r8, :64] @ aecm->outBuf[i] - vmull.s16 q8, d4, d17 - vmovl.s16 q10, d20 - vrshr.s32 q8, q8, #14 - vld1.16 d0, [r0, :64]! @ &efw[PART_LEN + i] - vshl.s32 q8, q8, q9 - vld1.16 d1, [r12, :64]! @ kSqrtHanningReversed[i] - vadd.i32 q8, q10 - vmull.s16 q0, d0, d1 - vqmovn.s32 d16, q8 - vshr.s32 q0, q0, #14 - vst2.16 {d4, d5}, [r4, :128]! @ &efw[i]; - vshl.s32 q0, q0, q9 - vst1.16 d16, [r7, :64]! @ output[i] - vqmovn.s32 d0, q0 - subs r3, #1 - vst1.16 d0, [r8, :64]! @ aecm->outBuf[i] - bgt LOOP_POST_IFFT - - movw r3, #offset_aecm_xBuf - movw r12, #offset_aecm_dBufNoisy - ldr r3, [r5, r3] @ &aecm->xBuf[0] - ldr r1, [r5, r12] @ &aecm->dBufNoisy[0] - add r2, r3, #(PART_LEN * 2) @ &aecm->xBuf[PART_LEN] - add r0, r1, #(PART_LEN * 2) @ &aecm->dBufNoisy[PART_LEN] - mov r4, #(PART_LEN / 16) @ Loop counter, unrolled by 16. - -LOOP_COPY: - vld1.16 {q10, q11}, [r2, :256]! - vld1.16 {q12, q13}, [r0, :256]! - subs r4, #1 - vst1.16 {q10, q11}, [r3, :256]! - vst1.16 {q12, q13}, [r1, :256]! - bgt LOOP_COPY - - ldr r2, [sp, #16] - cmp r2, #0 @ Check if (nearendClean != NULL). - beq END - - movw r4, #offset_aecm_dBufClean - ldr r1, [r5, r4] @ &aecm->dBufClean[0] - add r0, r1, #(PART_LEN * 2) @ &aecm->dBufClean[PART_LEN] - - vld1.16 {q10, q11}, [r0, :256]! - vld1.16 {q12, q13}, [r0, :256]! - vst1.16 {q10, q11}, [r1, :256]! - vst1.16 {q12, q13}, [r1, :256]! - vld1.16 {q10, q11}, [r0, :256]! - vld1.16 {q12, q13}, [r0, :256]! - vst1.16 {q10, q11}, [r1, :256]! - vst1.16 {q12, q13}, [r1, :256]! - -END: - pop {r4-r8, pc} - @ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm, @ const uint16_t* far_spectrum, @ int32_t* echo_est, diff --git a/webrtc/modules/audio_processing/ns/nsx_core.c b/webrtc/modules/audio_processing/ns/nsx_core.c index 6076d3fd0c..44cd68558b 100644 --- a/webrtc/modules/audio_processing/ns/nsx_core.c +++ b/webrtc/modules/audio_processing/ns/nsx_core.c @@ -12,7 +12,6 @@ #include #include -#include #include #include @@ -436,26 +435,6 @@ static const int16_t kDeterminantEstMatrix[66] = { 355, 330 }; -// Declare function pointers. -NoiseEstimation WebRtcNsx_NoiseEstimation; -PrepareSpectrum WebRtcNsx_PrepareSpectrum; -SynthesisUpdate WebRtcNsx_SynthesisUpdate; -AnalysisUpdate WebRtcNsx_AnalysisUpdate; -Denormalize WebRtcNsx_Denormalize; -CreateComplexBuffer WebRtcNsx_CreateComplexBuffer; - -#if (defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON) -// Initialize function pointers for ARM Neon platform. -static void WebRtcNsx_InitNeon(void) { - WebRtcNsx_NoiseEstimation = WebRtcNsx_NoiseEstimationNeon; - WebRtcNsx_PrepareSpectrum = WebRtcNsx_PrepareSpectrumNeon; - WebRtcNsx_SynthesisUpdate = WebRtcNsx_SynthesisUpdateNeon; - WebRtcNsx_AnalysisUpdate = WebRtcNsx_AnalysisUpdateNeon; - WebRtcNsx_Denormalize = WebRtcNsx_DenormalizeNeon; - WebRtcNsx_CreateComplexBuffer = WebRtcNsx_CreateComplexBufferNeon; -} -#endif - // Update the noise estimation information. static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) { int32_t tmp32no1 = 0; @@ -614,7 +593,6 @@ static void NoiseEstimationC(NsxInst_t* inst, // Filter the data in the frequency domain, and create spectrum. static void PrepareSpectrumC(NsxInst_t* inst, int16_t* freq_buf) { int i = 0, j = 0; - int16_t tmp16 = 0; for (i = 0; i < inst->magnLen; i++) { inst->real[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i], @@ -626,22 +604,19 @@ static void PrepareSpectrumC(NsxInst_t* inst, int16_t* freq_buf) { freq_buf[0] = inst->real[0]; freq_buf[1] = -inst->imag[0]; for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) { - tmp16 = (inst->anaLen << 1) - j; freq_buf[j] = inst->real[i]; freq_buf[j + 1] = -inst->imag[i]; - freq_buf[tmp16] = inst->real[i]; - freq_buf[tmp16 + 1] = inst->imag[i]; } freq_buf[inst->anaLen] = inst->real[inst->anaLen2]; freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2]; } -// Denormalize the input buffer. -static __inline void DenormalizeC(NsxInst_t* inst, int16_t* in, int factor) { - int i = 0, j = 0; +// Denormalize the real-valued signal |in|, the output from inverse FFT. +static __inline void Denormalize(NsxInst_t* inst, int16_t* in, int factor) { + int i = 0; int32_t tmp32 = 0; - for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) { - tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[j], + for (i = 0; i < inst->anaLen; i += 1) { + tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[i], factor - inst->normData); inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0 } @@ -701,18 +676,32 @@ static void AnalysisUpdateC(NsxInst_t* inst, } } -// Create a complex number buffer (out[]) as the intput (in[]) interleaved with -// zeros, and normalize it. -static __inline void CreateComplexBufferC(NsxInst_t* inst, - int16_t* in, - int16_t* out) { - int i = 0, j = 0; - for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) { - out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData) - out[j + 1] = 0; // Insert zeros in imaginary part +// Normalize the real-valued signal |in|, the input to forward FFT. +static __inline void NormalizeRealBuffer(NsxInst_t* inst, + const int16_t* in, + int16_t* out) { + int i = 0; + for (i = 0; i < inst->anaLen; ++i) { + out[i] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData) } } +// Declare function pointers. +NoiseEstimation WebRtcNsx_NoiseEstimation; +PrepareSpectrum WebRtcNsx_PrepareSpectrum; +SynthesisUpdate WebRtcNsx_SynthesisUpdate; +AnalysisUpdate WebRtcNsx_AnalysisUpdate; + +#if (defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON) +// Initialize function pointers for ARM Neon platform. +static void WebRtcNsx_InitNeon(void) { + WebRtcNsx_NoiseEstimation = WebRtcNsx_NoiseEstimationNeon; + WebRtcNsx_PrepareSpectrum = WebRtcNsx_PrepareSpectrumNeon; + WebRtcNsx_SynthesisUpdate = WebRtcNsx_SynthesisUpdateNeon; + WebRtcNsx_AnalysisUpdate = WebRtcNsx_AnalysisUpdateNeon; +} +#endif + void WebRtcNsx_CalcParametricNoiseEstimate(NsxInst_t* inst, int16_t pink_noise_exp_avg, int32_t pink_noise_num_avg, @@ -900,17 +889,14 @@ int32_t WebRtcNsx_InitCore(NsxInst_t* inst, uint32_t fs) { WebRtcNsx_PrepareSpectrum = PrepareSpectrumC; WebRtcNsx_SynthesisUpdate = SynthesisUpdateC; WebRtcNsx_AnalysisUpdate = AnalysisUpdateC; - WebRtcNsx_Denormalize = DenormalizeC; - WebRtcNsx_CreateComplexBuffer = CreateComplexBufferC; #ifdef WEBRTC_DETECT_ARM_NEON - uint64_t features = WebRtc_GetCPUFeaturesARM(); - if ((features & kCPUFeatureNEON) != 0) - { - WebRtcNsx_InitNeon(); - } + uint64_t features = WebRtc_GetCPUFeaturesARM(); + if ((features & kCPUFeatureNEON) != 0) { + WebRtcNsx_InitNeon(); + } #elif defined(WEBRTC_ARCH_ARM_NEON) - WebRtcNsx_InitNeon(); + WebRtcNsx_InitNeon(); #endif inst->initFlag = 1; @@ -1606,7 +1592,7 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, uint16_t* magnU right_shifts_in_magnU16 = WEBRTC_SPL_MAX(right_shifts_in_magnU16, 0); // create realImag as winData interleaved with zeros (= imag. part), normalize it - WebRtcNsx_CreateComplexBuffer(inst, winData, realImag); + NormalizeRealBuffer(inst, winData, realImag); // FFT output will be in winData[]. WebRtcSpl_RealForwardFFT(inst->real_fft, realImag, winData); @@ -1838,8 +1824,7 @@ void WebRtcNsx_DataSynthesis(NsxInst_t* inst, short* outFrame) { // Inverse FFT output will be in rfft_out[]. outCIFFT = WebRtcSpl_RealInverseFFT(inst->real_fft, realImag, rfft_out); - // Denormalize. - WebRtcNsx_Denormalize(inst, rfft_out, outCIFFT); + Denormalize(inst, rfft_out, outCIFFT); //scale factor: only do it after END_STARTUP_LONG time gainFactor = 8192; // 8192 = Q13(1.0) diff --git a/webrtc/modules/audio_processing/ns/nsx_core.h b/webrtc/modules/audio_processing/ns/nsx_core.h index f1cf43cbc8..1ad369ffbe 100644 --- a/webrtc/modules/audio_processing/ns/nsx_core.h +++ b/webrtc/modules/audio_processing/ns/nsx_core.h @@ -201,19 +201,6 @@ typedef void (*AnalysisUpdate)(NsxInst_t* inst, int16_t* new_speech); extern AnalysisUpdate WebRtcNsx_AnalysisUpdate; -// Denormalize the input buffer. -typedef void (*Denormalize)(NsxInst_t* inst, - int16_t* in, - int factor); -extern Denormalize WebRtcNsx_Denormalize; - -// Create a complex number buffer, as the intput interleaved with zeros, -// and normalize it. -typedef void (*CreateComplexBuffer)(NsxInst_t* inst, - int16_t* in, - int16_t* out); -extern CreateComplexBuffer WebRtcNsx_CreateComplexBuffer; - #if (defined WEBRTC_DETECT_ARM_NEON) || defined (WEBRTC_ARCH_ARM_NEON) // For the above function pointers, functions for generic platforms are declared // and defined as static in file nsx_core.c, while those for ARM Neon platforms @@ -222,16 +209,12 @@ void WebRtcNsx_NoiseEstimationNeon(NsxInst_t* inst, uint16_t* magn, uint32_t* noise, int16_t* q_noise); -void WebRtcNsx_CreateComplexBufferNeon(NsxInst_t* inst, - int16_t* in, - int16_t* out); void WebRtcNsx_SynthesisUpdateNeon(NsxInst_t* inst, int16_t* out_frame, int16_t gain_factor); void WebRtcNsx_AnalysisUpdateNeon(NsxInst_t* inst, int16_t* out, int16_t* new_speech); -void WebRtcNsx_DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor); void WebRtcNsx_PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buff); #endif diff --git a/webrtc/modules/audio_processing/ns/nsx_core_neon.S b/webrtc/modules/audio_processing/ns/nsx_core_neon.S index a0d4a2cdf8..7269b2820e 100644 --- a/webrtc/modules/audio_processing/ns/nsx_core_neon.S +++ b/webrtc/modules/audio_processing/ns/nsx_core_neon.S @@ -20,8 +20,6 @@ GLOBAL_FUNCTION WebRtcNsx_NoiseEstimationNeon GLOBAL_FUNCTION WebRtcNsx_PrepareSpectrumNeon GLOBAL_FUNCTION WebRtcNsx_SynthesisUpdateNeon GLOBAL_FUNCTION WebRtcNsx_AnalysisUpdateNeon -GLOBAL_FUNCTION WebRtcNsx_DenormalizeNeon -GLOBAL_FUNCTION WebRtcNsx_CreateComplexBufferNeon GLOBAL_LABEL WebRtcNsx_kLogTable GLOBAL_LABEL WebRtcNsx_kCounterDiv GLOBAL_LABEL WebRtcNsx_kLogTableFrac @@ -426,6 +424,7 @@ POST_LOOP_MAGNLEN: pop {r4, r5, r6, pc} +@ TODO(kma): Remove copying to 2nd half of freq_buf, for real FFT interface. @ void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf); .align 2 DEFINE_FUNCTION WebRtcNsx_PrepareSpectrumNeon @@ -542,35 +541,6 @@ LOOP_ANALEN2: pop {r4-r9} bx r14 -@ void WebRtcNsx_DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor); -.align 2 -DEFINE_FUNCTION WebRtcNsx_DenormalizeNeon - movw r12, #offset_nsx_normData - movw r3, #offset_nsx_real - ldr r12, [r0, r12] @ inst->normData - add r3, r0 @ &inst->real[0] - sub r2, r12 - vdup.32 q10, r2 - - movw r2, #offset_nsx_anaLen - ldrsh r2, [r0, r2] @ inst->anaLen - add r0, r3, r2, lsl #1 @ &inst->real[inst->anaLen] - -LOOP_ANALEN: - vld2.16 {d0, d1}, [r1]! @ &in[] - vld2.16 {d2, d3}, [r1]! @ &in[] - vmovl.s16 q2, d0 - vmovl.s16 q3, d2 - vshl.s32 q2, q10 - vshl.s32 q3, q10 - vqmovn.s32 d0, q2 - vqmovn.s32 d1, q3 - vst1.16 {d0, d1}, [r3]! @ inst->real[] - cmp r3, r0 - blt LOOP_ANALEN - - bx r14 - @ void SynthesisUpdateNeon(NsxInst_t* inst, @ int16_t* out_frame, @ int16_t gain_factor); @@ -704,33 +674,3 @@ LOOP_WINDOW_DATA: POST_LOOP_WINDOW_DATA: pop {r4-r6} bx r14 - -@ void CreateComplexBufferNeon(NsxInst_t* inst, int16_t* in, int16_t* out); -.align 2 -DEFINE_FUNCTION WebRtcNsx_CreateComplexBufferNeon - movw r3, #offset_nsx_anaLen - movw r12, #offset_nsx_normData - ldrsh r3, [r0, r3] @ inst->anaLen - ldr r12, [r0, r12] @ inst->normData - add r3, r1, r3, lsl #1 @ &in[inst->anaLen] - - vmov.i16 d7, #0 @ For writing to imaginary parts. - vmov.i16 d5, #0 @ For writing to imaginary parts. - vdup.i16 q10, r12 - -LOOP_CREATE_COMPLEX_BUFFER: @ Unrolled by 16. - vld1.16 {d0, d1, d2, d3}, [r1]! @ in[] - cmp r1, r3 - vshl.s16 q0, q10 - vshl.s16 q1, q10 - vmov d4, d1 - vmov d1, d5 - vmov d6, d3 - vmov d3, d7 - vst2.16 {d0, d1}, [r2]! - vst2.16 {d4, d5}, [r2]! - vst2.16 {d2, d3}, [r2]! - vst2.16 {d6, d7}, [r2]! - blt LOOP_CREATE_COMPLEX_BUFFER - - bx r14