From 8dd7466b5279bf4dd302eef2c7d2408317cb8eb8 Mon Sep 17 00:00:00 2001
From: "kma@google.com" <kma@google.com@4adac7df-926f-26a2-2b94-8c16560cd09d>
Date: Tue, 16 Aug 2011 03:28:28 +0000
Subject: [PATCH] 2nd check in Review URL:
 http://webrtc-codereview.appspot.com/112002

git-svn-id: http://webrtc.googlecode.com/svn/trunk@372 4adac7df-926f-26a2-2b94-8c16560cd09d
---
 .../aecm/main/source/aecm_core.c              | 522 +++++++++---------
 .../aecm/main/source/aecm_core.h              |  28 +-
 .../aecm/main/source/aecm_core_neon.c         | 243 +++++---
 3 files changed, 472 insertions(+), 321 deletions(-)

diff --git a/src/modules/audio_processing/aecm/main/source/aecm_core.c b/src/modules/audio_processing/aecm/main/source/aecm_core.c
index f3e8a83624..cc49e91d6a 100644
--- a/src/modules/audio_processing/aecm/main/source/aecm_core.c
+++ b/src/modules/audio_processing/aecm/main/source/aecm_core.c
@@ -28,6 +28,14 @@ FILE *dfile;
 FILE *testfile;
 #endif
 
+#ifdef _MSC_VER // visual c++
+#define ALIGN8_BEG __declspec(align(8))
+#define ALIGN8_END
+#else // gcc or icc
+#define ALIGN8_BEG
+#define ALIGN8_END __attribute__((aligned(8)))
+#endif
+
 #ifdef AECM_SHORT
 
 // Square root of Hanning window in Q14
@@ -43,7 +51,7 @@ const WebRtc_Word16 WebRtcAecm_kSqrtHanning[] =
 #else
 
 // Square root of Hanning window in Q14
-const WebRtc_Word16 WebRtcAecm_kSqrtHanning[] =
+const ALIGN8_BEG WebRtc_Word16 WebRtcAecm_kSqrtHanning[] ALIGN8_END =
 {
     0, 399, 798, 1196, 1594, 1990, 2386, 2780, 3172,
     3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224, 6591, 6954, 7313, 7668, 8019, 8364,
@@ -97,12 +105,13 @@ static const WebRtc_Word16 kChannelStored16kHz[PART_LEN1] = {
 static const WebRtc_Word16 kNoiseEstQDomain = 15;
 static const WebRtc_Word16 kNoiseEstIncCount = 5;
 
-static void ComfortNoise(AecmCore_t * aecm,
+static void ComfortNoise(AecmCore_t* aecm,
                          const WebRtc_UWord16* dfa,
-                         WebRtc_Word16* outReal,
-                         WebRtc_Word16* outImag,
+                         complex16_t* out,
                          const WebRtc_Word16* lambda);
 
+static WebRtc_Word16 CalcSuppressionGain(AecmCore_t * const aecm);
+
 #ifdef ARM_WINM_LOG
 HANDLE logFile = NULL;
 #endif
@@ -151,10 +160,11 @@ int WebRtcAecm_CreateCore(AecmCore_t **aecmInst)
         return -1;
     }
 
-    // Init some aecm pointers. 16-byte alignment is only necessary for Neon code currently.
-    aecm->xBuf = (WebRtc_Word16*) (((uintptr_t)aecm->xBuf_buf + 15) & ~ 15);
-    aecm->dBufClean = (WebRtc_Word16*) (((uintptr_t)aecm->dBufClean_buf + 15) & ~ 15);
-    aecm->dBufNoisy = (WebRtc_Word16*) (((uintptr_t)aecm->dBufNoisy_buf + 15) & ~ 15);
+    // Init some aecm pointers. 16 and 32 byte alignment is only necessary
+    // for Neon code currently.
+    aecm->xBuf = (WebRtc_Word16*) (((uintptr_t)aecm->xBuf_buf + 31) & ~ 31);
+    aecm->dBufClean = (WebRtc_Word16*) (((uintptr_t)aecm->dBufClean_buf + 31) & ~ 31);
+    aecm->dBufNoisy = (WebRtc_Word16*) (((uintptr_t)aecm->dBufNoisy_buf + 31) & ~ 31);
     aecm->outBuf = (WebRtc_Word16*) (((uintptr_t)aecm->outBuf_buf + 15) & ~ 15);
     aecm->channelStored = (WebRtc_Word16*) (((uintptr_t)
                                              aecm->channelStored_buf + 15) & ~ 15);
@@ -345,7 +355,9 @@ int WebRtcAecm_ProcessFrame(AecmCore_t * aecm,
     WebRtc_Word16 farBlock[PART_LEN];
     WebRtc_Word16 nearNoisyBlock[PART_LEN];
     WebRtc_Word16 nearCleanBlock[PART_LEN];
-    WebRtc_Word16 outBlock[PART_LEN];
+    WebRtc_Word16 outBlock_buf[PART_LEN + 8]; // Align buffer to 8-byte boundary.
+    WebRtc_Word16* outBlock = (WebRtc_Word16*) (((uintptr_t) outBlock_buf + 15) & ~ 15);
+
     WebRtc_Word16 farFrame[FRAME_LEN];
     int size = 0;
 
@@ -892,7 +904,7 @@ void WebRtcAecm_UpdateChannel(AecmCore_t * aecm,
     // END: Determine if we should store or reset channel estimate.
 }
 
-// WebRtcAecm_CalcSuppressionGain(...)
+// CalcSuppressionGain(...)
 //
 // This function calculates the suppression gain that is used in the Wiener filter.
 //
@@ -902,7 +914,7 @@ void WebRtcAecm_UpdateChannel(AecmCore_t * aecm,
 //                          level (Q14).
 //
 //
-WebRtc_Word16 WebRtcAecm_CalcSuppressionGain(AecmCore_t* aecm)
+static WebRtc_Word16 CalcSuppressionGain(AecmCore_t * const aecm)
 {
     WebRtc_Word32 tmp32no1;
 
@@ -985,8 +997,7 @@ WebRtc_Word16 WebRtcAecm_CalcSuppressionGain(AecmCore_t* aecm)
 // return value                 The Q-domain of current frequency values
 //
 static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
-                                 WebRtc_Word16* freq_signal_real,
-                                 WebRtc_Word16* freq_signal_imag,
+                                 complex16_t* freq_signal,
                                  WebRtc_UWord16* freq_signal_abs,
                                  WebRtc_UWord32* freq_signal_sum_abs)
 {
@@ -998,9 +1009,9 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
     WebRtc_Word32 tmp32no1;
     WebRtc_Word32 tmp32no2;
 
-    // In fft_buf, +8 for 16-byte alignment, and +2 to make some loops safe.
-    WebRtc_Word16 fft_buf[PART_LEN4 + 2 + 8];
-    WebRtc_Word16 *fft = (WebRtc_Word16 *) (((uintptr_t) fft_buf + 15) & ~15);
+    // In fft_buf, +16 for 32-byte alignment.
+    WebRtc_Word16 fft_buf[PART_LEN4 + 16];
+    WebRtc_Word16 *fft = (WebRtc_Word16 *) (((uintptr_t) fft_buf + 31) & ~31);
 
     WebRtc_Word16 tmp16no1;
     WebRtc_Word16 tmp16no2;
@@ -1016,46 +1027,30 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
     time_signal_scaling = WebRtcSpl_NormW16(tmp16no1);
 #endif
 
-    WebRtcAecm_PrepareFft(fft, time_signal, time_signal_scaling);
-
-    // Fourier transformation of time domain signal.
-    // The result is scaled with 1/PART_LEN2, that is, the result is in Q(-6)
-    // for PART_LEN = 32
-
-    WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
-    ret = WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
-
-    // Take only the first PART_LEN2 samples
-    for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2)
-    {
-        freq_signal_real[i] = fft[j];
-
-        // The imaginary part has to switch sign
-        freq_signal_imag[i] = - fft[j+1];
-    }
+    WebRtcAecm_WindowAndFFT(fft, time_signal, freq_signal, time_signal_scaling);
 
     // Extract imaginary and real part, calculate the magnitude for all frequency bins
-    freq_signal_imag[0] = 0;
-    freq_signal_imag[PART_LEN] = 0;
-    freq_signal_real[PART_LEN] = fft[PART_LEN2];
+    freq_signal[0].imag = 0;
+    freq_signal[PART_LEN].imag = 0;
+    freq_signal[PART_LEN].real = fft[PART_LEN2];
     freq_signal_abs[0] = (WebRtc_UWord16)WEBRTC_SPL_ABS_W16(
-        freq_signal_real[0]);
+        freq_signal[0].real);
     freq_signal_abs[PART_LEN] = (WebRtc_UWord16)WEBRTC_SPL_ABS_W16(
-        freq_signal_real[PART_LEN]);
+        freq_signal[PART_LEN].real);
     (*freq_signal_sum_abs) = (WebRtc_UWord32)(freq_signal_abs[0]) +
         (WebRtc_UWord32)(freq_signal_abs[PART_LEN]);
 
     for (i = 1; i < PART_LEN; i++)
     {
-        if (freq_signal_real[i] == 0)
+        if (freq_signal[i].real == 0)
         {
             freq_signal_abs[i] = (WebRtc_UWord16)WEBRTC_SPL_ABS_W16(
-                freq_signal_imag[i]);
+                freq_signal[i].imag);
         }
-        else if (freq_signal_imag[i] == 0)
+        else if (freq_signal[i].imag == 0)
         {
             freq_signal_abs[i] = (WebRtc_UWord16)WEBRTC_SPL_ABS_W16(
-                freq_signal_real[i]);
+                freq_signal[i].real);
         }
         else
         {
@@ -1066,8 +1061,8 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
             // The parameters alpha and beta are stored in Q15
 
 #ifdef AECM_WITH_ABS_APPROX
-            tmp16no1 = WEBRTC_SPL_ABS_W16(freq_signal_real[i]);
-            tmp16no2 = WEBRTC_SPL_ABS_W16(freq_signal_imag[i]);
+            tmp16no1 = WEBRTC_SPL_ABS_W16(freq_signal[i].real);
+            tmp16no2 = WEBRTC_SPL_ABS_W16(freq_signal[i].imag);
 
             if(tmp16no1 > tmp16no2)
             {
@@ -1103,13 +1098,13 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
                 (WebRtc_UWord16)tmp16no2;
 #else
 #ifdef WEBRTC_ARCH_ARM_V7A
-           __asm__("smulbb %0, %1, %2" : "=r"(tmp32no1) : "r"(freq_signal_real[i]),
-                                                "r"(freq_signal_real[i]));
-           __asm__("smlabb %0, %1, %2, %3" :: "r"(tmp32no2), "r"(freq_signal_imag[i]), 
-                                                "r"(freq_signal_imag[i]), "r"(tmp32no1));
+           __asm__("smulbb %0, %1, %2" : "=r"(tmp32no1) : "r"(freq_signal[i].real),
+                                                "r"(freq_signal[i].real));
+           __asm__("smlabb %0, %1, %2, %3" :: "r"(tmp32no2), "r"(freq_signal[i].imag), 
+                                                "r"(freq_signal[i].imag), "r"(tmp32no1));
 #else
-            tmp16no1 = WEBRTC_SPL_ABS_W16(freq_signal_real[i]);
-            tmp16no2 = WEBRTC_SPL_ABS_W16(freq_signal_imag[i]);
+            tmp16no1 = WEBRTC_SPL_ABS_W16(freq_signal[i].real);
+            tmp16no2 = WEBRTC_SPL_ABS_W16(freq_signal[i].imag);
             tmp32no1 = WEBRTC_SPL_MUL_16_16(tmp16no1, tmp16no1);
             tmp32no2 = WEBRTC_SPL_MUL_16_16(tmp16no2, tmp16no2);
             tmp32no2 = WEBRTC_SPL_ADD_SAT_W32(tmp32no1, tmp32no2);
@@ -1125,7 +1120,8 @@ static int TimeToFrequencyDomain(const WebRtc_Word16* time_signal,
     return time_signal_scaling;
 }
 
-int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
+int WebRtcAecm_ProcessBlock(AecmCore_t * aecm,
+                            const WebRtc_Word16 * farend,
                             const WebRtc_Word16 * nearendNoisy,
                             const WebRtc_Word16 * nearendClean,
                             WebRtc_Word16 * output)
@@ -1140,10 +1136,6 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
 
     WebRtc_Word32 tmp32no1;
 
-    // +8 for 32-byte alignment.
-    WebRtc_Word32 echoEst32_buf[PART_LEN1 + 8];
-    WebRtc_Word32 *echoEst32 = (WebRtc_Word32*) (((uintptr_t) echoEst32_buf + 31) & ~ 31);
-
     WebRtc_UWord16 xfa[PART_LEN1];
     WebRtc_UWord16 dfaNoisy[PART_LEN1];
     WebRtc_UWord16 dfaClean[PART_LEN1];
@@ -1151,11 +1143,18 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
     const WebRtc_UWord16* far_spectrum_ptr = NULL;
     int outCFFT;
 
-    WebRtc_Word16 fft[PART_LEN4];
-    WebRtc_Word16 dfwReal[PART_LEN1];
-    WebRtc_Word16 dfwImag[PART_LEN1];
-    WebRtc_Word16 efwReal[PART_LEN1];
-    WebRtc_Word16 efwImag[PART_LEN1];
+    // 32 byte aligned buffers (with +8 or +16).
+    // TODO (kma): define fft with complex16_t.
+    WebRtc_Word16 fft_buf[PART_LEN4 + 2 + 16]; // +2 to make a loop safe.
+    WebRtc_Word32 echoEst32_buf[PART_LEN1 + 8];
+    WebRtc_Word32 dfw_buf[PART_LEN1 + 8];
+    WebRtc_Word32 efw_buf[PART_LEN1 + 8];
+
+    WebRtc_Word16* fft = (WebRtc_Word16*) (((uintptr_t) fft_buf + 31) & ~ 31);
+    WebRtc_Word32* echoEst32 = (WebRtc_Word32*) (((uintptr_t) echoEst32_buf + 31) & ~ 31);
+    complex16_t* dfw = (complex16_t*) (((uintptr_t) dfw_buf + 31) & ~ 31);
+    complex16_t* efw = (complex16_t*) (((uintptr_t) efw_buf + 31) & ~ 31);
+
     WebRtc_Word16 hnl[PART_LEN1];
     WebRtc_Word16 numPosCoef = 0;
     WebRtc_Word16 nlpGain = ONE_Q14;
@@ -1206,15 +1205,13 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
 
     // Transform far end signal from time domain to frequency domain.
     zerosXBuf = TimeToFrequencyDomain(aecm->xBuf,
-                                      dfwReal,
-                                      dfwImag,
+                                      dfw,
                                       xfa,
                                       &xfaSum);
 
     // Transform noisy near end signal from time domain to frequency domain.
     zerosDBufNoisy = TimeToFrequencyDomain(aecm->dBufNoisy,
-                                           dfwReal,
-                                           dfwImag,
+                                           dfw,
                                            dfaNoisy,
                                            &dfaNoisySum);
     aecm->dfaNoisyQDomainOld = aecm->dfaNoisyQDomain;
@@ -1231,8 +1228,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
     {
         // Transform clean near end signal from time domain to frequency domain.
         zerosDBufClean = TimeToFrequencyDomain(aecm->dBufClean,
-                                               dfwReal,
-                                               dfwImag,
+                                               dfw,
                                                dfaClean,
                                                &dfaCleanSum);
         aecm->dfaCleanQDomainOld = aecm->dfaCleanQDomain;
@@ -1300,7 +1296,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
     // This is the channel estimation algorithm.
     // It is base on NLMS but has a variable step length, which was calculated above.
     WebRtcAecm_UpdateChannel(aecm, far_spectrum_ptr, zerosXBuf, dfaNoisy, mu, echoEst32);
-    supGain = WebRtcAecm_CalcSuppressionGain(aecm);
+    supGain = CalcSuppressionGain(aecm);
 
 #ifdef ARM_WINM_LOG_
     // measure tick end
@@ -1483,9 +1479,9 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
             }
 
             // multiply with Wiener coefficients
-            efwReal[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwReal[i],
+            efw[i].real = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].real,
                                                                             hnl[i], 14));
-            efwImag[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwImag[i],
+            efw[i].imag = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].imag,
                                                                             hnl[i], 14));
         }
     }
@@ -1494,16 +1490,16 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
         // multiply with Wiener coefficients
         for (i = 0; i < PART_LEN1; i++)
         {
-            efwReal[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwReal[i],
+            efw[i].real = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].real,
                                                                            hnl[i], 14));
-            efwImag[i] = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfwImag[i],
+            efw[i].imag = (WebRtc_Word16)(WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].imag,
                                                                            hnl[i], 14));
         }
     }
 
     if (aecm->cngMode == AecmTrue)
     {
-        ComfortNoise(aecm, ptrDfaClean, efwReal, efwImag, hnl);
+        ComfortNoise(aecm, ptrDfaClean, efw, hnl);
     }
 
 #ifdef ARM_WINM_LOG_
@@ -1516,177 +1512,11 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm, const WebRtc_Word16 * farend,
     QueryPerformanceCounter((LARGE_INTEGER*)&start);
 #endif
 
-    // Synthesis
-    for (i = 1; i < PART_LEN; i++)
-    {
-        j = WEBRTC_SPL_LSHIFT_W32(i, 1);
-        fft[j] = efwReal[i];
-
-        // mirrored data, even
-        fft[PART_LEN4 - j] = efwReal[i];
-        fft[j + 1] = -efwImag[i];
-
-        //mirrored data, odd
-        fft[PART_LEN4 - (j - 1)] = efwImag[i];
-    }
-    fft[0] = efwReal[0];
-    fft[1] = -efwImag[0];
-
-    fft[PART_LEN2] = efwReal[PART_LEN];
-    fft[PART_LEN2 + 1] = -efwImag[PART_LEN];
-
-    // inverse FFT, result should be scaled with outCFFT
-    WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
-    outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
-
-    //take only the real values and scale with outCFFT
-    for (i = 0; i < PART_LEN2; i++)
-    {
-        j = WEBRTC_SPL_LSHIFT_W32(i, 1);
-        fft[i] = fft[j];
-    }
-
-    for (i = 0; i < PART_LEN; i++)
-    {
-        fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
-                fft[i],
-                WebRtcAecm_kSqrtHanning[i],
-                14);
-        tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
-                outCFFT - aecm->dfaCleanQDomain);
-        fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
-                tmp32no1 + aecm->outBuf[i],
-                WEBRTC_SPL_WORD16_MIN);
-        output[i] = fft[i];
-
-        tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
-                fft[PART_LEN + i],
-                WebRtcAecm_kSqrtHanning[PART_LEN - i],
-                14);
-        tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
-                outCFFT - aecm->dfaCleanQDomain);
-        aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
-                WEBRTC_SPL_WORD16_MAX,
-                tmp32no1,
-                WEBRTC_SPL_WORD16_MIN);
-    }
-
-#ifdef ARM_WINM_LOG_
-    // measure tick end
-    QueryPerformanceCounter((LARGE_INTEGER*)&end);
-    diff__ = ((end - start) * 1000) / (freq/1000);
-    milliseconds = (unsigned int)(diff__ & 0xffffffff);
-    WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL);
-#endif
-    // Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
-    memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
-    memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
-    if (nearendClean != NULL)
-    {
-        memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
-    }
+    WebRtcAecm_InverseFFTAndWindow(aecm, fft, efw, output, nearendClean);
 
     return 0;
 }
 
-#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
-
-void WebRtcAecm_PrepareFft(WebRtc_Word16* fft,
-                           const WebRtc_Word16* time_signal,
-                           int time_signal_scaling)
-{
-    int i, j;
-
-    memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
-    // FFT of signal
-    for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
-    {
-        // Window time domain signal and insert into real part of
-        // transformation array |fft|
-        fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
-            (time_signal[i] << time_signal_scaling),
-            WebRtcAecm_kSqrtHanning[i],
-            14);
-        fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
-            (time_signal[PART_LEN + i] << time_signal_scaling),
-            WebRtcAecm_kSqrtHanning[PART_LEN - i],
-            14);
-        // Inserting zeros in imaginary parts not necessary since we
-        // initialized the array with all zeros
-    }
-}
-
-void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
-                                   const WebRtc_UWord16* far_spectrum,
-                                   WebRtc_Word32* echo_est,
-                                   WebRtc_UWord32* far_energy,
-                                   WebRtc_UWord32* echo_energy_adapt,
-                                   WebRtc_UWord32* echo_energy_stored)
-{
-    int i;
-
-    // Get energy for the delayed far end signal and estimated
-    // echo using both stored and adapted channels.
-    for (i = 0; i < PART_LEN1; i++)
-    {
-        echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
-                                           far_spectrum[i]);
-        (*far_energy) += (WebRtc_UWord32)(far_spectrum[i]);
-        (*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
-                                          far_spectrum[i]);
-        (*echo_energy_stored) += (WebRtc_UWord32)echo_est[i];
-    }
-}
-
-void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
-                                     const WebRtc_UWord16* far_spectrum,
-                                     WebRtc_Word32* echo_est)
-{
-    int i;
-
-    // During startup we store the channel every block.
-    memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
-    // Recalculate echo estimate
-    for (i = 0; i < PART_LEN; i += 4)
-    {
-        echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
-                                           far_spectrum[i]);
-        echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
-                                           far_spectrum[i + 1]);
-        echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
-                                           far_spectrum[i + 2]);
-        echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
-                                           far_spectrum[i + 3]);
-    }
-    echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
-                                       far_spectrum[i]);
-}
-
-void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm)
-{
-    int i;
-
-    // The stored channel has a significantly lower MSE than the adaptive one for
-    // two consecutive calculations. Reset the adaptive channel.
-    memcpy(aecm->channelAdapt16, aecm->channelStored,
-           sizeof(WebRtc_Word16) * PART_LEN1);
-    // Restore the W32 channel
-    for (i = 0; i < PART_LEN; i += 4)
-    {
-        aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
-                (WebRtc_Word32)aecm->channelStored[i], 16);
-        aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
-                (WebRtc_Word32)aecm->channelStored[i + 1], 16);
-        aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
-                (WebRtc_Word32)aecm->channelStored[i + 2], 16);
-        aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
-                (WebRtc_Word32)aecm->channelStored[i + 3], 16);
-    }
-    aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
-}
-
-#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
-
 
 // Generate comfort noise and add to output signal.
 //
@@ -1696,10 +1526,9 @@ void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm)
 // \param[in,out] outImag Imaginary part of the output signal (Q[aecm->dfaQDomain]).
 // \param[in]     lambda  Suppression gain with which to scale the noise level (Q14).
 //
-static void ComfortNoise(AecmCore_t * aecm,
+static void ComfortNoise(AecmCore_t* aecm,
                          const WebRtc_UWord16* dfa,
-                         WebRtc_Word16* outReal,
-                         WebRtc_Word16* outImag,
+                         complex16_t* out,
                          const WebRtc_Word16* lambda)
 {
     WebRtc_Word16 i;
@@ -1827,22 +1656,22 @@ static void ComfortNoise(AecmCore_t * aecm,
 #if (!defined ARM_WINM) && (!defined ARM9E_GCC) && (!defined ANDROID_AECOPT)
     for (i = 0; i < PART_LEN1; i++)
     {
-        outReal[i] = WEBRTC_SPL_ADD_SAT_W16(outReal[i], uReal[i]);
-        outImag[i] = WEBRTC_SPL_ADD_SAT_W16(outImag[i], uImag[i]);
+        out[i].real = WEBRTC_SPL_ADD_SAT_W16(out[i].real, uReal[i]);
+        out[i].imag = WEBRTC_SPL_ADD_SAT_W16(out[i].imag, uImag[i]);
     }
 #else
     for (i = 0; i < PART_LEN1 -1; )
     {
-        outReal[i] = WEBRTC_SPL_ADD_SAT_W16(outReal[i], uReal[i]);
-        outImag[i] = WEBRTC_SPL_ADD_SAT_W16(outImag[i], uImag[i]);
+        out[i].real = WEBRTC_SPL_ADD_SAT_W16(out[i].real, uReal[i]);
+        out[i].imag = WEBRTC_SPL_ADD_SAT_W16(out[i].imag, uImag[i]);
         i++;
 
-        outReal[i] = WEBRTC_SPL_ADD_SAT_W16(outReal[i], uReal[i]);
-        outImag[i] = WEBRTC_SPL_ADD_SAT_W16(outImag[i], uImag[i]);
+        out[i].real = WEBRTC_SPL_ADD_SAT_W16(out[i].real, uReal[i]);
+        out[i].imag = WEBRTC_SPL_ADD_SAT_W16(out[i].imag, uImag[i]);
         i++;
     }
-    outReal[i] = WEBRTC_SPL_ADD_SAT_W16(outReal[i], uReal[i]);
-    outImag[i] = WEBRTC_SPL_ADD_SAT_W16(outImag[i], uImag[i]);
+    out[i].real = WEBRTC_SPL_ADD_SAT_W16(out[i].real, uReal[i]);
+    out[i].imag = WEBRTC_SPL_ADD_SAT_W16(out[i].imag, uImag[i]);
 #endif
 }
 
@@ -1906,3 +1735,196 @@ void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const far
            sizeof(WebRtc_Word16) * readLen);
     aecm->farBufReadPos += readLen;
 }
+
+#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
+
+void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
+                    const WebRtc_Word16* time_signal,
+                    complex16_t* freq_signal,
+                    int time_signal_scaling)
+{
+    int i, j;
+
+    memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
+    // FFT of signal
+    for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
+    {
+        // Window time domain signal and insert into real part of
+        // transformation array |fft|
+        fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
+            (time_signal[i] << time_signal_scaling),
+            WebRtcAecm_kSqrtHanning[i],
+            14);
+        fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
+            (time_signal[i + PART_LEN] << time_signal_scaling),
+            WebRtcAecm_kSqrtHanning[PART_LEN - i],
+            14);
+        // Inserting zeros in imaginary parts not necessary since we
+        // initialized the array with all zeros
+    }
+
+    WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
+    WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
+
+    // Take only the first PART_LEN2 samples
+    for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2)
+    {
+        freq_signal[i].real = fft[j];
+
+        // The imaginary part has to switch sign
+        freq_signal[i].imag = - fft[j+1];
+    }
+}
+
+void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
+                        WebRtc_Word16* fft,
+                        complex16_t* efw,
+                        WebRtc_Word16* output,
+                        const WebRtc_Word16* nearendClean)
+{
+    int i, j, outCFFT;
+    WebRtc_Word32 tmp32no1;
+
+    // Synthesis
+    for (i = 1; i < PART_LEN; i++)
+    {
+        j = WEBRTC_SPL_LSHIFT_W32(i, 1);
+        fft[j] = efw[i].real;
+
+        // mirrored data, even
+        fft[PART_LEN4 - j] = efw[i].real;
+        fft[j + 1] = -efw[i].imag;
+
+        //mirrored data, odd
+        fft[PART_LEN4 - (j - 1)] = efw[i].imag;
+    }
+    fft[0] = efw[0].real;
+    fft[1] = -efw[0].imag;
+
+    fft[PART_LEN2] = efw[PART_LEN].real;
+    fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
+
+    // inverse FFT, result should be scaled with outCFFT
+    WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
+    outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
+
+    //take only the real values and scale with outCFFT
+    for (i = 0; i < PART_LEN2; i++)
+    {
+        j = WEBRTC_SPL_LSHIFT_W32(i, 1);
+        fft[i] = fft[j];
+    }
+
+    for (i = 0; i < PART_LEN; i++)
+    {
+        fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+                fft[i],
+                WebRtcAecm_kSqrtHanning[i],
+                14);
+        tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
+                outCFFT - aecm->dfaCleanQDomain);
+        fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
+                tmp32no1 + aecm->outBuf[i],
+                WEBRTC_SPL_WORD16_MIN);
+        output[i] = fft[i];
+
+        tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
+                fft[PART_LEN + i],
+                WebRtcAecm_kSqrtHanning[PART_LEN - i],
+                14);
+        tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
+                outCFFT - aecm->dfaCleanQDomain);
+        aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
+                WEBRTC_SPL_WORD16_MAX,
+                tmp32no1,
+                WEBRTC_SPL_WORD16_MIN);
+    }
+
+#ifdef ARM_WINM_LOG_
+    // measure tick end
+    QueryPerformanceCounter((LARGE_INTEGER*)&end);
+    diff__ = ((end - start) * 1000) / (freq/1000);
+    milliseconds = (unsigned int)(diff__ & 0xffffffff);
+    WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL);
+#endif
+
+    // Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
+    memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
+    memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
+    if (nearendClean != NULL)
+    {
+        memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
+    }
+}
+
+void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
+                                   const WebRtc_UWord16* far_spectrum,
+                                   WebRtc_Word32* echo_est,
+                                   WebRtc_UWord32* far_energy,
+                                   WebRtc_UWord32* echo_energy_adapt,
+                                   WebRtc_UWord32* echo_energy_stored)
+{
+    int i;
+
+    // Get energy for the delayed far end signal and estimated
+    // echo using both stored and adapted channels.
+    for (i = 0; i < PART_LEN1; i++)
+    {
+        echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
+                                           far_spectrum[i]);
+        (*far_energy) += (WebRtc_UWord32)(far_spectrum[i]);
+        (*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
+                                          far_spectrum[i]);
+        (*echo_energy_stored) += (WebRtc_UWord32)echo_est[i];
+    }
+}
+
+void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
+                                     const WebRtc_UWord16* far_spectrum,
+                                     WebRtc_Word32* echo_est)
+{
+    int i;
+
+    // During startup we store the channel every block.
+    memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
+    // Recalculate echo estimate
+    for (i = 0; i < PART_LEN; i += 4)
+    {
+        echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
+                                           far_spectrum[i]);
+        echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
+                                           far_spectrum[i + 1]);
+        echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
+                                           far_spectrum[i + 2]);
+        echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
+                                           far_spectrum[i + 3]);
+    }
+    echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
+                                       far_spectrum[i]);
+}
+
+void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm)
+{
+    int i;
+
+    // The stored channel has a significantly lower MSE than the adaptive one for
+    // two consecutive calculations. Reset the adaptive channel.
+    memcpy(aecm->channelAdapt16, aecm->channelStored,
+           sizeof(WebRtc_Word16) * PART_LEN1);
+    // Restore the W32 channel
+    for (i = 0; i < PART_LEN; i += 4)
+    {
+        aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
+                (WebRtc_Word32)aecm->channelStored[i], 16);
+        aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
+                (WebRtc_Word32)aecm->channelStored[i + 1], 16);
+        aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
+                (WebRtc_Word32)aecm->channelStored[i + 2], 16);
+        aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
+                (WebRtc_Word32)aecm->channelStored[i + 3], 16);
+    }
+    aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
+}
+
+#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
+
diff --git a/src/modules/audio_processing/aecm/main/source/aecm_core.h b/src/modules/audio_processing/aecm/main/source/aecm_core.h
index 1050dee16b..e431c71af0 100644
--- a/src/modules/audio_processing/aecm/main/source/aecm_core.h
+++ b/src/modules/audio_processing/aecm/main/source/aecm_core.h
@@ -99,6 +99,11 @@
 
 extern const WebRtc_Word16 WebRtcAecm_kSqrtHanning[];
 
+typedef struct {
+    WebRtc_Word16 real;
+    WebRtc_Word16 imag;
+} complex16_t;
+
 typedef struct
 {
     int farBufWritePos;
@@ -142,9 +147,9 @@ typedef struct
     WebRtc_Word16 channelStored_buf[PART_LEN1 + 8];
     WebRtc_Word16 channelAdapt16_buf[PART_LEN1 + 8];
     WebRtc_Word32 channelAdapt32_buf[PART_LEN1 + 8];
-    WebRtc_Word16 xBuf_buf[PART_LEN2 + 8]; // farend
-    WebRtc_Word16 dBufClean_buf[PART_LEN2 + 8]; // nearend
-    WebRtc_Word16 dBufNoisy_buf[PART_LEN2 + 8]; // nearend
+    WebRtc_Word16 xBuf_buf[PART_LEN2 + 16]; // farend
+    WebRtc_Word16 dBufClean_buf[PART_LEN2 + 16]; // nearend
+    WebRtc_Word16 dBufNoisy_buf[PART_LEN2 + 16]; // nearend
     WebRtc_Word16 outBuf_buf[PART_LEN + 8];
 
     // Pointers to the above buffers
@@ -326,9 +331,7 @@ void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const far
 // Some internal functions shared by ARM NEON and generic C code:
 //
 
-WebRtc_Word16 WebRtcAecm_CalcSuppressionGain(AecmCore_t * aecm);
-
-void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
+void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
                                    const WebRtc_UWord16* far_spectrum,
                                    WebRtc_Word32* echoEst,
                                    WebRtc_UWord32* far_energy,
@@ -341,8 +344,15 @@ void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
 
 void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm);
 
-void WebRtcAecm_PrepareFft(WebRtc_Word16* fft,
-                           const WebRtc_Word16* time_signal,
-                           int time_signal_scaling);
+void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
+                             const WebRtc_Word16* time_signal,
+                             complex16_t* freq_signal,
+                             int time_signal_scaling);
+
+void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
+                                    WebRtc_Word16* fft,
+                                    complex16_t* efw,
+                                    WebRtc_Word16* output,
+                                    const WebRtc_Word16* nearendClean);
 
 #endif
diff --git a/src/modules/audio_processing/aecm/main/source/aecm_core_neon.c b/src/modules/audio_processing/aecm/main/source/aecm_core_neon.c
index cfac49ad67..86ced1ed3b 100644
--- a/src/modules/audio_processing/aecm/main/source/aecm_core_neon.c
+++ b/src/modules/audio_processing/aecm/main/source/aecm_core_neon.c
@@ -13,14 +13,9 @@
 
 #include <arm_neon.h>
 #include <assert.h>
-#include <stdlib.h>
 
-#include "aecm_delay_estimator.h"
-#include "echo_control_mobile.h"
-#include "ring_buffer.h"
-#include "typedefs.h"
 
-// Square root of Hanning window in Q14
+// Square root of Hanning window in Q14.
 static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8))) = {       
      16384, 16373, 16354, 16325,  
      16286, 16237, 16179, 16111,  
@@ -40,9 +35,172 @@ static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8)))
      1594,  1196,  798,   399
 };
 
-void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
+void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
+                             const WebRtc_Word16* time_signal,
+                             complex16_t* freq_signal,
+                             int time_signal_scaling)
+{
+    int i, j;
+
+    int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
+    __asm__("vmov.i16 d21, #0" ::: "d21");
+
+    for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8)
+    {
+        int16x4_t tmp16x4_0;
+        int16x4_t tmp16x4_1;
+        int32x4_t tmp32x4_0;
+
+        /* Window near end */
+        // fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i]
+        //       << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14);
+        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i]));
+        tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
+
+        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
+        tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
+
+        __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
+        __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10");
+
+        // fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
+        //      (time_signal[PART_LEN + i] << time_signal_scaling),
+        //       WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
+        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i + PART_LEN]));
+        tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
+
+        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
+        tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
+
+        __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
+        __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10");
+    }
+
+    WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
+    WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
+
+    // Take only the first PART_LEN2 samples, and switch the sign of the imaginary part.
+    for(i = 0, j = 0; j < PART_LEN2; i += 8, j += 16)
+    {
+        __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
+        __asm__("vneg.s16 d22, d22" : : : "q10");
+        __asm__("vneg.s16 d23, d23" : : : "q11");
+        __asm__("vst2.16 {d20, d21, d22, d23}, [%0, :256]" : : 
+            "r"(&freq_signal[i].real): "q10", "q11");
+    }
+}
+
+void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
+                        WebRtc_Word16* fft,
+                        complex16_t* efw,
+                        WebRtc_Word16* output,
+                        const WebRtc_Word16* nearendClean)
+{
+    int i, j, outCFFT;
+    WebRtc_Word32 tmp32no1;
+
+    // Synthesis
+    for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8)
+    {
+        // We overwrite two more elements in fft[], but it's ok.
+        __asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10");
+        __asm__("vmov q11, q10" : : : "q10", "q11");
+
+        __asm__("vneg.s16 d23, d23" : : : "q11");
+        __asm__("vst2.16 {d22, d23}, [%0, :128]" : : "r"(&fft[j]): "q11");
+
+        __asm__("vrev64.16 q10, q10" : : : "q10");
+        __asm__("vst2.16 {d20, d21}, [%0]" : : "r"(&fft[PART_LEN4 - j - 6]): "q10");
+    }
+
+    fft[PART_LEN2] = efw[PART_LEN].real;
+    fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
+
+    // Inverse FFT, result should be scaled with outCFFT.
+    WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
+    outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
+
+    // Take only the real values and scale with outCFFT.
+    for (i = 0, j = 0; i < PART_LEN2; i += 8, j+= 16)
+    {
+        __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
+        __asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10");
+    }
+
+    int32x4_t tmp32x4_2;
+    __asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32)
+        (outCFFT - aecm->dfaCleanQDomain)));
+    for (i = 0; i < PART_LEN; i += 4)
+    {
+        int16x4_t tmp16x4_0;
+        int16x4_t tmp16x4_1;
+        int32x4_t tmp32x4_0;
+        int32x4_t tmp32x4_1;
+
+        // fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+        //        fft[i], WebRtcAecm_kSqrtHanning[i], 14);
+        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[i]));
+        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
+        __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
+        __asm__("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
+
+        // tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
+        //        outCFFT - aecm->dfaCleanQDomain);
+        __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
+
+        // fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
+        //        tmp32no1 + outBuf[i], WEBRTC_SPL_WORD16_MIN);
+        // output[i] = fft[i];
+        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i]));
+        __asm__("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0));
+        __asm__("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1));
+        __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
+        __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&fft[i]));
+        __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i]));
+
+        // tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
+        //        fft[PART_LEN + i], WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
+        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[PART_LEN + i]));
+        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
+        __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
+        __asm__("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
+
+        // tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain);
+        __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
+        // outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
+        //        WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
+        __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
+        __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i]));
+    }
+
+    // Copy the current block to the old position (outBuf is shifted elsewhere).
+    for (i = 0; i < PART_LEN; i += 16)
+    {
+        __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
+            "r"(&aecm->xBuf[i + PART_LEN]) : "q10");
+        __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10");
+    }
+    for (i = 0; i < PART_LEN; i += 16)
+    {
+        __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
+            "r"(&aecm->dBufNoisy[i + PART_LEN]) : "q10");
+        __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : 
+            "r"(&aecm->dBufNoisy[i]): "q10");
+    }
+    if (nearendClean != NULL) {
+        for (i = 0; i < PART_LEN; i += 16)
+        {
+            __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
+                "r"(&aecm->dBufClean[i + PART_LEN]) : "q10");
+            __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
+                "r"(&aecm->dBufClean[i]): "q10");
+        }
+    }
+}
+
+void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
                                    const WebRtc_UWord16* far_spectrum,
-                                   WebRtc_Word32* echoEst,
+                                   WebRtc_Word32* echo_est,
                                    WebRtc_UWord32* far_energy,
                                    WebRtc_UWord32* echo_energy_adapt,
                                    WebRtc_UWord32* echo_energy_stored)
@@ -54,29 +212,31 @@ void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
     register WebRtc_UWord32 echo_energy_adapt_r;
     uint32x4_t tmp32x4_0;
 
-    __asm__("vmov.i32 q14, #0" : : : "q14"); //far_energy
-    __asm__("vmov.i32 q8,  #0" : : : "q8"); //echo_energy_stored
-    __asm__("vmov.i32 q9,  #0" : : : "q9"); //echo_energy_adapt
+    __asm__("vmov.i32 q14, #0" : : : "q14"); // far_energy
+    __asm__("vmov.i32 q8,  #0" : : : "q8"); // echo_energy_stored
+    __asm__("vmov.i32 q9,  #0" : : : "q9"); // echo_energy_adapt
 
     for(i = 0; i < PART_LEN -7; i += 8)
     {
-        //far_energy += (WebRtc_UWord32)(far_spectrum[i]);
+        // far_energy += (WebRtc_UWord32)(far_spectrum[i]);
         __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
         __asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
         __asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
 
-        // Get estimated echo energies for adaptive channel and stored channel
-        //echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
+        // Get estimated echo energies for adaptive channel and stored channel.
+        // echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
         __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
         __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
         __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
-        __asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echoEst[i]): "q10", "q11");
+        __asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]):
+            "q10", "q11");
 
-        //echo_energy_stored += (WebRtc_UWord32)echoEst[i];
+        // echo_energy_stored += (WebRtc_UWord32)echoEst[i];
         __asm__("vadd.u32 q8, q10" : : : "q10", "q8");
         __asm__("vadd.u32 q8, q11" : : : "q11", "q8");
 
-        //echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i], far_spectrum[i]);
+        // echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(
+        //     aecm->channelAdapt16[i], far_spectrum[i]);
         __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
         __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
         __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
@@ -96,9 +256,9 @@ void WebRtcAecm_CalcLinearEnergies(AecmCore_t *aecm,
     __asm__("vpadd.u32 d16, d16" : : : "q8");
     __asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
 
-    // Get estimated echo energies for adaptive channel and stored channel
-    echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
-    *echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echoEst[i];
+    // Get estimated echo energies for adaptive channel and stored channel.
+    echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
+    *echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echo_est[i];
     *far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]);
     *echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16(
         aecm->channelAdapt16[i], far_spectrum[i]);
@@ -128,7 +288,7 @@ void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
     echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
 }
 
-void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm)
+void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm)
 {
     int i;
 
@@ -151,45 +311,4 @@ void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm)
             (WebRtc_Word32)aecm->channelStored[i], 16);
 }
 
-void WebRtcAecm_PrepareFft(WebRtc_Word16* fft,
-                           const WebRtc_Word16* time_signal,
-                           int time_signal_scaling)
-{
-    int i, j;
-    int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
-    __asm__("vmov.i16 d21, #0" ::: "d21");
-
-    for(i = 0, j = 0; i < PART_LEN-3; i += 4, j += 8)
-    {
-        int16x4_t tmp16x4_0;
-        int16x4_t tmp16x4_1;
-        int32x4_t tmp32x4_0;
-
-        /* Window near end */
-        // fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i]
-        //       << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14);
-        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i]));
-        tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
-
-        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
-        tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
-
-        __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
-        __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10");
-
-        // fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
-        //      (time_signal[PART_LEN + i] << time_signal_scaling),
-        //       WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
-        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[PART_LEN + i]));
-        tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
-
-        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
-        tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
-
-        __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
-        __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10");
-    }
-}
-
 #endif // #if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)
-