From c0907eff42079cb53c4ee28cb47a8e495ab06b37 Mon Sep 17 00:00:00 2001
From: "andrew@webrtc.org"
 <andrew@webrtc.org@4adac7df-926f-26a2-2b94-8c16560cd09d>
Date: Fri, 21 Feb 2014 00:13:31 +0000
Subject: [PATCH] MIPS optimizations for AEC audio processing module

The resulting output streams obtained by testing with audioproc test application
are bit-exact with generic C code output streams.

Performance gain achieved:
- mips32 ~ 17%
- mips32r2 ~ 20%
- mipsdsp & mipsdspr2 ~ 21%

R=andrew@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/7359004

Patch from Ljubomir Papuga <lpapuga@mips.com>.

git-svn-id: http://webrtc.googlecode.com/svn/trunk@5591 4adac7df-926f-26a2-2b94-8c16560cd09d
---
 .../modules/audio_processing/aec/aec_core.c   |    8 +-
 .../modules/audio_processing/aec/aec_core.h   |    3 +
 .../audio_processing/aec/aec_core_internal.h  |    7 +
 .../audio_processing/aec/aec_core_mips.c      |  774 +++++++++++
 .../modules/audio_processing/aec/aec_rdft.c   |   15 +-
 .../modules/audio_processing/aec/aec_rdft.h   |    7 +
 .../audio_processing/aec/aec_rdft_mips.c      | 1213 +++++++++++++++++
 .../audio_processing/audio_processing.gypi    |    8 +
 8 files changed, 2031 insertions(+), 4 deletions(-)
 create mode 100644 webrtc/modules/audio_processing/aec/aec_core_mips.c
 create mode 100644 webrtc/modules/audio_processing/aec/aec_rdft_mips.c

diff --git a/webrtc/modules/audio_processing/aec/aec_core.c b/webrtc/modules/audio_processing/aec/aec_core.c
index 4f4ce034ea..9efa00d752 100644
--- a/webrtc/modules/audio_processing/aec/aec_core.c
+++ b/webrtc/modules/audio_processing/aec/aec_core.c
@@ -419,6 +419,7 @@ WebRtcAec_FilterFar_t WebRtcAec_FilterFar;
 WebRtcAec_ScaleErrorSignal_t WebRtcAec_ScaleErrorSignal;
 WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation;
 WebRtcAec_OverdriveAndSuppress_t WebRtcAec_OverdriveAndSuppress;
+WebRtcAec_ComfortNoise_t WebRtcAec_ComfortNoise;
 
 int WebRtcAec_InitAec(AecCore* aec, int sampFreq) {
   int i;
@@ -568,6 +569,7 @@ int WebRtcAec_InitAec(AecCore* aec, int sampFreq) {
   WebRtcAec_ScaleErrorSignal = ScaleErrorSignal;
   WebRtcAec_FilterAdaptation = FilterAdaptation;
   WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppress;
+  WebRtcAec_ComfortNoise = ComfortNoise;
 
 #if defined(WEBRTC_ARCH_X86_FAMILY)
   if (WebRtc_GetCPUInfo(kSSE2)) {
@@ -575,6 +577,10 @@ int WebRtcAec_InitAec(AecCore* aec, int sampFreq) {
   }
 #endif
 
+#if defined(MIPS_FPU_LE)
+  WebRtcAec_InitAec_mips();
+#endif
+
   aec_rdft_init();
 
   return 0;
@@ -1279,7 +1285,7 @@ static void NonLinearProcessing(AecCore* aec, short* output, short* outputH) {
   WebRtcAec_OverdriveAndSuppress(aec, hNl, hNlFb, efw);
 
   // Add comfort noise.
-  ComfortNoise(aec, efw, comfortNoiseHband, aec->noisePow, hNl);
+  WebRtcAec_ComfortNoise(aec, efw, comfortNoiseHband, aec->noisePow, hNl);
 
   // TODO(bjornv): Investigate how to take the windowing below into account if
   // needed.
diff --git a/webrtc/modules/audio_processing/aec/aec_core.h b/webrtc/modules/audio_processing/aec/aec_core.h
index d3c6d7e2b2..e1f6f903d9 100644
--- a/webrtc/modules/audio_processing/aec/aec_core.h
+++ b/webrtc/modules/audio_processing/aec/aec_core.h
@@ -65,6 +65,9 @@ int WebRtcAec_CreateAec(AecCore** aec);
 int WebRtcAec_FreeAec(AecCore* aec);
 int WebRtcAec_InitAec(AecCore* aec, int sampFreq);
 void WebRtcAec_InitAec_SSE2(void);
+#if defined(MIPS_FPU_LE)
+void WebRtcAec_InitAec_mips(void);
+#endif
 
 void WebRtcAec_BufferFarendPartition(AecCore* aec, const float* farend);
 void WebRtcAec_ProcessFrame(AecCore* aec,
diff --git a/webrtc/modules/audio_processing/aec/aec_core_internal.h b/webrtc/modules/audio_processing/aec/aec_core_internal.h
index 193369382c..c6b762ab1d 100644
--- a/webrtc/modules/audio_processing/aec/aec_core_internal.h
+++ b/webrtc/modules/audio_processing/aec/aec_core_internal.h
@@ -151,4 +151,11 @@ typedef void (*WebRtcAec_OverdriveAndSuppress_t)(AecCore* aec,
                                                  float efw[2][PART_LEN1]);
 extern WebRtcAec_OverdriveAndSuppress_t WebRtcAec_OverdriveAndSuppress;
 
+typedef void (*WebRtcAec_ComfortNoise_t)(AecCore* aec,
+                                         float efw[2][PART_LEN1],
+                                         complex_t* comfortNoiseHband,
+                                         const float* noisePow,
+                                         const float* lambda);
+extern WebRtcAec_ComfortNoise_t WebRtcAec_ComfortNoise;
+
 #endif  // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_
diff --git a/webrtc/modules/audio_processing/aec/aec_core_mips.c b/webrtc/modules/audio_processing/aec/aec_core_mips.c
new file mode 100644
index 0000000000..d861e10f90
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/aec_core_mips.c
@@ -0,0 +1,774 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * The core AEC algorithm, which is presented with time-aligned signals.
+ */
+
+#include "webrtc/modules/audio_processing/aec/aec_core.h"
+
+#include <math.h>
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
+#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
+
+static const int flagHbandCn = 1; // flag for adding comfort noise in H band
+extern const float WebRtcAec_weightCurve[65];
+extern const float WebRtcAec_overDriveCurve[65];
+
+void WebRtcAec_ComfortNoise_mips(AecCore* aec,
+                                 float efw[2][PART_LEN1],
+                                 complex_t* comfortNoiseHband,
+                                 const float* noisePow,
+                                 const float* lambda) {
+  int i, num;
+  float rand[PART_LEN];
+  float noise, noiseAvg, tmp, tmpAvg;
+  int16_t randW16[PART_LEN];
+  complex_t u[PART_LEN1];
+
+  const float pi2 = 6.28318530717959f;
+  const float pi2t = pi2 / 32768;
+
+  // Generate a uniform random array on [0 1]
+  WebRtcSpl_RandUArray(randW16, PART_LEN, &aec->seed);
+
+  int16_t *randWptr = randW16;
+  float randTemp, randTemp2, randTemp3, randTemp4;
+  short tmp1s, tmp2s, tmp3s, tmp4s;
+
+  for (i = 0; i < PART_LEN; i+=4) {
+    __asm __volatile (
+      ".set     push                                           \n\t"
+      ".set     noreorder                                      \n\t"
+      "lh       %[tmp1s],       0(%[randWptr])                 \n\t"
+      "lh       %[tmp2s],       2(%[randWptr])                 \n\t"
+      "lh       %[tmp3s],       4(%[randWptr])                 \n\t"
+      "lh       %[tmp4s],       6(%[randWptr])                 \n\t"
+      "mtc1     %[tmp1s],       %[randTemp]                    \n\t"
+      "mtc1     %[tmp2s],       %[randTemp2]                   \n\t"
+      "mtc1     %[tmp3s],       %[randTemp3]                   \n\t"
+      "mtc1     %[tmp4s],       %[randTemp4]                   \n\t"
+      "cvt.s.w  %[randTemp],    %[randTemp]                    \n\t"
+      "cvt.s.w  %[randTemp2],   %[randTemp2]                   \n\t"
+      "cvt.s.w  %[randTemp3],   %[randTemp3]                   \n\t"
+      "cvt.s.w  %[randTemp4],   %[randTemp4]                   \n\t"
+      "addiu    %[randWptr],    %[randWptr],      8            \n\t"
+      "mul.s    %[randTemp],    %[randTemp],      %[pi2t]      \n\t"
+      "mul.s    %[randTemp2],   %[randTemp2],     %[pi2t]      \n\t"
+      "mul.s    %[randTemp3],   %[randTemp3],     %[pi2t]      \n\t"
+      "mul.s    %[randTemp4],   %[randTemp4],     %[pi2t]      \n\t"
+      ".set     pop                                            \n\t"
+      : [randWptr] "+r" (randWptr), [randTemp] "=&f" (randTemp),
+        [randTemp2] "=&f" (randTemp2), [randTemp3] "=&f" (randTemp3),
+        [randTemp4] "=&f" (randTemp4), [tmp1s] "=&r" (tmp1s),
+        [tmp2s] "=&r" (tmp2s), [tmp3s] "=&r" (tmp3s),
+        [tmp4s] "=&r" (tmp4s)
+      : [pi2t] "f" (pi2t)
+      : "memory"
+    );
+
+    u[i+1][0] = (float)cos(randTemp);
+    u[i+1][1] = (float)sin(randTemp);
+    u[i+2][0] = (float)cos(randTemp2);
+    u[i+2][1] = (float)sin(randTemp2);
+    u[i+3][0] = (float)cos(randTemp3);
+    u[i+3][1] = (float)sin(randTemp3);
+    u[i+4][0] = (float)cos(randTemp4);
+    u[i+4][1] = (float)sin(randTemp4);
+  }
+
+  // Reject LF noise
+  float *u_ptr = &u[1][0];
+  float noise2, noise3, noise4;
+  float tmp1f, tmp2f, tmp3f, tmp4f, tmp5f, tmp6f, tmp7f, tmp8f;
+
+  u[0][0] = 0;
+  u[0][1] = 0;
+  for (i = 1; i < PART_LEN1; i+=4) {
+    __asm __volatile (
+      ".set     push                                            \n\t"
+      ".set     noreorder                                       \n\t"
+      "lwc1     %[noise],       4(%[noisePow])                  \n\t"
+      "lwc1     %[noise2],      8(%[noisePow])                  \n\t"
+      "lwc1     %[noise3],      12(%[noisePow])                 \n\t"
+      "lwc1     %[noise4],      16(%[noisePow])                 \n\t"
+      "sqrt.s   %[noise],       %[noise]                        \n\t"
+      "sqrt.s   %[noise2],      %[noise2]                       \n\t"
+      "sqrt.s   %[noise3],      %[noise3]                       \n\t"
+      "sqrt.s   %[noise4],      %[noise4]                       \n\t"
+      "lwc1     %[tmp1f],       0(%[u_ptr])                     \n\t"
+      "lwc1     %[tmp2f],       4(%[u_ptr])                     \n\t"
+      "lwc1     %[tmp3f],       8(%[u_ptr])                     \n\t"
+      "lwc1     %[tmp4f],       12(%[u_ptr])                    \n\t"
+      "lwc1     %[tmp5f],       16(%[u_ptr])                    \n\t"
+      "lwc1     %[tmp6f],       20(%[u_ptr])                    \n\t"
+      "lwc1     %[tmp7f],       24(%[u_ptr])                    \n\t"
+      "lwc1     %[tmp8f],       28(%[u_ptr])                    \n\t"
+      "addiu    %[noisePow],    %[noisePow],      16            \n\t"
+      "mul.s    %[tmp1f],       %[tmp1f],         %[noise]      \n\t"
+      "mul.s    %[tmp2f],       %[tmp2f],         %[noise]      \n\t"
+      "mul.s    %[tmp3f],       %[tmp3f],         %[noise2]     \n\t"
+      "mul.s    %[tmp4f],       %[tmp4f],         %[noise2]     \n\t"
+      "mul.s    %[tmp5f],       %[tmp5f],         %[noise3]     \n\t"
+      "mul.s    %[tmp6f],       %[tmp6f],         %[noise3]     \n\t"
+      "swc1     %[tmp1f],       0(%[u_ptr])                     \n\t"
+      "swc1     %[tmp3f],       8(%[u_ptr])                     \n\t"
+      "mul.s    %[tmp8f],       %[tmp8f],         %[noise4]     \n\t"
+      "mul.s    %[tmp7f],       %[tmp7f],         %[noise4]     \n\t"
+      "neg.s    %[tmp2f]                                        \n\t"
+      "neg.s    %[tmp4f]                                        \n\t"
+      "neg.s    %[tmp6f]                                        \n\t"
+      "neg.s    %[tmp8f]                                        \n\t"
+      "swc1     %[tmp5f],       16(%[u_ptr])                    \n\t"
+      "swc1     %[tmp7f],       24(%[u_ptr])                    \n\t"
+      "swc1     %[tmp2f],       4(%[u_ptr])                     \n\t"
+      "swc1     %[tmp4f],       12(%[u_ptr])                    \n\t"
+      "swc1     %[tmp6f],       20(%[u_ptr])                    \n\t"
+      "swc1     %[tmp8f],       28(%[u_ptr])                    \n\t"
+      "addiu    %[u_ptr],       %[u_ptr],         32            \n\t"
+      ".set     pop                                             \n\t"
+      : [u_ptr] "+r" (u_ptr),  [noisePow] "+r" (noisePow),
+        [noise] "=&f" (noise), [noise2] "=&f" (noise2),
+        [noise3] "=&f" (noise3), [noise4] "=&f" (noise4),
+        [tmp1f] "=&f" (tmp1f), [tmp2f] "=&f" (tmp2f),
+        [tmp3f] "=&f" (tmp3f), [tmp4f] "=&f" (tmp4f),
+        [tmp5f] "=&f" (tmp5f), [tmp6f] "=&f" (tmp6f),
+        [tmp7f] "=&f" (tmp7f), [tmp8f] "=&f" (tmp8f)
+      :
+      : "memory"
+    );
+  }
+  u[PART_LEN][1] = 0;
+  noisePow -= PART_LEN;
+
+  u_ptr = &u[0][0];
+  float *u_ptr_end = &u[PART_LEN][0];
+  float *efw_ptr_0 = &efw[0][0];
+  float *efw_ptr_1 = &efw[1][0];
+  float tmp9f, tmp10f;
+  const float tmp1c = 1.0;
+  const float tmp2c = 0.0;
+
+  __asm __volatile (
+    ".set     push                                                        \n\t"
+    ".set     noreorder                                                   \n\t"
+   "1:                                                                    \n\t"
+    "lwc1     %[tmp1f],       0(%[lambda])                                \n\t"
+    "lwc1     %[tmp6f],       4(%[lambda])                                \n\t"
+    "addiu    %[lambda],      %[lambda],   8                              \n\t"
+    "c.lt.s   %[tmp1f],       %[tmp1c]                                    \n\t"
+    "bc1f     4f                                                          \n\t"
+    " nop                                                                 \n\t"
+    "c.lt.s   %[tmp6f],       %[tmp1c]                                    \n\t"
+    "bc1f     3f                                                          \n\t"
+    " nop                                                                 \n\t"
+   "2:                                                                    \n\t"
+    "mul.s    %[tmp1f],       %[tmp1f],         %[tmp1f]                  \n\t"
+    "mul.s    %[tmp6f],       %[tmp6f],         %[tmp6f]                  \n\t"
+    "sub.s    %[tmp1f],       %[tmp1c],         %[tmp1f]                  \n\t"
+    "sub.s    %[tmp6f],       %[tmp1c],         %[tmp6f]                  \n\t"
+    "sqrt.s   %[tmp1f],       %[tmp1f]                                    \n\t"
+    "sqrt.s   %[tmp6f],       %[tmp6f]                                    \n\t"
+    "lwc1     %[tmp2f],       0(%[efw_ptr_0])                             \n\t"
+    "lwc1     %[tmp3f],       0(%[u_ptr])                                 \n\t"
+    "lwc1     %[tmp7f],       4(%[efw_ptr_0])                             \n\t"
+    "lwc1     %[tmp8f],       8(%[u_ptr])                                 \n\t"
+    "lwc1     %[tmp4f],       0(%[efw_ptr_1])                             \n\t"
+    "lwc1     %[tmp5f],       4(%[u_ptr])                                 \n\t"
+    "lwc1     %[tmp9f],       4(%[efw_ptr_1])                             \n\t"
+    "lwc1     %[tmp10f],      12(%[u_ptr])                                \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s    %[tmp3f],       %[tmp1f],         %[tmp3f]                  \n\t"
+    "add.s    %[tmp2f],       %[tmp2f],         %[tmp3f]                  \n\t"
+    "mul.s    %[tmp3f],       %[tmp1f],         %[tmp5f]                  \n\t"
+    "add.s    %[tmp4f],       %[tmp4f],         %[tmp3f]                  \n\t"
+    "mul.s    %[tmp3f],       %[tmp6f],         %[tmp8f]                  \n\t"
+    "add.s    %[tmp7f],       %[tmp7f],         %[tmp3f]                  \n\t"
+    "mul.s    %[tmp3f],       %[tmp6f],         %[tmp10f]                 \n\t"
+    "add.s    %[tmp9f],       %[tmp9f],         %[tmp3f]                  \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+    "madd.s   %[tmp2f],       %[tmp2f],         %[tmp1f],     %[tmp3f]    \n\t"
+    "madd.s   %[tmp4f],       %[tmp4f],         %[tmp1f],     %[tmp5f]    \n\t"
+    "madd.s   %[tmp7f],       %[tmp7f],         %[tmp6f],     %[tmp8f]    \n\t"
+    "madd.s   %[tmp9f],       %[tmp9f],         %[tmp6f],     %[tmp10f]   \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+    "swc1     %[tmp2f],       0(%[efw_ptr_0])                             \n\t"
+    "swc1     %[tmp4f],       0(%[efw_ptr_1])                             \n\t"
+    "swc1     %[tmp7f],       4(%[efw_ptr_0])                             \n\t"
+    "b        5f                                                          \n\t"
+    " swc1    %[tmp9f],       4(%[efw_ptr_1])                             \n\t"
+   "3:                                                                    \n\t"
+    "mul.s    %[tmp1f],       %[tmp1f],         %[tmp1f]                  \n\t"
+    "sub.s    %[tmp1f],       %[tmp1c],         %[tmp1f]                  \n\t"
+    "sqrt.s   %[tmp1f],       %[tmp1f]                                    \n\t"
+    "lwc1     %[tmp2f],       0(%[efw_ptr_0])                             \n\t"
+    "lwc1     %[tmp3f],       0(%[u_ptr])                                 \n\t"
+    "lwc1     %[tmp4f],       0(%[efw_ptr_1])                             \n\t"
+    "lwc1     %[tmp5f],       4(%[u_ptr])                                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s    %[tmp3f],       %[tmp1f],         %[tmp3f]                  \n\t"
+    "add.s    %[tmp2f],       %[tmp2f],         %[tmp3f]                  \n\t"
+    "mul.s    %[tmp3f],       %[tmp1f],         %[tmp5f]                  \n\t"
+    "add.s    %[tmp4f],       %[tmp4f],         %[tmp3f]                  \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+    "madd.s   %[tmp2f],       %[tmp2f],         %[tmp1f],     %[tmp3f]    \n\t"
+    "madd.s   %[tmp4f],       %[tmp4f],         %[tmp1f],     %[tmp5f]    \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+    "swc1     %[tmp2f],       0(%[efw_ptr_0])                             \n\t"
+    "b        5f                                                          \n\t"
+    " swc1    %[tmp4f],       0(%[efw_ptr_1])                             \n\t"
+   "4:                                                                    \n\t"
+    "c.lt.s   %[tmp6f],       %[tmp1c]                                    \n\t"
+    "bc1f     5f                                                          \n\t"
+    " nop                                                                 \n\t"
+    "mul.s    %[tmp6f],       %[tmp6f],         %[tmp6f]                  \n\t"
+    "sub.s    %[tmp6f],       %[tmp1c],         %[tmp6f]                  \n\t"
+    "sqrt.s   %[tmp6f],       %[tmp6f]                                    \n\t"
+    "lwc1     %[tmp7f],       4(%[efw_ptr_0])                             \n\t"
+    "lwc1     %[tmp8f],       8(%[u_ptr])                                 \n\t"
+    "lwc1     %[tmp9f],       4(%[efw_ptr_1])                             \n\t"
+    "lwc1     %[tmp10f],      12(%[u_ptr])                                \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s    %[tmp3f],       %[tmp6f],         %[tmp8f]                  \n\t"
+    "add.s    %[tmp7f],       %[tmp7f],         %[tmp3f]                  \n\t"
+    "mul.s    %[tmp3f],       %[tmp6f],         %[tmp10f]                 \n\t"
+    "add.s    %[tmp9f],       %[tmp9f],         %[tmp3f]                  \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+    "madd.s   %[tmp7f],       %[tmp7f],         %[tmp6f],     %[tmp8f]    \n\t"
+    "madd.s   %[tmp9f],       %[tmp9f],         %[tmp6f],     %[tmp10f]   \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+    "swc1     %[tmp7f],       4(%[efw_ptr_0])                             \n\t"
+    "swc1     %[tmp9f],       4(%[efw_ptr_1])                             \n\t"
+   "5:                                                                    \n\t"
+    "addiu    %[u_ptr],       %[u_ptr],         16                        \n\t"
+    "addiu    %[efw_ptr_0],   %[efw_ptr_0],     8                         \n\t"
+    "bne      %[u_ptr],       %[u_ptr_end],     1b                        \n\t"
+    " addiu   %[efw_ptr_1],   %[efw_ptr_1],     8                         \n\t"
+    ".set     pop                                                         \n\t"
+    : [lambda] "+r" (lambda), [u_ptr] "+r" (u_ptr),
+      [efw_ptr_0] "+r" (efw_ptr_0), [efw_ptr_1] "+r" (efw_ptr_1),
+      [tmp1f] "=&f" (tmp1f), [tmp2f] "=&f" (tmp2f), [tmp3f] "=&f" (tmp3f),
+      [tmp4f] "=&f" (tmp4f), [tmp5f] "=&f" (tmp5f),
+      [tmp6f] "=&f" (tmp6f), [tmp7f] "=&f" (tmp7f), [tmp8f] "=&f" (tmp8f),
+      [tmp9f] "=&f" (tmp9f), [tmp10f] "=&f" (tmp10f)
+    : [tmp1c] "f" (tmp1c), [tmp2c] "f" (tmp2c), [u_ptr_end] "r" (u_ptr_end)
+    : "memory"
+  );
+
+  lambda -= PART_LEN;
+  tmp = sqrtf(WEBRTC_SPL_MAX(1 - lambda[PART_LEN] * lambda[PART_LEN], 0));
+  //tmp = 1 - lambda[i];
+  efw[0][PART_LEN] += tmp * u[PART_LEN][0];
+  efw[1][PART_LEN] += tmp * u[PART_LEN][1];
+
+  // For H band comfort noise
+  // TODO: don't compute noise and "tmp" twice. Use the previous results.
+  noiseAvg = 0.0;
+  tmpAvg = 0.0;
+  num = 0;
+  if (aec->sampFreq == 32000 && flagHbandCn == 1) {
+    for (i = 0; i < PART_LEN; i++) {
+      rand[i] = ((float)randW16[i]) / 32768;
+    }
+
+    // average noise scale
+    // average over second half of freq spectrum (i.e., 4->8khz)
+    // TODO: we shouldn't need num. We know how many elements we're summing.
+    for (i = PART_LEN1 >> 1; i < PART_LEN1; i++) {
+      num++;
+      noiseAvg += sqrtf(noisePow[i]);
+    }
+    noiseAvg /= (float)num;
+
+    // average nlp scale
+    // average over second half of freq spectrum (i.e., 4->8khz)
+    // TODO: we shouldn't need num. We know how many elements we're summing.
+    num = 0;
+    for (i = PART_LEN1 >> 1; i < PART_LEN1; i++) {
+      num++;
+      tmpAvg += sqrtf(WEBRTC_SPL_MAX(1 - lambda[i] * lambda[i], 0));
+    }
+    tmpAvg /= (float)num;
+
+    // Use average noise for H band
+    // TODO: we should probably have a new random vector here.
+    // Reject LF noise
+    u[0][0] = 0;
+    u[0][1] = 0;
+    for (i = 1; i < PART_LEN1; i++) {
+      tmp = pi2 * rand[i - 1];
+
+      // Use average noise for H band
+      u[i][0] = noiseAvg * (float)cos(tmp);
+      u[i][1] = -noiseAvg * (float)sin(tmp);
+    }
+    u[PART_LEN][1] = 0;
+
+    for (i = 0; i < PART_LEN1; i++) {
+      // Use average NLP weight for H band
+      comfortNoiseHband[i][0] = tmpAvg * u[i][0];
+      comfortNoiseHband[i][1] = tmpAvg * u[i][1];
+    }
+  }
+}
+
+void WebRtcAec_FilterFar_mips(AecCore *aec, float yf[2][PART_LEN1]) {
+  int i;
+  for (i = 0; i < aec->num_partitions; i++) {
+    int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
+    int pos = i * PART_LEN1;
+    // Check for wrap
+    if (i + aec->xfBufBlockPos >=  aec->num_partitions) {
+      xPos -=  aec->num_partitions * (PART_LEN1);
+    }
+    float *yf0 = yf[0];
+    float *yf1 = yf[1];
+    float *aRe = aec->xfBuf[0] + xPos;
+    float *aIm = aec->xfBuf[1] + xPos;
+    float *bRe = aec->wfBuf[0] + pos;
+    float *bIm = aec->wfBuf[1] + pos;
+    float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13;
+    int len = PART_LEN1 >> 1;
+    int len1 = PART_LEN1 & 1;
+
+    __asm __volatile (
+      ".set       push                                                \n\t"
+      ".set       noreorder                                           \n\t"
+     "1:                                                              \n\t"
+      "lwc1       %[f0],      0(%[aRe])                               \n\t"
+      "lwc1       %[f1],      0(%[bRe])                               \n\t"
+      "lwc1       %[f2],      0(%[bIm])                               \n\t"
+      "lwc1       %[f3],      0(%[aIm])                               \n\t"
+      "lwc1       %[f4],      4(%[aRe])                               \n\t"
+      "lwc1       %[f5],      4(%[bRe])                               \n\t"
+      "lwc1       %[f6],      4(%[bIm])                               \n\t"
+      "mul.s      %[f8],      %[f0],          %[f1]                   \n\t"
+      "mul.s      %[f0],      %[f0],          %[f2]                   \n\t"
+      "mul.s      %[f9],      %[f4],          %[f5]                   \n\t"
+      "mul.s      %[f4],      %[f4],          %[f6]                   \n\t"
+      "lwc1       %[f7],      4(%[aIm])                               \n\t"
+#if !defined(MIPS32_R2_LE)
+      "mul.s      %[f12],     %[f2],          %[f3]                   \n\t"
+      "mul.s      %[f1],      %[f3],          %[f1]                   \n\t"
+      "mul.s      %[f11],     %[f6],          %[f7]                   \n\t"
+      "addiu      %[aRe],     %[aRe],         8                       \n\t"
+      "addiu      %[aIm],     %[aIm],         8                       \n\t"
+      "addiu      %[len],     %[len],         -1                      \n\t"
+      "sub.s      %[f8],      %[f8],          %[f12]                  \n\t"
+      "mul.s      %[f12],     %[f7],          %[f5]                   \n\t"
+      "lwc1       %[f2],      0(%[yf0])                               \n\t"
+      "add.s      %[f1],      %[f0],          %[f1]                   \n\t"
+      "lwc1       %[f3],      0(%[yf1])                               \n\t"
+      "sub.s      %[f9],      %[f9],          %[f11]                  \n\t"
+      "lwc1       %[f6],      4(%[yf0])                               \n\t"
+      "add.s      %[f4],      %[f4],          %[f12]                  \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+      "addiu      %[aRe],     %[aRe],         8                       \n\t"
+      "addiu      %[aIm],     %[aIm],         8                       \n\t"
+      "addiu      %[len],     %[len],         -1                      \n\t"
+      "nmsub.s    %[f8],      %[f8],          %[f2],      %[f3]       \n\t"
+      "lwc1       %[f2],      0(%[yf0])                               \n\t"
+      "madd.s     %[f1],      %[f0],          %[f3],      %[f1]       \n\t"
+      "lwc1       %[f3],      0(%[yf1])                               \n\t"
+      "nmsub.s    %[f9],      %[f9],          %[f6],      %[f7]       \n\t"
+      "lwc1       %[f6],      4(%[yf0])                               \n\t"
+      "madd.s     %[f4],      %[f4],          %[f7],      %[f5]       \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+      "lwc1       %[f5],      4(%[yf1])                               \n\t"
+      "add.s      %[f2],      %[f2],          %[f8]                   \n\t"
+      "addiu      %[bRe],     %[bRe],         8                       \n\t"
+      "addiu      %[bIm],     %[bIm],         8                       \n\t"
+      "add.s      %[f3],      %[f3],          %[f1]                   \n\t"
+      "add.s      %[f6],      %[f6],          %[f9]                   \n\t"
+      "add.s      %[f5],      %[f5],          %[f4]                   \n\t"
+      "swc1       %[f2],      0(%[yf0])                               \n\t"
+      "swc1       %[f3],      0(%[yf1])                               \n\t"
+      "swc1       %[f6],      4(%[yf0])                               \n\t"
+      "swc1       %[f5],      4(%[yf1])                               \n\t"
+      "addiu      %[yf0],     %[yf0],         8                       \n\t"
+      "bgtz       %[len],     1b                                      \n\t"
+      " addiu     %[yf1],     %[yf1],         8                       \n\t"
+      "lwc1       %[f0],      0(%[aRe])                               \n\t"
+      "lwc1       %[f1],      0(%[bRe])                               \n\t"
+      "lwc1       %[f2],      0(%[bIm])                               \n\t"
+      "lwc1       %[f3],      0(%[aIm])                               \n\t"
+      "mul.s      %[f8],      %[f0],          %[f1]                   \n\t"
+      "mul.s      %[f0],      %[f0],          %[f2]                   \n\t"
+#if !defined(MIPS32_R2_LE)
+      "mul.s      %[f12],     %[f2],          %[f3]                   \n\t"
+      "mul.s      %[f1],      %[f3],          %[f1]                   \n\t"
+      "sub.s      %[f8],      %[f8],          %[f12]                  \n\t"
+      "lwc1       %[f2],      0(%[yf0])                               \n\t"
+      "add.s      %[f1],      %[f0],          %[f1]                   \n\t"
+      "lwc1       %[f3],      0(%[yf1])                               \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+      "nmsub.s    %[f8],      %[f8],          %[f2],      %[f3]       \n\t"
+      "lwc1       %[f2],      0(%[yf0])                               \n\t"
+      "madd.s     %[f1],      %[f0],          %[f3],      %[f1]       \n\t"
+      "lwc1       %[f3],      0(%[yf1])                               \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+      "add.s      %[f2],      %[f2],          %[f8]                   \n\t"
+      "add.s      %[f3],      %[f3],          %[f1]                   \n\t"
+      "swc1       %[f2],      0(%[yf0])                               \n\t"
+      "swc1       %[f3],      0(%[yf1])                               \n\t"
+      ".set       pop                                                 \n\t"
+      : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
+        [f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
+        [f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
+        [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11),
+        [f12] "=&f" (f12), [f13] "=&f" (f13), [aRe] "+r" (aRe),
+        [aIm] "+r" (aIm), [bRe] "+r" (bRe), [bIm] "+r" (bIm),
+        [yf0] "+r" (yf0), [yf1] "+r" (yf1), [len] "+r" (len)
+      : [len1] "r" (len1)
+      : "memory"
+    );
+  }
+}
+
+void WebRtcAec_FilterAdaptation_mips(AecCore *aec,
+                                     float *fft,
+                                     float ef[2][PART_LEN1]) {
+  int i;
+  for (i = 0; i < aec->num_partitions; i++) {
+    int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
+    int pos;
+    // Check for wrap
+    if (i + aec->xfBufBlockPos >= aec->num_partitions) {
+      xPos -= aec->num_partitions * PART_LEN1;
+    }
+
+    pos = i * PART_LEN1;
+    float *aRe = aec->xfBuf[0] + xPos;
+    float *aIm = aec->xfBuf[1] + xPos;
+    float *bRe = ef[0];
+    float *bIm = ef[1];
+    float *fft_tmp = fft;
+
+    float f0, f1, f2, f3, f4, f5, f6 ,f7, f8, f9, f10, f11, f12;
+    int len = PART_LEN >> 1;
+
+    __asm __volatile (
+      ".set       push                                                \n\t"
+      ".set       noreorder                                           \n\t"
+     "1:                                                              \n\t"
+      "lwc1       %[f0],      0(%[aRe])                               \n\t"
+      "lwc1       %[f1],      0(%[bRe])                               \n\t"
+      "lwc1       %[f2],      0(%[bIm])                               \n\t"
+      "lwc1       %[f4],      4(%[aRe])                               \n\t"
+      "lwc1       %[f5],      4(%[bRe])                               \n\t"
+      "lwc1       %[f6],      4(%[bIm])                               \n\t"
+      "addiu      %[aRe],     %[aRe],         8                       \n\t"
+      "addiu      %[bRe],     %[bRe],         8                       \n\t"
+      "mul.s      %[f8],      %[f0],          %[f1]                   \n\t"
+      "mul.s      %[f0],      %[f0],          %[f2]                   \n\t"
+      "lwc1       %[f3],      0(%[aIm])                               \n\t"
+      "mul.s      %[f9],      %[f4],          %[f5]                   \n\t"
+      "lwc1       %[f7],      4(%[aIm])                               \n\t"
+      "mul.s      %[f4],      %[f4],          %[f6]                   \n\t"
+#if !defined(MIPS32_R2_LE)
+      "mul.s      %[f10],     %[f3],          %[f2]                   \n\t"
+      "mul.s      %[f1],      %[f3],          %[f1]                   \n\t"
+      "mul.s      %[f11],     %[f7],          %[f6]                   \n\t"
+      "mul.s      %[f5],      %[f7],          %[f5]                   \n\t"
+      "addiu      %[aIm],     %[aIm],         8                       \n\t"
+      "addiu      %[bIm],     %[bIm],         8                       \n\t"
+      "addiu      %[len],     %[len],         -1                      \n\t"
+      "add.s      %[f8],      %[f8],          %[f10]                  \n\t"
+      "sub.s      %[f1],      %[f0],          %[f1]                   \n\t"
+      "add.s      %[f9],      %[f9],          %[f11]                  \n\t"
+      "sub.s      %[f5],      %[f4],          %[f5]                   \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+      "addiu      %[aIm],     %[aIm],         8                       \n\t"
+      "addiu      %[bIm],     %[bIm],         8                       \n\t"
+      "addiu      %[len],     %[len],         -1                      \n\t"
+      "madd.s     %[f8],      %[f8],          %[f3],      %[f2]       \n\t"
+      "nmsub.s    %[f1],      %[f0],          %[f3],      %[f1]       \n\t"
+      "madd.s     %[f9],      %[f9],          %[f7],      %[f6]       \n\t"
+      "nmsub.s    %[f5],      %[f4],          %[f7],      %[f5]       \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+      "swc1       %[f8],      0(%[fft_tmp])                           \n\t"
+      "swc1       %[f1],      4(%[fft_tmp])                           \n\t"
+      "swc1       %[f9],      8(%[fft_tmp])                           \n\t"
+      "swc1       %[f5],      12(%[fft_tmp])                          \n\t"
+      "bgtz       %[len],     1b                                      \n\t"
+      " addiu     %[fft_tmp], %[fft_tmp],     16                      \n\t"
+      "lwc1       %[f0],      0(%[aRe])                               \n\t"
+      "lwc1       %[f1],      0(%[bRe])                               \n\t"
+      "lwc1       %[f2],      0(%[bIm])                               \n\t"
+      "lwc1       %[f3],      0(%[aIm])                               \n\t"
+      "mul.s      %[f8],      %[f0],          %[f1]                   \n\t"
+#if !defined(MIPS32_R2_LE)
+      "mul.s      %[f10],     %[f3],          %[f2]                   \n\t"
+      "add.s      %[f8],      %[f8],          %[f10]                  \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+      "madd.s     %[f8],      %[f8],          %[f3],      %[f2]       \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+      "swc1       %[f8],      4(%[fft])                               \n\t"
+      ".set       pop                                                 \n\t"
+      : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
+        [f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
+        [f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
+        [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11),
+        [f12] "=&f" (f12), [aRe] "+r" (aRe), [aIm] "+r" (aIm),
+        [bRe] "+r" (bRe), [bIm] "+r" (bIm), [fft_tmp] "+r" (fft_tmp),
+        [len] "+r" (len), [fft] "=&r" (fft)
+      :
+      : "memory"
+    );
+
+    aec_rdft_inverse_128(fft);
+    memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
+
+    // fft scaling
+    {
+      float scale = 2.0f / PART_LEN2;
+      __asm __volatile (
+        ".set     push                                    \n\t"
+        ".set     noreorder                               \n\t"
+        "addiu    %[fft_tmp], %[fft],        0            \n\t"
+        "addiu    %[len],     $zero,         8            \n\t"
+       "1:                                                \n\t"
+        "addiu    %[len],     %[len],        -1           \n\t"
+        "lwc1     %[f0],      0(%[fft_tmp])               \n\t"
+        "lwc1     %[f1],      4(%[fft_tmp])               \n\t"
+        "lwc1     %[f2],      8(%[fft_tmp])               \n\t"
+        "lwc1     %[f3],      12(%[fft_tmp])              \n\t"
+        "mul.s    %[f0],      %[f0],         %[scale]     \n\t"
+        "mul.s    %[f1],      %[f1],         %[scale]     \n\t"
+        "mul.s    %[f2],      %[f2],         %[scale]     \n\t"
+        "mul.s    %[f3],      %[f3],         %[scale]     \n\t"
+        "lwc1     %[f4],      16(%[fft_tmp])              \n\t"
+        "lwc1     %[f5],      20(%[fft_tmp])              \n\t"
+        "lwc1     %[f6],      24(%[fft_tmp])              \n\t"
+        "lwc1     %[f7],      28(%[fft_tmp])              \n\t"
+        "mul.s    %[f4],      %[f4],         %[scale]     \n\t"
+        "mul.s    %[f5],      %[f5],         %[scale]     \n\t"
+        "mul.s    %[f6],      %[f6],         %[scale]     \n\t"
+        "mul.s    %[f7],      %[f7],         %[scale]     \n\t"
+        "swc1     %[f0],      0(%[fft_tmp])               \n\t"
+        "swc1     %[f1],      4(%[fft_tmp])               \n\t"
+        "swc1     %[f2],      8(%[fft_tmp])               \n\t"
+        "swc1     %[f3],      12(%[fft_tmp])              \n\t"
+        "swc1     %[f4],      16(%[fft_tmp])              \n\t"
+        "swc1     %[f5],      20(%[fft_tmp])              \n\t"
+        "swc1     %[f6],      24(%[fft_tmp])              \n\t"
+        "swc1     %[f7],      28(%[fft_tmp])              \n\t"
+        "bgtz     %[len],     1b                          \n\t"
+        " addiu   %[fft_tmp], %[fft_tmp],    32           \n\t"
+        ".set     pop                                     \n\t"
+        : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
+          [f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
+          [f6] "=&f" (f6), [f7] "=&f" (f7), [len] "=&r" (len),
+          [fft_tmp] "=&r" (fft_tmp)
+        : [scale] "f" (scale), [fft] "r" (fft)
+        : "memory"
+      );
+    }
+    aec_rdft_forward_128(fft);
+    aRe = aec->wfBuf[0] + pos;
+    aIm = aec->wfBuf[1] + pos;
+    __asm __volatile (
+      ".set     push                                    \n\t"
+      ".set     noreorder                               \n\t"
+      "addiu    %[fft_tmp], %[fft],        0            \n\t"
+      "addiu    %[len],     $zero,         31           \n\t"
+      "lwc1     %[f0],      0(%[aRe])                   \n\t"
+      "lwc1     %[f1],      0(%[fft_tmp])               \n\t"
+      "lwc1     %[f2],      256(%[aRe])                 \n\t"
+      "lwc1     %[f3],      4(%[fft_tmp])               \n\t"
+      "lwc1     %[f4],      4(%[aRe])                   \n\t"
+      "lwc1     %[f5],      8(%[fft_tmp])               \n\t"
+      "lwc1     %[f6],      4(%[aIm])                   \n\t"
+      "lwc1     %[f7],      12(%[fft_tmp])              \n\t"
+      "add.s    %[f0],      %[f0],         %[f1]        \n\t"
+      "add.s    %[f2],      %[f2],         %[f3]        \n\t"
+      "add.s    %[f4],      %[f4],         %[f5]        \n\t"
+      "add.s    %[f6],      %[f6],         %[f7]        \n\t"
+      "addiu    %[fft_tmp], %[fft_tmp],    16           \n\t"
+      "swc1     %[f0],      0(%[aRe])                   \n\t"
+      "swc1     %[f2],      256(%[aRe])                 \n\t"
+      "swc1     %[f4],      4(%[aRe])                   \n\t"
+      "addiu    %[aRe],     %[aRe],        8            \n\t"
+      "swc1     %[f6],      4(%[aIm])                   \n\t"
+      "addiu    %[aIm],     %[aIm],        8            \n\t"
+     "1:                                                \n\t"
+      "lwc1     %[f0],      0(%[aRe])                   \n\t"
+      "lwc1     %[f1],      0(%[fft_tmp])               \n\t"
+      "lwc1     %[f2],      0(%[aIm])                   \n\t"
+      "lwc1     %[f3],      4(%[fft_tmp])               \n\t"
+      "lwc1     %[f4],      4(%[aRe])                   \n\t"
+      "lwc1     %[f5],      8(%[fft_tmp])               \n\t"
+      "lwc1     %[f6],      4(%[aIm])                   \n\t"
+      "lwc1     %[f7],      12(%[fft_tmp])              \n\t"
+      "add.s    %[f0],      %[f0],         %[f1]        \n\t"
+      "add.s    %[f2],      %[f2],         %[f3]        \n\t"
+      "add.s    %[f4],      %[f4],         %[f5]        \n\t"
+      "add.s    %[f6],      %[f6],         %[f7]        \n\t"
+      "addiu    %[len],     %[len],        -1           \n\t"
+      "addiu    %[fft_tmp], %[fft_tmp],    16           \n\t"
+      "swc1     %[f0],      0(%[aRe])                   \n\t"
+      "swc1     %[f2],      0(%[aIm])                   \n\t"
+      "swc1     %[f4],      4(%[aRe])                   \n\t"
+      "addiu    %[aRe],     %[aRe],        8            \n\t"
+      "swc1     %[f6],      4(%[aIm])                   \n\t"
+      "bgtz     %[len],     1b                          \n\t"
+      " addiu   %[aIm],     %[aIm],        8            \n\t"
+      ".set     pop                                     \n\t"
+      : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
+        [f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
+        [f6] "=&f" (f6), [f7] "=&f" (f7), [len] "=&r" (len),
+        [fft_tmp] "=&r" (fft_tmp)
+      : [aRe] "r" (aRe), [aIm] "r" (aIm), [fft] "r" (fft)
+      : "memory"
+    );
+  }
+}
+
+void WebRtcAec_OverdriveAndSuppress_mips(AecCore *aec,
+                                         float hNl[PART_LEN1],
+                                         const float hNlFb,
+                                         float efw[2][PART_LEN1]) {
+  int i;
+  const float one = 1.0;
+  float *p_hNl, *p_efw0, *p_efw1;
+  float *p_WebRtcAec_wC;
+  float temp1, temp2, temp3, temp4;
+
+  p_hNl = &hNl[0];
+  p_efw0 = &efw[0][0];
+  p_efw1 = &efw[1][0];
+  p_WebRtcAec_wC = (float*)&WebRtcAec_weightCurve[0];
+
+  for (i = 0; i < PART_LEN1; i++) {
+    // Weight subbands
+    __asm __volatile (
+      ".set      push                                              \n\t"
+      ".set      noreorder                                         \n\t"
+      "lwc1      %[temp1],    0(%[p_hNl])                          \n\t"
+      "lwc1      %[temp2],    0(%[p_wC])                           \n\t"
+      "c.lt.s    %[hNlFb],    %[temp1]                             \n\t"
+      "bc1f      1f                                                \n\t"
+      " mul.s    %[temp3],    %[temp2],     %[hNlFb]               \n\t"
+      "sub.s     %[temp4],    %[one],       %[temp2]               \n\t"
+#if !defined(MIPS32_R2_LE)
+      "mul.s     %[temp1],    %[temp1],     %[temp4]               \n\t"
+      "add.s     %[temp1],    %[temp3],     %[temp1]               \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+      "madd.s    %[temp1],    %[temp3],     %[temp1],   %[temp4]   \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+      "swc1      %[temp1],    0(%[p_hNl])                          \n\t"
+     "1:                                                           \n\t"
+      "addiu     %[p_wC],     %[p_wC],      4                      \n\t"
+      ".set      pop                                               \n\t"
+      : [temp1] "=&f" (temp1), [temp2] "=&f" (temp2), [temp3] "=&f" (temp3),
+        [temp4] "=&f" (temp4), [p_wC] "+r" (p_WebRtcAec_wC)
+      : [hNlFb] "f" (hNlFb), [one] "f" (one), [p_hNl] "r" (p_hNl)
+      : "memory"
+    );
+
+    hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
+
+    __asm __volatile (
+      "lwc1      %[temp1],    0(%[p_hNl])              \n\t"
+      "lwc1      %[temp3],    0(%[p_efw1])             \n\t"
+      "lwc1      %[temp2],    0(%[p_efw0])             \n\t"
+      "addiu     %[p_hNl],    %[p_hNl],     4          \n\t"
+      "mul.s     %[temp3],    %[temp3],     %[temp1]   \n\t"
+      "mul.s     %[temp2],    %[temp2],     %[temp1]   \n\t"
+      "addiu     %[p_efw0],   %[p_efw0],    4          \n\t"
+      "addiu     %[p_efw1],   %[p_efw1],    4          \n\t"
+      "neg.s     %[temp4],    %[temp3]                 \n\t"
+      "swc1      %[temp2],    -4(%[p_efw0])            \n\t"
+      "swc1      %[temp4],    -4(%[p_efw1])            \n\t"
+      : [temp1] "=&f" (temp1), [temp2] "=&f" (temp2), [temp3] "=&f" (temp3),
+        [temp4] "=&f" (temp4), [p_efw0] "+r" (p_efw0), [p_efw1] "+r" (p_efw1),
+        [p_hNl] "+r" (p_hNl)
+      :
+      : "memory"
+    );
+  }
+}
+
+void WebRtcAec_ScaleErrorSignal_mips(AecCore *aec, float ef[2][PART_LEN1]) {
+  const float mu = aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
+  const float error_threshold = aec->extended_filter_enabled
+                                    ? kExtendedErrorThreshold
+                                    : aec->normal_error_threshold;
+  int len = (PART_LEN1);
+  float *ef0 = ef[0];
+  float *ef1 = ef[1];
+  float *xPow = aec->xPow;
+  float fac1 = 1e-10f;
+  float err_th2 = error_threshold * error_threshold;
+  float f0, f1, f2;
+#if !defined(MIPS32_R2_LE)
+  float f3;
+#endif
+
+  __asm __volatile (
+    ".set       push                                   \n\t"
+    ".set       noreorder                              \n\t"
+   "1:                                                 \n\t"
+    "lwc1       %[f0],     0(%[xPow])                  \n\t"
+    "lwc1       %[f1],     0(%[ef0])                   \n\t"
+    "lwc1       %[f2],     0(%[ef1])                   \n\t"
+    "add.s      %[f0],     %[f0],       %[fac1]        \n\t"
+    "div.s      %[f1],     %[f1],       %[f0]          \n\t"
+    "div.s      %[f2],     %[f2],       %[f0]          \n\t"
+    "mul.s      %[f0],     %[f1],       %[f1]          \n\t"
+#if defined(MIPS32_R2_LE)
+    "madd.s     %[f0],     %[f0],       %[f2],   %[f2] \n\t"
+#else
+    "mul.s      %[f3],     %[f2],       %[f2]          \n\t"
+    "add.s      %[f0],     %[f0],       %[f3]          \n\t"
+#endif
+    "c.le.s     %[f0],     %[err_th2]                  \n\t"
+    "nop                                               \n\t"
+    "bc1t       2f                                     \n\t"
+    " nop                                              \n\t"
+    "sqrt.s     %[f0],     %[f0]                       \n\t"
+    "add.s      %[f0],     %[f0],       %[fac1]        \n\t"
+    "div.s      %[f0],     %[err_th],   %[f0]          \n\t"
+    "mul.s      %[f1],     %[f1],       %[f0]          \n\t"
+    "mul.s      %[f2],     %[f2],       %[f0]          \n\t"
+   "2:                                                 \n\t"
+    "mul.s      %[f1],     %[f1],       %[mu]          \n\t"
+    "mul.s      %[f2],     %[f2],       %[mu]          \n\t"
+    "swc1       %[f1],     0(%[ef0])                   \n\t"
+    "swc1       %[f2],     0(%[ef1])                   \n\t"
+    "addiu      %[len],    %[len],      -1             \n\t"
+    "addiu      %[xPow],   %[xPow],     4              \n\t"
+    "addiu      %[ef0],    %[ef0],      4              \n\t"
+    "bgtz       %[len],    1b                          \n\t"
+    " addiu     %[ef1],    %[ef1],      4              \n\t"
+    ".set       pop                                    \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
+#if !defined(MIPS32_R2_LE)
+      [f3] "=&f" (f3),
+#endif
+      [xPow] "+r" (xPow), [ef0] "+r" (ef0), [ef1] "+r" (ef1),
+      [len] "+r" (len)
+    : [fac1] "f" (fac1), [err_th2] "f" (err_th2), [mu] "f" (mu),
+      [err_th] "f" (error_threshold)
+    : "memory"
+  );
+}
+
+void WebRtcAec_InitAec_mips(void)
+{
+  WebRtcAec_FilterFar = WebRtcAec_FilterFar_mips;
+  WebRtcAec_FilterAdaptation = WebRtcAec_FilterAdaptation_mips;
+  WebRtcAec_ScaleErrorSignal = WebRtcAec_ScaleErrorSignal_mips;
+  WebRtcAec_ComfortNoise = WebRtcAec_ComfortNoise_mips;
+  WebRtcAec_OverdriveAndSuppress = WebRtcAec_OverdriveAndSuppress_mips;
+}
+
diff --git a/webrtc/modules/audio_processing/aec/aec_rdft.c b/webrtc/modules/audio_processing/aec/aec_rdft.c
index a19e8877bb..7731b37b22 100644
--- a/webrtc/modules/audio_processing/aec/aec_rdft.c
+++ b/webrtc/modules/audio_processing/aec/aec_rdft.c
@@ -116,7 +116,7 @@ static void bitrv2_32(int* ip, float* a) {
   }
 }
 
-static void bitrv2_128(float* a) {
+static void bitrv2_128_C(float* a) {
   /*
       Following things have been attempted but are no faster:
       (a) Storing the swap indexes in a LUT (index calculations are done
@@ -512,7 +512,7 @@ static void cftmdl_128_C(float* a) {
   }
 }
 
-static void cftfsub_128(float* a) {
+static void cftfsub_128_C(float* a) {
   int j, j1, j2, j3, l;
   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
 
@@ -542,7 +542,7 @@ static void cftfsub_128(float* a) {
   }
 }
 
-static void cftbsub_128(float* a) {
+static void cftbsub_128_C(float* a) {
   int j, j1, j2, j3, l;
   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
 
@@ -640,16 +640,25 @@ rft_sub_128_t cft1st_128;
 rft_sub_128_t cftmdl_128;
 rft_sub_128_t rftfsub_128;
 rft_sub_128_t rftbsub_128;
+rft_sub_128_t cftfsub_128;
+rft_sub_128_t cftbsub_128;
+rft_sub_128_t bitrv2_128;
 
 void aec_rdft_init(void) {
   cft1st_128 = cft1st_128_C;
   cftmdl_128 = cftmdl_128_C;
   rftfsub_128 = rftfsub_128_C;
   rftbsub_128 = rftbsub_128_C;
+  cftfsub_128 = cftfsub_128_C;
+  cftbsub_128 = cftbsub_128_C;
+  bitrv2_128 = bitrv2_128_C;
 #if defined(WEBRTC_ARCH_X86_FAMILY)
   if (WebRtc_GetCPUInfo(kSSE2)) {
     aec_rdft_init_sse2();
   }
+#endif
+#if defined(MIPS_FPU_LE)
+  aec_rdft_init_mips();
 #endif
   // init library constants.
   makewt_32();
diff --git a/webrtc/modules/audio_processing/aec/aec_rdft.h b/webrtc/modules/audio_processing/aec/aec_rdft.h
index 8a2e0b5071..795c57d44c 100644
--- a/webrtc/modules/audio_processing/aec/aec_rdft.h
+++ b/webrtc/modules/audio_processing/aec/aec_rdft.h
@@ -47,6 +47,9 @@ extern rft_sub_128_t rftfsub_128;
 extern rft_sub_128_t rftbsub_128;
 extern rft_sub_128_t cft1st_128;
 extern rft_sub_128_t cftmdl_128;
+extern rft_sub_128_t cftfsub_128;
+extern rft_sub_128_t cftbsub_128;
+extern rft_sub_128_t bitrv2_128;
 
 // entry points
 void aec_rdft_init(void);
@@ -54,4 +57,8 @@ void aec_rdft_init_sse2(void);
 void aec_rdft_forward_128(float* a);
 void aec_rdft_inverse_128(float* a);
 
+#if defined(MIPS_FPU_LE)
+void aec_rdft_init_mips(void);
+#endif
+
 #endif  // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
diff --git a/webrtc/modules/audio_processing/aec/aec_rdft_mips.c b/webrtc/modules/audio_processing/aec/aec_rdft_mips.c
new file mode 100644
index 0000000000..a0dac5f135
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/aec_rdft_mips.c
@@ -0,0 +1,1213 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
+#include "webrtc/typedefs.h"
+
+static void bitrv2_128_mips(float *a) {
+  // n is 128
+  float xr, xi, yr, yi;
+
+  xr = a[8];
+  xi = a[9];
+  yr = a[16];
+  yi = a[17];
+  a[8] = yr;
+  a[9] = yi;
+  a[16] = xr;
+  a[17] = xi;
+
+  xr = a[64];
+  xi = a[65];
+  yr = a[2];
+  yi = a[3];
+  a[64] = yr;
+  a[65] = yi;
+  a[2] = xr;
+  a[3] = xi;
+
+  xr = a[72];
+  xi = a[73];
+  yr = a[18];
+  yi = a[19];
+  a[72] = yr;
+  a[73] = yi;
+  a[18] = xr;
+  a[19] = xi;
+
+  xr = a[80];
+  xi = a[81];
+  yr = a[10];
+  yi = a[11];
+  a[80] = yr;
+  a[81] = yi;
+  a[10] = xr;
+  a[11] = xi;
+
+  xr = a[88];
+  xi = a[89];
+  yr = a[26];
+  yi = a[27];
+  a[88] = yr;
+  a[89] = yi;
+  a[26] = xr;
+  a[27] = xi;
+
+  xr = a[74];
+  xi = a[75];
+  yr = a[82];
+  yi = a[83];
+  a[74] = yr;
+  a[75] = yi;
+  a[82] = xr;
+  a[83] = xi;
+
+  xr = a[32];
+  xi = a[33];
+  yr = a[4];
+  yi = a[5];
+  a[32] = yr;
+  a[33] = yi;
+  a[4] = xr;
+  a[5] = xi;
+
+  xr = a[40];
+  xi = a[41];
+  yr = a[20];
+  yi = a[21];
+  a[40] = yr;
+  a[41] = yi;
+  a[20] = xr;
+  a[21] = xi;
+
+  xr = a[48];
+  xi = a[49];
+  yr = a[12];
+  yi = a[13];
+  a[48] = yr;
+  a[49] = yi;
+  a[12] = xr;
+  a[13] = xi;
+
+  xr = a[56];
+  xi = a[57];
+  yr = a[28];
+  yi = a[29];
+  a[56] = yr;
+  a[57] = yi;
+  a[28] = xr;
+  a[29] = xi;
+
+  xr = a[34];
+  xi = a[35];
+  yr = a[68];
+  yi = a[69];
+  a[34] = yr;
+  a[35] = yi;
+  a[68] = xr;
+  a[69] = xi;
+
+  xr = a[42];
+  xi = a[43];
+  yr = a[84];
+  yi = a[85];
+  a[42] = yr;
+  a[43] = yi;
+  a[84] = xr;
+  a[85] = xi;
+
+  xr = a[50];
+  xi = a[51];
+  yr = a[76];
+  yi = a[77];
+  a[50] = yr;
+  a[51] = yi;
+  a[76] = xr;
+  a[77] = xi;
+
+  xr = a[58];
+  xi = a[59];
+  yr = a[92];
+  yi = a[93];
+  a[58] = yr;
+  a[59] = yi;
+  a[92] = xr;
+  a[93] = xi;
+
+  xr = a[44];
+  xi = a[45];
+  yr = a[52];
+  yi = a[53];
+  a[44] = yr;
+  a[45] = yi;
+  a[52] = xr;
+  a[53] = xi;
+
+  xr = a[96];
+  xi = a[97];
+  yr = a[6];
+  yi = a[7];
+  a[96] = yr;
+  a[97] = yi;
+  a[6] = xr;
+  a[7] = xi;
+
+  xr = a[104];
+  xi = a[105];
+  yr = a[22];
+  yi = a[23];
+  a[104] = yr;
+  a[105] = yi;
+  a[22] = xr;
+  a[23] = xi;
+
+  xr = a[112];
+  xi = a[113];
+  yr = a[14];
+  yi = a[15];
+  a[112] = yr;
+  a[113] = yi;
+  a[14] = xr;
+  a[15] = xi;
+
+  xr = a[120];
+  xi = a[121];
+  yr = a[30];
+  yi = a[31];
+  a[120] = yr;
+  a[121] = yi;
+  a[30] = xr;
+  a[31] = xi;
+
+  xr = a[98];
+  xi = a[99];
+  yr = a[70];
+  yi = a[71];
+  a[98] = yr;
+  a[99] = yi;
+  a[70] = xr;
+  a[71] = xi;
+
+  xr = a[106];
+  xi = a[107];
+  yr = a[86];
+  yi = a[87];
+  a[106] = yr;
+  a[107] = yi;
+  a[86] = xr;
+  a[87] = xi;
+
+  xr = a[114];
+  xi = a[115];
+  yr = a[78];
+  yi = a[79];
+  a[114] = yr;
+  a[115] = yi;
+  a[78] = xr;
+  a[79] = xi;
+
+  xr = a[122];
+  xi = a[123];
+  yr = a[94];
+  yi = a[95];
+  a[122] = yr;
+  a[123] = yi;
+  a[94] = xr;
+  a[95] = xi;
+
+  xr = a[100];
+  xi = a[101];
+  yr = a[38];
+  yi = a[39];
+  a[100] = yr;
+  a[101] = yi;
+  a[38] = xr;
+  a[39] = xi;
+
+  xr = a[108];
+  xi = a[109];
+  yr = a[54];
+  yi = a[55];
+  a[108] = yr;
+  a[109] = yi;
+  a[54] = xr;
+  a[55] = xi;
+
+  xr = a[116];
+  xi = a[117];
+  yr = a[46];
+  yi = a[47];
+  a[116] = yr;
+  a[117] = yi;
+  a[46] = xr;
+  a[47] = xi;
+
+  xr = a[124];
+  xi = a[125];
+  yr = a[62];
+  yi = a[63];
+  a[124] = yr;
+  a[125] = yi;
+  a[62] = xr;
+  a[63] = xi;
+
+  xr = a[110];
+  xi = a[111];
+  yr = a[118];
+  yi = a[119];
+  a[110] = yr;
+  a[111] = yi;
+  a[118] = xr;
+  a[119] = xi;
+}
+
+static void cft1st_128_mips(float *a) {
+  float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  float f0, f1, f2, f3, f4, f5, f6, f7;
+  int a_ptr, p1_rdft, p2_rdft, count;
+
+  __asm __volatile (
+    ".set       push                                                       \n\t"
+    ".set       noreorder                                                  \n\t"
+    // first 16
+    "lwc1       %[f0],          0(%[a])                                    \n\t"
+    "lwc1       %[f1],          4(%[a])                                    \n\t"
+    "lwc1       %[f2],          8(%[a])                                    \n\t"
+    "lwc1       %[f3],          12(%[a])                                   \n\t"
+    "lwc1       %[f4],          16(%[a])                                   \n\t"
+    "lwc1       %[f5],          20(%[a])                                   \n\t"
+    "lwc1       %[f6],          24(%[a])                                   \n\t"
+    "lwc1       %[f7],          28(%[a])                                   \n\t"
+    "add.s      %[x0r],         %[f0],           %[f2]                     \n\t"
+    "add.s      %[x0i],         %[f1],           %[f3]                     \n\t"
+    "sub.s      %[x1r],         %[f0],           %[f2]                     \n\t"
+    "add.s      %[x2r],         %[f4],           %[f6]                     \n\t"
+    "add.s      %[x2i],         %[f5],           %[f7]                     \n\t"
+    "sub.s      %[x1i],         %[f1],           %[f3]                     \n\t"
+    "sub.s      %[x3r],         %[f4],           %[f6]                     \n\t"
+    "sub.s      %[x3i],         %[f5],           %[f7]                     \n\t"
+    "add.s      %[f0],          %[x0r],          %[x2r]                    \n\t"
+    "add.s      %[f1],          %[x0i],          %[x2i]                    \n\t"
+    "sub.s      %[f4],          %[x0r],          %[x2r]                    \n\t"
+    "sub.s      %[f5],          %[x0i],          %[x2i]                    \n\t"
+    "sub.s      %[f2],          %[x1r],          %[x3i]                    \n\t"
+    "add.s      %[f3],          %[x1i],          %[x3r]                    \n\t"
+    "add.s      %[f6],          %[x1r],          %[x3i]                    \n\t"
+    "sub.s      %[f7],          %[x1i],          %[x3r]                    \n\t"
+    "swc1       %[f0],          0(%[a])                                    \n\t"
+    "swc1       %[f1],          4(%[a])                                    \n\t"
+    "swc1       %[f2],          8(%[a])                                    \n\t"
+    "swc1       %[f3],          12(%[a])                                   \n\t"
+    "swc1       %[f4],          16(%[a])                                   \n\t"
+    "swc1       %[f5],          20(%[a])                                   \n\t"
+    "swc1       %[f6],          24(%[a])                                   \n\t"
+    "swc1       %[f7],          28(%[a])                                   \n\t"
+    "lwc1       %[f0],          32(%[a])                                   \n\t"
+    "lwc1       %[f1],          36(%[a])                                   \n\t"
+    "lwc1       %[f2],          40(%[a])                                   \n\t"
+    "lwc1       %[f3],          44(%[a])                                   \n\t"
+    "lwc1       %[f4],          48(%[a])                                   \n\t"
+    "lwc1       %[f5],          52(%[a])                                   \n\t"
+    "lwc1       %[f6],          56(%[a])                                   \n\t"
+    "lwc1       %[f7],          60(%[a])                                   \n\t"
+    "add.s      %[x0r],         %[f0],           %[f2]                     \n\t"
+    "add.s      %[x0i],         %[f1],           %[f3]                     \n\t"
+    "sub.s      %[x1r],         %[f0],           %[f2]                     \n\t"
+    "sub.s      %[x1i],         %[f1],           %[f3]                     \n\t"
+    "sub.s      %[x3r],         %[f4],           %[f6]                     \n\t"
+    "sub.s      %[x3i],         %[f5],           %[f7]                     \n\t"
+    "add.s      %[x2r],         %[f4],           %[f6]                     \n\t"
+    "add.s      %[x2i],         %[f5],           %[f7]                     \n\t"
+    "lwc1       %[wk2r],        8(%[rdft_w])                               \n\t"
+    "add.s      %[f3],          %[x1i],          %[x3r]                    \n\t"
+    "sub.s      %[f2],          %[x1r],          %[x3i]                    \n\t"
+    "add.s      %[f6],          %[x3i],          %[x1r]                    \n\t"
+    "sub.s      %[f7],          %[x3r],          %[x1i]                    \n\t"
+    "add.s      %[f0],          %[x0r],          %[x2r]                    \n\t"
+    "add.s      %[f1],          %[x0i],          %[x2i]                    \n\t"
+    "sub.s      %[x1r],         %[f2],           %[f3]                     \n\t"
+    "add.s      %[x1i],         %[f3],           %[f2]                     \n\t"
+    "sub.s      %[x3r],         %[f7],           %[f6]                     \n\t"
+    "add.s      %[x3i],         %[f7],           %[f6]                     \n\t"
+    "sub.s      %[f4],          %[x0r],          %[x2r]                    \n\t"
+    "mul.s      %[f2],          %[wk2r],         %[x1r]                    \n\t"
+    "mul.s      %[f3],          %[wk2r],         %[x1i]                    \n\t"
+    "mul.s      %[f6],          %[wk2r],         %[x3r]                    \n\t"
+    "mul.s      %[f7],          %[wk2r],         %[x3i]                    \n\t"
+    "sub.s      %[f5],          %[x2i],          %[x0i]                    \n\t"
+    "swc1       %[f0],          32(%[a])                                   \n\t"
+    "swc1       %[f1],          36(%[a])                                   \n\t"
+    "swc1       %[f2],          40(%[a])                                   \n\t"
+    "swc1       %[f3],          44(%[a])                                   \n\t"
+    "swc1       %[f5],          48(%[a])                                   \n\t"
+    "swc1       %[f4],          52(%[a])                                   \n\t"
+    "swc1       %[f6],          56(%[a])                                   \n\t"
+    "swc1       %[f7],          60(%[a])                                   \n\t"
+    // prepare for loop
+    "addiu      %[a_ptr],       %[a],            64                        \n\t"
+    "addiu      %[p1_rdft],     %[rdft_w],       8                         \n\t"
+    "addiu      %[p2_rdft],     %[rdft_w],       16                        \n\t"
+    "addiu      %[count],       $zero,           7                         \n\t"
+    // loop
+   "1:                                                                     \n\t"
+    "lwc1       %[f0],          0(%[a_ptr])                                \n\t"
+    "lwc1       %[f1],          4(%[a_ptr])                                \n\t"
+    "lwc1       %[f2],          8(%[a_ptr])                                \n\t"
+    "lwc1       %[f3],          12(%[a_ptr])                               \n\t"
+    "lwc1       %[f4],          16(%[a_ptr])                               \n\t"
+    "lwc1       %[f5],          20(%[a_ptr])                               \n\t"
+    "lwc1       %[f6],          24(%[a_ptr])                               \n\t"
+    "lwc1       %[f7],          28(%[a_ptr])                               \n\t"
+    "add.s      %[x0r],         %[f0],           %[f2]                     \n\t"
+    "add.s      %[x2r],         %[f4],           %[f6]                     \n\t"
+    "add.s      %[x0i],         %[f1],           %[f3]                     \n\t"
+    "add.s      %[x2i],         %[f5],           %[f7]                     \n\t"
+    "sub.s      %[x1r],         %[f0],           %[f2]                     \n\t"
+    "sub.s      %[x1i],         %[f1],           %[f3]                     \n\t"
+    "sub.s      %[x3r],         %[f4],           %[f6]                     \n\t"
+    "sub.s      %[x3i],         %[f5],           %[f7]                     \n\t"
+    "lwc1       %[wk2i],        4(%[p1_rdft])                              \n\t"
+    "sub.s      %[f0],          %[x0r],          %[x2r]                    \n\t"
+    "sub.s      %[f1],          %[x0i],          %[x2i]                    \n\t"
+    "add.s      %[f2],          %[x1i],          %[x3r]                    \n\t"
+    "sub.s      %[f3],          %[x1r],          %[x3i]                    \n\t"
+    "lwc1       %[wk1r],        0(%[p2_rdft])                              \n\t"
+    "add.s      %[f4],          %[x1r],          %[x3i]                    \n\t"
+    "sub.s      %[f5],          %[x1i],          %[x3r]                    \n\t"
+    "lwc1       %[wk3r],        8(%[first])                                \n\t"
+    "mul.s      %[x3r],         %[wk2r],         %[f0]                     \n\t"
+    "mul.s      %[x3i],         %[wk2r],         %[f1]                     \n\t"
+    "mul.s      %[x1r],         %[wk1r],         %[f3]                     \n\t"
+    "mul.s      %[x1i],         %[wk1r],         %[f2]                     \n\t"
+    "lwc1       %[wk1i],        4(%[p2_rdft])                              \n\t"
+    "mul.s      %[f6],          %[wk3r],         %[f4]                     \n\t"
+    "mul.s      %[f7],          %[wk3r],         %[f5]                     \n\t"
+    "lwc1       %[wk3i],        12(%[first])                               \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s      %[wk1r],        %[wk2i],         %[f1]                     \n\t"
+    "mul.s      %[f0],          %[wk2i],         %[f0]                     \n\t"
+    "sub.s      %[x3r],         %[x3r],          %[wk1r]                   \n\t"
+    "add.s      %[x3i],         %[x3i],          %[f0]                     \n\t"
+    "add.s      %[f0],          %[x0r],          %[x2r]                    \n\t"
+    "add.s      %[f1],          %[x0i],          %[x2i]                    \n\t"
+    "mul.s      %[x0r],         %[wk1i],         %[f2]                     \n\t"
+    "mul.s      %[f3],          %[wk1i],         %[f3]                     \n\t"
+    "mul.s      %[x2r],         %[wk3i],         %[f5]                     \n\t"
+    "mul.s      %[f4],          %[wk3i],         %[f4]                     \n\t"
+    "sub.s      %[x1r],         %[x1r],          %[x0r]                    \n\t"
+    "add.s      %[x1i],         %[x1i],          %[f3]                     \n\t"
+    "sub.s      %[f6],          %[f6],           %[x2r]                    \n\t"
+    "add.s      %[f7],          %[f7],           %[f4]                     \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+    "nmsub.s    %[x3r],         %[x3r],          %[wk2i],        %[f1]     \n\t"
+    "madd.s     %[x3i],         %[x3i],          %[wk2i],        %[f0]     \n\t"
+    "add.s      %[f0],          %[x0r],          %[x2r]                    \n\t"
+    "add.s      %[f1],          %[x0i],          %[x2i]                    \n\t"
+    "nmsub.s    %[x1r],         %[x1r],          %[wk1i],        %[f2]     \n\t"
+    "madd.s     %[x1i],         %[x1i],          %[wk1i],        %[f3]     \n\t"
+    "nmsub.s    %[f6],          %[f6],           %[wk3i],        %[f5]     \n\t"
+    "madd.s     %[f7],          %[f7],           %[wk3i],        %[f4]     \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+    "swc1       %[f0],          0(%[a_ptr])                                \n\t"
+    "swc1       %[f1],          4(%[a_ptr])                                \n\t"
+    "swc1       %[x1r],         8(%[a_ptr])                                \n\t"
+    "swc1       %[x1i],         12(%[a_ptr])                               \n\t"
+    "swc1       %[x3r],         16(%[a_ptr])                               \n\t"
+    "swc1       %[x3i],         20(%[a_ptr])                               \n\t"
+    "swc1       %[f6],          24(%[a_ptr])                               \n\t"
+    "swc1       %[f7],          28(%[a_ptr])                               \n\t"
+    "lwc1       %[f0],          32(%[a_ptr])                               \n\t"
+    "lwc1       %[f1],          36(%[a_ptr])                               \n\t"
+    "lwc1       %[f2],          40(%[a_ptr])                               \n\t"
+    "lwc1       %[f3],          44(%[a_ptr])                               \n\t"
+    "lwc1       %[f4],          48(%[a_ptr])                               \n\t"
+    "lwc1       %[f5],          52(%[a_ptr])                               \n\t"
+    "lwc1       %[f6],          56(%[a_ptr])                               \n\t"
+    "lwc1       %[f7],          60(%[a_ptr])                               \n\t"
+    "add.s      %[x0r],         %[f0],           %[f2]                     \n\t"
+    "add.s      %[x2r],         %[f4],           %[f6]                     \n\t"
+    "add.s      %[x0i],         %[f1],           %[f3]                     \n\t"
+    "add.s      %[x2i],         %[f5],           %[f7]                     \n\t"
+    "sub.s      %[x1r],         %[f0],           %[f2]                     \n\t"
+    "sub.s      %[x1i],         %[f1],           %[f3]                     \n\t"
+    "sub.s      %[x3r],         %[f4],           %[f6]                     \n\t"
+    "sub.s      %[x3i],         %[f5],           %[f7]                     \n\t"
+    "lwc1       %[wk1r],        8(%[p2_rdft])                              \n\t"
+    "sub.s      %[f0],          %[x0r],          %[x2r]                    \n\t"
+    "sub.s      %[f1],          %[x0i],          %[x2i]                    \n\t"
+    "add.s      %[f2],          %[x1i],          %[x3r]                    \n\t"
+    "sub.s      %[f3],          %[x1r],          %[x3i]                    \n\t"
+    "add.s      %[f4],          %[x1r],          %[x3i]                    \n\t"
+    "sub.s      %[f5],          %[x1i],          %[x3r]                    \n\t"
+    "lwc1       %[wk3r],        8(%[second])                               \n\t"
+    "mul.s      %[x3r],         %[wk2i],         %[f0]                     \n\t"
+    "mul.s      %[x3i],         %[wk2i],         %[f1]                     \n\t"
+    "mul.s      %[x1r],         %[wk1r],         %[f3]                     \n\t"
+    "mul.s      %[x1i],         %[wk1r],         %[f2]                     \n\t"
+    "mul.s      %[f6],          %[wk3r],         %[f4]                     \n\t"
+    "mul.s      %[f7],          %[wk3r],         %[f5]                     \n\t"
+    "lwc1       %[wk1i],        12(%[p2_rdft])                             \n\t"
+    "lwc1       %[wk3i],        12(%[second])                              \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s      %[wk1r],        %[wk2r],         %[f1]                     \n\t"
+    "mul.s      %[f0],          %[wk2r],         %[f0]                     \n\t"
+    "add.s      %[x3r],         %[x3r],          %[wk1r]                   \n\t"
+    "neg.s      %[x3r],         %[x3r]                                     \n\t"
+    "sub.s      %[x3i],         %[f0],           %[x3i]                    \n\t"
+    "add.s      %[f0],          %[x0r],          %[x2r]                    \n\t"
+    "add.s      %[f1],          %[x0i],          %[x2i]                    \n\t"
+    "mul.s      %[x0r],         %[wk1i],         %[f2]                     \n\t"
+    "mul.s      %[f3],          %[wk1i],         %[f3]                     \n\t"
+    "mul.s      %[x2r],         %[wk3i],         %[f5]                     \n\t"
+    "mul.s      %[f4],          %[wk3i],         %[f4]                     \n\t"
+    "sub.s      %[x1r],         %[x1r],          %[x0r]                    \n\t"
+    "add.s      %[x1i],         %[x1i],          %[f3]                     \n\t"
+    "sub.s      %[f6],          %[f6],           %[x2r]                    \n\t"
+    "add.s      %[f7],          %[f7],           %[f4]                     \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+    "nmadd.s    %[x3r],         %[x3r],          %[wk2r],        %[f1]     \n\t"
+    "msub.s     %[x3i],         %[x3i],          %[wk2r],        %[f0]     \n\t"
+    "add.s      %[f0],          %[x0r],          %[x2r]                    \n\t"
+    "add.s      %[f1],          %[x0i],          %[x2i]                    \n\t"
+    "nmsub.s    %[x1r],         %[x1r],          %[wk1i],        %[f2]     \n\t"
+    "madd.s     %[x1i],         %[x1i],          %[wk1i],        %[f3]     \n\t"
+    "nmsub.s    %[f6],          %[f6],           %[wk3i],        %[f5]     \n\t"
+    "madd.s     %[f7],          %[f7],           %[wk3i],        %[f4]     \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+    "addiu      %[count],       %[count],        -1                        \n\t"
+    "lwc1       %[wk2r],        8(%[p1_rdft])                              \n\t"
+    "addiu      %[a_ptr],       %[a_ptr],        64                        \n\t"
+    "addiu      %[p1_rdft],     %[p1_rdft],      8                         \n\t"
+    "addiu      %[p2_rdft],     %[p2_rdft],      16                        \n\t"
+    "addiu      %[first],       %[first],        8                         \n\t"
+    "swc1       %[f0],          -32(%[a_ptr])                              \n\t"
+    "swc1       %[f1],          -28(%[a_ptr])                              \n\t"
+    "swc1       %[x1r],         -24(%[a_ptr])                              \n\t"
+    "swc1       %[x1i],         -20(%[a_ptr])                              \n\t"
+    "swc1       %[x3r],         -16(%[a_ptr])                              \n\t"
+    "swc1       %[x3i],         -12(%[a_ptr])                              \n\t"
+    "swc1       %[f6],          -8(%[a_ptr])                               \n\t"
+    "swc1       %[f7],          -4(%[a_ptr])                               \n\t"
+    "bgtz       %[count],       1b                                         \n\t"
+    " addiu     %[second],      %[second],       8                         \n\t"
+    ".set       pop                                                        \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [x0r] "=&f" (x0r), [x0i] "=&f" (x0i), [x1r] "=&f" (x1r),
+      [x1i] "=&f" (x1i), [x2r] "=&f" (x2r), [x2i] "=&f" (x2i),
+      [x3r] "=&f" (x3r), [x3i] "=&f" (x3i), [wk1r] "=&f" (wk1r),
+      [wk1i] "=&f" (wk1i), [wk2r] "=&f" (wk2r), [wk2i] "=&f" (wk2i),
+      [wk3r] "=&f" (wk3r), [wk3i] "=&f" (wk3i), [a_ptr] "=&r" (a_ptr),
+      [p1_rdft] "=&r" (p1_rdft), [p2_rdft] "=&r" (p2_rdft),
+      [count] "=&r" (count)
+    : [a] "r" (a), [rdft_w] "r" (rdft_w), [first] "r" (rdft_wk3ri_first),
+      [second] "r" (rdft_wk3ri_second)
+    : "memory"
+  );
+}
+
+static void cftmdl_128_mips(float *a) {
+  float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+  float f0, f1, f2, f3, f4, f5, f6, f7;
+  int tmp_a, count;
+
+  __asm __volatile (
+    ".set       push                                      \n\t"
+    ".set       noreorder                                 \n\t"
+    "addiu      %[tmp_a],   %[a],         0               \n\t"
+    "addiu      %[count],   $zero,        4               \n\t"
+   "1:                                                    \n\t"
+    "addiu      %[count],   %[count],     -1              \n\t"
+    "lwc1       %[f0],      0(%[tmp_a])                   \n\t"
+    "lwc1       %[f1],      4(%[tmp_a])                   \n\t"
+    "lwc1       %[f2],      32(%[tmp_a])                  \n\t"
+    "lwc1       %[f3],      36(%[tmp_a])                  \n\t"
+    "lwc1       %[f4],      64(%[tmp_a])                  \n\t"
+    "lwc1       %[f5],      68(%[tmp_a])                  \n\t"
+    "lwc1       %[f6],      96(%[tmp_a])                  \n\t"
+    "lwc1       %[f7],      100(%[tmp_a])                 \n\t"
+    "add.s      %[x0r],     %[f0],        %[f2]           \n\t"
+    "add.s      %[x0i],     %[f1],        %[f3]           \n\t"
+    "add.s      %[x2r],     %[f4],        %[f6]           \n\t"
+    "add.s      %[x2i],     %[f5],        %[f7]           \n\t"
+    "sub.s      %[x1r],     %[f0],        %[f2]           \n\t"
+    "sub.s      %[x1i],     %[f1],        %[f3]           \n\t"
+    "sub.s      %[x3r],     %[f4],        %[f6]           \n\t"
+    "sub.s      %[x3i],     %[f5],        %[f7]           \n\t"
+    "add.s      %[f0],      %[x0r],       %[x2r]          \n\t"
+    "add.s      %[f1],      %[x0i],       %[x2i]          \n\t"
+    "sub.s      %[f4],      %[x0r],       %[x2r]          \n\t"
+    "sub.s      %[f5],      %[x0i],       %[x2i]          \n\t"
+    "sub.s      %[f2],      %[x1r],       %[x3i]          \n\t"
+    "add.s      %[f3],      %[x1i],       %[x3r]          \n\t"
+    "add.s      %[f6],      %[x1r],       %[x3i]          \n\t"
+    "sub.s      %[f7],      %[x1i],       %[x3r]          \n\t"
+    "swc1       %[f0],      0(%[tmp_a])                   \n\t"
+    "swc1       %[f1],      4(%[tmp_a])                   \n\t"
+    "swc1       %[f2],      32(%[tmp_a])                  \n\t"
+    "swc1       %[f3],      36(%[tmp_a])                  \n\t"
+    "swc1       %[f4],      64(%[tmp_a])                  \n\t"
+    "swc1       %[f5],      68(%[tmp_a])                  \n\t"
+    "swc1       %[f6],      96(%[tmp_a])                  \n\t"
+    "swc1       %[f7],      100(%[tmp_a])                 \n\t"
+    "bgtz       %[count],   1b                            \n\t"
+    " addiu     %[tmp_a],   %[tmp_a],     8               \n\t"
+    ".set       pop                                       \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [x0r] "=&f" (x0r), [x0i] "=&f" (x0i), [x1r] "=&f" (x1r),
+      [x1i] "=&f" (x1i), [x2r] "=&f" (x2r), [x2i] "=&f" (x2i),
+      [x3r] "=&f" (x3r), [x3i] "=&f" (x3i), [tmp_a] "=&r" (tmp_a),
+      [count] "=&r" (count)
+    : [a] "r" (a)
+    : "memory"
+  );
+  wk2r = rdft_w[2];
+  __asm __volatile (
+    ".set   push                                      \n\t"
+    ".set   noreorder                                 \n\t"
+    "addiu  %[tmp_a],   %[a],         128             \n\t"
+    "addiu  %[count],   $zero,        4               \n\t"
+   "1:                                                \n\t"
+    "addiu  %[count],   %[count],     -1              \n\t"
+    "lwc1   %[f0],      0(%[tmp_a])                   \n\t"
+    "lwc1   %[f1],      4(%[tmp_a])                   \n\t"
+    "lwc1   %[f2],      32(%[tmp_a])                  \n\t"
+    "lwc1   %[f3],      36(%[tmp_a])                  \n\t"
+    "lwc1   %[f4],      64(%[tmp_a])                  \n\t"
+    "lwc1   %[f5],      68(%[tmp_a])                  \n\t"
+    "lwc1   %[f6],      96(%[tmp_a])                  \n\t"
+    "lwc1   %[f7],      100(%[tmp_a])                 \n\t"
+    "sub.s  %[x1r],     %[f0],        %[f2]           \n\t"
+    "sub.s  %[x1i],     %[f1],        %[f3]           \n\t"
+    "sub.s  %[x3r],     %[f4],        %[f6]           \n\t"
+    "sub.s  %[x3i],     %[f5],        %[f7]           \n\t"
+    "add.s  %[x0r],     %[f0],        %[f2]           \n\t"
+    "add.s  %[x0i],     %[f1],        %[f3]           \n\t"
+    "add.s  %[x2r],     %[f4],        %[f6]           \n\t"
+    "add.s  %[x2i],     %[f5],        %[f7]           \n\t"
+    "sub.s  %[f0],      %[x1r],       %[x3i]          \n\t"
+    "add.s  %[f1],      %[x1i],       %[x3r]          \n\t"
+    "sub.s  %[f2],      %[x3r],       %[x1i]          \n\t"
+    "add.s  %[f3],      %[x3i],       %[x1r]          \n\t"
+    "add.s  %[f4],      %[x0r],       %[x2r]          \n\t"
+    "add.s  %[f5],      %[x0i],       %[x2i]          \n\t"
+    "sub.s  %[f6],      %[f0],        %[f1]           \n\t"
+    "add.s  %[f0],      %[f0],        %[f1]           \n\t"
+    "sub.s  %[f7],      %[f2],        %[f3]           \n\t"
+    "add.s  %[f2],      %[f2],        %[f3]           \n\t"
+    "sub.s  %[f1],      %[x2i],       %[x0i]          \n\t"
+    "mul.s  %[f6],      %[f6],        %[wk2r]         \n\t"
+    "mul.s  %[f0],      %[f0],        %[wk2r]         \n\t"
+    "sub.s  %[f3],      %[x0r],       %[x2r]          \n\t"
+    "mul.s  %[f7],      %[f7],        %[wk2r]         \n\t"
+    "mul.s  %[f2],      %[f2],        %[wk2r]         \n\t"
+    "swc1   %[f4],      0(%[tmp_a])                   \n\t"
+    "swc1   %[f5],      4(%[tmp_a])                   \n\t"
+    "swc1   %[f6],      32(%[tmp_a])                  \n\t"
+    "swc1   %[f0],      36(%[tmp_a])                  \n\t"
+    "swc1   %[f1],      64(%[tmp_a])                  \n\t"
+    "swc1   %[f3],      68(%[tmp_a])                  \n\t"
+    "swc1   %[f7],      96(%[tmp_a])                  \n\t"
+    "swc1   %[f2],      100(%[tmp_a])                 \n\t"
+    "bgtz   %[count],   1b                            \n\t"
+    " addiu %[tmp_a],   %[tmp_a],     8               \n\t"
+    ".set   pop                                       \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [x0r] "=&f" (x0r), [x0i] "=&f" (x0i), [x1r] "=&f" (x1r),
+      [x1i] "=&f" (x1i), [x2r] "=&f" (x2r), [x2i] "=&f" (x2i),
+      [x3r] "=&f" (x3r), [x3i] "=&f" (x3i), [tmp_a] "=&r" (tmp_a),
+      [count] "=&r" (count)
+    : [a] "r" (a), [wk2r] "f" (wk2r)
+    : "memory"
+  );
+  wk2i = rdft_w[3];
+  wk1r = rdft_w[4];
+  wk1i = rdft_w[5];
+  wk3r = rdft_wk3ri_first[2];
+  wk3i = rdft_wk3ri_first[3];
+
+  __asm __volatile (
+    ".set       push                                                       \n\t"
+    ".set       noreorder                                                  \n\t"
+    "addiu      %[tmp_a],       %[a],           256                        \n\t"
+    "addiu      %[count],       $zero,          4                          \n\t"
+   "1:                                                                     \n\t"
+    "addiu      %[count],       %[count],       -1                         \n\t"
+    "lwc1       %[f0],          0(%[tmp_a])                                \n\t"
+    "lwc1       %[f1],          4(%[tmp_a])                                \n\t"
+    "lwc1       %[f2],          32(%[tmp_a])                               \n\t"
+    "lwc1       %[f3],          36(%[tmp_a])                               \n\t"
+    "lwc1       %[f4],          64(%[tmp_a])                               \n\t"
+    "lwc1       %[f5],          68(%[tmp_a])                               \n\t"
+    "lwc1       %[f6],          96(%[tmp_a])                               \n\t"
+    "lwc1       %[f7],          100(%[tmp_a])                              \n\t"
+    "add.s      %[x0r],         %[f0],          %[f2]                      \n\t"
+    "add.s      %[x2r],         %[f4],          %[f6]                      \n\t"
+    "add.s      %[x0i],         %[f1],          %[f3]                      \n\t"
+    "add.s      %[x2i],         %[f5],          %[f7]                      \n\t"
+    "sub.s      %[x1r],         %[f0],          %[f2]                      \n\t"
+    "sub.s      %[x1i],         %[f1],          %[f3]                      \n\t"
+    "sub.s      %[x3r],         %[f4],          %[f6]                      \n\t"
+    "sub.s      %[x3i],         %[f5],          %[f7]                      \n\t"
+    "sub.s      %[f0],          %[x0r],         %[x2r]                     \n\t"
+    "sub.s      %[f1],          %[x0i],         %[x2i]                     \n\t"
+    "add.s      %[f2],          %[x1i],         %[x3r]                     \n\t"
+    "sub.s      %[f3],          %[x1r],         %[x3i]                     \n\t"
+    "add.s      %[f4],          %[x1r],         %[x3i]                     \n\t"
+    "sub.s      %[f5],          %[x1i],         %[x3r]                     \n\t"
+    "mul.s      %[x3r],         %[wk2r],        %[f0]                      \n\t"
+    "mul.s      %[x3i],         %[wk2r],        %[f1]                      \n\t"
+    "mul.s      %[x1r],         %[wk1r],        %[f3]                      \n\t"
+    "mul.s      %[x1i],         %[wk1r],        %[f2]                      \n\t"
+    "mul.s      %[f6],          %[wk3r],        %[f4]                      \n\t"
+    "mul.s      %[f7],          %[wk3r],        %[f5]                      \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s      %[f1],          %[wk2i],        %[f1]                      \n\t"
+    "mul.s      %[f0],          %[wk2i],        %[f0]                      \n\t"
+    "sub.s      %[x3r],         %[x3r],         %[f1]                      \n\t"
+    "add.s      %[x3i],         %[x3i],         %[f0]                      \n\t"
+    "add.s      %[f0],          %[x0r],         %[x2r]                     \n\t"
+    "add.s      %[f1],          %[x0i],         %[x2i]                     \n\t"
+    "mul.s      %[f2],          %[wk1i],        %[f2]                      \n\t"
+    "mul.s      %[f3],          %[wk1i],        %[f3]                      \n\t"
+    "mul.s      %[f5],          %[wk3i],        %[f5]                      \n\t"
+    "mul.s      %[f4],          %[wk3i],        %[f4]                      \n\t"
+    "sub.s      %[x1r],         %[x1r],         %[f2]                      \n\t"
+    "add.s      %[x1i],         %[x1i],         %[f3]                      \n\t"
+    "sub.s      %[f6],          %[f6],          %[f5]                      \n\t"
+    "add.s      %[f7],          %[f7],          %[f4]                      \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+    "nmsub.s    %[x3r],         %[x3r],         %[wk2i],        %[f1]      \n\t"
+    "madd.s     %[x3i],         %[x3i],         %[wk2i],        %[f0]      \n\t"
+    "add.s      %[f0],          %[x0r],         %[x2r]                     \n\t"
+    "add.s      %[f1],          %[x0i],         %[x2i]                     \n\t"
+    "nmsub.s    %[x1r],         %[x1r],         %[wk1i],        %[f2]      \n\t"
+    "madd.s     %[x1i],         %[x1i],         %[wk1i],        %[f3]      \n\t"
+    "nmsub.s    %[f6],          %[f6],          %[wk3i],        %[f5]      \n\t"
+    "madd.s     %[f7],          %[f7],          %[wk3i],        %[f4]      \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+    "swc1       %[f0],          0(%[tmp_a])                                \n\t"
+    "swc1       %[f1],          4(%[tmp_a])                                \n\t"
+    "swc1       %[x1r],         32(%[tmp_a])                               \n\t"
+    "swc1       %[x1i],         36(%[tmp_a])                               \n\t"
+    "swc1       %[x3r],         64(%[tmp_a])                               \n\t"
+    "swc1       %[x3i],         68(%[tmp_a])                               \n\t"
+    "swc1       %[f6],          96(%[tmp_a])                               \n\t"
+    "swc1       %[f7],          100(%[tmp_a])                              \n\t"
+    "bgtz       %[count],       1b                                         \n\t"
+    " addiu     %[tmp_a],       %[tmp_a],       8                          \n\t"
+    ".set       pop                                                        \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [x0r] "=&f" (x0r), [x0i] "=&f" (x0i), [x1r] "=&f" (x1r),
+      [x1i] "=&f" (x1i), [x2r] "=&f" (x2r), [x2i] "=&f" (x2i),
+      [x3r] "=&f" (x3r), [x3i] "=&f" (x3i), [tmp_a] "=&r" (tmp_a),
+      [count] "=&r" (count)
+    : [a] "r" (a),  [wk1r] "f" (wk1r), [wk1i] "f" (wk1i), [wk2r] "f" (wk2r),
+      [wk2i] "f" (wk2i), [wk3r] "f" (wk3r), [wk3i] "f" (wk3i)
+    : "memory"
+  );
+
+  wk1r = rdft_w[6];
+  wk1i = rdft_w[7];
+  wk3r = rdft_wk3ri_second[2];
+  wk3i = rdft_wk3ri_second[3];
+
+  __asm __volatile (
+    ".set       push                                                       \n\t"
+    ".set       noreorder                                                  \n\t"
+    "addiu      %[tmp_a],       %[a],           384                        \n\t"
+    "addiu      %[count],       $zero,          4                          \n\t"
+   "1:                                                                     \n\t"
+    "addiu      %[count],       %[count],       -1                         \n\t"
+    "lwc1       %[f0],          0(%[tmp_a])                                \n\t"
+    "lwc1       %[f1],          4(%[tmp_a])                                \n\t"
+    "lwc1       %[f2],          32(%[tmp_a])                               \n\t"
+    "lwc1       %[f3],          36(%[tmp_a])                               \n\t"
+    "lwc1       %[f4],          64(%[tmp_a])                               \n\t"
+    "lwc1       %[f5],          68(%[tmp_a])                               \n\t"
+    "lwc1       %[f6],          96(%[tmp_a])                               \n\t"
+    "lwc1       %[f7],          100(%[tmp_a])                              \n\t"
+    "add.s      %[x0r],         %[f0],          %[f2]                      \n\t"
+    "add.s      %[x2r],         %[f4],          %[f6]                      \n\t"
+    "add.s      %[x0i],         %[f1],          %[f3]                      \n\t"
+    "add.s      %[x2i],         %[f5],          %[f7]                      \n\t"
+    "sub.s      %[x1r],         %[f0],          %[f2]                      \n\t"
+    "sub.s      %[x1i],         %[f1],          %[f3]                      \n\t"
+    "sub.s      %[x3r],         %[f4],          %[f6]                      \n\t"
+    "sub.s      %[x3i],         %[f5],          %[f7]                      \n\t"
+    "sub.s      %[f0],          %[x0r],         %[x2r]                     \n\t"
+    "sub.s      %[f1],          %[x0i],         %[x2i]                     \n\t"
+    "add.s      %[f2],          %[x1i],         %[x3r]                     \n\t"
+    "sub.s      %[f3],          %[x1r],         %[x3i]                     \n\t"
+    "add.s      %[f4],          %[x1r],         %[x3i]                     \n\t"
+    "sub.s      %[f5],          %[x1i],         %[x3r]                     \n\t"
+    "mul.s      %[x3r],         %[wk2i],        %[f0]                      \n\t"
+    "mul.s      %[x3i],         %[wk2i],        %[f1]                      \n\t"
+    "mul.s      %[x1r],         %[wk1r],        %[f3]                      \n\t"
+    "mul.s      %[x1i],         %[wk1r],        %[f2]                      \n\t"
+    "mul.s      %[f6],          %[wk3r],        %[f4]                      \n\t"
+    "mul.s      %[f7],          %[wk3r],        %[f5]                      \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s      %[f1],          %[wk2r],        %[f1]                      \n\t"
+    "mul.s      %[f0],          %[wk2r],        %[f0]                      \n\t"
+    "add.s      %[x3r],         %[x3r],         %[f1]                      \n\t"
+    "neg.s      %[x3r],         %[x3r]                                     \n\t"
+    "sub.s      %[x3i],         %[f0],          %[x3i]                     \n\t"
+    "add.s      %[f0],          %[x0r],         %[x2r]                     \n\t"
+    "add.s      %[f1],          %[x0i],         %[x2i]                     \n\t"
+    "mul.s      %[f2],          %[wk1i],        %[f2]                      \n\t"
+    "mul.s      %[f3],          %[wk1i],        %[f3]                      \n\t"
+    "mul.s      %[f5],          %[wk3i],        %[f5]                      \n\t"
+    "mul.s      %[f4],          %[wk3i],        %[f4]                      \n\t"
+    "sub.s      %[x1r],         %[x1r],         %[f2]                      \n\t"
+    "add.s      %[x1i],         %[x1i],         %[f3]                      \n\t"
+    "sub.s      %[f6],          %[f6],          %[f5]                      \n\t"
+    "add.s      %[f7],          %[f7],          %[f4]                      \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+    "nmadd.s    %[x3r],         %[x3r],         %[wk2r],        %[f1]      \n\t"
+    "msub.s     %[x3i],         %[x3i],         %[wk2r],        %[f0]      \n\t"
+    "add.s      %[f0],          %[x0r],         %[x2r]                     \n\t"
+    "add.s      %[f1],          %[x0i],         %[x2i]                     \n\t"
+    "nmsub.s    %[x1r],         %[x1r],         %[wk1i],        %[f2]      \n\t"
+    "madd.s     %[x1i],         %[x1i],         %[wk1i],        %[f3]      \n\t"
+    "nmsub.s    %[f6],          %[f6],          %[wk3i],        %[f5]      \n\t"
+    "madd.s     %[f7],          %[f7],          %[wk3i],        %[f4]      \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+    "swc1       %[f0],          0(%[tmp_a])                                \n\t"
+    "swc1       %[f1],          4(%[tmp_a])                                \n\t"
+    "swc1       %[x1r],         32(%[tmp_a])                               \n\t"
+    "swc1       %[x1i],         36(%[tmp_a])                               \n\t"
+    "swc1       %[x3r],         64(%[tmp_a])                               \n\t"
+    "swc1       %[x3i],         68(%[tmp_a])                               \n\t"
+    "swc1       %[f6],          96(%[tmp_a])                               \n\t"
+    "swc1       %[f7],          100(%[tmp_a])                              \n\t"
+    "bgtz       %[count],       1b                                         \n\t"
+    " addiu     %[tmp_a],       %[tmp_a],       8                          \n\t"
+    ".set       pop                                                        \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [x0r] "=&f" (x0r), [x0i] "=&f" (x0i), [x1r] "=&f" (x1r),
+      [x1i] "=&f" (x1i), [x2r] "=&f" (x2r), [x2i] "=&f" (x2i),
+      [x3r] "=&f" (x3r), [x3i] "=&f" (x3i), [tmp_a] "=&r" (tmp_a),
+      [count] "=&r" (count)
+    : [a] "r" (a), [wk1r] "f" (wk1r), [wk1i] "f" (wk1i), [wk2r] "f" (wk2r),
+      [wk2i] "f" (wk2i), [wk3r] "f" (wk3r), [wk3i] "f" (wk3i)
+    : "memory"
+  );
+}
+
+static void cftfsub_128_mips(float *a) {
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+  float f0, f1, f2, f3, f4, f5, f6, f7;
+  int tmp_a, count;
+
+  cft1st_128(a);
+  cftmdl_128(a);
+
+  __asm __volatile (
+    ".set       push                                      \n\t"
+    ".set       noreorder                                 \n\t"
+    "addiu      %[tmp_a],       %[a],         0           \n\t"
+    "addiu      %[count],       $zero,        16          \n\t"
+   "1:                                                    \n\t"
+    "addiu      %[count],       %[count],     -1          \n\t"
+    "lwc1       %[f0],          0(%[tmp_a])               \n\t"
+    "lwc1       %[f1],          4(%[tmp_a])               \n\t"
+    "lwc1       %[f2],          128(%[tmp_a])             \n\t"
+    "lwc1       %[f3],          132(%[tmp_a])             \n\t"
+    "lwc1       %[f4],          256(%[tmp_a])             \n\t"
+    "lwc1       %[f5],          260(%[tmp_a])             \n\t"
+    "lwc1       %[f6],          384(%[tmp_a])             \n\t"
+    "lwc1       %[f7],          388(%[tmp_a])             \n\t"
+    "add.s      %[x0r],         %[f0],        %[f2]       \n\t"
+    "add.s      %[x0i],         %[f1],        %[f3]       \n\t"
+    "add.s      %[x2r],         %[f4],        %[f6]       \n\t"
+    "add.s      %[x2i],         %[f5],        %[f7]       \n\t"
+    "sub.s      %[x1r],         %[f0],        %[f2]       \n\t"
+    "sub.s      %[x1i],         %[f1],        %[f3]       \n\t"
+    "sub.s      %[x3r],         %[f4],        %[f6]       \n\t"
+    "sub.s      %[x3i],         %[f5],        %[f7]       \n\t"
+    "add.s      %[f0],          %[x0r],       %[x2r]      \n\t"
+    "add.s      %[f1],          %[x0i],       %[x2i]      \n\t"
+    "sub.s      %[f4],          %[x0r],       %[x2r]      \n\t"
+    "sub.s      %[f5],          %[x0i],       %[x2i]      \n\t"
+    "sub.s      %[f2],          %[x1r],       %[x3i]      \n\t"
+    "add.s      %[f3],          %[x1i],       %[x3r]      \n\t"
+    "add.s      %[f6],          %[x1r],       %[x3i]      \n\t"
+    "sub.s      %[f7],          %[x1i],       %[x3r]      \n\t"
+    "swc1       %[f0],          0(%[tmp_a])               \n\t"
+    "swc1       %[f1],          4(%[tmp_a])               \n\t"
+    "swc1       %[f2],          128(%[tmp_a])             \n\t"
+    "swc1       %[f3],          132(%[tmp_a])             \n\t"
+    "swc1       %[f4],          256(%[tmp_a])             \n\t"
+    "swc1       %[f5],          260(%[tmp_a])             \n\t"
+    "swc1       %[f6],          384(%[tmp_a])             \n\t"
+    "swc1       %[f7],          388(%[tmp_a])             \n\t"
+    "bgtz       %[count],       1b                        \n\t"
+    " addiu     %[tmp_a],       %[tmp_a],   8             \n\t"
+    ".set       pop                                       \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [x0r] "=&f" (x0r), [x0i] "=&f" (x0i), [x1r] "=&f" (x1r),
+      [x1i] "=&f" (x1i), [x2r] "=&f" (x2r), [x2i] "=&f" (x2i),
+      [x3r] "=&f" (x3r), [x3i] "=&f" (x3i), [tmp_a] "=&r" (tmp_a),
+      [count] "=&r" (count)
+    : [a] "r" (a)
+    : "memory"
+  );
+}
+
+static void cftbsub_128_mips(float *a) {
+  float f0, f1, f2, f3, f4, f5, f6, f7;
+  float f8, f9, f10, f11, f12, f13, f14, f15;
+  float f16, f17, f18, f19, f20, f21, f22, f23;
+  int tmp_a, count;
+
+  cft1st_128(a);
+  cftmdl_128(a);
+
+  __asm __volatile (
+    ".set       push                                        \n\t"
+    ".set       noreorder                                   \n\t"
+    "addiu      %[tmp_a],   %[a],           0               \n\t"
+    "addiu      %[count],   $zero,          8               \n\t"
+   "1:                                                      \n\t"
+    "addiu      %[count],   %[count],       -1              \n\t"
+    "lwc1       %[f0],      0(%[tmp_a])                     \n\t"
+    "lwc1       %[f1],      4(%[tmp_a])                     \n\t"
+    "lwc1       %[f2],      128(%[tmp_a])                   \n\t"
+    "lwc1       %[f3],      132(%[tmp_a])                   \n\t"
+    "lwc1       %[f4],      256(%[tmp_a])                   \n\t"
+    "lwc1       %[f5],      260(%[tmp_a])                   \n\t"
+    "lwc1       %[f6],      384(%[tmp_a])                   \n\t"
+    "lwc1       %[f7],      388(%[tmp_a])                   \n\t"
+    "lwc1       %[f8],      8(%[tmp_a])                     \n\t"
+    "lwc1       %[f9],      12(%[tmp_a])                    \n\t"
+    "lwc1       %[f10],     136(%[tmp_a])                   \n\t"
+    "lwc1       %[f11],     140(%[tmp_a])                   \n\t"
+    "lwc1       %[f12],     264(%[tmp_a])                   \n\t"
+    "lwc1       %[f13],     268(%[tmp_a])                   \n\t"
+    "lwc1       %[f14],     392(%[tmp_a])                   \n\t"
+    "lwc1       %[f15],     396(%[tmp_a])                   \n\t"
+    "add.s      %[f16],     %[f0],          %[f2]           \n\t"
+    "add.s      %[f17],     %[f1],          %[f3]           \n\t"
+    "add.s      %[f18],     %[f4],          %[f6]           \n\t"
+    "add.s      %[f19],     %[f5],          %[f7]           \n\t"
+    "sub.s      %[f20],     %[f0],          %[f2]           \n\t"
+    "sub.s      %[f21],     %[f3],          %[f1]           \n\t"
+    "sub.s      %[f22],     %[f4],          %[f6]           \n\t"
+    "sub.s      %[f23],     %[f5],          %[f7]           \n\t"
+    "add.s      %[f0],      %[f8],          %[f10]          \n\t"
+    "add.s      %[f1],      %[f9],          %[f11]          \n\t"
+    "add.s      %[f2],      %[f12],         %[f14]          \n\t"
+    "add.s      %[f3],      %[f13],         %[f15]          \n\t"
+    "sub.s      %[f4],      %[f8],          %[f10]          \n\t"
+    "sub.s      %[f5],      %[f11],         %[f9]           \n\t"
+    "sub.s      %[f6],      %[f12],         %[f14]          \n\t"
+    "sub.s      %[f7],      %[f13],         %[f15]          \n\t"
+    "add.s      %[f8],      %[f16],         %[f18]          \n\t"
+    "add.s      %[f9],      %[f17],         %[f19]          \n\t"
+    "sub.s      %[f12],     %[f16],         %[f18]          \n\t"
+    "sub.s      %[f13],     %[f19],         %[f17]          \n\t"
+    "sub.s      %[f10],     %[f20],         %[f23]          \n\t"
+    "sub.s      %[f11],     %[f21],         %[f22]          \n\t"
+    "add.s      %[f14],     %[f20],         %[f23]          \n\t"
+    "add.s      %[f15],     %[f21],         %[f22]          \n\t"
+    "neg.s      %[f9],      %[f9]                           \n\t"
+    "add.s      %[f16],      %[f0],         %[f2]           \n\t"
+    "add.s      %[f17],      %[f1],         %[f3]           \n\t"
+    "sub.s      %[f20],      %[f0],         %[f2]           \n\t"
+    "sub.s      %[f21],      %[f3],         %[f1]           \n\t"
+    "sub.s      %[f18],      %[f4],         %[f7]           \n\t"
+    "sub.s      %[f19],      %[f5],         %[f6]           \n\t"
+    "add.s      %[f22],      %[f4],         %[f7]           \n\t"
+    "add.s      %[f23],      %[f5],         %[f6]           \n\t"
+    "neg.s      %[f17],      %[f17]                         \n\t"
+    "swc1       %[f8],      0(%[tmp_a])                     \n\t"
+    "swc1       %[f10],     128(%[tmp_a])                   \n\t"
+    "swc1       %[f11],     132(%[tmp_a])                   \n\t"
+    "swc1       %[f12],     256(%[tmp_a])                   \n\t"
+    "swc1       %[f13],     260(%[tmp_a])                   \n\t"
+    "swc1       %[f14],     384(%[tmp_a])                   \n\t"
+    "swc1       %[f15],     388(%[tmp_a])                   \n\t"
+    "swc1       %[f9],      4(%[tmp_a])                     \n\t"
+    "swc1       %[f16],     8(%[tmp_a])                     \n\t"
+    "swc1       %[f18],     136(%[tmp_a])                   \n\t"
+    "swc1       %[f19],     140(%[tmp_a])                   \n\t"
+    "swc1       %[f20],     264(%[tmp_a])                   \n\t"
+    "swc1       %[f21],     268(%[tmp_a])                   \n\t"
+    "swc1       %[f22],     392(%[tmp_a])                   \n\t"
+    "swc1       %[f23],     396(%[tmp_a])                   \n\t"
+    "swc1       %[f17],     12(%[tmp_a])                    \n\t"
+    "bgtz       %[count],   1b                              \n\t"
+    " addiu     %[tmp_a],   %[tmp_a],       16              \n\t"
+    ".set       pop                                         \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11),
+      [f12] "=&f" (f12), [f13] "=&f" (f13), [f14] "=&f" (f14),
+      [f15] "=&f" (f15), [f16] "=&f" (f16), [f17] "=&f" (f17),
+      [f18] "=&f" (f18), [f19] "=&f" (f19), [f20] "=&f" (f20),
+      [f21] "=&f" (f21), [f22] "=&f" (f22), [f23] "=&f" (f23),
+      [tmp_a] "=&r" (tmp_a), [count] "=&r" (count)
+    : [a] "r" (a)
+    : "memory"
+  );
+}
+
+static void rftfsub_128_mips(float *a) {
+  const float *c = rdft_w + 32;
+  float wkr, wki, xr, xi, yr, yi;
+  const float temp = 0.5f;
+  float aj20=0, aj21=0, ak20=0, ak21=0, ck1=0;
+  float *a1 = a;
+  float *a2 = a;
+  float *c1 = rdft_w + 33;
+  float *c2 = c1 + 30;
+
+  __asm __volatile (
+    ".set      push                                             \n\t"
+    ".set      noreorder                                        \n\t"
+    "lwc1      %[aj20],     8(%[a2])                            \n\t"
+    "lwc1      %[ak20],     504(%[a1])                          \n\t"
+    "lwc1      %[ck1],      0(%[c2])                            \n\t"
+    "lwc1      %[aj21],     12(%[a2])                           \n\t"
+    "lwc1      %[ak21],     508(%[a1])                          \n\t"
+    "sub.s     %[wkr],      %[temp],      %[ck1]                \n\t"
+    "sub.s     %[xr],       %[aj20],      %[ak20]               \n\t"
+    "add.s     %[xi],       %[aj21],      %[ak21]               \n\t"
+    "lwc1      %[wki],      0(%[c1])                            \n\t"
+    "addiu     %[c2],       %[c2],-4                            \n\t"
+    "mul.s     %[yr],       %[wkr],       %[xr]                 \n\t"
+    "mul.s     %[yi],       %[wkr],       %[xi]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[xi],       %[wki],       %[xi]                 \n\t"
+    "mul.s     %[xr],       %[wki],       %[xr]                 \n\t"
+    "sub.s     %[yr],       %[yr],        %[xi]                 \n\t"
+    "add.s     %[yi],       %[yi],        %[xr]                 \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+    "nmsub.s   %[yr],       %[yr],        %[wki],     %[xi]     \n\t"
+    "madd.s    %[yi],       %[yi],        %[wki],     %[xr]     \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+    "addiu     %[c1],       %[c1],        4                     \n\t"
+    "sub.s     %[aj20],     %[aj20],      %[yr]                 \n\t"
+    "sub.s     %[aj21],     %[aj21],      %[yi]                 \n\t"
+    "add.s     %[ak20],     %[ak20],      %[yr]                 \n\t"
+    "sub.s     %[ak21],     %[ak21],      %[yi]                 \n\t"
+    "addiu     %[a2],       %[a2],        8                     \n\t"
+    "swc1      %[aj20],     0(%[a2])                            \n\t"
+    "swc1      %[aj21],     4(%[a2])                            \n\t"
+    "swc1      %[ak20],     504(%[a1])                          \n\t"
+    "swc1      %[ak21],     508(%[a1])                          \n\t"
+    "addiu     %[a1],       %[a1],        -8                    \n\t"
+    //15x2 passes:
+   "1:                                                          \n\t"
+    "lwc1      %[ck1],      0(%[c2])                            \n\t"
+    "lwc1      %[aj20],     8(%[a2])                            \n\t"
+    "lwc1      %[aj21],     12(%[a2])                           \n\t"
+    "lwc1      %[ak20],     504(%[a1])                          \n\t"
+    "lwc1      %[ak21],     508(%[a1])                          \n\t"
+    "lwc1      $f0,         -4(%[c2])                           \n\t"
+    "lwc1      $f2,         16(%[a2])                           \n\t"
+    "lwc1      $f3,         20(%[a2])                           \n\t"
+    "lwc1      $f8,         496(%[a1])                          \n\t"
+    "lwc1      $f7,         500(%[a1])                          \n\t"
+    "sub.s     %[wkr],      %[temp],      %[ck1]                \n\t"
+    "sub.s     %[xr],       %[aj20],      %[ak20]               \n\t"
+    "add.s     %[xi],       %[aj21],      %[ak21]               \n\t"
+    "lwc1      %[wki],      0(%[c1])                            \n\t"
+    "sub.s     $f0,         %[temp],      $f0                   \n\t"
+    "sub.s     $f6,         $f2,          $f8                   \n\t"
+    "add.s     $f4,         $f3,          $f7                   \n\t"
+    "lwc1      $f5,         4(%[c1])                            \n\t"
+    "mul.s     %[yr],       %[wkr],       %[xr]                 \n\t"
+    "mul.s     %[yi],       %[wkr],       %[xi]                 \n\t"
+    "mul.s     $f1,         $f0,          $f6                   \n\t"
+    "mul.s     $f0,         $f0,          $f4                   \n\t"
+    "addiu     %[c2],       %[c2],        -8                    \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[xi],       %[wki],       %[xi]                 \n\t"
+    "mul.s     %[xr],       %[wki],       %[xr]                 \n\t"
+    "mul.s     $f4,         $f5,          $f4                   \n\t"
+    "mul.s     $f6,         $f5,          $f6                   \n\t"
+    "sub.s     %[yr],       %[yr],        %[xi]                 \n\t"
+    "add.s     %[yi],       %[yi],        %[xr]                 \n\t"
+    "sub.s     $f1,         $f1,          $f4                   \n\t"
+    "add.s     $f0,         $f0,          $f6                   \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+    "nmsub.s   %[yr],       %[yr],        %[wki],     %[xi]     \n\t"
+    "madd.s    %[yi],       %[yi],        %[wki],     %[xr]     \n\t"
+    "nmsub.s   $f1,         $f1,          $f5,        $f4       \n\t"
+    "madd.s    $f0,         $f0,          $f5,        $f6       \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+    "addiu     %[c1],       %[c1],        8                     \n\t"
+    "sub.s     %[aj20],     %[aj20],      %[yr]                 \n\t"
+    "sub.s     %[aj21],     %[aj21],      %[yi]                 \n\t"
+    "add.s     %[ak20],     %[ak20],      %[yr]                 \n\t"
+    "sub.s     %[ak21],     %[ak21],      %[yi]                 \n\t"
+    "sub.s     $f2,         $f2,          $f1                   \n\t"
+    "sub.s     $f3,         $f3,          $f0                   \n\t"
+    "add.s     $f1,         $f8,          $f1                   \n\t"
+    "sub.s     $f0,         $f7,          $f0                   \n\t"
+    "swc1      %[aj20],     8(%[a2])                            \n\t"
+    "swc1      %[aj21],     12(%[a2])                           \n\t"
+    "swc1      %[ak20],     504(%[a1])                          \n\t"
+    "swc1      %[ak21],     508(%[a1])                          \n\t"
+    "swc1      $f2,         16(%[a2])                           \n\t"
+    "swc1      $f3,         20(%[a2])                           \n\t"
+    "swc1      $f1,         496(%[a1])                          \n\t"
+    "swc1      $f0,         500(%[a1])                          \n\t"
+    "addiu     %[a2],       %[a2],        16                    \n\t"
+    "bne       %[c2],       %[c],         1b                    \n\t"
+    " addiu    %[a1],       %[a1],        -16                   \n\t"
+    ".set      pop                                              \n\t"
+    : [a] "+r" (a), [c] "+r" (c), [a1] "+r" (a1), [a2] "+r" (a2),
+      [c1] "+r" (c1), [c2] "+r" (c2), [wkr] "=&f" (wkr), [wki] "=&f" (wki),
+      [xr] "=&f" (xr), [xi] "=&f" (xi), [yr] "=&f" (yr), [yi] "=&f" (yi),
+      [aj20] "=&f" (aj20), [aj21] "=&f" (aj21), [ak20] "=&f" (ak20),
+      [ak21] "=&f" (ak21), [ck1] "=&f" (ck1)
+    : [temp] "f" (temp)
+    : "memory", "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", "$f8"
+  );
+}
+
+static void rftbsub_128_mips(float *a) {
+  const float *c = rdft_w + 32;
+  float wkr, wki, xr, xi, yr, yi;
+  a[1] = -a[1];
+  a[65] = -a[65];
+  const float temp = 0.5f;
+  float aj20=0, aj21=0, ak20=0, ak21=0, ck1=0;
+  float *a1 = a;
+  float *a2 = a;
+  float *c1 = rdft_w + 33;
+  float *c2 = c1 + 30;
+
+  __asm __volatile (
+    ".set      push                                           \n\t"
+    ".set      noreorder                                      \n\t"
+    "lwc1      %[aj20],     8(%[a2])                          \n\t"
+    "lwc1      %[ak20],     504(%[a1])                        \n\t"
+    "lwc1      %[ck1],      0(%[c2])                          \n\t"
+    "lwc1      %[aj21],     12(%[a2])                         \n\t"
+    "lwc1      %[ak21],     508(%[a1])                        \n\t"
+    "sub.s     %[wkr],      %[temp],    %[ck1]                \n\t"
+    "sub.s     %[xr],       %[aj20],    %[ak20]               \n\t"
+    "add.s     %[xi],       %[aj21],    %[ak21]               \n\t"
+    "lwc1      %[wki],      0(%[c1])                          \n\t"
+    "addiu     %[c2],       %[c2],       -4                   \n\t"
+    "mul.s     %[yr],       %[wkr],     %[xr]                 \n\t"
+    "mul.s     %[yi],       %[wkr],     %[xi]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[xi],       %[wki],     %[xi]                 \n\t"
+    "mul.s     %[xr],       %[wki],     %[xr]                 \n\t"
+    "add.s     %[yr],       %[yr],      %[xi]                 \n\t"
+    "sub.s     %[yi],       %[yi],      %[xr]                 \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+    "madd.s    %[yr],       %[yr],      %[wki],     %[xi]     \n\t"
+    "nmsub.s   %[yi],       %[yi],      %[wki],     %[xr]     \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+    "addiu     %[c1],       %[c1],4                           \n\t"
+    "sub.s     %[aj20],     %[aj20],    %[yr]                 \n\t"
+    "sub.s     %[aj21],     %[yi],      %[aj21]               \n\t"
+    "add.s     %[ak20],     %[ak20],    %[yr]                 \n\t"
+    "sub.s     %[ak21],     %[yi],      %[ak21]               \n\t"
+    "addiu     %[a2],       %[a2],      8                     \n\t"
+    "swc1      %[aj20],     0(%[a2])                          \n\t"
+    "swc1      %[aj21],     4(%[a2])                          \n\t"
+    "swc1      %[ak20],     504(%[a1])                        \n\t"
+    "swc1      %[ak21],     508(%[a1])                        \n\t"
+    "addiu     %[a1],       %[a1],      -8                    \n\t"
+    //15x2 passes:
+   "1:                                                        \n\t"
+    "lwc1      %[ck1],      0(%[c2])                          \n\t"
+    "lwc1      %[aj20],     8(%[a2])                          \n\t"
+    "lwc1      %[aj21],     12(%[a2])                         \n\t"
+    "lwc1      %[ak20],     504(%[a1])                        \n\t"
+    "lwc1      %[ak21],     508(%[a1])                        \n\t"
+    "lwc1      $f0,         -4(%[c2])                         \n\t"
+    "lwc1      $f2,         16(%[a2])                         \n\t"
+    "lwc1      $f3,         20(%[a2])                         \n\t"
+    "lwc1      $f8,         496(%[a1])                        \n\t"
+    "lwc1      $f7,         500(%[a1])                        \n\t"
+    "sub.s     %[wkr],      %[temp],    %[ck1]                \n\t"
+    "sub.s     %[xr],       %[aj20],    %[ak20]               \n\t"
+    "add.s     %[xi],       %[aj21],    %[ak21]               \n\t"
+    "lwc1      %[wki],      0(%[c1])                          \n\t"
+    "sub.s     $f0,         %[temp],    $f0                   \n\t"
+    "sub.s     $f6,         $f2,        $f8                   \n\t"
+    "add.s     $f4,         $f3,        $f7                   \n\t"
+    "lwc1      $f5,         4(%[c1])                          \n\t"
+    "mul.s     %[yr],       %[wkr],     %[xr]                 \n\t"
+    "mul.s     %[yi],       %[wkr],     %[xi]                 \n\t"
+    "mul.s     $f1,         $f0,        $f6                   \n\t"
+    "mul.s     $f0,         $f0,        $f4                   \n\t"
+    "addiu     %[c2],       %[c2],      -8                    \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[xi],       %[wki],     %[xi]                 \n\t"
+    "mul.s     %[xr],       %[wki],     %[xr]                 \n\t"
+    "mul.s     $f4,         $f5,        $f4                   \n\t"
+    "mul.s     $f6,         $f5,        $f6                   \n\t"
+    "add.s     %[yr],       %[yr],      %[xi]                 \n\t"
+    "sub.s     %[yi],       %[yi],      %[xr]                 \n\t"
+    "add.s     $f1,         $f1,        $f4                   \n\t"
+    "sub.s     $f0,         $f0,        $f6                   \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+    "madd.s    %[yr],       %[yr],      %[wki],     %[xi]     \n\t"
+    "nmsub.s   %[yi],       %[yi],      %[wki],     %[xr]     \n\t"
+    "madd.s    $f1,         $f1,        $f5,        $f4       \n\t"
+    "nmsub.s   $f0,         $f0,        $f5,        $f6       \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+    "addiu     %[c1],       %[c1],      8                     \n\t"
+    "sub.s     %[aj20],     %[aj20],    %[yr]                 \n\t"
+    "sub.s     %[aj21],     %[yi],      %[aj21]               \n\t"
+    "add.s     %[ak20],     %[ak20],    %[yr]                 \n\t"
+    "sub.s     %[ak21],     %[yi],      %[ak21]               \n\t"
+    "sub.s     $f2,         $f2,        $f1                   \n\t"
+    "sub.s     $f3,         $f0,        $f3                   \n\t"
+    "add.s     $f1,         $f8,        $f1                   \n\t"
+    "sub.s     $f0,         $f0,        $f7                   \n\t"
+    "swc1      %[aj20],     8(%[a2])                          \n\t"
+    "swc1      %[aj21],     12(%[a2])                         \n\t"
+    "swc1      %[ak20],     504(%[a1])                        \n\t"
+    "swc1      %[ak21],     508(%[a1])                        \n\t"
+    "swc1      $f2,         16(%[a2])                         \n\t"
+    "swc1      $f3,         20(%[a2])                         \n\t"
+    "swc1      $f1,         496(%[a1])                        \n\t"
+    "swc1      $f0,         500(%[a1])                        \n\t"
+    "addiu     %[a2],       %[a2],      16                    \n\t"
+    "bne       %[c2],       %[c],       1b                    \n\t"
+    " addiu    %[a1],       %[a1],      -16                   \n\t"
+    ".set      pop                                            \n\t"
+    : [a] "+r" (a), [c] "+r" (c), [a1] "+r" (a1), [a2] "+r" (a2),
+      [c1] "+r" (c1), [c2] "+r" (c2), [wkr] "=&f" (wkr), [wki] "=&f" (wki),
+      [xr] "=&f" (xr), [xi] "=&f" (xi), [yr] "=&f" (yr), [yi] "=&f" (yi),
+      [aj20] "=&f" (aj20), [aj21] "=&f" (aj21), [ak20] "=&f" (ak20),
+      [ak21] "=&f" (ak21), [ck1] "=&f" (ck1)
+    : [temp] "f" (temp)
+    : "memory", "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", "$f8"
+  );
+}
+
+void aec_rdft_init_mips(void) {
+  cft1st_128 = cft1st_128_mips;
+  cftmdl_128 = cftmdl_128_mips;
+  rftfsub_128 = rftfsub_128_mips;
+  rftbsub_128 = rftbsub_128_mips;
+  cftfsub_128 = cftfsub_128_mips;
+  cftbsub_128 = cftbsub_128_mips;
+  bitrv2_128 = bitrv2_128_mips;
+}
diff --git a/webrtc/modules/audio_processing/audio_processing.gypi b/webrtc/modules/audio_processing/audio_processing.gypi
index dd5a586cf8..da6121749a 100644
--- a/webrtc/modules/audio_processing/audio_processing.gypi
+++ b/webrtc/modules/audio_processing/audio_processing.gypi
@@ -136,6 +136,14 @@
           'sources': [
             'aecm/aecm_core_mips.c',
           ],
+          'conditions': [
+            ['mips_fpu==1', {
+              'sources': [
+                'aec/aec_core_mips.c',
+                'aec/aec_rdft_mips.c',
+              ],
+            }],
+          ],
         }, {
           'sources': [
             'aecm/aecm_core_c.c',