Revert of Added ARM Neon SIMD optimizations for AEC3 (patchset #2 id:970001 of https://codereview.webrtc.org/2834073005/ )

Reason for revert: The bug number for the chromium bug was wrong. Original issue's description: > Added ARM Neon optimizations for AEC3 > > This CL adds Neon SIMD optimizations for AEC3 on ARM, resulting > in an 8 times complexity reduction. The optimizations are basically > identical to what was already in place for SSE2. > > BUG=chromium:14993, webrtc:6018 > > Review-Url: https://codereview.webrtc.org/2834073005 > Cr-Commit-Position: refs/heads/master@{#17993} > Committed: f246b91eba TBR=ivoc@webrtc.org # Skipping CQ checks because original CL landed less than 1 days ago. NOPRESUBMIT=true NOTREECHECKS=true NOTRY=true BUG=chromium:14993, webrtc:6018 Review-Url: https://codereview.webrtc.org/2856113003 Cr-Commit-Position: refs/heads/master@{#17994}
2017-05-03 06:40:47 -07:00 · 2017-05-03 06:40:47 -07:00 · b70f8cfd4d
commit b70f8cfd4d
parent f246b91eba
10 changed files with 6 additions and 621 deletions
--- a/webrtc/modules/audio_processing/aec3/adaptive_fir_filter.cc
+++ b/webrtc/modules/audio_processing/aec3/adaptive_fir_filter.cc
@ -10,9 +10,6 @@

 #include "webrtc/modules/audio_processing/aec3/adaptive_fir_filter.h"

-#if defined(WEBRTC_HAS_NEON)
-#include <arm_neon.h>
-#endif
 #include "webrtc/typedefs.h"
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 #include <emmintrin.h>
@ -55,26 +52,6 @@ void UpdateFrequencyResponse(
  }
 }

-#if defined(WEBRTC_HAS_NEON)
-// Computes and stores the frequency response of the filter.
-void UpdateFrequencyResponse_NEON(
-    rtc::ArrayView<const FftData> H,
-    std::vector<std::array<float, kFftLengthBy2Plus1>>* H2) {
-  RTC_DCHECK_EQ(H.size(), H2->size());
-  for (size_t k = 0; k < H.size(); ++k) {
-    for (size_t j = 0; j < kFftLengthBy2; j += 4) {
-      const float32x4_t re = vld1q_f32(&H[k].re[j]);
-      const float32x4_t im = vld1q_f32(&H[k].im[j]);
-      float32x4_t H2_k_j = vmulq_f32(re, re);
-      H2_k_j = vmlaq_f32(H2_k_j, im, im);
-      vst1q_f32(&(*H2)[k][j], H2_k_j);
-    }
-    (*H2)[k][kFftLengthBy2] = H[k].re[kFftLengthBy2] * H[k].re[kFftLengthBy2] +
-                              H[k].im[kFftLengthBy2] * H[k].im[kFftLengthBy2];
-  }
-}
-#endif
-
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 // Computes and stores the frequency response of the filter.
 void UpdateFrequencyResponse_SSE2(
@ -108,25 +85,6 @@ void UpdateErlEstimator(
  }
 }

-#if defined(WEBRTC_HAS_NEON)
-// Computes and stores the echo return loss estimate of the filter, which is the
-// sum of the partition frequency responses.
-void UpdateErlEstimator_NEON(
-    const std::vector<std::array<float, kFftLengthBy2Plus1>>& H2,
-    std::array<float, kFftLengthBy2Plus1>* erl) {
-  erl->fill(0.f);
-  for (auto& H2_j : H2) {
-    for (size_t k = 0; k < kFftLengthBy2; k += 4) {
-      const float32x4_t H2_j_k = vld1q_f32(&H2_j[k]);
-      float32x4_t erl_k = vld1q_f32(&(*erl)[k]);
-      erl_k = vaddq_f32(erl_k, H2_j_k);
-      vst1q_f32(&(*erl)[k], erl_k);
-    }
-    (*erl)[kFftLengthBy2] += H2_j[kFftLengthBy2];
-  }
-}
-#endif
-
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 // Computes and stores the echo return loss estimate of the filter, which is the
 // sum of the partition frequency responses.
@ -163,63 +121,6 @@ void AdaptPartitions(const RenderBuffer& render_buffer,
  }
 }

-#if defined(WEBRTC_HAS_NEON)
-// Adapts the filter partitions. (NEON variant)
-void AdaptPartitions_NEON(const RenderBuffer& render_buffer,
-                          const FftData& G,
-                          rtc::ArrayView<FftData> H) {
-  rtc::ArrayView<const FftData> render_buffer_data = render_buffer.Buffer();
-  const int lim1 =
-      std::min(render_buffer_data.size() - render_buffer.Position(), H.size());
-  const int lim2 = H.size();
-  constexpr int kNumFourBinBands = kFftLengthBy2 / 4;
-  FftData* H_j = &H[0];
-  const FftData* X = &render_buffer_data[render_buffer.Position()];
-  int limit = lim1;
-  int j = 0;
-  do {
-    for (; j < limit; ++j, ++H_j, ++X) {
-      for (int k = 0, n = 0; n < kNumFourBinBands; ++n, k += 4) {
-        const float32x4_t G_re = vld1q_f32(&G.re[k]);
-        const float32x4_t G_im = vld1q_f32(&G.im[k]);
-        const float32x4_t X_re = vld1q_f32(&X->re[k]);
-        const float32x4_t X_im = vld1q_f32(&X->im[k]);
-        const float32x4_t H_re = vld1q_f32(&H_j->re[k]);
-        const float32x4_t H_im = vld1q_f32(&H_j->im[k]);
-        const float32x4_t a = vmulq_f32(X_re, G_re);
-        const float32x4_t e = vmlaq_f32(a, X_im, G_im);
-        const float32x4_t c = vmulq_f32(X_re, G_im);
-        const float32x4_t f = vmlsq_f32(c, X_im, G_re);
-        const float32x4_t g = vaddq_f32(H_re, e);
-        const float32x4_t h = vaddq_f32(H_im, f);
-
-        vst1q_f32(&H_j->re[k], g);
-        vst1q_f32(&H_j->im[k], h);
-      }
-    }
-
-    X = &render_buffer_data[0];
-    limit = lim2;
-  } while (j < lim2);
-
-  H_j = &H[0];
-  X = &render_buffer_data[render_buffer.Position()];
-  limit = lim1;
-  j = 0;
-  do {
-    for (; j < limit; ++j, ++H_j, ++X) {
-      H_j->re[kFftLengthBy2] += X->re[kFftLengthBy2] * G.re[kFftLengthBy2] +
-                                X->im[kFftLengthBy2] * G.im[kFftLengthBy2];
-      H_j->im[kFftLengthBy2] += X->re[kFftLengthBy2] * G.im[kFftLengthBy2] -
-                                X->im[kFftLengthBy2] * G.re[kFftLengthBy2];
-    }
-
-    X = &render_buffer_data[0];
-    limit = lim2;
-  } while (j < lim2);
-}
-#endif
-
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 // Adapts the filter partitions. (SSE2 variant)
 void AdaptPartitions_SSE2(const RenderBuffer& render_buffer,
@ -302,65 +203,6 @@ void ApplyFilter(const RenderBuffer& render_buffer,
  }
 }

-#if defined(WEBRTC_HAS_NEON)
-// Produces the filter output (NEON variant).
-void ApplyFilter_NEON(const RenderBuffer& render_buffer,
-                      rtc::ArrayView<const FftData> H,
-                      FftData* S) {
-  RTC_DCHECK_GE(H.size(), H.size() - 1);
-  S->re.fill(0.f);
-  S->im.fill(0.f);
-
-  rtc::ArrayView<const FftData> render_buffer_data = render_buffer.Buffer();
-  const int lim1 =
-      std::min(render_buffer_data.size() - render_buffer.Position(), H.size());
-  const int lim2 = H.size();
-  constexpr int kNumFourBinBands = kFftLengthBy2 / 4;
-  const FftData* H_j = &H[0];
-  const FftData* X = &render_buffer_data[render_buffer.Position()];
-
-  int j = 0;
-  int limit = lim1;
-  do {
-    for (; j < limit; ++j, ++H_j, ++X) {
-      for (int k = 0, n = 0; n < kNumFourBinBands; ++n, k += 4) {
-        const float32x4_t X_re = vld1q_f32(&X->re[k]);
-        const float32x4_t X_im = vld1q_f32(&X->im[k]);
-        const float32x4_t H_re = vld1q_f32(&H_j->re[k]);
-        const float32x4_t H_im = vld1q_f32(&H_j->im[k]);
-        const float32x4_t S_re = vld1q_f32(&S->re[k]);
-        const float32x4_t S_im = vld1q_f32(&S->im[k]);
-        const float32x4_t a = vmulq_f32(X_re, H_re);
-        const float32x4_t e = vmlsq_f32(a, X_im, H_im);
-        const float32x4_t c = vmulq_f32(X_re, H_im);
-        const float32x4_t f = vmlaq_f32(c, X_im, H_re);
-        const float32x4_t g = vaddq_f32(S_re, e);
-        const float32x4_t h = vaddq_f32(S_im, f);
-        vst1q_f32(&S->re[k], g);
-        vst1q_f32(&S->im[k], h);
-      }
-    }
-    limit = lim2;
-    X = &render_buffer_data[0];
-  } while (j < lim2);
-
-  H_j = &H[0];
-  X = &render_buffer_data[render_buffer.Position()];
-  j = 0;
-  limit = lim1;
-  do {
-    for (; j < limit; ++j, ++H_j, ++X) {
-      S->re[kFftLengthBy2] += X->re[kFftLengthBy2] * H_j->re[kFftLengthBy2] -
-                              X->im[kFftLengthBy2] * H_j->im[kFftLengthBy2];
-      S->im[kFftLengthBy2] += X->re[kFftLengthBy2] * H_j->im[kFftLengthBy2] +
-                              X->im[kFftLengthBy2] * H_j->re[kFftLengthBy2];
-    }
-    limit = lim2;
-    X = &render_buffer_data[0];
-  } while (j < lim2);
-}
-#endif
-
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 // Produces the filter output (SSE2 variant).
 void ApplyFilter_SSE2(const RenderBuffer& render_buffer,
@ -463,11 +305,6 @@ void AdaptiveFirFilter::Filter(const RenderBuffer& render_buffer,
    case Aec3Optimization::kSse2:
      aec3::ApplyFilter_SSE2(render_buffer, H_, S);
      break;
-#endif
-#if defined(WEBRTC_HAS_NEON)
-    case Aec3Optimization::kNeon:
-      aec3::ApplyFilter_NEON(render_buffer, H_, S);
-      break;
 #endif
    default:
      aec3::ApplyFilter(render_buffer, H_, S);
@ -482,11 +319,6 @@ void AdaptiveFirFilter::Adapt(const RenderBuffer& render_buffer,
    case Aec3Optimization::kSse2:
      aec3::AdaptPartitions_SSE2(render_buffer, G, H_);
      break;
-#endif
-#if defined(WEBRTC_HAS_NEON)
-    case Aec3Optimization::kNeon:
-      aec3::AdaptPartitions_NEON(render_buffer, G, H_);
-      break;
 #endif
    default:
      aec3::AdaptPartitions(render_buffer, G, H_);
@ -505,12 +337,6 @@ void AdaptiveFirFilter::Adapt(const RenderBuffer& render_buffer,
      aec3::UpdateFrequencyResponse_SSE2(H_, &H2_);
      aec3::UpdateErlEstimator_SSE2(H2_, &erl_);
      break;
-#endif
-#if defined(WEBRTC_HAS_NEON)
-    case Aec3Optimization::kNeon:
-      aec3::UpdateFrequencyResponse_NEON(H_, &H2_);
-      aec3::UpdateErlEstimator_NEON(H2_, &erl_);
-      break;
 #endif
    default:
      aec3::UpdateFrequencyResponse(H_, &H2_);
--- a/webrtc/modules/audio_processing/aec3/adaptive_fir_filter.h
+++ b/webrtc/modules/audio_processing/aec3/adaptive_fir_filter.h
@ -29,11 +29,6 @@ namespace aec3 {
 void UpdateFrequencyResponse(
    rtc::ArrayView<const FftData> H,
    std::vector<std::array<float, kFftLengthBy2Plus1>>* H2);
-#if defined(WEBRTC_HAS_NEON)
-void UpdateFrequencyResponse_NEON(
-    rtc::ArrayView<const FftData> H,
-    std::vector<std::array<float, kFftLengthBy2Plus1>>* H2);
-#endif
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 void UpdateFrequencyResponse_SSE2(
    rtc::ArrayView<const FftData> H,
@ -45,11 +40,6 @@ void UpdateFrequencyResponse_SSE2(
 void UpdateErlEstimator(
    const std::vector<std::array<float, kFftLengthBy2Plus1>>& H2,
    std::array<float, kFftLengthBy2Plus1>* erl);
-#if defined(WEBRTC_HAS_NEON)
-void UpdateErlEstimator_NEON(
-    const std::vector<std::array<float, kFftLengthBy2Plus1>>& H2,
-    std::array<float, kFftLengthBy2Plus1>* erl);
-#endif
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 void UpdateErlEstimator_SSE2(
    const std::vector<std::array<float, kFftLengthBy2Plus1>>& H2,
@ -60,11 +50,6 @@ void UpdateErlEstimator_SSE2(
 void AdaptPartitions(const RenderBuffer& render_buffer,
                     const FftData& G,
                     rtc::ArrayView<FftData> H);
-#if defined(WEBRTC_HAS_NEON)
-void AdaptPartitions_NEON(const RenderBuffer& render_buffer,
-                          const FftData& G,
-                          rtc::ArrayView<FftData> H);
-#endif
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 void AdaptPartitions_SSE2(const RenderBuffer& render_buffer,
                          const FftData& G,
@ -75,11 +60,6 @@ void AdaptPartitions_SSE2(const RenderBuffer& render_buffer,
 void ApplyFilter(const RenderBuffer& render_buffer,
                 rtc::ArrayView<const FftData> H,
                 FftData* S);
-#if defined(WEBRTC_HAS_NEON)
-void ApplyFilter_NEON(const RenderBuffer& render_buffer,
-                      rtc::ArrayView<const FftData> H,
-                      FftData* S);
-#endif
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 void ApplyFilter_SSE2(const RenderBuffer& render_buffer,
                      rtc::ArrayView<const FftData> H,
--- a/webrtc/modules/audio_processing/aec3/adaptive_fir_filter_unittest.cc
+++ b/webrtc/modules/audio_processing/aec3/adaptive_fir_filter_unittest.cc
@ -10,7 +10,6 @@

 #include "webrtc/modules/audio_processing/aec3/adaptive_fir_filter.h"

-#include <math.h>
 #include <algorithm>
 #include <numeric>
 #include <string>
@ -42,114 +41,10 @@ std::string ProduceDebugText(size_t delay) {

 }  // namespace

-#if defined(WEBRTC_HAS_NEON)
-// Verifies that the optimized methods for filter adaptation are similar to
-// their reference counterparts.
-TEST(AdaptiveFirFilter, FilterAdaptationNeonOptimizations) {
-  RenderBuffer render_buffer(Aec3Optimization::kNone, 3, 12,
-                             std::vector<size_t>(1, 12));
-  Random random_generator(42U);
-  std::vector<std::vector<float>> x(3, std::vector<float>(kBlockSize, 0.f));
-  FftData S_C;
-  FftData S_NEON;
-  FftData G;
-  Aec3Fft fft;
-  std::vector<FftData> H_C(10);
-  std::vector<FftData> H_NEON(10);
-  for (auto& H_j : H_C) {
-    H_j.Clear();
-  }
-  for (auto& H_j : H_NEON) {
-    H_j.Clear();
-  }
-
-  for (size_t k = 0; k < 30; ++k) {
-    RandomizeSampleVector(&random_generator, x[0]);
-    render_buffer.Insert(x);
-  }
-
-  for (size_t j = 0; j < G.re.size(); ++j) {
-    G.re[j] = j / 10001.f;
-  }
-  for (size_t j = 1; j < G.im.size() - 1; ++j) {
-    G.im[j] = j / 20001.f;
-  }
-  G.im[0] = 0.f;
-  G.im[G.im.size() - 1] = 0.f;
-
-  AdaptPartitions_NEON(render_buffer, G, H_NEON);
-  AdaptPartitions(render_buffer, G, H_C);
-  AdaptPartitions_NEON(render_buffer, G, H_NEON);
-  AdaptPartitions(render_buffer, G, H_C);
-
-  for (size_t l = 0; l < H_C.size(); ++l) {
-    for (size_t j = 0; j < H_C[l].im.size(); ++j) {
-      EXPECT_NEAR(H_C[l].re[j], H_NEON[l].re[j], fabs(H_C[l].re[j] * 0.00001f));
-      EXPECT_NEAR(H_C[l].im[j], H_NEON[l].im[j], fabs(H_C[l].im[j] * 0.00001f));
-    }
-  }
-
-  ApplyFilter_NEON(render_buffer, H_NEON, &S_NEON);
-  ApplyFilter(render_buffer, H_C, &S_C);
-  for (size_t j = 0; j < S_C.re.size(); ++j) {
-    EXPECT_NEAR(S_C.re[j], S_NEON.re[j], fabs(S_C.re[j] * 0.00001f));
-    EXPECT_NEAR(S_C.im[j], S_NEON.im[j], fabs(S_C.re[j] * 0.00001f));
-  }
-}
-
-// Verifies that the optimized method for frequency response computation is
-// bitexact to the reference counterpart.
-TEST(AdaptiveFirFilter, UpdateFrequencyResponseNeonOptimization) {
-  const size_t kNumPartitions = 12;
-  std::vector<FftData> H(kNumPartitions);
-  std::vector<std::array<float, kFftLengthBy2Plus1>> H2(kNumPartitions);
-  std::vector<std::array<float, kFftLengthBy2Plus1>> H2_NEON(kNumPartitions);
-
-  for (size_t j = 0; j < H.size(); ++j) {
-    for (size_t k = 0; k < H[j].re.size(); ++k) {
-      H[j].re[k] = k + j / 3.f;
-      H[j].im[k] = j + k / 7.f;
-    }
-  }
-
-  UpdateFrequencyResponse(H, &H2);
-  UpdateFrequencyResponse_NEON(H, &H2_NEON);
-
-  for (size_t j = 0; j < H2.size(); ++j) {
-    for (size_t k = 0; k < H[j].re.size(); ++k) {
-      EXPECT_FLOAT_EQ(H2[j][k], H2_NEON[j][k]);
-    }
-  }
-}
-
-// Verifies that the optimized method for echo return loss computation is
-// bitexact to the reference counterpart.
-TEST(AdaptiveFirFilter, UpdateErlNeonOptimization) {
-  const size_t kNumPartitions = 12;
-  std::vector<std::array<float, kFftLengthBy2Plus1>> H2(kNumPartitions);
-  std::array<float, kFftLengthBy2Plus1> erl;
-  std::array<float, kFftLengthBy2Plus1> erl_NEON;
-
-  for (size_t j = 0; j < H2.size(); ++j) {
-    for (size_t k = 0; k < H2[j].size(); ++k) {
-      H2[j][k] = k + j / 3.f;
-    }
-  }
-
-  UpdateErlEstimator(H2, &erl);
-  UpdateErlEstimator_NEON(H2, &erl_NEON);
-
-  for (size_t j = 0; j < erl.size(); ++j) {
-    EXPECT_FLOAT_EQ(erl[j], erl_NEON[j]);
-  }
-}
-
-#endif
-
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 // Verifies that the optimized methods for filter adaptation are bitexact to
 // their reference counterparts.
-TEST(AdaptiveFirFilter, FilterAdaptationSse2Optimizations) {
+TEST(AdaptiveFirFilter, FilterAdaptationOptimizations) {
  bool use_sse2 = (WebRtc_GetCPUInfo(kSSE2) != 0);
  if (use_sse2) {
    RenderBuffer render_buffer(Aec3Optimization::kNone, 3, 12,
@ -200,7 +95,7 @@ TEST(AdaptiveFirFilter, FilterAdaptationSse2Optimizations) {

 // Verifies that the optimized method for frequency response computation is
 // bitexact to the reference counterpart.
-TEST(AdaptiveFirFilter, UpdateFrequencyResponseSse2Optimization) {
+TEST(AdaptiveFirFilter, UpdateFrequencyResponseOptimization) {
  bool use_sse2 = (WebRtc_GetCPUInfo(kSSE2) != 0);
  if (use_sse2) {
    const size_t kNumPartitions = 12;
@ -228,7 +123,7 @@ TEST(AdaptiveFirFilter, UpdateFrequencyResponseSse2Optimization) {

 // Verifies that the optimized method for echo return loss computation is
 // bitexact to the reference counterpart.
-TEST(AdaptiveFirFilter, UpdateErlSse2Optimization) {
+TEST(AdaptiveFirFilter, UpdateErlOptimization) {
  bool use_sse2 = (WebRtc_GetCPUInfo(kSSE2) != 0);
  if (use_sse2) {
    const size_t kNumPartitions = 12;
--- a/webrtc/modules/audio_processing/aec3/aec3_common.cc
+++ b/webrtc/modules/audio_processing/aec3/aec3_common.cc
@ -21,11 +21,6 @@ Aec3Optimization DetectOptimization() {
    return Aec3Optimization::kSse2;
  }
 #endif
-
-#if defined(WEBRTC_HAS_NEON)
-  return Aec3Optimization::kNeon;
-#endif
-
  return Aec3Optimization::kNone;
 }

--- a/webrtc/modules/audio_processing/aec3/aec3_common.h
+++ b/webrtc/modules/audio_processing/aec3/aec3_common.h
@ -24,7 +24,7 @@ namespace webrtc {
 #define ALIGN16_END __attribute__((aligned(16)))
 #endif

-enum class Aec3Optimization { kNone, kSse2, kNeon };
+enum class Aec3Optimization { kNone, kSse2 };

 constexpr int kNumBlocksPerSecond = 250;

--- a/webrtc/modules/audio_processing/aec3/matched_filter.cc
+++ b/webrtc/modules/audio_processing/aec3/matched_filter.cc
@ -9,9 +9,6 @@
 */
 #include "webrtc/modules/audio_processing/aec3/matched_filter.h"

-#if defined(WEBRTC_HAS_NEON)
-#include <arm_neon.h>
-#endif
 #include "webrtc/typedefs.h"
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 #include <emmintrin.h>
@ -25,114 +22,6 @@
 namespace webrtc {
 namespace aec3 {

-#if defined(WEBRTC_HAS_NEON)
-
-void MatchedFilterCore_NEON(size_t x_start_index,
-                            float x2_sum_threshold,
-                            rtc::ArrayView<const float> x,
-                            rtc::ArrayView<const float> y,
-                            rtc::ArrayView<float> h,
-                            bool* filters_updated,
-                            float* error_sum) {
-  const int h_size = static_cast<int>(h.size());
-  const int x_size = static_cast<int>(x.size());
-  RTC_DCHECK_EQ(0, h_size % 4);
-
-  // Process for all samples in the sub-block.
-  for (size_t i = 0; i < kSubBlockSize; ++i) {
-    // Apply the matched filter as filter * x, and compute x * x.
-
-    RTC_DCHECK_GT(x_size, x_start_index);
-    const float* x_p = &x[x_start_index];
-    const float* h_p = &h[0];
-
-    // Initialize values for the accumulation.
-    float32x4_t s_128 = vdupq_n_f32(0);
-    float32x4_t x2_sum_128 = vdupq_n_f32(0);
-    float x2_sum = 0.f;
-    float s = 0;
-
-    // Compute loop chunk sizes until, and after, the wraparound of the circular
-    // buffer for x.
-    const int chunk1 =
-        std::min(h_size, static_cast<int>(x_size - x_start_index));
-
-    // Perform the loop in two chunks.
-    const int chunk2 = h_size - chunk1;
-    for (int limit : {chunk1, chunk2}) {
-      // Perform 128 bit vector operations.
-      const int limit_by_4 = limit >> 2;
-      for (int k = limit_by_4; k > 0; --k, h_p += 4, x_p += 4) {
-        // Load the data into 128 bit vectors.
-        const float32x4_t x_k = vld1q_f32(x_p);
-        const float32x4_t h_k = vld1q_f32(h_p);
-        // Compute and accumulate x * x and h * x.
-        x2_sum_128 = vmlaq_f32(x2_sum_128, x_k, x_k);
-        s_128 = vmlaq_f32(s_128, h_k, x_k);
-      }
-
-      // Perform non-vector operations for any remaining items.
-      for (int k = limit - limit_by_4 * 4; k > 0; --k, ++h_p, ++x_p) {
-        const float x_k = *x_p;
-        x2_sum += x_k * x_k;
-        s += *h_p * x_k;
-      }
-
-      x_p = &x[0];
-    }
-
-    // Combine the accumulated vector and scalar values.
-    float* v = reinterpret_cast<float*>(&x2_sum_128);
-    x2_sum += v[0] + v[1] + v[2] + v[3];
-    v = reinterpret_cast<float*>(&s_128);
-    s += v[0] + v[1] + v[2] + v[3];
-
-    // Compute the matched filter error.
-    const float e = std::min(32767.f, std::max(-32768.f, y[i] - s));
-    *error_sum += e * e;
-
-    // Update the matched filter estimate in an NLMS manner.
-    if (x2_sum > x2_sum_threshold) {
-      RTC_DCHECK_LT(0.f, x2_sum);
-      const float alpha = 0.7f * e / x2_sum;
-      const float32x4_t alpha_128 = vmovq_n_f32(alpha);
-
-      // filter = filter + 0.7 * (y - filter * x) / x * x.
-      float* h_p = &h[0];
-      x_p = &x[x_start_index];
-
-      // Perform the loop in two chunks.
-      for (int limit : {chunk1, chunk2}) {
-        // Perform 128 bit vector operations.
-        const int limit_by_4 = limit >> 2;
-        for (int k = limit_by_4; k > 0; --k, h_p += 4, x_p += 4) {
-          // Load the data into 128 bit vectors.
-          float32x4_t h_k = vld1q_f32(h_p);
-          const float32x4_t x_k = vld1q_f32(x_p);
-          // Compute h = h + alpha * x.
-          h_k = vmlaq_f32(h_k, alpha_128, x_k);
-
-          // Store the result.
-          vst1q_f32(h_p, h_k);
-        }
-
-        // Perform non-vector operations for any remaining items.
-        for (int k = limit - limit_by_4 * 4; k > 0; --k, ++h_p, ++x_p) {
-          *h_p += alpha * *x_p;
-        }
-
-        x_p = &x[0];
-      }
-
-      *filters_updated = true;
-    }
-
-    x_start_index = x_start_index > 0 ? x_start_index - 1 : x_size - 1;
-  }
-}
-
-#endif
-
 #if defined(WEBRTC_ARCH_X86_FAMILY)

 void MatchedFilterCore_SSE2(size_t x_start_index,
@ -337,13 +226,6 @@ void MatchedFilter::Update(const DownsampledRenderBuffer& render_buffer,
                                     render_buffer.buffer, y, filters_[n],
                                     &filters_updated, &error_sum);
        break;
-#endif
-#if defined(WEBRTC_HAS_NEON)
-      case Aec3Optimization::kNeon:
-        aec3::MatchedFilterCore_NEON(x_start_index, x2_sum_threshold,
-                                     render_buffer.buffer, y, filters_[n],
-                                     &filters_updated, &error_sum);
-        break;
 #endif
      default:
        aec3::MatchedFilterCore(x_start_index, x2_sum_threshold,
--- a/webrtc/modules/audio_processing/aec3/matched_filter.h
+++ b/webrtc/modules/audio_processing/aec3/matched_filter.h
@ -23,19 +23,6 @@
 namespace webrtc {
 namespace aec3 {

-#if defined(WEBRTC_HAS_NEON)
-
-// Filter core for the matched filter that is optimized for NEON.
-void MatchedFilterCore_NEON(size_t x_start_index,
-                            float x2_sum_threshold,
-                            rtc::ArrayView<const float> x,
-                            rtc::ArrayView<const float> y,
-                            rtc::ArrayView<float> h,
-                            bool* filters_updated,
-                            float* error_sum);
-
-#endif
-
 #if defined(WEBRTC_ARCH_X86_FAMILY)

 // Filter core for the matched filter that is optimized for SSE2.
--- a/webrtc/modules/audio_processing/aec3/matched_filter_unittest.cc
+++ b/webrtc/modules/audio_processing/aec3/matched_filter_unittest.cc
@ -43,47 +43,10 @@ constexpr size_t kNumMatchedFilters = 4;

 }  // namespace

-#if defined(WEBRTC_HAS_NEON)
-// Verifies that the optimized methods for NEON are similar to their reference
-// counterparts.
-TEST(MatchedFilter, TestNeonOptimizations) {
-  Random random_generator(42U);
-  std::vector<float> x(2000);
-  RandomizeSampleVector(&random_generator, x);
-  std::vector<float> y(kSubBlockSize);
-  std::vector<float> h_NEON(512);
-  std::vector<float> h(512);
-  int x_index = 0;
-  for (int k = 0; k < 1000; ++k) {
-    RandomizeSampleVector(&random_generator, y);
-
-    bool filters_updated = false;
-    float error_sum = 0.f;
-    bool filters_updated_NEON = false;
-    float error_sum_NEON = 0.f;
-
-    MatchedFilterCore_NEON(x_index, h.size() * 150.f * 150.f, x, y, h_NEON,
-                           &filters_updated_NEON, &error_sum_NEON);
-
-    MatchedFilterCore(x_index, h.size() * 150.f * 150.f, x, y, h,
-                      &filters_updated, &error_sum);
-
-    EXPECT_EQ(filters_updated, filters_updated_NEON);
-    EXPECT_NEAR(error_sum, error_sum_NEON, error_sum / 100000.f);
-
-    for (size_t j = 0; j < h.size(); ++j) {
-      EXPECT_NEAR(h[j], h_NEON[j], 0.00001f);
-    }
-
-    x_index = (x_index + kSubBlockSize) % x.size();
-  }
-}
-#endif
-
 #if defined(WEBRTC_ARCH_X86_FAMILY)
-// Verifies that the optimized methods for SSE2 are bitexact to their reference
+// Verifies that the optimized methods are bitexact to their reference
 // counterparts.
-TEST(MatchedFilter, TestSse2Optimizations) {
+TEST(MatchedFilter, TestOptimizations) {
  bool use_sse2 = (WebRtc_GetCPUInfo(kSSE2) != 0);
  if (use_sse2) {
    Random random_generator(42U);
--- a/webrtc/modules/audio_processing/aec3/vector_math.h
+++ b/webrtc/modules/audio_processing/aec3/vector_math.h
@ -12,9 +12,6 @@
 #define WEBRTC_MODULES_AUDIO_PROCESSING_AEC3_VECTOR_MATH_H_

 #include "webrtc/typedefs.h"
-#if defined(WEBRTC_HAS_NEON)
-#include <arm_neon.h>
-#endif
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 #include <emmintrin.h>
 #endif
@ -56,51 +53,6 @@ class VectorMath {
        }
      } break;
 #endif
-#if defined(WEBRTC_HAS_NEON)
-      case Aec3Optimization::kNeon: {
-        const int x_size = static_cast<int>(x.size());
-        const int vector_limit = x_size >> 2;
-
-        int j = 0;
-        for (; j < vector_limit * 4; j += 4) {
-          float32x4_t g = vld1q_f32(&x[j]);
-#if !defined(WEBRTC_ARCH_ARM64)
-          float32x4_t y = vrsqrteq_f32(g);
-
-          // Code to handle sqrt(0).
-          // If the input to sqrtf() is zero, a zero will be returned.
-          // If the input to vrsqrteq_f32() is zero, positive infinity is
-          // returned.
-          const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000);
-          // check for divide by zero
-          const uint32x4_t div_by_zero =
-              vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(y));
-          // zero out the positive infinity results
-          y = vreinterpretq_f32_u32(
-              vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(y)));
-          // from arm documentation
-          // The Newton-Raphson iteration:
-          //     y[n+1] = y[n] * (3 - d * (y[n] * y[n])) / 2)
-          // converges to (1/√d) if y0 is the result of VRSQRTE applied to d.
-          //
-          // Note: The precision did not improve after 2 iterations.
-          for (int i = 0; i < 2; i++) {
-            y = vmulq_f32(vrsqrtsq_f32(vmulq_f32(y, y), g), y);
-          }
-          // sqrt(g) = g * 1/sqrt(g)
-          g = vmulq_f32(g, y);
-#else
-          g = vsqrtq_f32(g);
-#endif
-          vst1q_f32(&x[j], g);
-        }
-
-        for (; j < x_size; ++j) {
-          x[j] = sqrtf(x[j]);
-        }
-      }
-#endif
-      break;
      default:
        std::for_each(x.begin(), x.end(), [](float& a) { a = sqrtf(a); });
    }
@ -131,24 +83,6 @@ class VectorMath {
        }
      } break;
 #endif
-#if defined(WEBRTC_HAS_NEON)
-      case Aec3Optimization::kNeon: {
-        const int x_size = static_cast<int>(x.size());
-        const int vector_limit = x_size >> 2;
-
-        int j = 0;
-        for (; j < vector_limit * 4; j += 4) {
-          const float32x4_t x_j = vld1q_f32(&x[j]);
-          const float32x4_t y_j = vld1q_f32(&y[j]);
-          const float32x4_t z_j = vmulq_f32(x_j, y_j);
-          vst1q_f32(&z[j], z_j);
-        }
-
-        for (; j < x_size; ++j) {
-          z[j] = x[j] * y[j];
-        }
-      } break;
-#endif
      default:
        std::transform(x.begin(), x.end(), y.begin(), z.begin(),
                       std::multiplies<float>());
@ -177,24 +111,6 @@ class VectorMath {
        }
      } break;
 #endif
-#if defined(WEBRTC_HAS_NEON)
-      case Aec3Optimization::kNeon: {
-        const int x_size = static_cast<int>(x.size());
-        const int vector_limit = x_size >> 2;
-
-        int j = 0;
-        for (; j < vector_limit * 4; j += 4) {
-          const float32x4_t x_j = vld1q_f32(&x[j]);
-          float32x4_t z_j = vld1q_f32(&z[j]);
-          z_j = vaddq_f32(z_j, x_j);
-          vst1q_f32(&z[j], z_j);
-        }
-
-        for (; j < x_size; ++j) {
-          z[j] += x[j];
-        }
-      } break;
-#endif
      default:
        std::transform(x.begin(), x.end(), z.begin(), z.begin(),
                       std::plus<float>());
--- a/webrtc/modules/audio_processing/aec3/vector_math_unittest.cc
+++ b/webrtc/modules/audio_processing/aec3/vector_math_unittest.cc
@ -18,65 +18,6 @@

 namespace webrtc {

-#if defined(WEBRTC_HAS_NEON)
-
-TEST(VectorMath, Sqrt) {
-  std::array<float, kFftLengthBy2Plus1> x;
-  std::array<float, kFftLengthBy2Plus1> z;
-  std::array<float, kFftLengthBy2Plus1> z_neon;
-
-  for (size_t k = 0; k < x.size(); ++k) {
-    x[k] = (2.f / 3.f) * k;
-  }
-
-  std::copy(x.begin(), x.end(), z.begin());
-  aec3::VectorMath(Aec3Optimization::kNone).Sqrt(z);
-  std::copy(x.begin(), x.end(), z_neon.begin());
-  aec3::VectorMath(Aec3Optimization::kNeon).Sqrt(z_neon);
-  for (size_t k = 0; k < z.size(); ++k) {
-    EXPECT_NEAR(z[k], z_neon[k], 0.0001f);
-    EXPECT_NEAR(sqrtf(x[k]), z_neon[k], 0.0001f);
-  }
-}
-
-TEST(VectorMath, Multiply) {
-  std::array<float, kFftLengthBy2Plus1> x;
-  std::array<float, kFftLengthBy2Plus1> y;
-  std::array<float, kFftLengthBy2Plus1> z;
-  std::array<float, kFftLengthBy2Plus1> z_neon;
-
-  for (size_t k = 0; k < x.size(); ++k) {
-    x[k] = k;
-    y[k] = (2.f / 3.f) * k;
-  }
-
-  aec3::VectorMath(Aec3Optimization::kNone).Multiply(x, y, z);
-  aec3::VectorMath(Aec3Optimization::kNeon).Multiply(x, y, z_neon);
-  for (size_t k = 0; k < z.size(); ++k) {
-    EXPECT_FLOAT_EQ(z[k], z_neon[k]);
-    EXPECT_FLOAT_EQ(x[k] * y[k], z_neon[k]);
-  }
-}
-
-TEST(VectorMath, Accumulate) {
-  std::array<float, kFftLengthBy2Plus1> x;
-  std::array<float, kFftLengthBy2Plus1> z;
-  std::array<float, kFftLengthBy2Plus1> z_neon;
-
-  for (size_t k = 0; k < x.size(); ++k) {
-    x[k] = k;
-    z[k] = z_neon[k] = 2.f * k;
-  }
-
-  aec3::VectorMath(Aec3Optimization::kNone).Accumulate(x, z);
-  aec3::VectorMath(Aec3Optimization::kNeon).Accumulate(x, z_neon);
-  for (size_t k = 0; k < z.size(); ++k) {
-    EXPECT_FLOAT_EQ(z[k], z_neon[k]);
-    EXPECT_FLOAT_EQ(x[k] + 2.f * x[k], z_neon[k]);
-  }
-}
-#endif
-
 #if defined(WEBRTC_ARCH_X86_FAMILY)

 TEST(VectorMath, Sqrt) {