Revert of Added ARM Neon SIMD optimizations for AEC3 (patchset #2 id:970001 of https://codereview.webrtc.org/2834073005/ )

Reason for revert:
The bug number for the chromium bug was wrong.

Original issue's description:
> Added ARM Neon optimizations for AEC3
>
> This CL adds Neon SIMD optimizations for AEC3 on ARM, resulting
> in an 8 times complexity reduction. The optimizations are basically
> identical to what was already in place for SSE2.
>
> BUG=chromium:14993, webrtc:6018
>
> Review-Url: https://codereview.webrtc.org/2834073005
> Cr-Commit-Position: refs/heads/master@{#17993}
> Committed: f246b91eba

TBR=ivoc@webrtc.org
# Skipping CQ checks because original CL landed less than 1 days ago.
NOPRESUBMIT=true
NOTREECHECKS=true
NOTRY=true
BUG=chromium:14993, webrtc:6018

Review-Url: https://codereview.webrtc.org/2856113003
Cr-Commit-Position: refs/heads/master@{#17994}
This commit is contained in:
peah 2017-05-03 06:40:47 -07:00 committed by Commit bot
parent f246b91eba
commit b70f8cfd4d
10 changed files with 6 additions and 621 deletions

View File

@ -10,9 +10,6 @@
#include "webrtc/modules/audio_processing/aec3/adaptive_fir_filter.h"
#if defined(WEBRTC_HAS_NEON)
#include <arm_neon.h>
#endif
#include "webrtc/typedefs.h"
#if defined(WEBRTC_ARCH_X86_FAMILY)
#include <emmintrin.h>
@ -55,26 +52,6 @@ void UpdateFrequencyResponse(
}
}
#if defined(WEBRTC_HAS_NEON)
// Computes and stores the frequency response of the filter.
void UpdateFrequencyResponse_NEON(
rtc::ArrayView<const FftData> H,
std::vector<std::array<float, kFftLengthBy2Plus1>>* H2) {
RTC_DCHECK_EQ(H.size(), H2->size());
for (size_t k = 0; k < H.size(); ++k) {
for (size_t j = 0; j < kFftLengthBy2; j += 4) {
const float32x4_t re = vld1q_f32(&H[k].re[j]);
const float32x4_t im = vld1q_f32(&H[k].im[j]);
float32x4_t H2_k_j = vmulq_f32(re, re);
H2_k_j = vmlaq_f32(H2_k_j, im, im);
vst1q_f32(&(*H2)[k][j], H2_k_j);
}
(*H2)[k][kFftLengthBy2] = H[k].re[kFftLengthBy2] * H[k].re[kFftLengthBy2] +
H[k].im[kFftLengthBy2] * H[k].im[kFftLengthBy2];
}
}
#endif
#if defined(WEBRTC_ARCH_X86_FAMILY)
// Computes and stores the frequency response of the filter.
void UpdateFrequencyResponse_SSE2(
@ -108,25 +85,6 @@ void UpdateErlEstimator(
}
}
#if defined(WEBRTC_HAS_NEON)
// Computes and stores the echo return loss estimate of the filter, which is the
// sum of the partition frequency responses.
void UpdateErlEstimator_NEON(
const std::vector<std::array<float, kFftLengthBy2Plus1>>& H2,
std::array<float, kFftLengthBy2Plus1>* erl) {
erl->fill(0.f);
for (auto& H2_j : H2) {
for (size_t k = 0; k < kFftLengthBy2; k += 4) {
const float32x4_t H2_j_k = vld1q_f32(&H2_j[k]);
float32x4_t erl_k = vld1q_f32(&(*erl)[k]);
erl_k = vaddq_f32(erl_k, H2_j_k);
vst1q_f32(&(*erl)[k], erl_k);
}
(*erl)[kFftLengthBy2] += H2_j[kFftLengthBy2];
}
}
#endif
#if defined(WEBRTC_ARCH_X86_FAMILY)
// Computes and stores the echo return loss estimate of the filter, which is the
// sum of the partition frequency responses.
@ -163,63 +121,6 @@ void AdaptPartitions(const RenderBuffer& render_buffer,
}
}
#if defined(WEBRTC_HAS_NEON)
// Adapts the filter partitions. (NEON variant)
void AdaptPartitions_NEON(const RenderBuffer& render_buffer,
const FftData& G,
rtc::ArrayView<FftData> H) {
rtc::ArrayView<const FftData> render_buffer_data = render_buffer.Buffer();
const int lim1 =
std::min(render_buffer_data.size() - render_buffer.Position(), H.size());
const int lim2 = H.size();
constexpr int kNumFourBinBands = kFftLengthBy2 / 4;
FftData* H_j = &H[0];
const FftData* X = &render_buffer_data[render_buffer.Position()];
int limit = lim1;
int j = 0;
do {
for (; j < limit; ++j, ++H_j, ++X) {
for (int k = 0, n = 0; n < kNumFourBinBands; ++n, k += 4) {
const float32x4_t G_re = vld1q_f32(&G.re[k]);
const float32x4_t G_im = vld1q_f32(&G.im[k]);
const float32x4_t X_re = vld1q_f32(&X->re[k]);
const float32x4_t X_im = vld1q_f32(&X->im[k]);
const float32x4_t H_re = vld1q_f32(&H_j->re[k]);
const float32x4_t H_im = vld1q_f32(&H_j->im[k]);
const float32x4_t a = vmulq_f32(X_re, G_re);
const float32x4_t e = vmlaq_f32(a, X_im, G_im);
const float32x4_t c = vmulq_f32(X_re, G_im);
const float32x4_t f = vmlsq_f32(c, X_im, G_re);
const float32x4_t g = vaddq_f32(H_re, e);
const float32x4_t h = vaddq_f32(H_im, f);
vst1q_f32(&H_j->re[k], g);
vst1q_f32(&H_j->im[k], h);
}
}
X = &render_buffer_data[0];
limit = lim2;
} while (j < lim2);
H_j = &H[0];
X = &render_buffer_data[render_buffer.Position()];
limit = lim1;
j = 0;
do {
for (; j < limit; ++j, ++H_j, ++X) {
H_j->re[kFftLengthBy2] += X->re[kFftLengthBy2] * G.re[kFftLengthBy2] +
X->im[kFftLengthBy2] * G.im[kFftLengthBy2];
H_j->im[kFftLengthBy2] += X->re[kFftLengthBy2] * G.im[kFftLengthBy2] -
X->im[kFftLengthBy2] * G.re[kFftLengthBy2];
}
X = &render_buffer_data[0];
limit = lim2;
} while (j < lim2);
}
#endif
#if defined(WEBRTC_ARCH_X86_FAMILY)
// Adapts the filter partitions. (SSE2 variant)
void AdaptPartitions_SSE2(const RenderBuffer& render_buffer,
@ -302,65 +203,6 @@ void ApplyFilter(const RenderBuffer& render_buffer,
}
}
#if defined(WEBRTC_HAS_NEON)
// Produces the filter output (NEON variant).
void ApplyFilter_NEON(const RenderBuffer& render_buffer,
rtc::ArrayView<const FftData> H,
FftData* S) {
RTC_DCHECK_GE(H.size(), H.size() - 1);
S->re.fill(0.f);
S->im.fill(0.f);
rtc::ArrayView<const FftData> render_buffer_data = render_buffer.Buffer();
const int lim1 =
std::min(render_buffer_data.size() - render_buffer.Position(), H.size());
const int lim2 = H.size();
constexpr int kNumFourBinBands = kFftLengthBy2 / 4;
const FftData* H_j = &H[0];
const FftData* X = &render_buffer_data[render_buffer.Position()];
int j = 0;
int limit = lim1;
do {
for (; j < limit; ++j, ++H_j, ++X) {
for (int k = 0, n = 0; n < kNumFourBinBands; ++n, k += 4) {
const float32x4_t X_re = vld1q_f32(&X->re[k]);
const float32x4_t X_im = vld1q_f32(&X->im[k]);
const float32x4_t H_re = vld1q_f32(&H_j->re[k]);
const float32x4_t H_im = vld1q_f32(&H_j->im[k]);
const float32x4_t S_re = vld1q_f32(&S->re[k]);
const float32x4_t S_im = vld1q_f32(&S->im[k]);
const float32x4_t a = vmulq_f32(X_re, H_re);
const float32x4_t e = vmlsq_f32(a, X_im, H_im);
const float32x4_t c = vmulq_f32(X_re, H_im);
const float32x4_t f = vmlaq_f32(c, X_im, H_re);
const float32x4_t g = vaddq_f32(S_re, e);
const float32x4_t h = vaddq_f32(S_im, f);
vst1q_f32(&S->re[k], g);
vst1q_f32(&S->im[k], h);
}
}
limit = lim2;
X = &render_buffer_data[0];
} while (j < lim2);
H_j = &H[0];
X = &render_buffer_data[render_buffer.Position()];
j = 0;
limit = lim1;
do {
for (; j < limit; ++j, ++H_j, ++X) {
S->re[kFftLengthBy2] += X->re[kFftLengthBy2] * H_j->re[kFftLengthBy2] -
X->im[kFftLengthBy2] * H_j->im[kFftLengthBy2];
S->im[kFftLengthBy2] += X->re[kFftLengthBy2] * H_j->im[kFftLengthBy2] +
X->im[kFftLengthBy2] * H_j->re[kFftLengthBy2];
}
limit = lim2;
X = &render_buffer_data[0];
} while (j < lim2);
}
#endif
#if defined(WEBRTC_ARCH_X86_FAMILY)
// Produces the filter output (SSE2 variant).
void ApplyFilter_SSE2(const RenderBuffer& render_buffer,
@ -463,11 +305,6 @@ void AdaptiveFirFilter::Filter(const RenderBuffer& render_buffer,
case Aec3Optimization::kSse2:
aec3::ApplyFilter_SSE2(render_buffer, H_, S);
break;
#endif
#if defined(WEBRTC_HAS_NEON)
case Aec3Optimization::kNeon:
aec3::ApplyFilter_NEON(render_buffer, H_, S);
break;
#endif
default:
aec3::ApplyFilter(render_buffer, H_, S);
@ -482,11 +319,6 @@ void AdaptiveFirFilter::Adapt(const RenderBuffer& render_buffer,
case Aec3Optimization::kSse2:
aec3::AdaptPartitions_SSE2(render_buffer, G, H_);
break;
#endif
#if defined(WEBRTC_HAS_NEON)
case Aec3Optimization::kNeon:
aec3::AdaptPartitions_NEON(render_buffer, G, H_);
break;
#endif
default:
aec3::AdaptPartitions(render_buffer, G, H_);
@ -505,12 +337,6 @@ void AdaptiveFirFilter::Adapt(const RenderBuffer& render_buffer,
aec3::UpdateFrequencyResponse_SSE2(H_, &H2_);
aec3::UpdateErlEstimator_SSE2(H2_, &erl_);
break;
#endif
#if defined(WEBRTC_HAS_NEON)
case Aec3Optimization::kNeon:
aec3::UpdateFrequencyResponse_NEON(H_, &H2_);
aec3::UpdateErlEstimator_NEON(H2_, &erl_);
break;
#endif
default:
aec3::UpdateFrequencyResponse(H_, &H2_);

View File

@ -29,11 +29,6 @@ namespace aec3 {
void UpdateFrequencyResponse(
rtc::ArrayView<const FftData> H,
std::vector<std::array<float, kFftLengthBy2Plus1>>* H2);
#if defined(WEBRTC_HAS_NEON)
void UpdateFrequencyResponse_NEON(
rtc::ArrayView<const FftData> H,
std::vector<std::array<float, kFftLengthBy2Plus1>>* H2);
#endif
#if defined(WEBRTC_ARCH_X86_FAMILY)
void UpdateFrequencyResponse_SSE2(
rtc::ArrayView<const FftData> H,
@ -45,11 +40,6 @@ void UpdateFrequencyResponse_SSE2(
void UpdateErlEstimator(
const std::vector<std::array<float, kFftLengthBy2Plus1>>& H2,
std::array<float, kFftLengthBy2Plus1>* erl);
#if defined(WEBRTC_HAS_NEON)
void UpdateErlEstimator_NEON(
const std::vector<std::array<float, kFftLengthBy2Plus1>>& H2,
std::array<float, kFftLengthBy2Plus1>* erl);
#endif
#if defined(WEBRTC_ARCH_X86_FAMILY)
void UpdateErlEstimator_SSE2(
const std::vector<std::array<float, kFftLengthBy2Plus1>>& H2,
@ -60,11 +50,6 @@ void UpdateErlEstimator_SSE2(
void AdaptPartitions(const RenderBuffer& render_buffer,
const FftData& G,
rtc::ArrayView<FftData> H);
#if defined(WEBRTC_HAS_NEON)
void AdaptPartitions_NEON(const RenderBuffer& render_buffer,
const FftData& G,
rtc::ArrayView<FftData> H);
#endif
#if defined(WEBRTC_ARCH_X86_FAMILY)
void AdaptPartitions_SSE2(const RenderBuffer& render_buffer,
const FftData& G,
@ -75,11 +60,6 @@ void AdaptPartitions_SSE2(const RenderBuffer& render_buffer,
void ApplyFilter(const RenderBuffer& render_buffer,
rtc::ArrayView<const FftData> H,
FftData* S);
#if defined(WEBRTC_HAS_NEON)
void ApplyFilter_NEON(const RenderBuffer& render_buffer,
rtc::ArrayView<const FftData> H,
FftData* S);
#endif
#if defined(WEBRTC_ARCH_X86_FAMILY)
void ApplyFilter_SSE2(const RenderBuffer& render_buffer,
rtc::ArrayView<const FftData> H,

View File

@ -10,7 +10,6 @@
#include "webrtc/modules/audio_processing/aec3/adaptive_fir_filter.h"
#include <math.h>
#include <algorithm>
#include <numeric>
#include <string>
@ -42,114 +41,10 @@ std::string ProduceDebugText(size_t delay) {
} // namespace
#if defined(WEBRTC_HAS_NEON)
// Verifies that the optimized methods for filter adaptation are similar to
// their reference counterparts.
TEST(AdaptiveFirFilter, FilterAdaptationNeonOptimizations) {
RenderBuffer render_buffer(Aec3Optimization::kNone, 3, 12,
std::vector<size_t>(1, 12));
Random random_generator(42U);
std::vector<std::vector<float>> x(3, std::vector<float>(kBlockSize, 0.f));
FftData S_C;
FftData S_NEON;
FftData G;
Aec3Fft fft;
std::vector<FftData> H_C(10);
std::vector<FftData> H_NEON(10);
for (auto& H_j : H_C) {
H_j.Clear();
}
for (auto& H_j : H_NEON) {
H_j.Clear();
}
for (size_t k = 0; k < 30; ++k) {
RandomizeSampleVector(&random_generator, x[0]);
render_buffer.Insert(x);
}
for (size_t j = 0; j < G.re.size(); ++j) {
G.re[j] = j / 10001.f;
}
for (size_t j = 1; j < G.im.size() - 1; ++j) {
G.im[j] = j / 20001.f;
}
G.im[0] = 0.f;
G.im[G.im.size() - 1] = 0.f;
AdaptPartitions_NEON(render_buffer, G, H_NEON);
AdaptPartitions(render_buffer, G, H_C);
AdaptPartitions_NEON(render_buffer, G, H_NEON);
AdaptPartitions(render_buffer, G, H_C);
for (size_t l = 0; l < H_C.size(); ++l) {
for (size_t j = 0; j < H_C[l].im.size(); ++j) {
EXPECT_NEAR(H_C[l].re[j], H_NEON[l].re[j], fabs(H_C[l].re[j] * 0.00001f));
EXPECT_NEAR(H_C[l].im[j], H_NEON[l].im[j], fabs(H_C[l].im[j] * 0.00001f));
}
}
ApplyFilter_NEON(render_buffer, H_NEON, &S_NEON);
ApplyFilter(render_buffer, H_C, &S_C);
for (size_t j = 0; j < S_C.re.size(); ++j) {
EXPECT_NEAR(S_C.re[j], S_NEON.re[j], fabs(S_C.re[j] * 0.00001f));
EXPECT_NEAR(S_C.im[j], S_NEON.im[j], fabs(S_C.re[j] * 0.00001f));
}
}
// Verifies that the optimized method for frequency response computation is
// bitexact to the reference counterpart.
TEST(AdaptiveFirFilter, UpdateFrequencyResponseNeonOptimization) {
const size_t kNumPartitions = 12;
std::vector<FftData> H(kNumPartitions);
std::vector<std::array<float, kFftLengthBy2Plus1>> H2(kNumPartitions);
std::vector<std::array<float, kFftLengthBy2Plus1>> H2_NEON(kNumPartitions);
for (size_t j = 0; j < H.size(); ++j) {
for (size_t k = 0; k < H[j].re.size(); ++k) {
H[j].re[k] = k + j / 3.f;
H[j].im[k] = j + k / 7.f;
}
}
UpdateFrequencyResponse(H, &H2);
UpdateFrequencyResponse_NEON(H, &H2_NEON);
for (size_t j = 0; j < H2.size(); ++j) {
for (size_t k = 0; k < H[j].re.size(); ++k) {
EXPECT_FLOAT_EQ(H2[j][k], H2_NEON[j][k]);
}
}
}
// Verifies that the optimized method for echo return loss computation is
// bitexact to the reference counterpart.
TEST(AdaptiveFirFilter, UpdateErlNeonOptimization) {
const size_t kNumPartitions = 12;
std::vector<std::array<float, kFftLengthBy2Plus1>> H2(kNumPartitions);
std::array<float, kFftLengthBy2Plus1> erl;
std::array<float, kFftLengthBy2Plus1> erl_NEON;
for (size_t j = 0; j < H2.size(); ++j) {
for (size_t k = 0; k < H2[j].size(); ++k) {
H2[j][k] = k + j / 3.f;
}
}
UpdateErlEstimator(H2, &erl);
UpdateErlEstimator_NEON(H2, &erl_NEON);
for (size_t j = 0; j < erl.size(); ++j) {
EXPECT_FLOAT_EQ(erl[j], erl_NEON[j]);
}
}
#endif
#if defined(WEBRTC_ARCH_X86_FAMILY)
// Verifies that the optimized methods for filter adaptation are bitexact to
// their reference counterparts.
TEST(AdaptiveFirFilter, FilterAdaptationSse2Optimizations) {
TEST(AdaptiveFirFilter, FilterAdaptationOptimizations) {
bool use_sse2 = (WebRtc_GetCPUInfo(kSSE2) != 0);
if (use_sse2) {
RenderBuffer render_buffer(Aec3Optimization::kNone, 3, 12,
@ -200,7 +95,7 @@ TEST(AdaptiveFirFilter, FilterAdaptationSse2Optimizations) {
// Verifies that the optimized method for frequency response computation is
// bitexact to the reference counterpart.
TEST(AdaptiveFirFilter, UpdateFrequencyResponseSse2Optimization) {
TEST(AdaptiveFirFilter, UpdateFrequencyResponseOptimization) {
bool use_sse2 = (WebRtc_GetCPUInfo(kSSE2) != 0);
if (use_sse2) {
const size_t kNumPartitions = 12;
@ -228,7 +123,7 @@ TEST(AdaptiveFirFilter, UpdateFrequencyResponseSse2Optimization) {
// Verifies that the optimized method for echo return loss computation is
// bitexact to the reference counterpart.
TEST(AdaptiveFirFilter, UpdateErlSse2Optimization) {
TEST(AdaptiveFirFilter, UpdateErlOptimization) {
bool use_sse2 = (WebRtc_GetCPUInfo(kSSE2) != 0);
if (use_sse2) {
const size_t kNumPartitions = 12;

View File

@ -21,11 +21,6 @@ Aec3Optimization DetectOptimization() {
return Aec3Optimization::kSse2;
}
#endif
#if defined(WEBRTC_HAS_NEON)
return Aec3Optimization::kNeon;
#endif
return Aec3Optimization::kNone;
}

View File

@ -24,7 +24,7 @@ namespace webrtc {
#define ALIGN16_END __attribute__((aligned(16)))
#endif
enum class Aec3Optimization { kNone, kSse2, kNeon };
enum class Aec3Optimization { kNone, kSse2 };
constexpr int kNumBlocksPerSecond = 250;

View File

@ -9,9 +9,6 @@
*/
#include "webrtc/modules/audio_processing/aec3/matched_filter.h"
#if defined(WEBRTC_HAS_NEON)
#include <arm_neon.h>
#endif
#include "webrtc/typedefs.h"
#if defined(WEBRTC_ARCH_X86_FAMILY)
#include <emmintrin.h>
@ -25,114 +22,6 @@
namespace webrtc {
namespace aec3 {
#if defined(WEBRTC_HAS_NEON)
void MatchedFilterCore_NEON(size_t x_start_index,
float x2_sum_threshold,
rtc::ArrayView<const float> x,
rtc::ArrayView<const float> y,
rtc::ArrayView<float> h,
bool* filters_updated,
float* error_sum) {
const int h_size = static_cast<int>(h.size());
const int x_size = static_cast<int>(x.size());
RTC_DCHECK_EQ(0, h_size % 4);
// Process for all samples in the sub-block.
for (size_t i = 0; i < kSubBlockSize; ++i) {
// Apply the matched filter as filter * x, and compute x * x.
RTC_DCHECK_GT(x_size, x_start_index);
const float* x_p = &x[x_start_index];
const float* h_p = &h[0];
// Initialize values for the accumulation.
float32x4_t s_128 = vdupq_n_f32(0);
float32x4_t x2_sum_128 = vdupq_n_f32(0);
float x2_sum = 0.f;
float s = 0;
// Compute loop chunk sizes until, and after, the wraparound of the circular
// buffer for x.
const int chunk1 =
std::min(h_size, static_cast<int>(x_size - x_start_index));
// Perform the loop in two chunks.
const int chunk2 = h_size - chunk1;
for (int limit : {chunk1, chunk2}) {
// Perform 128 bit vector operations.
const int limit_by_4 = limit >> 2;
for (int k = limit_by_4; k > 0; --k, h_p += 4, x_p += 4) {
// Load the data into 128 bit vectors.
const float32x4_t x_k = vld1q_f32(x_p);
const float32x4_t h_k = vld1q_f32(h_p);
// Compute and accumulate x * x and h * x.
x2_sum_128 = vmlaq_f32(x2_sum_128, x_k, x_k);
s_128 = vmlaq_f32(s_128, h_k, x_k);
}
// Perform non-vector operations for any remaining items.
for (int k = limit - limit_by_4 * 4; k > 0; --k, ++h_p, ++x_p) {
const float x_k = *x_p;
x2_sum += x_k * x_k;
s += *h_p * x_k;
}
x_p = &x[0];
}
// Combine the accumulated vector and scalar values.
float* v = reinterpret_cast<float*>(&x2_sum_128);
x2_sum += v[0] + v[1] + v[2] + v[3];
v = reinterpret_cast<float*>(&s_128);
s += v[0] + v[1] + v[2] + v[3];
// Compute the matched filter error.
const float e = std::min(32767.f, std::max(-32768.f, y[i] - s));
*error_sum += e * e;
// Update the matched filter estimate in an NLMS manner.
if (x2_sum > x2_sum_threshold) {
RTC_DCHECK_LT(0.f, x2_sum);
const float alpha = 0.7f * e / x2_sum;
const float32x4_t alpha_128 = vmovq_n_f32(alpha);
// filter = filter + 0.7 * (y - filter * x) / x * x.
float* h_p = &h[0];
x_p = &x[x_start_index];
// Perform the loop in two chunks.
for (int limit : {chunk1, chunk2}) {
// Perform 128 bit vector operations.
const int limit_by_4 = limit >> 2;
for (int k = limit_by_4; k > 0; --k, h_p += 4, x_p += 4) {
// Load the data into 128 bit vectors.
float32x4_t h_k = vld1q_f32(h_p);
const float32x4_t x_k = vld1q_f32(x_p);
// Compute h = h + alpha * x.
h_k = vmlaq_f32(h_k, alpha_128, x_k);
// Store the result.
vst1q_f32(h_p, h_k);
}
// Perform non-vector operations for any remaining items.
for (int k = limit - limit_by_4 * 4; k > 0; --k, ++h_p, ++x_p) {
*h_p += alpha * *x_p;
}
x_p = &x[0];
}
*filters_updated = true;
}
x_start_index = x_start_index > 0 ? x_start_index - 1 : x_size - 1;
}
}
#endif
#if defined(WEBRTC_ARCH_X86_FAMILY)
void MatchedFilterCore_SSE2(size_t x_start_index,
@ -337,13 +226,6 @@ void MatchedFilter::Update(const DownsampledRenderBuffer& render_buffer,
render_buffer.buffer, y, filters_[n],
&filters_updated, &error_sum);
break;
#endif
#if defined(WEBRTC_HAS_NEON)
case Aec3Optimization::kNeon:
aec3::MatchedFilterCore_NEON(x_start_index, x2_sum_threshold,
render_buffer.buffer, y, filters_[n],
&filters_updated, &error_sum);
break;
#endif
default:
aec3::MatchedFilterCore(x_start_index, x2_sum_threshold,

View File

@ -23,19 +23,6 @@
namespace webrtc {
namespace aec3 {
#if defined(WEBRTC_HAS_NEON)
// Filter core for the matched filter that is optimized for NEON.
void MatchedFilterCore_NEON(size_t x_start_index,
float x2_sum_threshold,
rtc::ArrayView<const float> x,
rtc::ArrayView<const float> y,
rtc::ArrayView<float> h,
bool* filters_updated,
float* error_sum);
#endif
#if defined(WEBRTC_ARCH_X86_FAMILY)
// Filter core for the matched filter that is optimized for SSE2.

View File

@ -43,47 +43,10 @@ constexpr size_t kNumMatchedFilters = 4;
} // namespace
#if defined(WEBRTC_HAS_NEON)
// Verifies that the optimized methods for NEON are similar to their reference
// counterparts.
TEST(MatchedFilter, TestNeonOptimizations) {
Random random_generator(42U);
std::vector<float> x(2000);
RandomizeSampleVector(&random_generator, x);
std::vector<float> y(kSubBlockSize);
std::vector<float> h_NEON(512);
std::vector<float> h(512);
int x_index = 0;
for (int k = 0; k < 1000; ++k) {
RandomizeSampleVector(&random_generator, y);
bool filters_updated = false;
float error_sum = 0.f;
bool filters_updated_NEON = false;
float error_sum_NEON = 0.f;
MatchedFilterCore_NEON(x_index, h.size() * 150.f * 150.f, x, y, h_NEON,
&filters_updated_NEON, &error_sum_NEON);
MatchedFilterCore(x_index, h.size() * 150.f * 150.f, x, y, h,
&filters_updated, &error_sum);
EXPECT_EQ(filters_updated, filters_updated_NEON);
EXPECT_NEAR(error_sum, error_sum_NEON, error_sum / 100000.f);
for (size_t j = 0; j < h.size(); ++j) {
EXPECT_NEAR(h[j], h_NEON[j], 0.00001f);
}
x_index = (x_index + kSubBlockSize) % x.size();
}
}
#endif
#if defined(WEBRTC_ARCH_X86_FAMILY)
// Verifies that the optimized methods for SSE2 are bitexact to their reference
// Verifies that the optimized methods are bitexact to their reference
// counterparts.
TEST(MatchedFilter, TestSse2Optimizations) {
TEST(MatchedFilter, TestOptimizations) {
bool use_sse2 = (WebRtc_GetCPUInfo(kSSE2) != 0);
if (use_sse2) {
Random random_generator(42U);

View File

@ -12,9 +12,6 @@
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC3_VECTOR_MATH_H_
#include "webrtc/typedefs.h"
#if defined(WEBRTC_HAS_NEON)
#include <arm_neon.h>
#endif
#if defined(WEBRTC_ARCH_X86_FAMILY)
#include <emmintrin.h>
#endif
@ -56,51 +53,6 @@ class VectorMath {
}
} break;
#endif
#if defined(WEBRTC_HAS_NEON)
case Aec3Optimization::kNeon: {
const int x_size = static_cast<int>(x.size());
const int vector_limit = x_size >> 2;
int j = 0;
for (; j < vector_limit * 4; j += 4) {
float32x4_t g = vld1q_f32(&x[j]);
#if !defined(WEBRTC_ARCH_ARM64)
float32x4_t y = vrsqrteq_f32(g);
// Code to handle sqrt(0).
// If the input to sqrtf() is zero, a zero will be returned.
// If the input to vrsqrteq_f32() is zero, positive infinity is
// returned.
const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000);
// check for divide by zero
const uint32x4_t div_by_zero =
vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(y));
// zero out the positive infinity results
y = vreinterpretq_f32_u32(
vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(y)));
// from arm documentation
// The Newton-Raphson iteration:
// y[n+1] = y[n] * (3 - d * (y[n] * y[n])) / 2)
// converges to (1/√d) if y0 is the result of VRSQRTE applied to d.
//
// Note: The precision did not improve after 2 iterations.
for (int i = 0; i < 2; i++) {
y = vmulq_f32(vrsqrtsq_f32(vmulq_f32(y, y), g), y);
}
// sqrt(g) = g * 1/sqrt(g)
g = vmulq_f32(g, y);
#else
g = vsqrtq_f32(g);
#endif
vst1q_f32(&x[j], g);
}
for (; j < x_size; ++j) {
x[j] = sqrtf(x[j]);
}
}
#endif
break;
default:
std::for_each(x.begin(), x.end(), [](float& a) { a = sqrtf(a); });
}
@ -131,24 +83,6 @@ class VectorMath {
}
} break;
#endif
#if defined(WEBRTC_HAS_NEON)
case Aec3Optimization::kNeon: {
const int x_size = static_cast<int>(x.size());
const int vector_limit = x_size >> 2;
int j = 0;
for (; j < vector_limit * 4; j += 4) {
const float32x4_t x_j = vld1q_f32(&x[j]);
const float32x4_t y_j = vld1q_f32(&y[j]);
const float32x4_t z_j = vmulq_f32(x_j, y_j);
vst1q_f32(&z[j], z_j);
}
for (; j < x_size; ++j) {
z[j] = x[j] * y[j];
}
} break;
#endif
default:
std::transform(x.begin(), x.end(), y.begin(), z.begin(),
std::multiplies<float>());
@ -177,24 +111,6 @@ class VectorMath {
}
} break;
#endif
#if defined(WEBRTC_HAS_NEON)
case Aec3Optimization::kNeon: {
const int x_size = static_cast<int>(x.size());
const int vector_limit = x_size >> 2;
int j = 0;
for (; j < vector_limit * 4; j += 4) {
const float32x4_t x_j = vld1q_f32(&x[j]);
float32x4_t z_j = vld1q_f32(&z[j]);
z_j = vaddq_f32(z_j, x_j);
vst1q_f32(&z[j], z_j);
}
for (; j < x_size; ++j) {
z[j] += x[j];
}
} break;
#endif
default:
std::transform(x.begin(), x.end(), z.begin(), z.begin(),
std::plus<float>());

View File

@ -18,65 +18,6 @@
namespace webrtc {
#if defined(WEBRTC_HAS_NEON)
TEST(VectorMath, Sqrt) {
std::array<float, kFftLengthBy2Plus1> x;
std::array<float, kFftLengthBy2Plus1> z;
std::array<float, kFftLengthBy2Plus1> z_neon;
for (size_t k = 0; k < x.size(); ++k) {
x[k] = (2.f / 3.f) * k;
}
std::copy(x.begin(), x.end(), z.begin());
aec3::VectorMath(Aec3Optimization::kNone).Sqrt(z);
std::copy(x.begin(), x.end(), z_neon.begin());
aec3::VectorMath(Aec3Optimization::kNeon).Sqrt(z_neon);
for (size_t k = 0; k < z.size(); ++k) {
EXPECT_NEAR(z[k], z_neon[k], 0.0001f);
EXPECT_NEAR(sqrtf(x[k]), z_neon[k], 0.0001f);
}
}
TEST(VectorMath, Multiply) {
std::array<float, kFftLengthBy2Plus1> x;
std::array<float, kFftLengthBy2Plus1> y;
std::array<float, kFftLengthBy2Plus1> z;
std::array<float, kFftLengthBy2Plus1> z_neon;
for (size_t k = 0; k < x.size(); ++k) {
x[k] = k;
y[k] = (2.f / 3.f) * k;
}
aec3::VectorMath(Aec3Optimization::kNone).Multiply(x, y, z);
aec3::VectorMath(Aec3Optimization::kNeon).Multiply(x, y, z_neon);
for (size_t k = 0; k < z.size(); ++k) {
EXPECT_FLOAT_EQ(z[k], z_neon[k]);
EXPECT_FLOAT_EQ(x[k] * y[k], z_neon[k]);
}
}
TEST(VectorMath, Accumulate) {
std::array<float, kFftLengthBy2Plus1> x;
std::array<float, kFftLengthBy2Plus1> z;
std::array<float, kFftLengthBy2Plus1> z_neon;
for (size_t k = 0; k < x.size(); ++k) {
x[k] = k;
z[k] = z_neon[k] = 2.f * k;
}
aec3::VectorMath(Aec3Optimization::kNone).Accumulate(x, z);
aec3::VectorMath(Aec3Optimization::kNeon).Accumulate(x, z_neon);
for (size_t k = 0; k < z.size(); ++k) {
EXPECT_FLOAT_EQ(z[k], z_neon[k]);
EXPECT_FLOAT_EQ(x[k] + 2.f * x[k], z_neon[k]);
}
}
#endif
#if defined(WEBRTC_ARCH_X86_FAMILY)
TEST(VectorMath, Sqrt) {