From ed9f5f85fda4c0b039472c1b5de7a04bdabcd845 Mon Sep 17 00:00:00 2001 From: Alessio Bazzica Date: Mon, 28 Dec 2020 17:57:24 +0100 Subject: [PATCH] RNN VAD optimizations: `VectorMath::DotProduct()` NEON arm64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Results: RNN VAD realtime factor improved from 140x to 195x (+55x) Test device: Pixel 2 XL Benchmark setup: max clock speed forced on all the cores by setting "performance" as scaling governor Bug: webrtc:10480 Change-Id: I3e92f643853ad1fe990db909c578ce78ee826c03 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/198842 Reviewed-by: Per Ã…hgren Commit-Queue: Alessio Bazzica Cr-Commit-Position: refs/heads/master@{#32888} --- modules/audio_processing/agc2/BUILD.gn | 7 +++++ .../audio_processing/agc2/rnn_vad/BUILD.gn | 22 +++++++++++++++ .../agc2/rnn_vad/rnn_vad_unittest.cc | 3 ++ .../agc2/rnn_vad/vector_math.h | 28 ++++++++++++++++++- .../agc2/rnn_vad/vector_math_unittest.cc | 3 ++ 5 files changed, 62 insertions(+), 1 deletion(-) diff --git a/modules/audio_processing/agc2/BUILD.gn b/modules/audio_processing/agc2/BUILD.gn index c6667df420..7b71f6a8e7 100644 --- a/modules/audio_processing/agc2/BUILD.gn +++ b/modules/audio_processing/agc2/BUILD.gn @@ -162,6 +162,13 @@ rtc_library("rnn_vad_with_level") { "vad_with_level.cc", "vad_with_level.h", ] + + defines = [] + if (rtc_build_with_neon && current_cpu != "arm64") { + suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ] + cflags = [ "-mfpu=neon" ] + } + deps = [ ":common", ":cpu_features", diff --git a/modules/audio_processing/agc2/rnn_vad/BUILD.gn b/modules/audio_processing/agc2/rnn_vad/BUILD.gn index 9895b76e25..4732efd082 100644 --- a/modules/audio_processing/agc2/rnn_vad/BUILD.gn +++ b/modules/audio_processing/agc2/rnn_vad/BUILD.gn @@ -17,6 +17,7 @@ rtc_library("rnn_vad") { "rnn.h", ] + defines = [] if (rtc_build_with_neon && current_cpu != "arm64") { suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ] cflags = [ "-mfpu=neon" ] @@ -84,6 +85,13 @@ rtc_source_set("rnn_vad_layers") { "rnn_gru.cc", "rnn_gru.h", ] + + defines = [] + if (rtc_build_with_neon && current_cpu != "arm64") { + suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ] + cflags = [ "-mfpu=neon" ] + } + deps = [ ":rnn_vad_common", ":vector_math", @@ -138,6 +146,13 @@ rtc_library("rnn_vad_pitch") { "pitch_search_internal.cc", "pitch_search_internal.h", ] + + defines = [] + if (rtc_build_with_neon && current_cpu != "arm64") { + suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ] + cflags = [ "-mfpu=neon" ] + } + deps = [ ":rnn_vad_auto_correlation", ":rnn_vad_common", @@ -253,6 +268,13 @@ if (rtc_include_tests) { "symmetric_matrix_buffer_unittest.cc", "vector_math_unittest.cc", ] + + defines = [] + if (rtc_build_with_neon && current_cpu != "arm64") { + suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ] + cflags = [ "-mfpu=neon" ] + } + deps = [ ":rnn_vad", ":rnn_vad_auto_correlation", diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_vad_unittest.cc b/modules/audio_processing/agc2/rnn_vad/rnn_vad_unittest.cc index f223d587ee..989b235705 100644 --- a/modules/audio_processing/agc2/rnn_vad/rnn_vad_unittest.cc +++ b/modules/audio_processing/agc2/rnn_vad/rnn_vad_unittest.cc @@ -166,6 +166,9 @@ std::vector GetCpuFeaturesToTest() { if (available.sse2) { v.push_back({/*sse2=*/true, /*avx2=*/false, /*neon=*/false}); } + if (available.neon) { + v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/true}); + } return v; } diff --git a/modules/audio_processing/agc2/rnn_vad/vector_math.h b/modules/audio_processing/agc2/rnn_vad/vector_math.h index 0600b904eb..47f681196a 100644 --- a/modules/audio_processing/agc2/rnn_vad/vector_math.h +++ b/modules/audio_processing/agc2/rnn_vad/vector_math.h @@ -14,6 +14,9 @@ // Defines WEBRTC_ARCH_X86_FAMILY, used below. #include "rtc_base/system/arch.h" +#if defined(WEBRTC_HAS_NEON) +#include +#endif #if defined(WEBRTC_ARCH_X86_FAMILY) #include #endif @@ -70,8 +73,31 @@ class VectorMath { } return dot_product; } +#elif defined(WEBRTC_HAS_NEON) && defined(WEBRTC_ARCH_ARM64) + if (cpu_features_.neon) { + float32x4_t accumulator = vdupq_n_f32(0.f); + constexpr int kBlockSizeLog2 = 2; + constexpr int kBlockSize = 1 << kBlockSizeLog2; + const int incomplete_block_index = (x.size() >> kBlockSizeLog2) + << kBlockSizeLog2; + for (int i = 0; i < incomplete_block_index; i += kBlockSize) { + RTC_DCHECK_LE(i + kBlockSize, x.size()); + const float32x4_t x_i = vld1q_f32(&x[i]); + const float32x4_t y_i = vld1q_f32(&y[i]); + accumulator = vfmaq_f32(accumulator, x_i, y_i); + } + // Reduce `accumulator` by addition. + const float32x2_t tmp = + vpadd_f32(vget_low_f32(accumulator), vget_high_f32(accumulator)); + float dot_product = vget_lane_f32(vpadd_f32(tmp, vrev64_f32(tmp)), 0); + // Add the result for the last block if incomplete. + for (int i = incomplete_block_index; + i < rtc::dchecked_cast(x.size()); ++i) { + dot_product += x[i] * y[i]; + } + return dot_product; + } #endif - // TODO(bugs.webrtc.org/10480): Add NEON alternative implementation. return std::inner_product(x.begin(), x.end(), y.begin(), 0.f); } diff --git a/modules/audio_processing/agc2/rnn_vad/vector_math_unittest.cc b/modules/audio_processing/agc2/rnn_vad/vector_math_unittest.cc index 9a2d5bc116..45fd65d61e 100644 --- a/modules/audio_processing/agc2/rnn_vad/vector_math_unittest.cc +++ b/modules/audio_processing/agc2/rnn_vad/vector_math_unittest.cc @@ -52,6 +52,9 @@ std::vector GetCpuFeaturesToTest() { if (available.sse2) { v.push_back({/*sse2=*/true, /*avx2=*/false, /*neon=*/false}); } + if (available.neon) { + v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/true}); + } return v; }