From e8ee4626810c453c9c10990b31229dcbf8e314e7 Mon Sep 17 00:00:00 2001 From: Alessio Bazzica Date: Tue, 8 Dec 2020 11:30:47 +0100 Subject: [PATCH] RNN VAD: FC layer simplified The implementations for the fully connected layer can be simlpified by using `VectorMath:DotProduct()`. In this way, it is also possible to remove (nearly) duplicated SIMD code, reduce the binary size and more easily maintain the code. This CL also forces unoptimized code for the output layer of the VAD, which is a FC 24x1 layer. A slight improvement of the realtime has been measured (delta ~ +5x). Bug: webrtc:10480 Change-Id: Iee93bd59f7905ebf96275dbbfeb3c921baf4e8db Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/195580 Commit-Queue: Alessio Bazzica Reviewed-by: Ivo Creusen Cr-Commit-Position: refs/heads/master@{#32806} --- modules/audio_processing/agc2/cpu_features.cc | 4 ++ modules/audio_processing/agc2/cpu_features.h | 3 ++ .../audio_processing/agc2/rnn_vad/BUILD.gn | 1 - .../rnn_vad/pitch_search_internal_unittest.cc | 10 ++-- modules/audio_processing/agc2/rnn_vad/rnn.cc | 3 +- .../audio_processing/agc2/rnn_vad/rnn_fc.cc | 54 ++----------------- .../audio_processing/agc2/rnn_vad/rnn_fc.h | 8 +-- .../agc2/rnn_vad/rnn_fc_unittest.cc | 2 +- .../agc2/rnn_vad/rnn_gru_unittest.cc | 2 +- .../agc2/rnn_vad/rnn_vad_unittest.cc | 2 +- 10 files changed, 21 insertions(+), 68 deletions(-) diff --git a/modules/audio_processing/agc2/cpu_features.cc b/modules/audio_processing/agc2/cpu_features.cc index 10f9a19e28..cced7614bc 100644 --- a/modules/audio_processing/agc2/cpu_features.cc +++ b/modules/audio_processing/agc2/cpu_features.cc @@ -55,4 +55,8 @@ AvailableCpuFeatures GetAvailableCpuFeatures() { #endif } +AvailableCpuFeatures NoAvailableCpuFeatures() { + return {/*sse2=*/false, /*avx2=*/false, /*neon=*/false}; +} + } // namespace webrtc diff --git a/modules/audio_processing/agc2/cpu_features.h b/modules/audio_processing/agc2/cpu_features.h index bf73c3e562..54ddfb3055 100644 --- a/modules/audio_processing/agc2/cpu_features.h +++ b/modules/audio_processing/agc2/cpu_features.h @@ -31,6 +31,9 @@ struct AvailableCpuFeatures { // Detects what CPU features are available. AvailableCpuFeatures GetAvailableCpuFeatures(); +// Returns the CPU feature flags all set to false. +AvailableCpuFeatures NoAvailableCpuFeatures(); + } // namespace webrtc #endif // MODULES_AUDIO_PROCESSING_AGC2_CPU_FEATURES_H_ diff --git a/modules/audio_processing/agc2/rnn_vad/BUILD.gn b/modules/audio_processing/agc2/rnn_vad/BUILD.gn index ef2370c878..9895b76e25 100644 --- a/modules/audio_processing/agc2/rnn_vad/BUILD.gn +++ b/modules/audio_processing/agc2/rnn_vad/BUILD.gn @@ -92,7 +92,6 @@ rtc_source_set("rnn_vad_layers") { "../../../../api:function_view", "../../../../rtc_base:checks", "../../../../rtc_base:safe_conversions", - "../../../../rtc_base/system:arch", "//third_party/rnnoise:rnn_vad", ] if (current_cpu == "x86" || current_cpu == "x64") { diff --git a/modules/audio_processing/agc2/rnn_vad/pitch_search_internal_unittest.cc b/modules/audio_processing/agc2/rnn_vad/pitch_search_internal_unittest.cc index 8c336af90f..2a6e68f157 100644 --- a/modules/audio_processing/agc2/rnn_vad/pitch_search_internal_unittest.cc +++ b/modules/audio_processing/agc2/rnn_vad/pitch_search_internal_unittest.cc @@ -41,17 +41,13 @@ std::string PrintTestIndexAndCpuFeatures( // Finds the relevant CPU features combinations to test. std::vector GetCpuFeaturesToTest() { std::vector v; - v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false}); + v.push_back(NoAvailableCpuFeatures()); AvailableCpuFeatures available = GetAvailableCpuFeatures(); if (available.avx2) { - AvailableCpuFeatures features( - {/*sse2=*/false, /*avx2=*/true, /*neon=*/false}); - v.push_back(features); + v.push_back({/*sse2=*/false, /*avx2=*/true, /*neon=*/false}); } if (available.sse2) { - AvailableCpuFeatures features( - {/*sse2=*/true, /*avx2=*/false, /*neon=*/false}); - v.push_back(features); + v.push_back({/*sse2=*/true, /*avx2=*/false, /*neon=*/false}); } return v; } diff --git a/modules/audio_processing/agc2/rnn_vad/rnn.cc b/modules/audio_processing/agc2/rnn_vad/rnn.cc index f828a248c3..475bef9775 100644 --- a/modules/audio_processing/agc2/rnn_vad/rnn.cc +++ b/modules/audio_processing/agc2/rnn_vad/rnn.cc @@ -57,7 +57,8 @@ RnnVad::RnnVad(const AvailableCpuFeatures& cpu_features) kOutputDenseBias, kOutputDenseWeights, ActivationFunction::kSigmoidApproximated, - cpu_features, + // The output layer is just 24x1. The unoptimized code is faster. + NoAvailableCpuFeatures(), /*layer_name=*/"FC2") { // Input-output chaining size checks. RTC_DCHECK_EQ(input_.size(), hidden_.input_size()) diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_fc.cc b/modules/audio_processing/agc2/rnn_vad/rnn_fc.cc index 2363317bcf..b04807f19f 100644 --- a/modules/audio_processing/agc2/rnn_vad/rnn_fc.cc +++ b/modules/audio_processing/agc2/rnn_vad/rnn_fc.cc @@ -8,13 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ -// Defines WEBRTC_ARCH_X86_FAMILY, used below. -#include "rtc_base/system/arch.h" - -#if defined(WEBRTC_ARCH_X86_FAMILY) -#include -#endif - #include #include @@ -84,7 +77,7 @@ FullyConnectedLayer::FullyConnectedLayer( output_size_(output_size), bias_(GetScaledParams(bias)), weights_(PreprocessWeights(weights, output_size)), - cpu_features_(cpu_features), + vector_math_(cpu_features), activation_function_(GetActivationFunction(activation_function)) { RTC_DCHECK_LE(output_size_, kFullyConnectedLayerMaxUnits) << "Insufficient FC layer over-allocation (" << layer_name << ")."; @@ -100,52 +93,13 @@ FullyConnectedLayer::~FullyConnectedLayer() = default; void FullyConnectedLayer::ComputeOutput(rtc::ArrayView input) { RTC_DCHECK_EQ(input.size(), input_size_); -#if defined(WEBRTC_ARCH_X86_FAMILY) - // TODO(bugs.chromium.org/10480): Add AVX2. - if (cpu_features_.sse2) { - ComputeOutputSse2(input); - return; - } -#endif - // TODO(bugs.chromium.org/10480): Add Neon. - - // Un-optimized implementation. + rtc::ArrayView weights(weights_); for (int o = 0; o < output_size_; ++o) { - output_[o] = bias_[o]; - // TODO(bugs.chromium.org/9076): Benchmark how different layouts for - // |weights_| change the performance across different platforms. - for (int i = 0; i < input_size_; ++i) { - output_[o] += input[i] * weights_[o * input_size_ + i]; - } - output_[o] = activation_function_(output_[o]); - } -} - -#if defined(WEBRTC_ARCH_X86_FAMILY) -void FullyConnectedLayer::ComputeOutputSse2(rtc::ArrayView input) { - const int input_size_by_4 = input_size_ >> 2; - const int offset = input_size_ & ~3; - // TODO(bugs.chromium.org/10480): Check if reinterpret_cast below is ok. - __m128 sum_wx_128; - const float* v = reinterpret_cast(&sum_wx_128); - for (int o = 0; o < output_size_; ++o) { - // Perform 128 bit vector operations. - sum_wx_128 = _mm_set1_ps(0); - const float* x_p = input.data(); - const float* w_p = weights_.data() + o * input.size(); - for (int i = 0; i < input_size_by_4; ++i, x_p += 4, w_p += 4) { - sum_wx_128 = _mm_add_ps(sum_wx_128, - _mm_mul_ps(_mm_loadu_ps(x_p), _mm_loadu_ps(w_p))); - } - // Perform non-vector operations for any remaining items, sum up bias term - // and results from the vectorized code, and apply the activation function. output_[o] = activation_function_( - std::inner_product(input.begin() + offset, input.end(), - weights_.begin() + o * input.size() + offset, - bias_[o] + v[0] + v[1] + v[2] + v[3])); + bias_[o] + vector_math_.DotProduct( + input, weights.subview(o * input_size_, input_size_))); } } -#endif // defined(WEBRTC_ARCH_X86_FAMILY) } // namespace rnn_vad } // namespace webrtc diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_fc.h b/modules/audio_processing/agc2/rnn_vad/rnn_fc.h index d05d95cc4b..d23957a6f2 100644 --- a/modules/audio_processing/agc2/rnn_vad/rnn_fc.h +++ b/modules/audio_processing/agc2/rnn_vad/rnn_fc.h @@ -18,7 +18,7 @@ #include "api/array_view.h" #include "api/function_view.h" #include "modules/audio_processing/agc2/cpu_features.h" -#include "rtc_base/system/arch.h" +#include "modules/audio_processing/agc2/rnn_vad/vector_math.h" namespace webrtc { namespace rnn_vad { @@ -56,15 +56,11 @@ class FullyConnectedLayer { void ComputeOutput(rtc::ArrayView input); private: -#if defined(WEBRTC_ARCH_X86_FAMILY) - void ComputeOutputSse2(rtc::ArrayView input); -#endif - const int input_size_; const int output_size_; const std::vector bias_; const std::vector weights_; - const AvailableCpuFeatures cpu_features_; + const VectorMath vector_math_; rtc::FunctionView activation_function_; // Over-allocated array with size equal to `output_size_`. std::array output_; diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_fc_unittest.cc b/modules/audio_processing/agc2/rnn_vad/rnn_fc_unittest.cc index 900ce63121..3074b34335 100644 --- a/modules/audio_processing/agc2/rnn_vad/rnn_fc_unittest.cc +++ b/modules/audio_processing/agc2/rnn_vad/rnn_fc_unittest.cc @@ -84,7 +84,7 @@ TEST_P(RnnFcParametrization, DISABLED_BenchmarkFullyConnectedLayer) { // Finds the relevant CPU features combinations to test. std::vector GetCpuFeaturesToTest() { std::vector v; - v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false}); + v.push_back(NoAvailableCpuFeatures()); AvailableCpuFeatures available = GetAvailableCpuFeatures(); if (available.sse2) { v.push_back({/*sse2=*/true, /*avx2=*/false, /*neon=*/false}); diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_gru_unittest.cc b/modules/audio_processing/agc2/rnn_vad/rnn_gru_unittest.cc index ee8bdac994..f9b712554f 100644 --- a/modules/audio_processing/agc2/rnn_vad/rnn_gru_unittest.cc +++ b/modules/audio_processing/agc2/rnn_vad/rnn_gru_unittest.cc @@ -160,7 +160,7 @@ TEST_P(RnnGruParametrization, DISABLED_BenchmarkGatedRecurrentLayer) { std::vector GetCpuFeaturesToTest() { std::vector v; AvailableCpuFeatures available = GetAvailableCpuFeatures(); - v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false}); + v.push_back(NoAvailableCpuFeatures()); if (available.avx2) { v.push_back({/*sse2=*/false, /*avx2=*/true, /*neon=*/false}); } diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_vad_unittest.cc b/modules/audio_processing/agc2/rnn_vad/rnn_vad_unittest.cc index 7eb699c39f..f223d587ee 100644 --- a/modules/audio_processing/agc2/rnn_vad/rnn_vad_unittest.cc +++ b/modules/audio_processing/agc2/rnn_vad/rnn_vad_unittest.cc @@ -158,7 +158,7 @@ TEST_P(RnnVadProbabilityParametrization, DISABLED_RnnVadPerformance) { // Finds the relevant CPU features combinations to test. std::vector GetCpuFeaturesToTest() { std::vector v; - v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false}); + v.push_back(NoAvailableCpuFeatures()); AvailableCpuFeatures available = GetAvailableCpuFeatures(); if (available.avx2 && available.sse2) { v.push_back({/*sse2=*/true, /*avx2=*/true, /*neon=*/false});