From 5ab21f8853892205594ae8559a00b431f30a8a06 Mon Sep 17 00:00:00 2001 From: Alessio Bazzica Date: Tue, 5 Nov 2019 21:46:02 +0000 Subject: [PATCH] Revert "RNN VAD: prepare for SIMD optimization" This reverts commit 7350a902374c796dec8ce583cfaf4b9697f3a525. Reason for revert: possibly breaking downstream projects Original change's description: > RNN VAD: prepare for SIMD optimization > > This CL adds the boilerplate for SIMD optimization of FC and GRU layers > in rnn.cc. The same scheme of AEC3 has been used. Unit tests for the > optimized architectures have been added (the same unoptimized > implementation will run). > > Minor changes: > - unnecessary const removed in rnn.h > - FC and GRU test data in the anon namespace as constexpr > > Bug: webrtc:10480 > Change-Id: Ifae4e970326e7e7c603d49aeaf61194b5efdabd3 > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/141419 > Commit-Queue: Alessio Bazzica > Reviewed-by: Gustaf Ullberg > Cr-Commit-Position: refs/heads/master@{#29696} TBR=gustaf@webrtc.org,alessiob@webrtc.org,fhernqvist@webrtc.org Change-Id: I9ae82f4bd2d30797646fabfb5ad16bea378208b8 No-Presubmit: true No-Tree-Checks: true No-Try: true Bug: webrtc:10480 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/158893 Reviewed-by: Alessio Bazzica Commit-Queue: Alessio Bazzica Cr-Commit-Position: refs/heads/master@{#29699} --- .../audio_processing/agc2/rnn_vad/BUILD.gn | 10 -- .../audio_processing/agc2/rnn_vad/common.cc | 34 ---- .../audio_processing/agc2/rnn_vad/common.h | 7 - modules/audio_processing/agc2/rnn_vad/rnn.cc | 72 +------- modules/audio_processing/agc2/rnn_vad/rnn.h | 30 ++-- .../agc2/rnn_vad/rnn_unittest.cc | 155 ++++++++---------- 6 files changed, 87 insertions(+), 221 deletions(-) delete mode 100644 modules/audio_processing/agc2/rnn_vad/common.cc diff --git a/modules/audio_processing/agc2/rnn_vad/BUILD.gn b/modules/audio_processing/agc2/rnn_vad/BUILD.gn index 852abd88bf..71e02fb575 100644 --- a/modules/audio_processing/agc2/rnn_vad/BUILD.gn +++ b/modules/audio_processing/agc2/rnn_vad/BUILD.gn @@ -13,7 +13,6 @@ rtc_library("rnn_vad") { sources = [ "auto_correlation.cc", "auto_correlation.h", - "common.cc", "common.h", "features_extraction.cc", "features_extraction.h", @@ -34,20 +33,11 @@ rtc_library("rnn_vad") { "spectral_features_internal.h", "symmetric_matrix_buffer.h", ] - - defines = [] - if (rtc_build_with_neon && current_cpu != "arm64") { - suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ] - cflags = [ "-mfpu=neon" ] - } - deps = [ "..:biquad_filter", "../../../../api:array_view", "../../../../rtc_base:checks", "../../../../rtc_base:rtc_base_approved", - "../../../../rtc_base/system:arch", - "../../../../system_wrappers:cpu_features_api", "../../utility:pffft_wrapper", "//third_party/rnnoise:rnn_vad", ] diff --git a/modules/audio_processing/agc2/rnn_vad/common.cc b/modules/audio_processing/agc2/rnn_vad/common.cc deleted file mode 100644 index 744c87fea2..0000000000 --- a/modules/audio_processing/agc2/rnn_vad/common.cc +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2019 The WebRTC project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "modules/audio_processing/agc2/rnn_vad/common.h" - -#include "rtc_base/system/arch.h" -#include "system_wrappers/include/cpu_features_wrapper.h" - -namespace webrtc { -namespace rnn_vad { - -Optimization DetectOptimization() { -#if defined(WEBRTC_ARCH_X86_FAMILY) - if (WebRtc_GetCPUInfo(kSSE2) != 0) { - return Optimization::kSse2; - } -#endif - -#if defined(WEBRTC_HAS_NEON) - return Optimization::kNeon; -#endif - - return Optimization::kNone; -} - -} // namespace rnn_vad -} // namespace webrtc diff --git a/modules/audio_processing/agc2/rnn_vad/common.h b/modules/audio_processing/agc2/rnn_vad/common.h index c2e8df6905..6b434d2171 100644 --- a/modules/audio_processing/agc2/rnn_vad/common.h +++ b/modules/audio_processing/agc2/rnn_vad/common.h @@ -11,8 +11,6 @@ #ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_COMMON_H_ #define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_COMMON_H_ -#include - namespace webrtc { namespace rnn_vad { @@ -65,11 +63,6 @@ static_assert(kCepstralCoeffsHistorySize > 2, constexpr size_t kFeatureVectorSize = 42; -enum class Optimization { kNone, kSse2, kNeon }; - -// Detects what kind of optimizations to use for the code. -Optimization DetectOptimization(); - } // namespace rnn_vad } // namespace webrtc diff --git a/modules/audio_processing/agc2/rnn_vad/rnn.cc b/modules/audio_processing/agc2/rnn_vad/rnn.cc index e6ef2f3a41..94cc254045 100644 --- a/modules/audio_processing/agc2/rnn_vad/rnn.cc +++ b/modules/audio_processing/agc2/rnn_vad/rnn.cc @@ -10,15 +10,6 @@ #include "modules/audio_processing/agc2/rnn_vad/rnn.h" -// Defines WEBRTC_ARCH_X86_FAMILY, used below. -#include "rtc_base/system/arch.h" - -#if defined(WEBRTC_HAS_NEON) -#include -#endif -#if defined(WEBRTC_ARCH_X86_FAMILY) -#include -#endif #include #include #include @@ -78,14 +69,12 @@ FullyConnectedLayer::FullyConnectedLayer( const size_t output_size, const rtc::ArrayView bias, const rtc::ArrayView weights, - float (*const activation_function)(float), - Optimization optimization) + float (*const activation_function)(float)) : input_size_(input_size), output_size_(output_size), bias_(GetScaledParams(bias)), weights_(GetScaledParams(weights)), - activation_function_(activation_function), - optimization_(optimization) { + activation_function_(activation_function) { RTC_DCHECK_LE(output_size_, kFullyConnectedLayersMaxUnits) << "Static over-allocation of fully-connected layers output vectors is " "not sufficient."; @@ -102,26 +91,8 @@ rtc::ArrayView FullyConnectedLayer::GetOutput() const { } void FullyConnectedLayer::ComputeOutput(rtc::ArrayView input) { - switch (optimization_) { -#if defined(WEBRTC_ARCH_X86_FAMILY) - case Optimization::kSse2: - // TODO(bugs.chromium.org/10480): Handle Optimization::kSse2. - ComputeOutput_NONE(input); - break; -#endif -#if defined(WEBRTC_HAS_NEON) - case Optimization::kNeon: - // TODO(bugs.chromium.org/10480): Handle Optimization::kNeon. - ComputeOutput_NONE(input); - break; -#endif - default: - ComputeOutput_NONE(input); - } -} - -void FullyConnectedLayer::ComputeOutput_NONE( - rtc::ArrayView input) { + // TODO(bugs.chromium.org/9076): Optimize using SSE/AVX fused multiply-add + // operations. for (size_t o = 0; o < output_size_; ++o) { output_[o] = bias_[o]; // TODO(bugs.chromium.org/9076): Benchmark how different layouts for @@ -138,14 +109,12 @@ GatedRecurrentLayer::GatedRecurrentLayer( const size_t output_size, const rtc::ArrayView bias, const rtc::ArrayView weights, - const rtc::ArrayView recurrent_weights, - Optimization optimization) + const rtc::ArrayView recurrent_weights) : input_size_(input_size), output_size_(output_size), bias_(GetScaledParams(bias)), weights_(GetScaledParams(weights)), - recurrent_weights_(GetScaledParams(recurrent_weights)), - optimization_(optimization) { + recurrent_weights_(GetScaledParams(recurrent_weights)) { RTC_DCHECK_LE(output_size_, kRecurrentLayersMaxUnits) << "Static over-allocation of recurrent layers state vectors is not " << "sufficient."; @@ -170,26 +139,6 @@ void GatedRecurrentLayer::Reset() { } void GatedRecurrentLayer::ComputeOutput(rtc::ArrayView input) { - switch (optimization_) { -#if defined(WEBRTC_ARCH_X86_FAMILY) - case Optimization::kSse2: - // TODO(bugs.chromium.org/10480): Handle Optimization::kSse2. - ComputeOutput_NONE(input); - break; -#endif -#if defined(WEBRTC_HAS_NEON) - case Optimization::kNeon: - // TODO(bugs.chromium.org/10480): Handle Optimization::kNeon. - ComputeOutput_NONE(input); - break; -#endif - default: - ComputeOutput_NONE(input); - } -} - -void GatedRecurrentLayer::ComputeOutput_NONE( - rtc::ArrayView input) { // TODO(bugs.chromium.org/9076): Optimize using SSE/AVX fused multiply-add // operations. // Stride and offset used to read parameter arrays. @@ -254,20 +203,17 @@ RnnBasedVad::RnnBasedVad() kInputLayerOutputSize, kInputDenseBias, kInputDenseWeights, - TansigApproximated, - DetectOptimization()), + TansigApproximated), hidden_layer_(kInputLayerOutputSize, kHiddenLayerOutputSize, kHiddenGruBias, kHiddenGruWeights, - kHiddenGruRecurrentWeights, - DetectOptimization()), + kHiddenGruRecurrentWeights), output_layer_(kHiddenLayerOutputSize, kOutputLayerOutputSize, kOutputDenseBias, kOutputDenseWeights, - SigmoidApproximated, - DetectOptimization()) { + SigmoidApproximated) { // Input-output chaining size checks. RTC_DCHECK_EQ(input_layer_.output_size(), hidden_layer_.input_size()) << "The input and the hidden layers sizes do not match."; diff --git a/modules/audio_processing/agc2/rnn_vad/rnn.h b/modules/audio_processing/agc2/rnn_vad/rnn.h index f53a09379d..c38ff01b3e 100644 --- a/modules/audio_processing/agc2/rnn_vad/rnn.h +++ b/modules/audio_processing/agc2/rnn_vad/rnn.h @@ -38,12 +38,11 @@ constexpr size_t kRecurrentLayersMaxUnits = 24; // Fully-connected layer. class FullyConnectedLayer { public: - FullyConnectedLayer(size_t input_size, - size_t output_size, - rtc::ArrayView bias, - rtc::ArrayView weights, - float (*const activation_function)(float), - Optimization optimization); + FullyConnectedLayer(const size_t input_size, + const size_t output_size, + const rtc::ArrayView bias, + const rtc::ArrayView weights, + float (*const activation_function)(float)); FullyConnectedLayer(const FullyConnectedLayer&) = delete; FullyConnectedLayer& operator=(const FullyConnectedLayer&) = delete; ~FullyConnectedLayer(); @@ -54,15 +53,11 @@ class FullyConnectedLayer { void ComputeOutput(rtc::ArrayView input); private: - // No SIMD optimizations. - void ComputeOutput_NONE(rtc::ArrayView input); - const size_t input_size_; const size_t output_size_; const std::vector bias_; const std::vector weights_; float (*const activation_function_)(float); - const Optimization optimization_; // The output vector of a recurrent layer has length equal to |output_size_|. // However, for efficiency, over-allocation is used. std::array output_; @@ -72,12 +67,11 @@ class FullyConnectedLayer { // activation functions for the update/reset and output gates respectively. class GatedRecurrentLayer { public: - GatedRecurrentLayer(size_t input_size, - size_t output_size, - rtc::ArrayView bias, - rtc::ArrayView weights, - rtc::ArrayView recurrent_weights, - Optimization optimization); + GatedRecurrentLayer(const size_t input_size, + const size_t output_size, + const rtc::ArrayView bias, + const rtc::ArrayView weights, + const rtc::ArrayView recurrent_weights); GatedRecurrentLayer(const GatedRecurrentLayer&) = delete; GatedRecurrentLayer& operator=(const GatedRecurrentLayer&) = delete; ~GatedRecurrentLayer(); @@ -89,9 +83,6 @@ class GatedRecurrentLayer { void ComputeOutput(rtc::ArrayView input); private: - // No SIMD optimizations. - void ComputeOutput_NONE(rtc::ArrayView input); - const size_t input_size_; const size_t output_size_; const std::vector bias_; @@ -100,7 +91,6 @@ class GatedRecurrentLayer { // The state vector of a recurrent layer has length equal to |output_size_|. // However, to avoid dynamic allocation, over-allocation is used. std::array state_; - const Optimization optimization_; }; // Recurrent network based VAD. diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_unittest.cc b/modules/audio_processing/agc2/rnn_vad/rnn_unittest.cc index 97ede1811a..61e6f2670e 100644 --- a/modules/audio_processing/agc2/rnn_vad/rnn_unittest.cc +++ b/modules/audio_processing/agc2/rnn_vad/rnn_unittest.cc @@ -14,7 +14,6 @@ #include "modules/audio_processing/agc2/rnn_vad/test_utils.h" #include "rtc_base/checks.h" -#include "rtc_base/logging.h" #include "test/gtest.h" #include "third_party/rnnoise/src/rnn_activations.h" #include "third_party/rnnoise/src/rnn_vad_weights.h" @@ -61,104 +60,86 @@ void TestGatedRecurrentLayer( } } -// Fully connected layer test data. -constexpr size_t kFullyConnectedInputSize = 24; -constexpr size_t kFullyConnectedOutputSize = 1; -constexpr std::array kFullyConnectedBias = {-50}; -constexpr std::array kFullyConnectedWeights = { - 127, 127, 127, 127, 127, 20, 127, -126, -126, -54, 14, 125, - -126, -126, 127, -125, -126, 127, -127, -127, -57, -30, 127, 80}; -constexpr std::array kFullyConnectedInputVectors = { - // Input 1. - 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.215833917f, 0.290601075f, 0.238759011f, - 0.244751841f, 0.f, 0.0461241305f, 0.106401242f, 0.223070428f, 0.630603909f, - 0.690453172f, 0.f, 0.387645692f, 0.166913897f, 0.f, 0.0327451192f, 0.f, - 0.136149868f, 0.446351469f, - // Input 2. - 0.592162728f, 0.529089332f, 1.18205106f, 1.21736848f, 0.f, 0.470851123f, - 0.130675942f, 0.320903003f, 0.305496395f, 0.0571633279f, 1.57001138f, - 0.0182026215f, 0.0977443159f, 0.347477973f, 0.493206412f, 0.9688586f, - 0.0320267938f, 0.244722098f, 0.312745273f, 0.f, 0.00650715502f, - 0.312553257f, 1.62619662f, 0.782880902f, - // Input 3. - 0.395022154f, 0.333681047f, 0.76302278f, 0.965480626f, 0.f, 0.941198349f, - 0.0892967582f, 0.745046318f, 0.635769248f, 0.238564298f, 0.970656633f, - 0.014159563f, 0.094203949f, 0.446816623f, 0.640755892f, 1.20532358f, - 0.0254284926f, 0.283327013f, 0.726210058f, 0.0550272502f, 0.000344108557f, - 0.369803518f, 1.56680179f, 0.997883797f}; -constexpr std::array kFullyConnectedExpectedOutputs = { - 0.436567038f, 0.874741316f, 0.672785878f}; - -// Gated recurrent units layer test data. -constexpr size_t kGruInputSize = 5; -constexpr size_t kGruOutputSize = 4; -constexpr std::array kGruBias = {96, -99, -81, -114, 49, 119, - -118, 68, -76, 91, 121, 125}; -constexpr std::array kGruWeights = { - 124, 9, 1, 116, -66, -21, -118, -110, 104, 75, -23, -51, - -72, -111, 47, 93, 77, -98, 41, -8, 40, -23, -43, -107, - 9, -73, 30, -32, -2, 64, -26, 91, -48, -24, -28, -104, - 74, -46, 116, 15, 32, 52, -126, -38, -121, 12, -16, 110, - -95, 66, -103, -35, -38, 3, -126, -61, 28, 98, -117, -43}; -constexpr std::array kGruRecurrentWeights = { - -3, 87, 50, 51, -22, 27, -39, 62, 31, -83, -52, -48, - -6, 83, -19, 104, 105, 48, 23, 68, 23, 40, 7, -120, - 64, -62, 117, 85, -51, -43, 54, -105, 120, 56, -128, -107, - 39, 50, -17, -47, -117, 14, 108, 12, -7, -72, 103, -87, - -66, 82, 84, 100, -98, 102, -49, 44, 122, 106, -20, -69}; -constexpr std::array kGruInputSequence = { - 0.89395463f, 0.93224651f, 0.55788344f, 0.32341808f, 0.93355054f, - 0.13475326f, 0.97370994f, 0.14253306f, 0.93710381f, 0.76093364f, - 0.65780413f, 0.41657975f, 0.49403164f, 0.46843281f, 0.75138855f, - 0.24517593f, 0.47657707f, 0.57064998f, 0.435184f, 0.19319285f}; -constexpr std::array kGruExpectedOutputSequence = { - 0.0239123f, 0.5773077f, 0.f, 0.f, - 0.01282811f, 0.64330572f, 0.f, 0.04863098f, - 0.00781069f, 0.75267816f, 0.f, 0.02579715f, - 0.00471378f, 0.59162533f, 0.11087593f, 0.01334511f}; - } // namespace -class OptimizationTest : public ::testing::Test, - public ::testing::WithParamInterface {}; - // Checks that the output of a fully connected layer is within tolerance given // test input data. -TEST_P(OptimizationTest, CheckFullyConnectedLayerOutput) { - const Optimization optimization = GetParam(); - RTC_LOG(LS_VERBOSE) << optimization; - FullyConnectedLayer fc(kFullyConnectedInputSize, kFullyConnectedOutputSize, - kFullyConnectedBias, kFullyConnectedWeights, - SigmoidApproximated, optimization); +TEST(RnnVadTest, CheckFullyConnectedLayerOutput) { + const std::array bias = {-50}; + const std::array weights = { + 127, 127, 127, 127, 127, 20, 127, -126, -126, -54, 14, 125, + -126, -126, 127, -125, -126, 127, -127, -127, -57, -30, 127, 80}; + FullyConnectedLayer fc(24, 1, bias, weights, SigmoidApproximated); // Test on different inputs. - static_assert( - kFullyConnectedInputVectors.size() % kFullyConnectedInputSize == 0, ""); - constexpr size_t kNumInputVectors = - kFullyConnectedInputVectors.size() / kFullyConnectedInputSize; - static_assert(kFullyConnectedExpectedOutputs.size() == kNumInputVectors, ""); - for (size_t i = 0; i < kNumInputVectors; ++i) { - rtc::ArrayView input( - kFullyConnectedInputVectors.data() + kFullyConnectedInputSize * i, - kFullyConnectedInputSize); - TestFullyConnectedLayer(&fc, input, kFullyConnectedExpectedOutputs[i]); + { + const std::array input_vector = { + 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.215833917f, 0.290601075f, 0.238759011f, 0.244751841f, + 0.f, 0.0461241305f, 0.106401242f, 0.223070428f, 0.630603909f, + 0.690453172f, 0.f, 0.387645692f, 0.166913897f, 0.f, + 0.0327451192f, 0.f, 0.136149868f, 0.446351469f}; + TestFullyConnectedLayer(&fc, input_vector, 0.436567038f); + } + { + const std::array input_vector = { + 0.592162728f, 0.529089332f, 1.18205106f, + 1.21736848f, 0.f, 0.470851123f, + 0.130675942f, 0.320903003f, 0.305496395f, + 0.0571633279f, 1.57001138f, 0.0182026215f, + 0.0977443159f, 0.347477973f, 0.493206412f, + 0.9688586f, 0.0320267938f, 0.244722098f, + 0.312745273f, 0.f, 0.00650715502f, + 0.312553257f, 1.62619662f, 0.782880902f}; + TestFullyConnectedLayer(&fc, input_vector, 0.874741316f); + } + { + const std::array input_vector = { + 0.395022154f, 0.333681047f, 0.76302278f, + 0.965480626f, 0.f, 0.941198349f, + 0.0892967582f, 0.745046318f, 0.635769248f, + 0.238564298f, 0.970656633f, 0.014159563f, + 0.094203949f, 0.446816623f, 0.640755892f, + 1.20532358f, 0.0254284926f, 0.283327013f, + 0.726210058f, 0.0550272502f, 0.000344108557f, + 0.369803518f, 1.56680179f, 0.997883797f}; + TestFullyConnectedLayer(&fc, input_vector, 0.672785878f); } } // Checks that the output of a GRU layer is within tolerance given test input // data. -TEST_P(OptimizationTest, CheckGatedRecurrentLayer) { - const Optimization optimization = GetParam(); - RTC_LOG(LS_VERBOSE) << optimization; - GatedRecurrentLayer gru(kGruInputSize, kGruOutputSize, kGruBias, kGruWeights, - kGruRecurrentWeights, optimization); - TestGatedRecurrentLayer(&gru, kGruInputSequence, kGruExpectedOutputSequence); +TEST(RnnVadTest, CheckGatedRecurrentLayer) { + const std::array bias = {96, -99, -81, -114, 49, 119, + -118, 68, -76, 91, 121, 125}; + const std::array weights = { + 124, 9, 1, 116, -66, -21, -118, -110, 104, 75, -23, -51, + -72, -111, 47, 93, 77, -98, 41, -8, 40, -23, -43, -107, + 9, -73, 30, -32, -2, 64, -26, 91, -48, -24, -28, -104, + 74, -46, 116, 15, 32, 52, -126, -38, -121, 12, -16, 110, + -95, 66, -103, -35, -38, 3, -126, -61, 28, 98, -117, -43}; + const std::array recurrent_weights = { + -3, 87, 50, 51, -22, 27, -39, 62, 31, -83, -52, -48, + -6, 83, -19, 104, 105, 48, 23, 68, 23, 40, 7, -120, + 64, -62, 117, 85, -51, -43, 54, -105, 120, 56, -128, -107, + 39, 50, -17, -47, -117, 14, 108, 12, -7, -72, 103, -87, + -66, 82, 84, 100, -98, 102, -49, 44, 122, 106, -20, -69}; + GatedRecurrentLayer gru(5, 4, bias, weights, recurrent_weights); + // Test on different inputs. + { + const std::array input_sequence = { + 0.89395463f, 0.93224651f, 0.55788344f, 0.32341808f, 0.93355054f, + 0.13475326f, 0.97370994f, 0.14253306f, 0.93710381f, 0.76093364f, + 0.65780413f, 0.41657975f, 0.49403164f, 0.46843281f, 0.75138855f, + 0.24517593f, 0.47657707f, 0.57064998f, 0.435184f, 0.19319285f}; + const std::array expected_output_sequence = { + 0.0239123f, 0.5773077f, 0.f, 0.f, + 0.01282811f, 0.64330572f, 0.f, 0.04863098f, + 0.00781069f, 0.75267816f, 0.f, 0.02579715f, + 0.00471378f, 0.59162533f, 0.11087593f, 0.01334511f}; + TestGatedRecurrentLayer(&gru, input_sequence, expected_output_sequence); + } } -INSTANTIATE_TEST_SUITE_P(RnnVadTest, - OptimizationTest, - ::testing::Values(Optimization::kNone, - DetectOptimization())); - } // namespace test } // namespace rnn_vad } // namespace webrtc