diff --git a/modules/audio_processing/agc2/cpu_features.cc b/modules/audio_processing/agc2/cpu_features.cc index 10f9a19e28..cced7614bc 100644 --- a/modules/audio_processing/agc2/cpu_features.cc +++ b/modules/audio_processing/agc2/cpu_features.cc @@ -55,4 +55,8 @@ AvailableCpuFeatures GetAvailableCpuFeatures() { #endif } +AvailableCpuFeatures NoAvailableCpuFeatures() { + return {/*sse2=*/false, /*avx2=*/false, /*neon=*/false}; +} + } // namespace webrtc diff --git a/modules/audio_processing/agc2/cpu_features.h b/modules/audio_processing/agc2/cpu_features.h index bf73c3e562..54ddfb3055 100644 --- a/modules/audio_processing/agc2/cpu_features.h +++ b/modules/audio_processing/agc2/cpu_features.h @@ -31,6 +31,9 @@ struct AvailableCpuFeatures { // Detects what CPU features are available. AvailableCpuFeatures GetAvailableCpuFeatures(); +// Returns the CPU feature flags all set to false. +AvailableCpuFeatures NoAvailableCpuFeatures(); + } // namespace webrtc #endif // MODULES_AUDIO_PROCESSING_AGC2_CPU_FEATURES_H_ diff --git a/modules/audio_processing/agc2/rnn_vad/BUILD.gn b/modules/audio_processing/agc2/rnn_vad/BUILD.gn index ef2370c878..9895b76e25 100644 --- a/modules/audio_processing/agc2/rnn_vad/BUILD.gn +++ b/modules/audio_processing/agc2/rnn_vad/BUILD.gn @@ -92,7 +92,6 @@ rtc_source_set("rnn_vad_layers") { "../../../../api:function_view", "../../../../rtc_base:checks", "../../../../rtc_base:safe_conversions", - "../../../../rtc_base/system:arch", "//third_party/rnnoise:rnn_vad", ] if (current_cpu == "x86" || current_cpu == "x64") { diff --git a/modules/audio_processing/agc2/rnn_vad/pitch_search_internal_unittest.cc b/modules/audio_processing/agc2/rnn_vad/pitch_search_internal_unittest.cc index 8c336af90f..2a6e68f157 100644 --- a/modules/audio_processing/agc2/rnn_vad/pitch_search_internal_unittest.cc +++ b/modules/audio_processing/agc2/rnn_vad/pitch_search_internal_unittest.cc @@ -41,17 +41,13 @@ std::string PrintTestIndexAndCpuFeatures( // Finds the relevant CPU features combinations to test. std::vector GetCpuFeaturesToTest() { std::vector v; - v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false}); + v.push_back(NoAvailableCpuFeatures()); AvailableCpuFeatures available = GetAvailableCpuFeatures(); if (available.avx2) { - AvailableCpuFeatures features( - {/*sse2=*/false, /*avx2=*/true, /*neon=*/false}); - v.push_back(features); + v.push_back({/*sse2=*/false, /*avx2=*/true, /*neon=*/false}); } if (available.sse2) { - AvailableCpuFeatures features( - {/*sse2=*/true, /*avx2=*/false, /*neon=*/false}); - v.push_back(features); + v.push_back({/*sse2=*/true, /*avx2=*/false, /*neon=*/false}); } return v; } diff --git a/modules/audio_processing/agc2/rnn_vad/rnn.cc b/modules/audio_processing/agc2/rnn_vad/rnn.cc index f828a248c3..475bef9775 100644 --- a/modules/audio_processing/agc2/rnn_vad/rnn.cc +++ b/modules/audio_processing/agc2/rnn_vad/rnn.cc @@ -57,7 +57,8 @@ RnnVad::RnnVad(const AvailableCpuFeatures& cpu_features) kOutputDenseBias, kOutputDenseWeights, ActivationFunction::kSigmoidApproximated, - cpu_features, + // The output layer is just 24x1. The unoptimized code is faster. + NoAvailableCpuFeatures(), /*layer_name=*/"FC2") { // Input-output chaining size checks. RTC_DCHECK_EQ(input_.size(), hidden_.input_size()) diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_fc.cc b/modules/audio_processing/agc2/rnn_vad/rnn_fc.cc index 2363317bcf..b04807f19f 100644 --- a/modules/audio_processing/agc2/rnn_vad/rnn_fc.cc +++ b/modules/audio_processing/agc2/rnn_vad/rnn_fc.cc @@ -8,13 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ -// Defines WEBRTC_ARCH_X86_FAMILY, used below. -#include "rtc_base/system/arch.h" - -#if defined(WEBRTC_ARCH_X86_FAMILY) -#include -#endif - #include #include @@ -84,7 +77,7 @@ FullyConnectedLayer::FullyConnectedLayer( output_size_(output_size), bias_(GetScaledParams(bias)), weights_(PreprocessWeights(weights, output_size)), - cpu_features_(cpu_features), + vector_math_(cpu_features), activation_function_(GetActivationFunction(activation_function)) { RTC_DCHECK_LE(output_size_, kFullyConnectedLayerMaxUnits) << "Insufficient FC layer over-allocation (" << layer_name << ")."; @@ -100,52 +93,13 @@ FullyConnectedLayer::~FullyConnectedLayer() = default; void FullyConnectedLayer::ComputeOutput(rtc::ArrayView input) { RTC_DCHECK_EQ(input.size(), input_size_); -#if defined(WEBRTC_ARCH_X86_FAMILY) - // TODO(bugs.chromium.org/10480): Add AVX2. - if (cpu_features_.sse2) { - ComputeOutputSse2(input); - return; - } -#endif - // TODO(bugs.chromium.org/10480): Add Neon. - - // Un-optimized implementation. + rtc::ArrayView weights(weights_); for (int o = 0; o < output_size_; ++o) { - output_[o] = bias_[o]; - // TODO(bugs.chromium.org/9076): Benchmark how different layouts for - // |weights_| change the performance across different platforms. - for (int i = 0; i < input_size_; ++i) { - output_[o] += input[i] * weights_[o * input_size_ + i]; - } - output_[o] = activation_function_(output_[o]); - } -} - -#if defined(WEBRTC_ARCH_X86_FAMILY) -void FullyConnectedLayer::ComputeOutputSse2(rtc::ArrayView input) { - const int input_size_by_4 = input_size_ >> 2; - const int offset = input_size_ & ~3; - // TODO(bugs.chromium.org/10480): Check if reinterpret_cast below is ok. - __m128 sum_wx_128; - const float* v = reinterpret_cast(&sum_wx_128); - for (int o = 0; o < output_size_; ++o) { - // Perform 128 bit vector operations. - sum_wx_128 = _mm_set1_ps(0); - const float* x_p = input.data(); - const float* w_p = weights_.data() + o * input.size(); - for (int i = 0; i < input_size_by_4; ++i, x_p += 4, w_p += 4) { - sum_wx_128 = _mm_add_ps(sum_wx_128, - _mm_mul_ps(_mm_loadu_ps(x_p), _mm_loadu_ps(w_p))); - } - // Perform non-vector operations for any remaining items, sum up bias term - // and results from the vectorized code, and apply the activation function. output_[o] = activation_function_( - std::inner_product(input.begin() + offset, input.end(), - weights_.begin() + o * input.size() + offset, - bias_[o] + v[0] + v[1] + v[2] + v[3])); + bias_[o] + vector_math_.DotProduct( + input, weights.subview(o * input_size_, input_size_))); } } -#endif // defined(WEBRTC_ARCH_X86_FAMILY) } // namespace rnn_vad } // namespace webrtc diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_fc.h b/modules/audio_processing/agc2/rnn_vad/rnn_fc.h index d05d95cc4b..d23957a6f2 100644 --- a/modules/audio_processing/agc2/rnn_vad/rnn_fc.h +++ b/modules/audio_processing/agc2/rnn_vad/rnn_fc.h @@ -18,7 +18,7 @@ #include "api/array_view.h" #include "api/function_view.h" #include "modules/audio_processing/agc2/cpu_features.h" -#include "rtc_base/system/arch.h" +#include "modules/audio_processing/agc2/rnn_vad/vector_math.h" namespace webrtc { namespace rnn_vad { @@ -56,15 +56,11 @@ class FullyConnectedLayer { void ComputeOutput(rtc::ArrayView input); private: -#if defined(WEBRTC_ARCH_X86_FAMILY) - void ComputeOutputSse2(rtc::ArrayView input); -#endif - const int input_size_; const int output_size_; const std::vector bias_; const std::vector weights_; - const AvailableCpuFeatures cpu_features_; + const VectorMath vector_math_; rtc::FunctionView activation_function_; // Over-allocated array with size equal to `output_size_`. std::array output_; diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_fc_unittest.cc b/modules/audio_processing/agc2/rnn_vad/rnn_fc_unittest.cc index 900ce63121..3074b34335 100644 --- a/modules/audio_processing/agc2/rnn_vad/rnn_fc_unittest.cc +++ b/modules/audio_processing/agc2/rnn_vad/rnn_fc_unittest.cc @@ -84,7 +84,7 @@ TEST_P(RnnFcParametrization, DISABLED_BenchmarkFullyConnectedLayer) { // Finds the relevant CPU features combinations to test. std::vector GetCpuFeaturesToTest() { std::vector v; - v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false}); + v.push_back(NoAvailableCpuFeatures()); AvailableCpuFeatures available = GetAvailableCpuFeatures(); if (available.sse2) { v.push_back({/*sse2=*/true, /*avx2=*/false, /*neon=*/false}); diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_gru_unittest.cc b/modules/audio_processing/agc2/rnn_vad/rnn_gru_unittest.cc index ee8bdac994..f9b712554f 100644 --- a/modules/audio_processing/agc2/rnn_vad/rnn_gru_unittest.cc +++ b/modules/audio_processing/agc2/rnn_vad/rnn_gru_unittest.cc @@ -160,7 +160,7 @@ TEST_P(RnnGruParametrization, DISABLED_BenchmarkGatedRecurrentLayer) { std::vector GetCpuFeaturesToTest() { std::vector v; AvailableCpuFeatures available = GetAvailableCpuFeatures(); - v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false}); + v.push_back(NoAvailableCpuFeatures()); if (available.avx2) { v.push_back({/*sse2=*/false, /*avx2=*/true, /*neon=*/false}); } diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_vad_unittest.cc b/modules/audio_processing/agc2/rnn_vad/rnn_vad_unittest.cc index 7eb699c39f..f223d587ee 100644 --- a/modules/audio_processing/agc2/rnn_vad/rnn_vad_unittest.cc +++ b/modules/audio_processing/agc2/rnn_vad/rnn_vad_unittest.cc @@ -158,7 +158,7 @@ TEST_P(RnnVadProbabilityParametrization, DISABLED_RnnVadPerformance) { // Finds the relevant CPU features combinations to test. std::vector GetCpuFeaturesToTest() { std::vector v; - v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false}); + v.push_back(NoAvailableCpuFeatures()); AvailableCpuFeatures available = GetAvailableCpuFeatures(); if (available.avx2 && available.sse2) { v.push_back({/*sse2=*/true, /*avx2=*/true, /*neon=*/false});