RNN VAD: FC layer simplified

The implementations for the fully connected layer can be simlpified by
using `VectorMath:DotProduct()`. In this way, it is also possible to
remove (nearly) duplicated SIMD code, reduce the binary size and more
easily maintain the code.

This CL also forces unoptimized code for the output layer of the VAD,
which is a FC 24x1 layer. A slight improvement of the realtime has
been measured (delta ~ +5x).

Bug: webrtc:10480
Change-Id: Iee93bd59f7905ebf96275dbbfeb3c921baf4e8db
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/195580
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Reviewed-by: Ivo Creusen <ivoc@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#32806}
This commit is contained in:
Alessio Bazzica 2020-12-08 11:30:47 +01:00 committed by Commit Bot
parent e3dd5660ea
commit e8ee462681
10 changed files with 21 additions and 68 deletions

View File

@ -55,4 +55,8 @@ AvailableCpuFeatures GetAvailableCpuFeatures() {
#endif
}
AvailableCpuFeatures NoAvailableCpuFeatures() {
return {/*sse2=*/false, /*avx2=*/false, /*neon=*/false};
}
} // namespace webrtc

View File

@ -31,6 +31,9 @@ struct AvailableCpuFeatures {
// Detects what CPU features are available.
AvailableCpuFeatures GetAvailableCpuFeatures();
// Returns the CPU feature flags all set to false.
AvailableCpuFeatures NoAvailableCpuFeatures();
} // namespace webrtc
#endif // MODULES_AUDIO_PROCESSING_AGC2_CPU_FEATURES_H_

View File

@ -92,7 +92,6 @@ rtc_source_set("rnn_vad_layers") {
"../../../../api:function_view",
"../../../../rtc_base:checks",
"../../../../rtc_base:safe_conversions",
"../../../../rtc_base/system:arch",
"//third_party/rnnoise:rnn_vad",
]
if (current_cpu == "x86" || current_cpu == "x64") {

View File

@ -41,17 +41,13 @@ std::string PrintTestIndexAndCpuFeatures(
// Finds the relevant CPU features combinations to test.
std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
std::vector<AvailableCpuFeatures> v;
v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false});
v.push_back(NoAvailableCpuFeatures());
AvailableCpuFeatures available = GetAvailableCpuFeatures();
if (available.avx2) {
AvailableCpuFeatures features(
{/*sse2=*/false, /*avx2=*/true, /*neon=*/false});
v.push_back(features);
v.push_back({/*sse2=*/false, /*avx2=*/true, /*neon=*/false});
}
if (available.sse2) {
AvailableCpuFeatures features(
{/*sse2=*/true, /*avx2=*/false, /*neon=*/false});
v.push_back(features);
v.push_back({/*sse2=*/true, /*avx2=*/false, /*neon=*/false});
}
return v;
}

View File

@ -57,7 +57,8 @@ RnnVad::RnnVad(const AvailableCpuFeatures& cpu_features)
kOutputDenseBias,
kOutputDenseWeights,
ActivationFunction::kSigmoidApproximated,
cpu_features,
// The output layer is just 24x1. The unoptimized code is faster.
NoAvailableCpuFeatures(),
/*layer_name=*/"FC2") {
// Input-output chaining size checks.
RTC_DCHECK_EQ(input_.size(), hidden_.input_size())

View File

@ -8,13 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
// Defines WEBRTC_ARCH_X86_FAMILY, used below.
#include "rtc_base/system/arch.h"
#if defined(WEBRTC_ARCH_X86_FAMILY)
#include <emmintrin.h>
#endif
#include <algorithm>
#include <numeric>
@ -84,7 +77,7 @@ FullyConnectedLayer::FullyConnectedLayer(
output_size_(output_size),
bias_(GetScaledParams(bias)),
weights_(PreprocessWeights(weights, output_size)),
cpu_features_(cpu_features),
vector_math_(cpu_features),
activation_function_(GetActivationFunction(activation_function)) {
RTC_DCHECK_LE(output_size_, kFullyConnectedLayerMaxUnits)
<< "Insufficient FC layer over-allocation (" << layer_name << ").";
@ -100,52 +93,13 @@ FullyConnectedLayer::~FullyConnectedLayer() = default;
void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
RTC_DCHECK_EQ(input.size(), input_size_);
#if defined(WEBRTC_ARCH_X86_FAMILY)
// TODO(bugs.chromium.org/10480): Add AVX2.
if (cpu_features_.sse2) {
ComputeOutputSse2(input);
return;
}
#endif
// TODO(bugs.chromium.org/10480): Add Neon.
// Un-optimized implementation.
rtc::ArrayView<const float> weights(weights_);
for (int o = 0; o < output_size_; ++o) {
output_[o] = bias_[o];
// TODO(bugs.chromium.org/9076): Benchmark how different layouts for
// |weights_| change the performance across different platforms.
for (int i = 0; i < input_size_; ++i) {
output_[o] += input[i] * weights_[o * input_size_ + i];
}
output_[o] = activation_function_(output_[o]);
}
}
#if defined(WEBRTC_ARCH_X86_FAMILY)
void FullyConnectedLayer::ComputeOutputSse2(rtc::ArrayView<const float> input) {
const int input_size_by_4 = input_size_ >> 2;
const int offset = input_size_ & ~3;
// TODO(bugs.chromium.org/10480): Check if reinterpret_cast below is ok.
__m128 sum_wx_128;
const float* v = reinterpret_cast<const float*>(&sum_wx_128);
for (int o = 0; o < output_size_; ++o) {
// Perform 128 bit vector operations.
sum_wx_128 = _mm_set1_ps(0);
const float* x_p = input.data();
const float* w_p = weights_.data() + o * input.size();
for (int i = 0; i < input_size_by_4; ++i, x_p += 4, w_p += 4) {
sum_wx_128 = _mm_add_ps(sum_wx_128,
_mm_mul_ps(_mm_loadu_ps(x_p), _mm_loadu_ps(w_p)));
}
// Perform non-vector operations for any remaining items, sum up bias term
// and results from the vectorized code, and apply the activation function.
output_[o] = activation_function_(
std::inner_product(input.begin() + offset, input.end(),
weights_.begin() + o * input.size() + offset,
bias_[o] + v[0] + v[1] + v[2] + v[3]));
bias_[o] + vector_math_.DotProduct(
input, weights.subview(o * input_size_, input_size_)));
}
}
#endif // defined(WEBRTC_ARCH_X86_FAMILY)
} // namespace rnn_vad
} // namespace webrtc

View File

@ -18,7 +18,7 @@
#include "api/array_view.h"
#include "api/function_view.h"
#include "modules/audio_processing/agc2/cpu_features.h"
#include "rtc_base/system/arch.h"
#include "modules/audio_processing/agc2/rnn_vad/vector_math.h"
namespace webrtc {
namespace rnn_vad {
@ -56,15 +56,11 @@ class FullyConnectedLayer {
void ComputeOutput(rtc::ArrayView<const float> input);
private:
#if defined(WEBRTC_ARCH_X86_FAMILY)
void ComputeOutputSse2(rtc::ArrayView<const float> input);
#endif
const int input_size_;
const int output_size_;
const std::vector<float> bias_;
const std::vector<float> weights_;
const AvailableCpuFeatures cpu_features_;
const VectorMath vector_math_;
rtc::FunctionView<float(float)> activation_function_;
// Over-allocated array with size equal to `output_size_`.
std::array<float, kFullyConnectedLayerMaxUnits> output_;

View File

@ -84,7 +84,7 @@ TEST_P(RnnFcParametrization, DISABLED_BenchmarkFullyConnectedLayer) {
// Finds the relevant CPU features combinations to test.
std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
std::vector<AvailableCpuFeatures> v;
v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false});
v.push_back(NoAvailableCpuFeatures());
AvailableCpuFeatures available = GetAvailableCpuFeatures();
if (available.sse2) {
v.push_back({/*sse2=*/true, /*avx2=*/false, /*neon=*/false});

View File

@ -160,7 +160,7 @@ TEST_P(RnnGruParametrization, DISABLED_BenchmarkGatedRecurrentLayer) {
std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
std::vector<AvailableCpuFeatures> v;
AvailableCpuFeatures available = GetAvailableCpuFeatures();
v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false});
v.push_back(NoAvailableCpuFeatures());
if (available.avx2) {
v.push_back({/*sse2=*/false, /*avx2=*/true, /*neon=*/false});
}

View File

@ -158,7 +158,7 @@ TEST_P(RnnVadProbabilityParametrization, DISABLED_RnnVadPerformance) {
// Finds the relevant CPU features combinations to test.
std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
std::vector<AvailableCpuFeatures> v;
v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false});
v.push_back(NoAvailableCpuFeatures());
AvailableCpuFeatures available = GetAvailableCpuFeatures();
if (available.avx2 && available.sse2) {
v.push_back({/*sse2=*/true, /*avx2=*/true, /*neon=*/false});