RNN VAD: FC layer simplified
The implementations for the fully connected layer can be simlpified by using `VectorMath:DotProduct()`. In this way, it is also possible to remove (nearly) duplicated SIMD code, reduce the binary size and more easily maintain the code. This CL also forces unoptimized code for the output layer of the VAD, which is a FC 24x1 layer. A slight improvement of the realtime has been measured (delta ~ +5x). Bug: webrtc:10480 Change-Id: Iee93bd59f7905ebf96275dbbfeb3c921baf4e8db Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/195580 Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> Reviewed-by: Ivo Creusen <ivoc@webrtc.org> Cr-Commit-Position: refs/heads/master@{#32806}
This commit is contained in:
parent
e3dd5660ea
commit
e8ee462681
@ -55,4 +55,8 @@ AvailableCpuFeatures GetAvailableCpuFeatures() {
|
||||
#endif
|
||||
}
|
||||
|
||||
AvailableCpuFeatures NoAvailableCpuFeatures() {
|
||||
return {/*sse2=*/false, /*avx2=*/false, /*neon=*/false};
|
||||
}
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
@ -31,6 +31,9 @@ struct AvailableCpuFeatures {
|
||||
// Detects what CPU features are available.
|
||||
AvailableCpuFeatures GetAvailableCpuFeatures();
|
||||
|
||||
// Returns the CPU feature flags all set to false.
|
||||
AvailableCpuFeatures NoAvailableCpuFeatures();
|
||||
|
||||
} // namespace webrtc
|
||||
|
||||
#endif // MODULES_AUDIO_PROCESSING_AGC2_CPU_FEATURES_H_
|
||||
|
||||
@ -92,7 +92,6 @@ rtc_source_set("rnn_vad_layers") {
|
||||
"../../../../api:function_view",
|
||||
"../../../../rtc_base:checks",
|
||||
"../../../../rtc_base:safe_conversions",
|
||||
"../../../../rtc_base/system:arch",
|
||||
"//third_party/rnnoise:rnn_vad",
|
||||
]
|
||||
if (current_cpu == "x86" || current_cpu == "x64") {
|
||||
|
||||
@ -41,17 +41,13 @@ std::string PrintTestIndexAndCpuFeatures(
|
||||
// Finds the relevant CPU features combinations to test.
|
||||
std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
|
||||
std::vector<AvailableCpuFeatures> v;
|
||||
v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false});
|
||||
v.push_back(NoAvailableCpuFeatures());
|
||||
AvailableCpuFeatures available = GetAvailableCpuFeatures();
|
||||
if (available.avx2) {
|
||||
AvailableCpuFeatures features(
|
||||
{/*sse2=*/false, /*avx2=*/true, /*neon=*/false});
|
||||
v.push_back(features);
|
||||
v.push_back({/*sse2=*/false, /*avx2=*/true, /*neon=*/false});
|
||||
}
|
||||
if (available.sse2) {
|
||||
AvailableCpuFeatures features(
|
||||
{/*sse2=*/true, /*avx2=*/false, /*neon=*/false});
|
||||
v.push_back(features);
|
||||
v.push_back({/*sse2=*/true, /*avx2=*/false, /*neon=*/false});
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
@ -57,7 +57,8 @@ RnnVad::RnnVad(const AvailableCpuFeatures& cpu_features)
|
||||
kOutputDenseBias,
|
||||
kOutputDenseWeights,
|
||||
ActivationFunction::kSigmoidApproximated,
|
||||
cpu_features,
|
||||
// The output layer is just 24x1. The unoptimized code is faster.
|
||||
NoAvailableCpuFeatures(),
|
||||
/*layer_name=*/"FC2") {
|
||||
// Input-output chaining size checks.
|
||||
RTC_DCHECK_EQ(input_.size(), hidden_.input_size())
|
||||
|
||||
@ -8,13 +8,6 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
// Defines WEBRTC_ARCH_X86_FAMILY, used below.
|
||||
#include "rtc_base/system/arch.h"
|
||||
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
|
||||
@ -84,7 +77,7 @@ FullyConnectedLayer::FullyConnectedLayer(
|
||||
output_size_(output_size),
|
||||
bias_(GetScaledParams(bias)),
|
||||
weights_(PreprocessWeights(weights, output_size)),
|
||||
cpu_features_(cpu_features),
|
||||
vector_math_(cpu_features),
|
||||
activation_function_(GetActivationFunction(activation_function)) {
|
||||
RTC_DCHECK_LE(output_size_, kFullyConnectedLayerMaxUnits)
|
||||
<< "Insufficient FC layer over-allocation (" << layer_name << ").";
|
||||
@ -100,52 +93,13 @@ FullyConnectedLayer::~FullyConnectedLayer() = default;
|
||||
|
||||
void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
|
||||
RTC_DCHECK_EQ(input.size(), input_size_);
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
// TODO(bugs.chromium.org/10480): Add AVX2.
|
||||
if (cpu_features_.sse2) {
|
||||
ComputeOutputSse2(input);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
// TODO(bugs.chromium.org/10480): Add Neon.
|
||||
|
||||
// Un-optimized implementation.
|
||||
rtc::ArrayView<const float> weights(weights_);
|
||||
for (int o = 0; o < output_size_; ++o) {
|
||||
output_[o] = bias_[o];
|
||||
// TODO(bugs.chromium.org/9076): Benchmark how different layouts for
|
||||
// |weights_| change the performance across different platforms.
|
||||
for (int i = 0; i < input_size_; ++i) {
|
||||
output_[o] += input[i] * weights_[o * input_size_ + i];
|
||||
}
|
||||
output_[o] = activation_function_(output_[o]);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
void FullyConnectedLayer::ComputeOutputSse2(rtc::ArrayView<const float> input) {
|
||||
const int input_size_by_4 = input_size_ >> 2;
|
||||
const int offset = input_size_ & ~3;
|
||||
// TODO(bugs.chromium.org/10480): Check if reinterpret_cast below is ok.
|
||||
__m128 sum_wx_128;
|
||||
const float* v = reinterpret_cast<const float*>(&sum_wx_128);
|
||||
for (int o = 0; o < output_size_; ++o) {
|
||||
// Perform 128 bit vector operations.
|
||||
sum_wx_128 = _mm_set1_ps(0);
|
||||
const float* x_p = input.data();
|
||||
const float* w_p = weights_.data() + o * input.size();
|
||||
for (int i = 0; i < input_size_by_4; ++i, x_p += 4, w_p += 4) {
|
||||
sum_wx_128 = _mm_add_ps(sum_wx_128,
|
||||
_mm_mul_ps(_mm_loadu_ps(x_p), _mm_loadu_ps(w_p)));
|
||||
}
|
||||
// Perform non-vector operations for any remaining items, sum up bias term
|
||||
// and results from the vectorized code, and apply the activation function.
|
||||
output_[o] = activation_function_(
|
||||
std::inner_product(input.begin() + offset, input.end(),
|
||||
weights_.begin() + o * input.size() + offset,
|
||||
bias_[o] + v[0] + v[1] + v[2] + v[3]));
|
||||
bias_[o] + vector_math_.DotProduct(
|
||||
input, weights.subview(o * input_size_, input_size_)));
|
||||
}
|
||||
}
|
||||
#endif // defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
|
||||
} // namespace rnn_vad
|
||||
} // namespace webrtc
|
||||
|
||||
@ -18,7 +18,7 @@
|
||||
#include "api/array_view.h"
|
||||
#include "api/function_view.h"
|
||||
#include "modules/audio_processing/agc2/cpu_features.h"
|
||||
#include "rtc_base/system/arch.h"
|
||||
#include "modules/audio_processing/agc2/rnn_vad/vector_math.h"
|
||||
|
||||
namespace webrtc {
|
||||
namespace rnn_vad {
|
||||
@ -56,15 +56,11 @@ class FullyConnectedLayer {
|
||||
void ComputeOutput(rtc::ArrayView<const float> input);
|
||||
|
||||
private:
|
||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||
void ComputeOutputSse2(rtc::ArrayView<const float> input);
|
||||
#endif
|
||||
|
||||
const int input_size_;
|
||||
const int output_size_;
|
||||
const std::vector<float> bias_;
|
||||
const std::vector<float> weights_;
|
||||
const AvailableCpuFeatures cpu_features_;
|
||||
const VectorMath vector_math_;
|
||||
rtc::FunctionView<float(float)> activation_function_;
|
||||
// Over-allocated array with size equal to `output_size_`.
|
||||
std::array<float, kFullyConnectedLayerMaxUnits> output_;
|
||||
|
||||
@ -84,7 +84,7 @@ TEST_P(RnnFcParametrization, DISABLED_BenchmarkFullyConnectedLayer) {
|
||||
// Finds the relevant CPU features combinations to test.
|
||||
std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
|
||||
std::vector<AvailableCpuFeatures> v;
|
||||
v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false});
|
||||
v.push_back(NoAvailableCpuFeatures());
|
||||
AvailableCpuFeatures available = GetAvailableCpuFeatures();
|
||||
if (available.sse2) {
|
||||
v.push_back({/*sse2=*/true, /*avx2=*/false, /*neon=*/false});
|
||||
|
||||
@ -160,7 +160,7 @@ TEST_P(RnnGruParametrization, DISABLED_BenchmarkGatedRecurrentLayer) {
|
||||
std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
|
||||
std::vector<AvailableCpuFeatures> v;
|
||||
AvailableCpuFeatures available = GetAvailableCpuFeatures();
|
||||
v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false});
|
||||
v.push_back(NoAvailableCpuFeatures());
|
||||
if (available.avx2) {
|
||||
v.push_back({/*sse2=*/false, /*avx2=*/true, /*neon=*/false});
|
||||
}
|
||||
|
||||
@ -158,7 +158,7 @@ TEST_P(RnnVadProbabilityParametrization, DISABLED_RnnVadPerformance) {
|
||||
// Finds the relevant CPU features combinations to test.
|
||||
std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
|
||||
std::vector<AvailableCpuFeatures> v;
|
||||
v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false});
|
||||
v.push_back(NoAvailableCpuFeatures());
|
||||
AvailableCpuFeatures available = GetAvailableCpuFeatures();
|
||||
if (available.avx2 && available.sse2) {
|
||||
v.push_back({/*sse2=*/true, /*avx2=*/true, /*neon=*/false});
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user