This CL adds the GRU weights memory layout optimization with which it will be easier to add SSE2 code in a follow up CL. The new memory layout also improves the performance of the unoptimized code. This CL also includes a bug fix in the GRU layer input validation. It was a silent bug since the GRU layer of the RNN VAD has the same input and output size. This was caught by changing memory layout of the recurrent weights. The unit test has been adapted by removing the unused recurrent weights (the expected result does not change). Bug: webrtc:10480 Change-Id: Ia1551abde4cb24aa7e109c4447e0fffe7c839077 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/142177 Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> Reviewed-by: Gustaf Ullberg <gustaf@webrtc.org> Cr-Commit-Position: refs/heads/master@{#29717}
426 lines
16 KiB
C++
426 lines
16 KiB
C++
/*
|
|
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include "modules/audio_processing/agc2/rnn_vad/rnn.h"
|
|
|
|
// Defines WEBRTC_ARCH_X86_FAMILY, used below.
|
|
#include "rtc_base/system/arch.h"
|
|
|
|
#if defined(WEBRTC_HAS_NEON)
|
|
#include <arm_neon.h>
|
|
#endif
|
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
|
#include <emmintrin.h>
|
|
#endif
|
|
#include <algorithm>
|
|
#include <array>
|
|
#include <cmath>
|
|
#include <numeric>
|
|
|
|
#include "rtc_base/checks.h"
|
|
#include "rtc_base/logging.h"
|
|
#include "third_party/rnnoise/src/rnn_activations.h"
|
|
#include "third_party/rnnoise/src/rnn_vad_weights.h"
|
|
|
|
namespace webrtc {
|
|
namespace rnn_vad {
|
|
namespace {
|
|
|
|
using rnnoise::kWeightsScale;
|
|
|
|
using rnnoise::kInputLayerInputSize;
|
|
static_assert(kFeatureVectorSize == kInputLayerInputSize, "");
|
|
using rnnoise::kInputDenseBias;
|
|
using rnnoise::kInputDenseWeights;
|
|
using rnnoise::kInputLayerOutputSize;
|
|
static_assert(kInputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
|
|
"Increase kFullyConnectedLayersMaxUnits.");
|
|
|
|
using rnnoise::kHiddenGruBias;
|
|
using rnnoise::kHiddenGruRecurrentWeights;
|
|
using rnnoise::kHiddenGruWeights;
|
|
using rnnoise::kHiddenLayerOutputSize;
|
|
static_assert(kHiddenLayerOutputSize <= kRecurrentLayersMaxUnits,
|
|
"Increase kRecurrentLayersMaxUnits.");
|
|
|
|
using rnnoise::kOutputDenseBias;
|
|
using rnnoise::kOutputDenseWeights;
|
|
using rnnoise::kOutputLayerOutputSize;
|
|
static_assert(kOutputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
|
|
"Increase kFullyConnectedLayersMaxUnits.");
|
|
|
|
using rnnoise::SigmoidApproximated;
|
|
using rnnoise::TansigApproximated;
|
|
|
|
inline float RectifiedLinearUnit(float x) {
|
|
return x < 0.f ? 0.f : x;
|
|
}
|
|
|
|
std::vector<float> GetScaledParams(rtc::ArrayView<const int8_t> params) {
|
|
std::vector<float> scaled_params(params.size());
|
|
std::transform(params.begin(), params.end(), scaled_params.begin(),
|
|
[](int8_t x) -> float {
|
|
return rnnoise::kWeightsScale * static_cast<float>(x);
|
|
});
|
|
return scaled_params;
|
|
}
|
|
|
|
// TODO(bugs.chromium.org/10480): Hard-code optimized layout and remove this
|
|
// function to improve setup time.
|
|
// Casts and scales |weights| and re-arranges the layout.
|
|
std::vector<float> GetPreprocessedFcWeights(
|
|
rtc::ArrayView<const int8_t> weights,
|
|
size_t output_size) {
|
|
if (output_size == 1) {
|
|
return GetScaledParams(weights);
|
|
}
|
|
// Transpose, scale and cast.
|
|
const size_t input_size = rtc::CheckedDivExact(weights.size(), output_size);
|
|
std::vector<float> w(weights.size());
|
|
for (size_t o = 0; o < output_size; ++o) {
|
|
for (size_t i = 0; i < input_size; ++i) {
|
|
w[o * input_size + i] = rnnoise::kWeightsScale *
|
|
static_cast<float>(weights[i * output_size + o]);
|
|
}
|
|
}
|
|
return w;
|
|
}
|
|
|
|
constexpr size_t kNumGruGates = 3; // Update, reset, output.
|
|
|
|
// TODO(bugs.chromium.org/10480): Hard-coded optimized layout and remove this
|
|
// function to improve setup time.
|
|
// Casts and scales |tensor_src| for a GRU layer and re-arranges the layout.
|
|
// It works both for weights, recurrent weights and bias.
|
|
std::vector<float> GetPreprocessedGruTensor(
|
|
rtc::ArrayView<const int8_t> tensor_src,
|
|
size_t output_size) {
|
|
// Transpose, cast and scale.
|
|
// |n| is the size of the first dimension of the 3-dim tensor |weights|.
|
|
const size_t n =
|
|
rtc::CheckedDivExact(tensor_src.size(), output_size * kNumGruGates);
|
|
const size_t stride_src = kNumGruGates * output_size;
|
|
const size_t stride_dst = n * output_size;
|
|
std::vector<float> tensor_dst(tensor_src.size());
|
|
for (size_t g = 0; g < kNumGruGates; ++g) {
|
|
for (size_t o = 0; o < output_size; ++o) {
|
|
for (size_t i = 0; i < n; ++i) {
|
|
tensor_dst[g * stride_dst + o * n + i] =
|
|
rnnoise::kWeightsScale *
|
|
static_cast<float>(
|
|
tensor_src[i * stride_src + g * output_size + o]);
|
|
}
|
|
}
|
|
}
|
|
return tensor_dst;
|
|
}
|
|
|
|
void ComputeGruUpdateResetGates(size_t input_size,
|
|
size_t output_size,
|
|
rtc::ArrayView<const float> weights,
|
|
rtc::ArrayView<const float> recurrent_weights,
|
|
rtc::ArrayView<const float> bias,
|
|
rtc::ArrayView<const float> input,
|
|
rtc::ArrayView<const float> state,
|
|
rtc::ArrayView<float> gate) {
|
|
for (size_t o = 0; o < output_size; ++o) {
|
|
gate[o] = bias[o];
|
|
for (size_t i = 0; i < input_size; ++i) {
|
|
gate[o] += input[i] * weights[o * input_size + i];
|
|
}
|
|
for (size_t s = 0; s < output_size; ++s) {
|
|
gate[o] += state[s] * recurrent_weights[o * output_size + s];
|
|
}
|
|
gate[o] = SigmoidApproximated(gate[o]);
|
|
}
|
|
}
|
|
|
|
void ComputeGruOutputGate(size_t input_size,
|
|
size_t output_size,
|
|
rtc::ArrayView<const float> weights,
|
|
rtc::ArrayView<const float> recurrent_weights,
|
|
rtc::ArrayView<const float> bias,
|
|
rtc::ArrayView<const float> input,
|
|
rtc::ArrayView<const float> state,
|
|
rtc::ArrayView<const float> reset,
|
|
rtc::ArrayView<float> gate) {
|
|
for (size_t o = 0; o < output_size; ++o) {
|
|
gate[o] = bias[o];
|
|
for (size_t i = 0; i < input_size; ++i) {
|
|
gate[o] += input[i] * weights[o * input_size + i];
|
|
}
|
|
for (size_t s = 0; s < output_size; ++s) {
|
|
gate[o] += state[s] * recurrent_weights[o * output_size + s] * reset[s];
|
|
}
|
|
gate[o] = RectifiedLinearUnit(gate[o]);
|
|
}
|
|
}
|
|
|
|
// Gated recurrent unit (GRU) layer un-optimized implementation.
|
|
void ComputeGruLayerOutput(size_t input_size,
|
|
size_t output_size,
|
|
rtc::ArrayView<const float> input,
|
|
rtc::ArrayView<const float> weights,
|
|
rtc::ArrayView<const float> recurrent_weights,
|
|
rtc::ArrayView<const float> bias,
|
|
rtc::ArrayView<float> state) {
|
|
RTC_DCHECK_EQ(input_size, input.size());
|
|
// Stride and offset used to read parameter arrays.
|
|
const size_t stride_in = input_size * output_size;
|
|
const size_t stride_out = output_size * output_size;
|
|
|
|
// Update gate.
|
|
std::array<float, kRecurrentLayersMaxUnits> update;
|
|
ComputeGruUpdateResetGates(
|
|
input_size, output_size, weights.subview(0, stride_in),
|
|
recurrent_weights.subview(0, stride_out), bias.subview(0, output_size),
|
|
input, state, update);
|
|
|
|
// Reset gate.
|
|
std::array<float, kRecurrentLayersMaxUnits> reset;
|
|
ComputeGruUpdateResetGates(
|
|
input_size, output_size, weights.subview(stride_in, stride_in),
|
|
recurrent_weights.subview(stride_out, stride_out),
|
|
bias.subview(output_size, output_size), input, state, reset);
|
|
|
|
// Output gate.
|
|
std::array<float, kRecurrentLayersMaxUnits> output;
|
|
ComputeGruOutputGate(
|
|
input_size, output_size, weights.subview(2 * stride_in, stride_in),
|
|
recurrent_weights.subview(2 * stride_out, stride_out),
|
|
bias.subview(2 * output_size, output_size), input, state, reset, output);
|
|
|
|
// Update output through the update gates and update the state.
|
|
for (size_t o = 0; o < output_size; ++o) {
|
|
output[o] = update[o] * state[o] + (1.f - update[o]) * output[o];
|
|
state[o] = output[o];
|
|
}
|
|
}
|
|
|
|
// Fully connected layer un-optimized implementation.
|
|
void ComputeFullyConnectedLayerOutput(
|
|
size_t input_size,
|
|
size_t output_size,
|
|
rtc::ArrayView<const float> input,
|
|
rtc::ArrayView<const float> bias,
|
|
rtc::ArrayView<const float> weights,
|
|
rtc::FunctionView<float(float)> activation_function,
|
|
rtc::ArrayView<float> output) {
|
|
RTC_DCHECK_EQ(input.size(), input_size);
|
|
RTC_DCHECK_EQ(bias.size(), output_size);
|
|
RTC_DCHECK_EQ(weights.size(), input_size * output_size);
|
|
for (size_t o = 0; o < output_size; ++o) {
|
|
output[o] = bias[o];
|
|
// TODO(bugs.chromium.org/9076): Benchmark how different layouts for
|
|
// |weights_| change the performance across different platforms.
|
|
for (size_t i = 0; i < input_size; ++i) {
|
|
output[o] += input[i] * weights[o * input_size + i];
|
|
}
|
|
output[o] = activation_function(output[o]);
|
|
}
|
|
}
|
|
|
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
|
// Fully connected layer SSE2 implementation.
|
|
void ComputeFullyConnectedLayerOutputSse2(
|
|
size_t input_size,
|
|
size_t output_size,
|
|
rtc::ArrayView<const float> input,
|
|
rtc::ArrayView<const float> bias,
|
|
rtc::ArrayView<const float> weights,
|
|
rtc::FunctionView<float(float)> activation_function,
|
|
rtc::ArrayView<float> output) {
|
|
RTC_DCHECK_EQ(input.size(), input_size);
|
|
RTC_DCHECK_EQ(bias.size(), output_size);
|
|
RTC_DCHECK_EQ(weights.size(), input_size * output_size);
|
|
const size_t input_size_by_4 = input_size >> 2;
|
|
const size_t offset = input_size & ~3;
|
|
__m128 sum_wx_128;
|
|
const float* v = reinterpret_cast<const float*>(&sum_wx_128);
|
|
for (size_t o = 0; o < output_size; ++o) {
|
|
// Perform 128 bit vector operations.
|
|
sum_wx_128 = _mm_set1_ps(0);
|
|
const float* x_p = input.data();
|
|
const float* w_p = weights.data() + o * input_size;
|
|
for (size_t i = 0; i < input_size_by_4; ++i, x_p += 4, w_p += 4) {
|
|
sum_wx_128 = _mm_add_ps(sum_wx_128,
|
|
_mm_mul_ps(_mm_loadu_ps(x_p), _mm_loadu_ps(w_p)));
|
|
}
|
|
// Perform non-vector operations for any remaining items, sum up bias term
|
|
// and results from the vectorized code, and apply the activation function.
|
|
output[o] = activation_function(
|
|
std::inner_product(input.begin() + offset, input.end(),
|
|
weights.begin() + o * input_size + offset,
|
|
bias[o] + v[0] + v[1] + v[2] + v[3]));
|
|
}
|
|
}
|
|
#endif
|
|
|
|
} // namespace
|
|
|
|
FullyConnectedLayer::FullyConnectedLayer(
|
|
const size_t input_size,
|
|
const size_t output_size,
|
|
const rtc::ArrayView<const int8_t> bias,
|
|
const rtc::ArrayView<const int8_t> weights,
|
|
rtc::FunctionView<float(float)> activation_function,
|
|
Optimization optimization)
|
|
: input_size_(input_size),
|
|
output_size_(output_size),
|
|
bias_(GetScaledParams(bias)),
|
|
weights_(GetPreprocessedFcWeights(weights, output_size)),
|
|
activation_function_(activation_function),
|
|
optimization_(optimization) {
|
|
RTC_DCHECK_LE(output_size_, kFullyConnectedLayersMaxUnits)
|
|
<< "Static over-allocation of fully-connected layers output vectors is "
|
|
"not sufficient.";
|
|
RTC_DCHECK_EQ(output_size_, bias_.size())
|
|
<< "Mismatching output size and bias terms array size.";
|
|
RTC_DCHECK_EQ(input_size_ * output_size_, weights_.size())
|
|
<< "Mismatching input-output size and weight coefficients array size.";
|
|
}
|
|
|
|
FullyConnectedLayer::~FullyConnectedLayer() = default;
|
|
|
|
rtc::ArrayView<const float> FullyConnectedLayer::GetOutput() const {
|
|
return rtc::ArrayView<const float>(output_.data(), output_size_);
|
|
}
|
|
|
|
void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
|
|
switch (optimization_) {
|
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
|
case Optimization::kSse2:
|
|
ComputeFullyConnectedLayerOutputSse2(input_size_, output_size_, input,
|
|
bias_, weights_,
|
|
activation_function_, output_);
|
|
break;
|
|
#endif
|
|
#if defined(WEBRTC_HAS_NEON)
|
|
case Optimization::kNeon:
|
|
// TODO(bugs.chromium.org/10480): Handle Optimization::kNeon.
|
|
ComputeFullyConnectedLayerOutput(input_size_, output_size_, input, bias_,
|
|
weights_, activation_function_, output_);
|
|
break;
|
|
#endif
|
|
default:
|
|
ComputeFullyConnectedLayerOutput(input_size_, output_size_, input, bias_,
|
|
weights_, activation_function_, output_);
|
|
}
|
|
}
|
|
|
|
GatedRecurrentLayer::GatedRecurrentLayer(
|
|
const size_t input_size,
|
|
const size_t output_size,
|
|
const rtc::ArrayView<const int8_t> bias,
|
|
const rtc::ArrayView<const int8_t> weights,
|
|
const rtc::ArrayView<const int8_t> recurrent_weights,
|
|
Optimization optimization)
|
|
: input_size_(input_size),
|
|
output_size_(output_size),
|
|
bias_(GetPreprocessedGruTensor(bias, output_size)),
|
|
weights_(GetPreprocessedGruTensor(weights, output_size)),
|
|
recurrent_weights_(
|
|
GetPreprocessedGruTensor(recurrent_weights, output_size)),
|
|
optimization_(optimization) {
|
|
RTC_DCHECK_LE(output_size_, kRecurrentLayersMaxUnits)
|
|
<< "Static over-allocation of recurrent layers state vectors is not "
|
|
<< "sufficient.";
|
|
RTC_DCHECK_EQ(kNumGruGates * output_size_, bias_.size())
|
|
<< "Mismatching output size and bias terms array size.";
|
|
RTC_DCHECK_EQ(kNumGruGates * input_size_ * output_size_, weights_.size())
|
|
<< "Mismatching input-output size and weight coefficients array size.";
|
|
RTC_DCHECK_EQ(kNumGruGates * output_size_ * output_size_,
|
|
recurrent_weights_.size())
|
|
<< "Mismatching input-output size and recurrent weight coefficients array"
|
|
<< " size.";
|
|
Reset();
|
|
}
|
|
|
|
GatedRecurrentLayer::~GatedRecurrentLayer() = default;
|
|
|
|
rtc::ArrayView<const float> GatedRecurrentLayer::GetOutput() const {
|
|
return rtc::ArrayView<const float>(state_.data(), output_size_);
|
|
}
|
|
|
|
void GatedRecurrentLayer::Reset() {
|
|
state_.fill(0.f);
|
|
}
|
|
|
|
void GatedRecurrentLayer::ComputeOutput(rtc::ArrayView<const float> input) {
|
|
switch (optimization_) {
|
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
|
case Optimization::kSse2:
|
|
// TODO(bugs.chromium.org/10480): Handle Optimization::kSse2.
|
|
ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
|
|
recurrent_weights_, bias_, state_);
|
|
break;
|
|
#endif
|
|
#if defined(WEBRTC_HAS_NEON)
|
|
case Optimization::kNeon:
|
|
// TODO(bugs.chromium.org/10480): Handle Optimization::kNeon.
|
|
ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
|
|
recurrent_weights_, bias_, state_);
|
|
break;
|
|
#endif
|
|
default:
|
|
ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
|
|
recurrent_weights_, bias_, state_);
|
|
}
|
|
}
|
|
|
|
RnnBasedVad::RnnBasedVad()
|
|
: input_layer_(kInputLayerInputSize,
|
|
kInputLayerOutputSize,
|
|
kInputDenseBias,
|
|
kInputDenseWeights,
|
|
TansigApproximated,
|
|
DetectOptimization()),
|
|
hidden_layer_(kInputLayerOutputSize,
|
|
kHiddenLayerOutputSize,
|
|
kHiddenGruBias,
|
|
kHiddenGruWeights,
|
|
kHiddenGruRecurrentWeights,
|
|
DetectOptimization()),
|
|
output_layer_(kHiddenLayerOutputSize,
|
|
kOutputLayerOutputSize,
|
|
kOutputDenseBias,
|
|
kOutputDenseWeights,
|
|
SigmoidApproximated,
|
|
DetectOptimization()) {
|
|
// Input-output chaining size checks.
|
|
RTC_DCHECK_EQ(input_layer_.output_size(), hidden_layer_.input_size())
|
|
<< "The input and the hidden layers sizes do not match.";
|
|
RTC_DCHECK_EQ(hidden_layer_.output_size(), output_layer_.input_size())
|
|
<< "The hidden and the output layers sizes do not match.";
|
|
}
|
|
|
|
RnnBasedVad::~RnnBasedVad() = default;
|
|
|
|
void RnnBasedVad::Reset() {
|
|
hidden_layer_.Reset();
|
|
}
|
|
|
|
float RnnBasedVad::ComputeVadProbability(
|
|
rtc::ArrayView<const float, kFeatureVectorSize> feature_vector,
|
|
bool is_silence) {
|
|
if (is_silence) {
|
|
Reset();
|
|
return 0.f;
|
|
}
|
|
input_layer_.ComputeOutput(feature_vector);
|
|
hidden_layer_.ComputeOutput(input_layer_.GetOutput());
|
|
output_layer_.ComputeOutput(hidden_layer_.GetOutput());
|
|
const auto vad_output = output_layer_.GetOutput();
|
|
return vad_output[0];
|
|
}
|
|
|
|
} // namespace rnn_vad
|
|
} // namespace webrtc
|