From b81ab995a29adb3d88caaa9ab5657417da789536 Mon Sep 17 00:00:00 2001
From: Alessio Bazzica <alessiob@webrtc.org>
Date: Fri, 1 Nov 2019 20:06:33 +0100
Subject: [PATCH] RNN VAD: Optimize GRU (recurrent) weights optimized layout

This CL adds the GRU weights memory layout optimization with which it
will be easier to add SSE2 code in a follow up CL. The new memory
layout also improves the performance of the unoptimized code.

This CL also includes a bug fix in the GRU layer input validation.
It was a silent bug since the GRU layer of the RNN VAD has the same
input and output size. This was caught by changing memory layout of
the recurrent weights. The unit test has been adapted by removing the
unused recurrent weights (the expected result does not change).

Bug: webrtc:10480
Change-Id: Ia1551abde4cb24aa7e109c4447e0fffe7c839077
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/142177
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Reviewed-by: Gustaf Ullberg <gustaf@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#29717}
---
 modules/audio_processing/agc2/rnn_vad/rnn.cc  | 205 +++++++++++-------
 modules/audio_processing/agc2/rnn_vad/rnn.h   |   3 -
 .../agc2/rnn_vad/rnn_unittest.cc              | 108 +++++++--
 3 files changed, 223 insertions(+), 93 deletions(-)
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn.cc b/modules/audio_processing/agc2/rnn_vad/rnn.cc
index a5f7b4b4ab..1cd8ae7dbc 100644
--- a/modules/audio_processing/agc2/rnn_vad/rnn.cc
+++ b/modules/audio_processing/agc2/rnn_vad/rnn.cc
@@ -25,6 +25,7 @@
 #include <numeric>
 
 #include "rtc_base/checks.h"
+#include "rtc_base/logging.h"
 #include "third_party/rnnoise/src/rnn_activations.h"
 #include "third_party/rnnoise/src/rnn_vad_weights.h"
 
@@ -71,9 +72,12 @@ std::vector<float> GetScaledParams(rtc::ArrayView<const int8_t> params) {
   return scaled_params;
 }
 
+// TODO(bugs.chromium.org/10480): Hard-code optimized layout and remove this
+// function to improve setup time.
 // Casts and scales |weights| and re-arranges the layout.
-std::vector<float> GetPreprocessedWeights(rtc::ArrayView<const int8_t> weights,
-                                          const size_t output_size) {
+std::vector<float> GetPreprocessedFcWeights(
+    rtc::ArrayView<const int8_t> weights,
+    size_t output_size) {
   if (output_size == 1) {
     return GetScaledParams(weights);
   }
@@ -89,6 +93,117 @@ std::vector<float> GetPreprocessedWeights(rtc::ArrayView<const int8_t> weights,
   return w;
 }
 
+constexpr size_t kNumGruGates = 3;  // Update, reset, output.
+
+// TODO(bugs.chromium.org/10480): Hard-coded optimized layout and remove this
+// function to improve setup time.
+// Casts and scales |tensor_src| for a GRU layer and re-arranges the layout.
+// It works both for weights, recurrent weights and bias.
+std::vector<float> GetPreprocessedGruTensor(
+    rtc::ArrayView<const int8_t> tensor_src,
+    size_t output_size) {
+  // Transpose, cast and scale.
+  // |n| is the size of the first dimension of the 3-dim tensor |weights|.
+  const size_t n =
+      rtc::CheckedDivExact(tensor_src.size(), output_size * kNumGruGates);
+  const size_t stride_src = kNumGruGates * output_size;
+  const size_t stride_dst = n * output_size;
+  std::vector<float> tensor_dst(tensor_src.size());
+  for (size_t g = 0; g < kNumGruGates; ++g) {
+    for (size_t o = 0; o < output_size; ++o) {
+      for (size_t i = 0; i < n; ++i) {
+        tensor_dst[g * stride_dst + o * n + i] =
+            rnnoise::kWeightsScale *
+            static_cast<float>(
+                tensor_src[i * stride_src + g * output_size + o]);
+      }
+    }
+  }
+  return tensor_dst;
+}
+
+void ComputeGruUpdateResetGates(size_t input_size,
+                                size_t output_size,
+                                rtc::ArrayView<const float> weights,
+                                rtc::ArrayView<const float> recurrent_weights,
+                                rtc::ArrayView<const float> bias,
+                                rtc::ArrayView<const float> input,
+                                rtc::ArrayView<const float> state,
+                                rtc::ArrayView<float> gate) {
+  for (size_t o = 0; o < output_size; ++o) {
+    gate[o] = bias[o];
+    for (size_t i = 0; i < input_size; ++i) {
+      gate[o] += input[i] * weights[o * input_size + i];
+    }
+    for (size_t s = 0; s < output_size; ++s) {
+      gate[o] += state[s] * recurrent_weights[o * output_size + s];
+    }
+    gate[o] = SigmoidApproximated(gate[o]);
+  }
+}
+
+void ComputeGruOutputGate(size_t input_size,
+                          size_t output_size,
+                          rtc::ArrayView<const float> weights,
+                          rtc::ArrayView<const float> recurrent_weights,
+                          rtc::ArrayView<const float> bias,
+                          rtc::ArrayView<const float> input,
+                          rtc::ArrayView<const float> state,
+                          rtc::ArrayView<const float> reset,
+                          rtc::ArrayView<float> gate) {
+  for (size_t o = 0; o < output_size; ++o) {
+    gate[o] = bias[o];
+    for (size_t i = 0; i < input_size; ++i) {
+      gate[o] += input[i] * weights[o * input_size + i];
+    }
+    for (size_t s = 0; s < output_size; ++s) {
+      gate[o] += state[s] * recurrent_weights[o * output_size + s] * reset[s];
+    }
+    gate[o] = RectifiedLinearUnit(gate[o]);
+  }
+}
+
+// Gated recurrent unit (GRU) layer un-optimized implementation.
+void ComputeGruLayerOutput(size_t input_size,
+                           size_t output_size,
+                           rtc::ArrayView<const float> input,
+                           rtc::ArrayView<const float> weights,
+                           rtc::ArrayView<const float> recurrent_weights,
+                           rtc::ArrayView<const float> bias,
+                           rtc::ArrayView<float> state) {
+  RTC_DCHECK_EQ(input_size, input.size());
+  // Stride and offset used to read parameter arrays.
+  const size_t stride_in = input_size * output_size;
+  const size_t stride_out = output_size * output_size;
+
+  // Update gate.
+  std::array<float, kRecurrentLayersMaxUnits> update;
+  ComputeGruUpdateResetGates(
+      input_size, output_size, weights.subview(0, stride_in),
+      recurrent_weights.subview(0, stride_out), bias.subview(0, output_size),
+      input, state, update);
+
+  // Reset gate.
+  std::array<float, kRecurrentLayersMaxUnits> reset;
+  ComputeGruUpdateResetGates(
+      input_size, output_size, weights.subview(stride_in, stride_in),
+      recurrent_weights.subview(stride_out, stride_out),
+      bias.subview(output_size, output_size), input, state, reset);
+
+  // Output gate.
+  std::array<float, kRecurrentLayersMaxUnits> output;
+  ComputeGruOutputGate(
+      input_size, output_size, weights.subview(2 * stride_in, stride_in),
+      recurrent_weights.subview(2 * stride_out, stride_out),
+      bias.subview(2 * output_size, output_size), input, state, reset, output);
+
+  // Update output through the update gates and update the state.
+  for (size_t o = 0; o < output_size; ++o) {
+    output[o] = update[o] * state[o] + (1.f - update[o]) * output[o];
+    state[o] = output[o];
+  }
+}
+
 // Fully connected layer un-optimized implementation.
 void ComputeFullyConnectedLayerOutput(
     size_t input_size,
@@ -160,7 +275,7 @@ FullyConnectedLayer::FullyConnectedLayer(
     : input_size_(input_size),
       output_size_(output_size),
       bias_(GetScaledParams(bias)),
-      weights_(GetPreprocessedWeights(weights, output_size)),
+      weights_(GetPreprocessedFcWeights(weights, output_size)),
       activation_function_(activation_function),
       optimization_(optimization) {
   RTC_DCHECK_LE(output_size_, kFullyConnectedLayersMaxUnits)
@@ -209,18 +324,20 @@ GatedRecurrentLayer::GatedRecurrentLayer(
     Optimization optimization)
     : input_size_(input_size),
       output_size_(output_size),
-      bias_(GetScaledParams(bias)),
-      weights_(GetScaledParams(weights)),
-      recurrent_weights_(GetScaledParams(recurrent_weights)),
+      bias_(GetPreprocessedGruTensor(bias, output_size)),
+      weights_(GetPreprocessedGruTensor(weights, output_size)),
+      recurrent_weights_(
+          GetPreprocessedGruTensor(recurrent_weights, output_size)),
       optimization_(optimization) {
   RTC_DCHECK_LE(output_size_, kRecurrentLayersMaxUnits)
       << "Static over-allocation of recurrent layers state vectors is not "
       << "sufficient.";
-  RTC_DCHECK_EQ(3 * output_size_, bias_.size())
+  RTC_DCHECK_EQ(kNumGruGates * output_size_, bias_.size())
       << "Mismatching output size and bias terms array size.";
-  RTC_DCHECK_EQ(3 * input_size_ * output_size_, weights_.size())
+  RTC_DCHECK_EQ(kNumGruGates * input_size_ * output_size_, weights_.size())
       << "Mismatching input-output size and weight coefficients array size.";
-  RTC_DCHECK_EQ(3 * input_size_ * output_size_, recurrent_weights_.size())
+  RTC_DCHECK_EQ(kNumGruGates * output_size_ * output_size_,
+                recurrent_weights_.size())
       << "Mismatching input-output size and recurrent weight coefficients array"
       << " size.";
   Reset();
@@ -241,81 +358,23 @@ void GatedRecurrentLayer::ComputeOutput(rtc::ArrayView<const float> input) {
 #if defined(WEBRTC_ARCH_X86_FAMILY)
     case Optimization::kSse2:
       // TODO(bugs.chromium.org/10480): Handle Optimization::kSse2.
-      ComputeOutput_NONE(input);
+      ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
+                            recurrent_weights_, bias_, state_);
       break;
 #endif
 #if defined(WEBRTC_HAS_NEON)
     case Optimization::kNeon:
       // TODO(bugs.chromium.org/10480): Handle Optimization::kNeon.
-      ComputeOutput_NONE(input);
+      ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
+                            recurrent_weights_, bias_, state_);
       break;
 #endif
     default:
-      ComputeOutput_NONE(input);
+      ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
+                            recurrent_weights_, bias_, state_);
   }
 }
 
-void GatedRecurrentLayer::ComputeOutput_NONE(
-    rtc::ArrayView<const float> input) {
-  // TODO(bugs.chromium.org/9076): Optimize using SSE/AVX fused multiply-add
-  // operations.
-  // Stride and offset used to read parameter arrays.
-  const size_t stride = 3 * output_size_;
-  size_t offset = 0;
-
-  // Compute update gates.
-  std::array<float, kRecurrentLayersMaxUnits> update;
-  for (size_t o = 0; o < output_size_; ++o) {
-    update[o] = bias_[o];
-    // TODO(bugs.chromium.org/9076): Benchmark how different layouts for
-    // |weights_| and |recurrent_weights_| change the performance across
-    // different platforms.
-    for (size_t i = 0; i < input_size_; ++i) {  // Add input.
-      update[o] += input[i] * weights_[i * stride + o];
-    }
-    for (size_t s = 0; s < output_size_; ++s) {
-      update[o] += state_[s] * recurrent_weights_[s * stride + o];
-    }  // Add state.
-    update[o] = SigmoidApproximated(update[o]);
-  }
-
-  // Compute reset gates.
-  offset += output_size_;
-  std::array<float, kRecurrentLayersMaxUnits> reset;
-  for (size_t o = 0; o < output_size_; ++o) {
-    reset[o] = bias_[offset + o];
-    for (size_t i = 0; i < input_size_; ++i) {  // Add input.
-      reset[o] += input[i] * weights_[offset + i * stride + o];
-    }
-    for (size_t s = 0; s < output_size_; ++s) {  // Add state.
-      reset[o] += state_[s] * recurrent_weights_[offset + s * stride + o];
-    }
-    reset[o] = SigmoidApproximated(reset[o]);
-  }
-
-  // Compute output.
-  offset += output_size_;
-  std::array<float, kRecurrentLayersMaxUnits> output;
-  for (size_t o = 0; o < output_size_; ++o) {
-    output[o] = bias_[offset + o];
-    for (size_t i = 0; i < input_size_; ++i) {  // Add input.
-      output[o] += input[i] * weights_[offset + i * stride + o];
-    }
-    for (size_t s = 0; s < output_size_;
-         ++s) {  // Add state through reset gates.
-      output[o] +=
-          state_[s] * recurrent_weights_[offset + s * stride + o] * reset[s];
-    }
-    output[o] = RectifiedLinearUnit(output[o]);
-    // Update output through the update gates.
-    output[o] = update[o] * state_[o] + (1.f - update[o]) * output[o];
-  }
-
-  // Update the state. Not done in the previous loop since that would pollute
-  // the current state and lead to incorrect output values.
-  std::copy(output.begin(), output.end(), state_.begin());
-}
-
 RnnBasedVad::RnnBasedVad()
     : input_layer_(kInputLayerInputSize,
                    kInputLayerOutputSize,
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn.h b/modules/audio_processing/agc2/rnn_vad/rnn.h
index 29ee20744b..58274b2e1e 100644
--- a/modules/audio_processing/agc2/rnn_vad/rnn.h
+++ b/modules/audio_processing/agc2/rnn_vad/rnn.h
@@ -90,9 +90,6 @@ class GatedRecurrentLayer {
   void ComputeOutput(rtc::ArrayView<const float> input);
 
  private:
-  // No SIMD optimizations.
-  void ComputeOutput_NONE(rtc::ArrayView<const float> input);
-
   const size_t input_size_;
   const size_t output_size_;
   const std::vector<float> bias_;
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_unittest.cc b/modules/audio_processing/agc2/rnn_vad/rnn_unittest.cc
index 74974164a1..6e9f6f3690 100644
--- a/modules/audio_processing/agc2/rnn_vad/rnn_unittest.cc
+++ b/modules/audio_processing/agc2/rnn_vad/rnn_unittest.cc
@@ -82,17 +82,45 @@ constexpr size_t kGruOutputSize = 4;
 constexpr std::array<int8_t, 12> kGruBias = {96,   -99, -81, -114, 49,  119,
                                              -118, 68,  -76, 91,   121, 125};
 constexpr std::array<int8_t, 60> kGruWeights = {
-    124, 9,    1,    116, -66, -21, -118, -110, 104,  75,  -23,  -51,
-    -72, -111, 47,   93,  77,  -98, 41,   -8,   40,   -23, -43,  -107,
-    9,   -73,  30,   -32, -2,  64,  -26,  91,   -48,  -24, -28,  -104,
-    74,  -46,  116,  15,  32,  52,  -126, -38,  -121, 12,  -16,  110,
-    -95, 66,   -103, -35, -38, 3,   -126, -61,  28,   98,  -117, -43};
-constexpr std::array<int8_t, 60> kGruRecurrentWeights = {
-    -3,  87,  50,  51,  -22,  27,  -39, 62,   31,  -83, -52,  -48,
-    -6,  83,  -19, 104, 105,  48,  23,  68,   23,  40,  7,    -120,
-    64,  -62, 117, 85,  -51,  -43, 54,  -105, 120, 56,  -128, -107,
-    39,  50,  -17, -47, -117, 14,  108, 12,   -7,  -72, 103,  -87,
-    -66, 82,  84,  100, -98,  102, -49, 44,   122, 106, -20,  -69};
+    // Input 0.
+    124, 9, 1, 116,        // Update.
+    -66, -21, -118, -110,  // Reset.
+    104, 75, -23, -51,     // Output.
+    // Input 1.
+    -72, -111, 47, 93,   // Update.
+    77, -98, 41, -8,     // Reset.
+    40, -23, -43, -107,  // Output.
+    // Input 2.
+    9, -73, 30, -32,      // Update.
+    -2, 64, -26, 91,      // Reset.
+    -48, -24, -28, -104,  // Output.
+    // Input 3.
+    74, -46, 116, 15,    // Update.
+    32, 52, -126, -38,   // Reset.
+    -121, 12, -16, 110,  // Output.
+    // Input 4.
+    -95, 66, -103, -35,  // Update.
+    -38, 3, -126, -61,   // Reset.
+    28, 98, -117, -43    // Output.
+};
+constexpr std::array<int8_t, 48> kGruRecurrentWeights = {
+    // Output 0.
+    -3, 87, 50, 51,     // Update.
+    -22, 27, -39, 62,   // Reset.
+    31, -83, -52, -48,  // Output.
+    // Output 1.
+    -6, 83, -19, 104,  // Update.
+    105, 48, 23, 68,   // Reset.
+    23, 40, 7, -120,   // Output.
+    // Output 2.
+    64, -62, 117, 85,     // Update.
+    51, -43, 54, -105,    // Reset.
+    120, 56, -128, -107,  // Output.
+    // Output 3.
+    39, 50, -17, -47,   // Update.
+    -117, 14, 108, 12,  // Reset.
+    -7, -72, 103, -87,  // Output.
+};
 constexpr std::array<float, 20> kGruInputSequence = {
     0.89395463f, 0.93224651f, 0.55788344f, 0.32341808f, 0.93355054f,
     0.13475326f, 0.97370994f, 0.14253306f, 0.93710381f, 0.76093364f,
@@ -115,6 +143,12 @@ std::string GetOptimizationName(Optimization optimization) {
   }
 }
 
+struct Result {
+  Optimization optimization;
+  double average_us;
+  double std_dev_us;
+};
+
 }  // namespace
 
 // Checks that the output of a fully connected layer is within tolerance given
@@ -152,6 +186,17 @@ TEST(RnnVadTest, CheckFullyConnectedLayerOutputSse2) {
                           kFullyConnectedExpectedOutput);
 }
 
+// Like CheckGatedRecurrentLayer, but testing the SSE2 implementation.
+TEST(RnnVadTest, CheckGatedRecurrentLayerSse2) {
+  if (!IsOptimizationAvailable(Optimization::kSse2)) {
+    return;
+  }
+
+  GatedRecurrentLayer gru(kGruInputSize, kGruOutputSize, kGruBias, kGruWeights,
+                          kGruRecurrentWeights, Optimization::kSse2);
+  TestGatedRecurrentLayer(&gru, kGruInputSequence, kGruExpectedOutputSequence);
+}
+
 #endif  // WEBRTC_ARCH_X86_FAMILY
 
 TEST(RnnVadTest, DISABLED_BenchmarkFullyConnectedLayer) {
@@ -167,13 +212,7 @@ TEST(RnnVadTest, DISABLED_BenchmarkFullyConnectedLayer) {
         rnnoise::TansigApproximated, Optimization::kSse2));
   }
 
-  struct Result {
-    Optimization optimization;
-    double average_us;
-    double std_dev_us;
-  };
   std::vector<Result> results;
-
   constexpr size_t number_of_tests = 10000;
   for (auto& fc : implementations) {
     ::webrtc::test::PerformanceTimer perf_timer(number_of_tests);
@@ -193,6 +232,41 @@ TEST(RnnVadTest, DISABLED_BenchmarkFullyConnectedLayer) {
   }
 }
 
+TEST(RnnVadTest, DISABLED_BenchmarkGatedRecurrentLayer) {
+  std::vector<std::unique_ptr<GatedRecurrentLayer>> implementations;
+  implementations.emplace_back(std::make_unique<GatedRecurrentLayer>(
+      kGruInputSize, kGruOutputSize, kGruBias, kGruWeights,
+      kGruRecurrentWeights, Optimization::kNone));
+
+  rtc::ArrayView<const float> input_sequence(kGruInputSequence);
+  static_assert(kGruInputSequence.size() % kGruInputSize == 0, "");
+  constexpr size_t input_sequence_length =
+      kGruInputSequence.size() / kGruInputSize;
+
+  std::vector<Result> results;
+  constexpr size_t number_of_tests = 10000;
+  for (auto& gru : implementations) {
+    ::webrtc::test::PerformanceTimer perf_timer(number_of_tests);
+    gru->Reset();
+    for (size_t k = 0; k < number_of_tests; ++k) {
+      perf_timer.StartTimer();
+      for (size_t i = 0; i < input_sequence_length; ++i) {
+        gru->ComputeOutput(
+            input_sequence.subview(i * gru->input_size(), gru->input_size()));
+      }
+      perf_timer.StopTimer();
+    }
+    results.push_back({gru->optimization(), perf_timer.GetDurationAverage(),
+                       perf_timer.GetDurationStandardDeviation()});
+  }
+
+  for (const auto& result : results) {
+    RTC_LOG(LS_INFO) << GetOptimizationName(result.optimization) << ": "
+                     << (result.average_us / 1e3) << " +/- "
+                     << (result.std_dev_us / 1e3) << " ms";
+  }
+}
+
 }  // namespace test
 }  // namespace rnn_vad
 }  // namespace webrtc