From 7350a902374c796dec8ce583cfaf4b9697f3a525 Mon Sep 17 00:00:00 2001
From: Alessio Bazzica <alessiob@webrtc.org>
Date: Tue, 5 Nov 2019 17:22:52 +0100
Subject: [PATCH] RNN VAD: prepare for SIMD optimization

This CL adds the boilerplate for SIMD optimization of FC and GRU layers
in rnn.cc. The same scheme of AEC3 has been used. Unit tests for the
optimized architectures have been added (the same unoptimized
implementation will run).

Minor changes:
- unnecessary const removed in rnn.h
- FC and GRU test data in the anon namespace as constexpr

Bug: webrtc:10480
Change-Id: Ifae4e970326e7e7c603d49aeaf61194b5efdabd3
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/141419
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Reviewed-by: Gustaf Ullberg <gustaf@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#29696}
---
 .../audio_processing/agc2/rnn_vad/BUILD.gn    |  10 ++
 .../audio_processing/agc2/rnn_vad/common.cc   |  34 ++++
 .../audio_processing/agc2/rnn_vad/common.h    |   7 +
 modules/audio_processing/agc2/rnn_vad/rnn.cc  |  72 +++++++-
 modules/audio_processing/agc2/rnn_vad/rnn.h   |  30 ++--
 .../agc2/rnn_vad/rnn_unittest.cc              | 155 ++++++++++--------
 6 files changed, 221 insertions(+), 87 deletions(-)
 create mode 100644 modules/audio_processing/agc2/rnn_vad/common.cc

diff --git a/modules/audio_processing/agc2/rnn_vad/BUILD.gn b/modules/audio_processing/agc2/rnn_vad/BUILD.gn
index 71e02fb575..852abd88bf 100644
--- a/modules/audio_processing/agc2/rnn_vad/BUILD.gn
+++ b/modules/audio_processing/agc2/rnn_vad/BUILD.gn
@@ -13,6 +13,7 @@ rtc_library("rnn_vad") {
   sources = [
     "auto_correlation.cc",
     "auto_correlation.h",
+    "common.cc",
     "common.h",
     "features_extraction.cc",
     "features_extraction.h",
@@ -33,11 +34,20 @@ rtc_library("rnn_vad") {
     "spectral_features_internal.h",
     "symmetric_matrix_buffer.h",
   ]
+
+  defines = []
+  if (rtc_build_with_neon && current_cpu != "arm64") {
+    suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ]
+    cflags = [ "-mfpu=neon" ]
+  }
+
   deps = [
     "..:biquad_filter",
     "../../../../api:array_view",
     "../../../../rtc_base:checks",
     "../../../../rtc_base:rtc_base_approved",
+    "../../../../rtc_base/system:arch",
+    "../../../../system_wrappers:cpu_features_api",
     "../../utility:pffft_wrapper",
     "//third_party/rnnoise:rnn_vad",
   ]
diff --git a/modules/audio_processing/agc2/rnn_vad/common.cc b/modules/audio_processing/agc2/rnn_vad/common.cc
new file mode 100644
index 0000000000..744c87fea2
--- /dev/null
+++ b/modules/audio_processing/agc2/rnn_vad/common.cc
@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/agc2/rnn_vad/common.h"
+
+#include "rtc_base/system/arch.h"
+#include "system_wrappers/include/cpu_features_wrapper.h"
+
+namespace webrtc {
+namespace rnn_vad {
+
+Optimization DetectOptimization() {
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+  if (WebRtc_GetCPUInfo(kSSE2) != 0) {
+    return Optimization::kSse2;
+  }
+#endif
+
+#if defined(WEBRTC_HAS_NEON)
+  return Optimization::kNeon;
+#endif
+
+  return Optimization::kNone;
+}
+
+}  // namespace rnn_vad
+}  // namespace webrtc
diff --git a/modules/audio_processing/agc2/rnn_vad/common.h b/modules/audio_processing/agc2/rnn_vad/common.h
index 6b434d2171..c2e8df6905 100644
--- a/modules/audio_processing/agc2/rnn_vad/common.h
+++ b/modules/audio_processing/agc2/rnn_vad/common.h
@@ -11,6 +11,8 @@
 #ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_COMMON_H_
 #define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_COMMON_H_
 
+#include <stddef.h>
+
 namespace webrtc {
 namespace rnn_vad {
 
@@ -63,6 +65,11 @@ static_assert(kCepstralCoeffsHistorySize > 2,
 
 constexpr size_t kFeatureVectorSize = 42;
 
+enum class Optimization { kNone, kSse2, kNeon };
+
+// Detects what kind of optimizations to use for the code.
+Optimization DetectOptimization();
+
 }  // namespace rnn_vad
 }  // namespace webrtc
 
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn.cc b/modules/audio_processing/agc2/rnn_vad/rnn.cc
index 94cc254045..e6ef2f3a41 100644
--- a/modules/audio_processing/agc2/rnn_vad/rnn.cc
+++ b/modules/audio_processing/agc2/rnn_vad/rnn.cc
@@ -10,6 +10,15 @@
 
 #include "modules/audio_processing/agc2/rnn_vad/rnn.h"
 
+// Defines WEBRTC_ARCH_X86_FAMILY, used below.
+#include "rtc_base/system/arch.h"
+
+#if defined(WEBRTC_HAS_NEON)
+#include <arm_neon.h>
+#endif
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+#include <emmintrin.h>
+#endif
 #include <algorithm>
 #include <array>
 #include <cmath>
@@ -69,12 +78,14 @@ FullyConnectedLayer::FullyConnectedLayer(
     const size_t output_size,
     const rtc::ArrayView<const int8_t> bias,
     const rtc::ArrayView<const int8_t> weights,
-    float (*const activation_function)(float))
+    float (*const activation_function)(float),
+    Optimization optimization)
     : input_size_(input_size),
       output_size_(output_size),
       bias_(GetScaledParams(bias)),
       weights_(GetScaledParams(weights)),
-      activation_function_(activation_function) {
+      activation_function_(activation_function),
+      optimization_(optimization) {
   RTC_DCHECK_LE(output_size_, kFullyConnectedLayersMaxUnits)
       << "Static over-allocation of fully-connected layers output vectors is "
          "not sufficient.";
@@ -91,8 +102,26 @@ rtc::ArrayView<const float> FullyConnectedLayer::GetOutput() const {
 }
 
 void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
-  // TODO(bugs.chromium.org/9076): Optimize using SSE/AVX fused multiply-add
-  // operations.
+  switch (optimization_) {
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+    case Optimization::kSse2:
+      // TODO(bugs.chromium.org/10480): Handle Optimization::kSse2.
+      ComputeOutput_NONE(input);
+      break;
+#endif
+#if defined(WEBRTC_HAS_NEON)
+    case Optimization::kNeon:
+      // TODO(bugs.chromium.org/10480): Handle Optimization::kNeon.
+      ComputeOutput_NONE(input);
+      break;
+#endif
+    default:
+      ComputeOutput_NONE(input);
+  }
+}
+
+void FullyConnectedLayer::ComputeOutput_NONE(
+    rtc::ArrayView<const float> input) {
   for (size_t o = 0; o < output_size_; ++o) {
     output_[o] = bias_[o];
     // TODO(bugs.chromium.org/9076): Benchmark how different layouts for
@@ -109,12 +138,14 @@ GatedRecurrentLayer::GatedRecurrentLayer(
     const size_t output_size,
     const rtc::ArrayView<const int8_t> bias,
     const rtc::ArrayView<const int8_t> weights,
-    const rtc::ArrayView<const int8_t> recurrent_weights)
+    const rtc::ArrayView<const int8_t> recurrent_weights,
+    Optimization optimization)
     : input_size_(input_size),
       output_size_(output_size),
       bias_(GetScaledParams(bias)),
       weights_(GetScaledParams(weights)),
-      recurrent_weights_(GetScaledParams(recurrent_weights)) {
+      recurrent_weights_(GetScaledParams(recurrent_weights)),
+      optimization_(optimization) {
   RTC_DCHECK_LE(output_size_, kRecurrentLayersMaxUnits)
       << "Static over-allocation of recurrent layers state vectors is not "
       << "sufficient.";
@@ -139,6 +170,26 @@ void GatedRecurrentLayer::Reset() {
 }
 
 void GatedRecurrentLayer::ComputeOutput(rtc::ArrayView<const float> input) {
+  switch (optimization_) {
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+    case Optimization::kSse2:
+      // TODO(bugs.chromium.org/10480): Handle Optimization::kSse2.
+      ComputeOutput_NONE(input);
+      break;
+#endif
+#if defined(WEBRTC_HAS_NEON)
+    case Optimization::kNeon:
+      // TODO(bugs.chromium.org/10480): Handle Optimization::kNeon.
+      ComputeOutput_NONE(input);
+      break;
+#endif
+    default:
+      ComputeOutput_NONE(input);
+  }
+}
+
+void GatedRecurrentLayer::ComputeOutput_NONE(
+    rtc::ArrayView<const float> input) {
   // TODO(bugs.chromium.org/9076): Optimize using SSE/AVX fused multiply-add
   // operations.
   // Stride and offset used to read parameter arrays.
@@ -203,17 +254,20 @@ RnnBasedVad::RnnBasedVad()
                    kInputLayerOutputSize,
                    kInputDenseBias,
                    kInputDenseWeights,
-                   TansigApproximated),
+                   TansigApproximated,
+                   DetectOptimization()),
       hidden_layer_(kInputLayerOutputSize,
                     kHiddenLayerOutputSize,
                     kHiddenGruBias,
                     kHiddenGruWeights,
-                    kHiddenGruRecurrentWeights),
+                    kHiddenGruRecurrentWeights,
+                    DetectOptimization()),
       output_layer_(kHiddenLayerOutputSize,
                     kOutputLayerOutputSize,
                     kOutputDenseBias,
                     kOutputDenseWeights,
-                    SigmoidApproximated) {
+                    SigmoidApproximated,
+                    DetectOptimization()) {
   // Input-output chaining size checks.
   RTC_DCHECK_EQ(input_layer_.output_size(), hidden_layer_.input_size())
       << "The input and the hidden layers sizes do not match.";
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn.h b/modules/audio_processing/agc2/rnn_vad/rnn.h
index c38ff01b3e..f53a09379d 100644
--- a/modules/audio_processing/agc2/rnn_vad/rnn.h
+++ b/modules/audio_processing/agc2/rnn_vad/rnn.h
@@ -38,11 +38,12 @@ constexpr size_t kRecurrentLayersMaxUnits = 24;
 // Fully-connected layer.
 class FullyConnectedLayer {
  public:
-  FullyConnectedLayer(const size_t input_size,
-                      const size_t output_size,
-                      const rtc::ArrayView<const int8_t> bias,
-                      const rtc::ArrayView<const int8_t> weights,
-                      float (*const activation_function)(float));
+  FullyConnectedLayer(size_t input_size,
+                      size_t output_size,
+                      rtc::ArrayView<const int8_t> bias,
+                      rtc::ArrayView<const int8_t> weights,
+                      float (*const activation_function)(float),
+                      Optimization optimization);
   FullyConnectedLayer(const FullyConnectedLayer&) = delete;
   FullyConnectedLayer& operator=(const FullyConnectedLayer&) = delete;
   ~FullyConnectedLayer();
@@ -53,11 +54,15 @@ class FullyConnectedLayer {
   void ComputeOutput(rtc::ArrayView<const float> input);
 
  private:
+  // No SIMD optimizations.
+  void ComputeOutput_NONE(rtc::ArrayView<const float> input);
+
   const size_t input_size_;
   const size_t output_size_;
   const std::vector<float> bias_;
   const std::vector<float> weights_;
   float (*const activation_function_)(float);
+  const Optimization optimization_;
   // The output vector of a recurrent layer has length equal to |output_size_|.
   // However, for efficiency, over-allocation is used.
   std::array<float, kFullyConnectedLayersMaxUnits> output_;
@@ -67,11 +72,12 @@ class FullyConnectedLayer {
 // activation functions for the update/reset and output gates respectively.
 class GatedRecurrentLayer {
  public:
-  GatedRecurrentLayer(const size_t input_size,
-                      const size_t output_size,
-                      const rtc::ArrayView<const int8_t> bias,
-                      const rtc::ArrayView<const int8_t> weights,
-                      const rtc::ArrayView<const int8_t> recurrent_weights);
+  GatedRecurrentLayer(size_t input_size,
+                      size_t output_size,
+                      rtc::ArrayView<const int8_t> bias,
+                      rtc::ArrayView<const int8_t> weights,
+                      rtc::ArrayView<const int8_t> recurrent_weights,
+                      Optimization optimization);
   GatedRecurrentLayer(const GatedRecurrentLayer&) = delete;
   GatedRecurrentLayer& operator=(const GatedRecurrentLayer&) = delete;
   ~GatedRecurrentLayer();
@@ -83,6 +89,9 @@ class GatedRecurrentLayer {
   void ComputeOutput(rtc::ArrayView<const float> input);
 
  private:
+  // No SIMD optimizations.
+  void ComputeOutput_NONE(rtc::ArrayView<const float> input);
+
   const size_t input_size_;
   const size_t output_size_;
   const std::vector<float> bias_;
@@ -91,6 +100,7 @@ class GatedRecurrentLayer {
   // The state vector of a recurrent layer has length equal to |output_size_|.
   // However, to avoid dynamic allocation, over-allocation is used.
   std::array<float, kRecurrentLayersMaxUnits> state_;
+  const Optimization optimization_;
 };
 
 // Recurrent network based VAD.
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_unittest.cc b/modules/audio_processing/agc2/rnn_vad/rnn_unittest.cc
index 61e6f2670e..97ede1811a 100644
--- a/modules/audio_processing/agc2/rnn_vad/rnn_unittest.cc
+++ b/modules/audio_processing/agc2/rnn_vad/rnn_unittest.cc
@@ -14,6 +14,7 @@
 
 #include "modules/audio_processing/agc2/rnn_vad/test_utils.h"
 #include "rtc_base/checks.h"
+#include "rtc_base/logging.h"
 #include "test/gtest.h"
 #include "third_party/rnnoise/src/rnn_activations.h"
 #include "third_party/rnnoise/src/rnn_vad_weights.h"
@@ -60,86 +61,104 @@ void TestGatedRecurrentLayer(
   }
 }
 
+// Fully connected layer test data.
+constexpr size_t kFullyConnectedInputSize = 24;
+constexpr size_t kFullyConnectedOutputSize = 1;
+constexpr std::array<int8_t, 1> kFullyConnectedBias = {-50};
+constexpr std::array<int8_t, 24> kFullyConnectedWeights = {
+    127,  127,  127, 127,  127,  20,  127,  -126, -126, -54, 14,  125,
+    -126, -126, 127, -125, -126, 127, -127, -127, -57,  -30, 127, 80};
+constexpr std::array<float, 24 * 3> kFullyConnectedInputVectors = {
+    // Input 1.
+    0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.215833917f, 0.290601075f, 0.238759011f,
+    0.244751841f, 0.f, 0.0461241305f, 0.106401242f, 0.223070428f, 0.630603909f,
+    0.690453172f, 0.f, 0.387645692f, 0.166913897f, 0.f, 0.0327451192f, 0.f,
+    0.136149868f, 0.446351469f,
+    // Input 2.
+    0.592162728f, 0.529089332f, 1.18205106f, 1.21736848f, 0.f, 0.470851123f,
+    0.130675942f, 0.320903003f, 0.305496395f, 0.0571633279f, 1.57001138f,
+    0.0182026215f, 0.0977443159f, 0.347477973f, 0.493206412f, 0.9688586f,
+    0.0320267938f, 0.244722098f, 0.312745273f, 0.f, 0.00650715502f,
+    0.312553257f, 1.62619662f, 0.782880902f,
+    // Input 3.
+    0.395022154f, 0.333681047f, 0.76302278f, 0.965480626f, 0.f, 0.941198349f,
+    0.0892967582f, 0.745046318f, 0.635769248f, 0.238564298f, 0.970656633f,
+    0.014159563f, 0.094203949f, 0.446816623f, 0.640755892f, 1.20532358f,
+    0.0254284926f, 0.283327013f, 0.726210058f, 0.0550272502f, 0.000344108557f,
+    0.369803518f, 1.56680179f, 0.997883797f};
+constexpr std::array<float, 3> kFullyConnectedExpectedOutputs = {
+    0.436567038f, 0.874741316f, 0.672785878f};
+
+// Gated recurrent units layer test data.
+constexpr size_t kGruInputSize = 5;
+constexpr size_t kGruOutputSize = 4;
+constexpr std::array<int8_t, 12> kGruBias = {96,   -99, -81, -114, 49,  119,
+                                             -118, 68,  -76, 91,   121, 125};
+constexpr std::array<int8_t, 60> kGruWeights = {
+    124, 9,    1,    116, -66, -21, -118, -110, 104,  75,  -23,  -51,
+    -72, -111, 47,   93,  77,  -98, 41,   -8,   40,   -23, -43,  -107,
+    9,   -73,  30,   -32, -2,  64,  -26,  91,   -48,  -24, -28,  -104,
+    74,  -46,  116,  15,  32,  52,  -126, -38,  -121, 12,  -16,  110,
+    -95, 66,   -103, -35, -38, 3,   -126, -61,  28,   98,  -117, -43};
+constexpr std::array<int8_t, 60> kGruRecurrentWeights = {
+    -3,  87,  50,  51,  -22,  27,  -39, 62,   31,  -83, -52,  -48,
+    -6,  83,  -19, 104, 105,  48,  23,  68,   23,  40,  7,    -120,
+    64,  -62, 117, 85,  -51,  -43, 54,  -105, 120, 56,  -128, -107,
+    39,  50,  -17, -47, -117, 14,  108, 12,   -7,  -72, 103,  -87,
+    -66, 82,  84,  100, -98,  102, -49, 44,   122, 106, -20,  -69};
+constexpr std::array<float, 20> kGruInputSequence = {
+    0.89395463f, 0.93224651f, 0.55788344f, 0.32341808f, 0.93355054f,
+    0.13475326f, 0.97370994f, 0.14253306f, 0.93710381f, 0.76093364f,
+    0.65780413f, 0.41657975f, 0.49403164f, 0.46843281f, 0.75138855f,
+    0.24517593f, 0.47657707f, 0.57064998f, 0.435184f,   0.19319285f};
+constexpr std::array<float, 16> kGruExpectedOutputSequence = {
+    0.0239123f,  0.5773077f,  0.f,         0.f,
+    0.01282811f, 0.64330572f, 0.f,         0.04863098f,
+    0.00781069f, 0.75267816f, 0.f,         0.02579715f,
+    0.00471378f, 0.59162533f, 0.11087593f, 0.01334511f};
+
 }  // namespace
 
+class OptimizationTest : public ::testing::Test,
+                         public ::testing::WithParamInterface<Optimization> {};
+
 // Checks that the output of a fully connected layer is within tolerance given
 // test input data.
-TEST(RnnVadTest, CheckFullyConnectedLayerOutput) {
-  const std::array<int8_t, 1> bias = {-50};
-  const std::array<int8_t, 24> weights = {
-      127,  127,  127, 127,  127,  20,  127,  -126, -126, -54, 14,  125,
-      -126, -126, 127, -125, -126, 127, -127, -127, -57,  -30, 127, 80};
-  FullyConnectedLayer fc(24, 1, bias, weights, SigmoidApproximated);
+TEST_P(OptimizationTest, CheckFullyConnectedLayerOutput) {
+  const Optimization optimization = GetParam();
+  RTC_LOG(LS_VERBOSE) << optimization;
+  FullyConnectedLayer fc(kFullyConnectedInputSize, kFullyConnectedOutputSize,
+                         kFullyConnectedBias, kFullyConnectedWeights,
+                         SigmoidApproximated, optimization);
   // Test on different inputs.
-  {
-    const std::array<float, 24> input_vector = {
-        0.f,           0.f,           0.f,          0.f,          0.f,
-        0.f,           0.215833917f,  0.290601075f, 0.238759011f, 0.244751841f,
-        0.f,           0.0461241305f, 0.106401242f, 0.223070428f, 0.630603909f,
-        0.690453172f,  0.f,           0.387645692f, 0.166913897f, 0.f,
-        0.0327451192f, 0.f,           0.136149868f, 0.446351469f};
-    TestFullyConnectedLayer(&fc, input_vector, 0.436567038f);
-  }
-  {
-    const std::array<float, 24> input_vector = {
-        0.592162728f,  0.529089332f,  1.18205106f,
-        1.21736848f,   0.f,           0.470851123f,
-        0.130675942f,  0.320903003f,  0.305496395f,
-        0.0571633279f, 1.57001138f,   0.0182026215f,
-        0.0977443159f, 0.347477973f,  0.493206412f,
-        0.9688586f,    0.0320267938f, 0.244722098f,
-        0.312745273f,  0.f,           0.00650715502f,
-        0.312553257f,  1.62619662f,   0.782880902f};
-    TestFullyConnectedLayer(&fc, input_vector, 0.874741316f);
-  }
-  {
-    const std::array<float, 24> input_vector = {
-        0.395022154f,  0.333681047f,  0.76302278f,
-        0.965480626f,  0.f,           0.941198349f,
-        0.0892967582f, 0.745046318f,  0.635769248f,
-        0.238564298f,  0.970656633f,  0.014159563f,
-        0.094203949f,  0.446816623f,  0.640755892f,
-        1.20532358f,   0.0254284926f, 0.283327013f,
-        0.726210058f,  0.0550272502f, 0.000344108557f,
-        0.369803518f,  1.56680179f,   0.997883797f};
-    TestFullyConnectedLayer(&fc, input_vector, 0.672785878f);
+  static_assert(
+      kFullyConnectedInputVectors.size() % kFullyConnectedInputSize == 0, "");
+  constexpr size_t kNumInputVectors =
+      kFullyConnectedInputVectors.size() / kFullyConnectedInputSize;
+  static_assert(kFullyConnectedExpectedOutputs.size() == kNumInputVectors, "");
+  for (size_t i = 0; i < kNumInputVectors; ++i) {
+    rtc::ArrayView<const float> input(
+        kFullyConnectedInputVectors.data() + kFullyConnectedInputSize * i,
+        kFullyConnectedInputSize);
+    TestFullyConnectedLayer(&fc, input, kFullyConnectedExpectedOutputs[i]);
   }
 }
 
 // Checks that the output of a GRU layer is within tolerance given test input
 // data.
-TEST(RnnVadTest, CheckGatedRecurrentLayer) {
-  const std::array<int8_t, 12> bias = {96,   -99, -81, -114, 49,  119,
-                                       -118, 68,  -76, 91,   121, 125};
-  const std::array<int8_t, 60> weights = {
-      124, 9,    1,    116, -66, -21, -118, -110, 104,  75,  -23,  -51,
-      -72, -111, 47,   93,  77,  -98, 41,   -8,   40,   -23, -43,  -107,
-      9,   -73,  30,   -32, -2,  64,  -26,  91,   -48,  -24, -28,  -104,
-      74,  -46,  116,  15,  32,  52,  -126, -38,  -121, 12,  -16,  110,
-      -95, 66,   -103, -35, -38, 3,   -126, -61,  28,   98,  -117, -43};
-  const std::array<int8_t, 60> recurrent_weights = {
-      -3,  87,  50,  51,  -22,  27,  -39, 62,   31,  -83, -52,  -48,
-      -6,  83,  -19, 104, 105,  48,  23,  68,   23,  40,  7,    -120,
-      64,  -62, 117, 85,  -51,  -43, 54,  -105, 120, 56,  -128, -107,
-      39,  50,  -17, -47, -117, 14,  108, 12,   -7,  -72, 103,  -87,
-      -66, 82,  84,  100, -98,  102, -49, 44,   122, 106, -20,  -69};
-  GatedRecurrentLayer gru(5, 4, bias, weights, recurrent_weights);
-  // Test on different inputs.
-  {
-    const std::array<float, 20> input_sequence = {
-        0.89395463f, 0.93224651f, 0.55788344f, 0.32341808f, 0.93355054f,
-        0.13475326f, 0.97370994f, 0.14253306f, 0.93710381f, 0.76093364f,
-        0.65780413f, 0.41657975f, 0.49403164f, 0.46843281f, 0.75138855f,
-        0.24517593f, 0.47657707f, 0.57064998f, 0.435184f,   0.19319285f};
-    const std::array<float, 16> expected_output_sequence = {
-        0.0239123f,  0.5773077f,  0.f,         0.f,
-        0.01282811f, 0.64330572f, 0.f,         0.04863098f,
-        0.00781069f, 0.75267816f, 0.f,         0.02579715f,
-        0.00471378f, 0.59162533f, 0.11087593f, 0.01334511f};
-    TestGatedRecurrentLayer(&gru, input_sequence, expected_output_sequence);
-  }
+TEST_P(OptimizationTest, CheckGatedRecurrentLayer) {
+  const Optimization optimization = GetParam();
+  RTC_LOG(LS_VERBOSE) << optimization;
+  GatedRecurrentLayer gru(kGruInputSize, kGruOutputSize, kGruBias, kGruWeights,
+                          kGruRecurrentWeights, optimization);
+  TestGatedRecurrentLayer(&gru, kGruInputSequence, kGruExpectedOutputSequence);
 }
 
+INSTANTIATE_TEST_SUITE_P(RnnVadTest,
+                         OptimizationTest,
+                         ::testing::Values(Optimization::kNone,
+                                           DetectOptimization()));
+
 }  // namespace test
 }  // namespace rnn_vad
 }  // namespace webrtc