RNN VAD: FC layer isolated into rnn_fc.h/.cc

Refactoring done to more easily and cleanly add SIMD optimizations and
to remove `FullyConnectedLayer` from the RNN VAD api.

Minor improvements (readability, API):
- `FullyConnectedLayer` gets the ActivationFunction enum and not
  a function view anymore
- SSE2 optimization moved into `FullyConnectedLayer::ComputeOutputSse2`
- layer name added for improved logs

Bug: webrtc:10480
Change-Id: Ida4903a67655e19ef0464f378c433c1f6e96dca7
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/195444
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Reviewed-by: Sam Zackrisson <saza@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#32766}
diff --git a/modules/audio_processing/agc2/rnn_vad/BUILD.gn b/modules/audio_processing/agc2/rnn_vad/BUILD.gn
index 4351afd..c57971a 100644
--- a/modules/audio_processing/agc2/rnn_vad/BUILD.gn
+++ b/modules/audio_processing/agc2/rnn_vad/BUILD.gn
@@ -24,6 +24,7 @@
 
   deps = [
     ":rnn_vad_common",
+    ":rnn_vad_layers",
     ":rnn_vad_lp_residual",
     ":rnn_vad_pitch",
     ":rnn_vad_sequence_buffer",
@@ -78,6 +79,24 @@
   ]
 }
 
+rtc_source_set("rnn_vad_layers") {
+  sources = [
+    "rnn_fc.cc",
+    "rnn_fc.h",
+  ]
+  deps = [
+    ":rnn_vad_common",
+    "..:cpu_features",
+    "../../../../api:array_view",
+    "../../../../api:function_view",
+    "../../../../rtc_base:checks",
+    "../../../../rtc_base:safe_conversions",
+    "../../../../rtc_base/system:arch",
+    "//third_party/rnnoise:rnn_vad",
+  ]
+  absl_deps = [ "//third_party/abseil-cpp/absl/strings" ]
+}
+
 rtc_source_set("vector_math") {
   sources = [ "vector_math.h" ]
   deps = [
@@ -221,6 +240,7 @@
       "pitch_search_internal_unittest.cc",
       "pitch_search_unittest.cc",
       "ring_buffer_unittest.cc",
+      "rnn_fc_unittest.cc",
       "rnn_unittest.cc",
       "rnn_vad_unittest.cc",
       "sequence_buffer_unittest.cc",
@@ -233,6 +253,7 @@
       ":rnn_vad",
       ":rnn_vad_auto_correlation",
       ":rnn_vad_common",
+      ":rnn_vad_layers",
       ":rnn_vad_lp_residual",
       ":rnn_vad_pitch",
       ":rnn_vad_ring_buffer",
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn.cc b/modules/audio_processing/agc2/rnn_vad/rnn.cc
index 1c9b736..9d6d28f 100644
--- a/modules/audio_processing/agc2/rnn_vad/rnn.cc
+++ b/modules/audio_processing/agc2/rnn_vad/rnn.cc
@@ -60,37 +60,6 @@
   return x < 0.f ? 0.f : x;
 }
 
-std::vector<float> GetScaledParams(rtc::ArrayView<const int8_t> params) {
-  std::vector<float> scaled_params(params.size());
-  std::transform(params.begin(), params.end(), scaled_params.begin(),
-                 [](int8_t x) -> float {
-                   return rnnoise::kWeightsScale * static_cast<float>(x);
-                 });
-  return scaled_params;
-}
-
-// TODO(bugs.chromium.org/10480): Hard-code optimized layout and remove this
-// function to improve setup time.
-// Casts and scales |weights| and re-arranges the layout.
-std::vector<float> GetPreprocessedFcWeights(
-    rtc::ArrayView<const int8_t> weights,
-    int output_size) {
-  if (output_size == 1) {
-    return GetScaledParams(weights);
-  }
-  // Transpose, scale and cast.
-  const int input_size = rtc::CheckedDivExact(
-      rtc::dchecked_cast<int>(weights.size()), output_size);
-  std::vector<float> w(weights.size());
-  for (int o = 0; o < output_size; ++o) {
-    for (int i = 0; i < input_size; ++i) {
-      w[o * input_size + i] = rnnoise::kWeightsScale *
-                              static_cast<float>(weights[i * output_size + o]);
-    }
-  }
-  return w;
-}
-
 constexpr int kNumGruGates = 3;  // Update, reset, output.
 
 // TODO(bugs.chromium.org/10480): Hard-coded optimized layout and remove this
@@ -202,106 +171,8 @@
   }
 }
 
-// Fully connected layer un-optimized implementation.
-void ComputeFullyConnectedLayerOutput(
-    int input_size,
-    int output_size,
-    rtc::ArrayView<const float> input,
-    rtc::ArrayView<const float> bias,
-    rtc::ArrayView<const float> weights,
-    rtc::FunctionView<float(float)> activation_function,
-    rtc::ArrayView<float> output) {
-  RTC_DCHECK_EQ(input.size(), input_size);
-  RTC_DCHECK_EQ(bias.size(), output_size);
-  RTC_DCHECK_EQ(weights.size(), input_size * output_size);
-  for (int o = 0; o < output_size; ++o) {
-    output[o] = bias[o];
-    // TODO(bugs.chromium.org/9076): Benchmark how different layouts for
-    // |weights_| change the performance across different platforms.
-    for (int i = 0; i < input_size; ++i) {
-      output[o] += input[i] * weights[o * input_size + i];
-    }
-    output[o] = activation_function(output[o]);
-  }
-}
-
-#if defined(WEBRTC_ARCH_X86_FAMILY)
-// Fully connected layer SSE2 implementation.
-void ComputeFullyConnectedLayerOutputSse2(
-    int input_size,
-    int output_size,
-    rtc::ArrayView<const float> input,
-    rtc::ArrayView<const float> bias,
-    rtc::ArrayView<const float> weights,
-    rtc::FunctionView<float(float)> activation_function,
-    rtc::ArrayView<float> output) {
-  RTC_DCHECK_EQ(input.size(), input_size);
-  RTC_DCHECK_EQ(bias.size(), output_size);
-  RTC_DCHECK_EQ(weights.size(), input_size * output_size);
-  const int input_size_by_4 = input_size >> 2;
-  const int offset = input_size & ~3;
-  __m128 sum_wx_128;
-  const float* v = reinterpret_cast<const float*>(&sum_wx_128);
-  for (int o = 0; o < output_size; ++o) {
-    // Perform 128 bit vector operations.
-    sum_wx_128 = _mm_set1_ps(0);
-    const float* x_p = input.data();
-    const float* w_p = weights.data() + o * input_size;
-    for (int i = 0; i < input_size_by_4; ++i, x_p += 4, w_p += 4) {
-      sum_wx_128 = _mm_add_ps(sum_wx_128,
-                              _mm_mul_ps(_mm_loadu_ps(x_p), _mm_loadu_ps(w_p)));
-    }
-    // Perform non-vector operations for any remaining items, sum up bias term
-    // and results from the vectorized code, and apply the activation function.
-    output[o] = activation_function(
-        std::inner_product(input.begin() + offset, input.end(),
-                           weights.begin() + o * input_size + offset,
-                           bias[o] + v[0] + v[1] + v[2] + v[3]));
-  }
-}
-#endif
-
 }  // namespace
 
-FullyConnectedLayer::FullyConnectedLayer(
-    const int input_size,
-    const int output_size,
-    const rtc::ArrayView<const int8_t> bias,
-    const rtc::ArrayView<const int8_t> weights,
-    rtc::FunctionView<float(float)> activation_function,
-    const AvailableCpuFeatures& cpu_features)
-    : input_size_(input_size),
-      output_size_(output_size),
-      bias_(GetScaledParams(bias)),
-      weights_(GetPreprocessedFcWeights(weights, output_size)),
-      activation_function_(activation_function),
-      cpu_features_(cpu_features) {
-  RTC_DCHECK_LE(output_size_, kFullyConnectedLayerMaxUnits)
-      << "Static over-allocation of fully-connected layers output vectors is "
-         "not sufficient.";
-  RTC_DCHECK_EQ(output_size_, bias_.size())
-      << "Mismatching output size and bias terms array size.";
-  RTC_DCHECK_EQ(input_size_ * output_size_, weights_.size())
-      << "Mismatching input-output size and weight coefficients array size.";
-}
-
-FullyConnectedLayer::~FullyConnectedLayer() = default;
-
-void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
-#if defined(WEBRTC_ARCH_X86_FAMILY)
-  // TODO(bugs.chromium.org/10480): Add AVX2.
-  if (cpu_features_.sse2) {
-    ComputeFullyConnectedLayerOutputSse2(input_size_, output_size_, input,
-                                         bias_, weights_, activation_function_,
-                                         output_);
-    return;
-  }
-#endif
-  // TODO(bugs.chromium.org/10480): Add Neon.
-  ComputeFullyConnectedLayerOutput(input_size_, output_size_, input, bias_,
-                                   weights_, activation_function_, output_);
-}
-
 GatedRecurrentLayer::GatedRecurrentLayer(
     const int input_size,
     const int output_size,
@@ -346,8 +217,9 @@
              kInputLayerOutputSize,
              kInputDenseBias,
              kInputDenseWeights,
-             TansigApproximated,
-             cpu_features),
+             ActivationFunction::kTansigApproximated,
+             cpu_features,
+             /*layer_name=*/"FC1"),
       hidden_(kInputLayerOutputSize,
               kHiddenLayerOutputSize,
               kHiddenGruBias,
@@ -357,8 +229,9 @@
               kOutputLayerOutputSize,
               kOutputDenseBias,
               kOutputDenseWeights,
-              SigmoidApproximated,
-              cpu_features) {
+              ActivationFunction::kSigmoidApproximated,
+              cpu_features,
+              /*layer_name=*/"FC2") {
   // Input-output chaining size checks.
   RTC_DCHECK_EQ(input_.size(), hidden_.input_size())
       << "The input and the hidden layers sizes do not match.";
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn.h b/modules/audio_processing/agc2/rnn_vad/rnn.h
index c886034..df99c3c 100644
--- a/modules/audio_processing/agc2/rnn_vad/rnn.h
+++ b/modules/audio_processing/agc2/rnn_vad/rnn.h
@@ -21,54 +21,15 @@
 #include "api/function_view.h"
 #include "modules/audio_processing/agc2/cpu_features.h"
 #include "modules/audio_processing/agc2/rnn_vad/common.h"
+#include "modules/audio_processing/agc2/rnn_vad/rnn_fc.h"
 #include "rtc_base/system/arch.h"
 
 namespace webrtc {
 namespace rnn_vad {
 
-// Maximum number of units for an FC layer.
-constexpr int kFullyConnectedLayerMaxUnits = 24;
-
 // Maximum number of units for a GRU layer.
 constexpr int kGruLayerMaxUnits = 24;
 
-// Fully-connected layer with a custom activation function which owns the output
-// buffer.
-class FullyConnectedLayer {
- public:
-  // Ctor. `output_size` cannot be greater than `kFullyConnectedLayerMaxUnits`.
-  FullyConnectedLayer(int input_size,
-                      int output_size,
-                      rtc::ArrayView<const int8_t> bias,
-                      rtc::ArrayView<const int8_t> weights,
-                      rtc::FunctionView<float(float)> activation_function,
-                      const AvailableCpuFeatures& cpu_features);
-  FullyConnectedLayer(const FullyConnectedLayer&) = delete;
-  FullyConnectedLayer& operator=(const FullyConnectedLayer&) = delete;
-  ~FullyConnectedLayer();
-
-  // Returns the size of the input vector.
-  int input_size() const { return input_size_; }
-  // Returns the pointer to the first element of the output buffer.
-  const float* data() const { return output_.data(); }
-  // Returns the size of the output buffer.
-  int size() const { return output_size_; }
-
-  // Computes the fully-connected layer output.
-  void ComputeOutput(rtc::ArrayView<const float> input);
-
- private:
-  const int input_size_;
-  const int output_size_;
-  const std::vector<float> bias_;
-  const std::vector<float> weights_;
-  rtc::FunctionView<float(float)> activation_function_;
-  // The output vector of a recurrent layer has length equal to |output_size_|.
-  // However, for efficiency, over-allocation is used.
-  std::array<float, kFullyConnectedLayerMaxUnits> output_;
-  const AvailableCpuFeatures cpu_features_;
-};
-
 // Recurrent layer with gated recurrent units (GRUs) with sigmoid and ReLU as
 // activation functions for the update/reset and output gates respectively. It
 // owns the output buffer.
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_fc.cc b/modules/audio_processing/agc2/rnn_vad/rnn_fc.cc
new file mode 100644
index 0000000..2363317
--- /dev/null
+++ b/modules/audio_processing/agc2/rnn_vad/rnn_fc.cc
@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2020 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Defines WEBRTC_ARCH_X86_FAMILY, used below.
+#include "rtc_base/system/arch.h"
+
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+#include <emmintrin.h>
+#endif
+
+#include <algorithm>
+#include <numeric>
+
+#include "modules/audio_processing/agc2/rnn_vad/rnn_fc.h"
+#include "rtc_base/checks.h"
+#include "rtc_base/numerics/safe_conversions.h"
+#include "third_party/rnnoise/src/rnn_activations.h"
+#include "third_party/rnnoise/src/rnn_vad_weights.h"
+
+namespace webrtc {
+namespace rnn_vad {
+namespace {
+
+std::vector<float> GetScaledParams(rtc::ArrayView<const int8_t> params) {
+  std::vector<float> scaled_params(params.size());
+  std::transform(params.begin(), params.end(), scaled_params.begin(),
+                 [](int8_t x) -> float {
+                   return ::rnnoise::kWeightsScale * static_cast<float>(x);
+                 });
+  return scaled_params;
+}
+
+// TODO(bugs.chromium.org/10480): Hard-code optimized layout and remove this
+// function to improve setup time.
+// Casts and scales |weights| and re-arranges the layout.
+std::vector<float> PreprocessWeights(rtc::ArrayView<const int8_t> weights,
+                                     int output_size) {
+  if (output_size == 1) {
+    return GetScaledParams(weights);
+  }
+  // Transpose, scale and cast.
+  const int input_size = rtc::CheckedDivExact(
+      rtc::dchecked_cast<int>(weights.size()), output_size);
+  std::vector<float> w(weights.size());
+  for (int o = 0; o < output_size; ++o) {
+    for (int i = 0; i < input_size; ++i) {
+      w[o * input_size + i] = rnnoise::kWeightsScale *
+                              static_cast<float>(weights[i * output_size + o]);
+    }
+  }
+  return w;
+}
+
+rtc::FunctionView<float(float)> GetActivationFunction(
+    ActivationFunction activation_function) {
+  switch (activation_function) {
+    case ActivationFunction::kTansigApproximated:
+      return ::rnnoise::TansigApproximated;
+      break;
+    case ActivationFunction::kSigmoidApproximated:
+      return ::rnnoise::SigmoidApproximated;
+      break;
+  }
+}
+
+}  // namespace
+
+FullyConnectedLayer::FullyConnectedLayer(
+    const int input_size,
+    const int output_size,
+    const rtc::ArrayView<const int8_t> bias,
+    const rtc::ArrayView<const int8_t> weights,
+    ActivationFunction activation_function,
+    const AvailableCpuFeatures& cpu_features,
+    absl::string_view layer_name)
+    : input_size_(input_size),
+      output_size_(output_size),
+      bias_(GetScaledParams(bias)),
+      weights_(PreprocessWeights(weights, output_size)),
+      cpu_features_(cpu_features),
+      activation_function_(GetActivationFunction(activation_function)) {
+  RTC_DCHECK_LE(output_size_, kFullyConnectedLayerMaxUnits)
+      << "Insufficient FC layer over-allocation (" << layer_name << ").";
+  RTC_DCHECK_EQ(output_size_, bias_.size())
+      << "Mismatching output size and bias terms array size (" << layer_name
+      << ").";
+  RTC_DCHECK_EQ(input_size_ * output_size_, weights_.size())
+      << "Mismatching input-output size and weight coefficients array size ("
+      << layer_name << ").";
+}
+
+FullyConnectedLayer::~FullyConnectedLayer() = default;
+
+void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
+  RTC_DCHECK_EQ(input.size(), input_size_);
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+  // TODO(bugs.chromium.org/10480): Add AVX2.
+  if (cpu_features_.sse2) {
+    ComputeOutputSse2(input);
+    return;
+  }
+#endif
+  // TODO(bugs.chromium.org/10480): Add Neon.
+
+  // Un-optimized implementation.
+  for (int o = 0; o < output_size_; ++o) {
+    output_[o] = bias_[o];
+    // TODO(bugs.chromium.org/9076): Benchmark how different layouts for
+    // |weights_| change the performance across different platforms.
+    for (int i = 0; i < input_size_; ++i) {
+      output_[o] += input[i] * weights_[o * input_size_ + i];
+    }
+    output_[o] = activation_function_(output_[o]);
+  }
+}
+
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+void FullyConnectedLayer::ComputeOutputSse2(rtc::ArrayView<const float> input) {
+  const int input_size_by_4 = input_size_ >> 2;
+  const int offset = input_size_ & ~3;
+  // TODO(bugs.chromium.org/10480): Check if reinterpret_cast below is ok.
+  __m128 sum_wx_128;
+  const float* v = reinterpret_cast<const float*>(&sum_wx_128);
+  for (int o = 0; o < output_size_; ++o) {
+    // Perform 128 bit vector operations.
+    sum_wx_128 = _mm_set1_ps(0);
+    const float* x_p = input.data();
+    const float* w_p = weights_.data() + o * input.size();
+    for (int i = 0; i < input_size_by_4; ++i, x_p += 4, w_p += 4) {
+      sum_wx_128 = _mm_add_ps(sum_wx_128,
+                              _mm_mul_ps(_mm_loadu_ps(x_p), _mm_loadu_ps(w_p)));
+    }
+    // Perform non-vector operations for any remaining items, sum up bias term
+    // and results from the vectorized code, and apply the activation function.
+    output_[o] = activation_function_(
+        std::inner_product(input.begin() + offset, input.end(),
+                           weights_.begin() + o * input.size() + offset,
+                           bias_[o] + v[0] + v[1] + v[2] + v[3]));
+  }
+}
+#endif  // defined(WEBRTC_ARCH_X86_FAMILY)
+
+}  // namespace rnn_vad
+}  // namespace webrtc
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_fc.h b/modules/audio_processing/agc2/rnn_vad/rnn_fc.h
new file mode 100644
index 0000000..d05d95c
--- /dev/null
+++ b/modules/audio_processing/agc2/rnn_vad/rnn_fc.h
@@ -0,0 +1,76 @@
+/*
+ *  Copyright (c) 2020 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_RNN_FC_H_
+#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_RNN_FC_H_
+
+#include <array>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "api/array_view.h"
+#include "api/function_view.h"
+#include "modules/audio_processing/agc2/cpu_features.h"
+#include "rtc_base/system/arch.h"
+
+namespace webrtc {
+namespace rnn_vad {
+
+// Activation function for a neural network cell.
+enum class ActivationFunction { kTansigApproximated, kSigmoidApproximated };
+
+// Maximum number of units for an FC layer.
+constexpr int kFullyConnectedLayerMaxUnits = 24;
+
+// Fully-connected layer with a custom activation function which owns the output
+// buffer.
+class FullyConnectedLayer {
+ public:
+  // Ctor. `output_size` cannot be greater than `kFullyConnectedLayerMaxUnits`.
+  FullyConnectedLayer(int input_size,
+                      int output_size,
+                      rtc::ArrayView<const int8_t> bias,
+                      rtc::ArrayView<const int8_t> weights,
+                      ActivationFunction activation_function,
+                      const AvailableCpuFeatures& cpu_features,
+                      absl::string_view layer_name);
+  FullyConnectedLayer(const FullyConnectedLayer&) = delete;
+  FullyConnectedLayer& operator=(const FullyConnectedLayer&) = delete;
+  ~FullyConnectedLayer();
+
+  // Returns the size of the input vector.
+  int input_size() const { return input_size_; }
+  // Returns the pointer to the first element of the output buffer.
+  const float* data() const { return output_.data(); }
+  // Returns the size of the output buffer.
+  int size() const { return output_size_; }
+
+  // Computes the fully-connected layer output.
+  void ComputeOutput(rtc::ArrayView<const float> input);
+
+ private:
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+  void ComputeOutputSse2(rtc::ArrayView<const float> input);
+#endif
+
+  const int input_size_;
+  const int output_size_;
+  const std::vector<float> bias_;
+  const std::vector<float> weights_;
+  const AvailableCpuFeatures cpu_features_;
+  rtc::FunctionView<float(float)> activation_function_;
+  // Over-allocated array with size equal to `output_size_`.
+  std::array<float, kFullyConnectedLayerMaxUnits> output_;
+};
+
+}  // namespace rnn_vad
+}  // namespace webrtc
+
+#endif  // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_RNN_FC_H_
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_fc_unittest.cc b/modules/audio_processing/agc2/rnn_vad/rnn_fc_unittest.cc
new file mode 100644
index 0000000..1094832
--- /dev/null
+++ b/modules/audio_processing/agc2/rnn_vad/rnn_fc_unittest.cc
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2020 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/agc2/rnn_vad/rnn_fc.h"
+
+#include <array>
+#include <vector>
+
+#include "api/array_view.h"
+#include "modules/audio_processing/agc2/cpu_features.h"
+#include "modules/audio_processing/agc2/rnn_vad/test_utils.h"
+#include "modules/audio_processing/test/performance_timer.h"
+#include "rtc_base/logging.h"
+#include "rtc_base/system/arch.h"
+#include "test/gtest.h"
+#include "third_party/rnnoise/src/rnn_vad_weights.h"
+
+namespace webrtc {
+namespace rnn_vad {
+namespace test {
+namespace {
+
+using ::rnnoise::kInputDenseBias;
+using ::rnnoise::kInputDenseWeights;
+using ::rnnoise::kInputLayerInputSize;
+using ::rnnoise::kInputLayerOutputSize;
+
+// Fully connected layer test data.
+constexpr std::array<float, 42> kFullyConnectedInputVector = {
+    -1.00131f,   -0.627069f, -7.81097f,  7.86285f,    -2.87145f,  3.32365f,
+    -0.653161f,  0.529839f,  -0.425307f, 0.25583f,    0.235094f,  0.230527f,
+    -0.144687f,  0.182785f,  0.57102f,   0.125039f,   0.479482f,  -0.0255439f,
+    -0.0073141f, -0.147346f, -0.217106f, -0.0846906f, -8.34943f,  3.09065f,
+    1.42628f,    -0.85235f,  -0.220207f, -0.811163f,  2.09032f,   -2.01425f,
+    -0.690268f,  -0.925327f, -0.541354f, 0.58455f,    -0.606726f, -0.0372358f,
+    0.565991f,   0.435854f,  0.420812f,  0.162198f,   -2.13f,     10.0089f};
+constexpr std::array<float, 24> kFullyConnectedExpectedOutput = {
+    -0.623293f, -0.988299f, 0.999378f,  0.967168f,  0.103087f,  -0.978545f,
+    -0.856347f, 0.346675f,  1.f,        -0.717442f, -0.544176f, 0.960363f,
+    0.983443f,  0.999991f,  -0.824335f, 0.984742f,  0.990208f,  0.938179f,
+    0.875092f,  0.999846f,  0.997707f,  -0.999382f, 0.973153f,  -0.966605f};
+
+class RnnParametrization
+    : public ::testing::TestWithParam<AvailableCpuFeatures> {};
+
+// Checks that the output of a fully connected layer is within tolerance given
+// test input data.
+TEST_P(RnnParametrization, CheckFullyConnectedLayerOutput) {
+  FullyConnectedLayer fc(kInputLayerInputSize, kInputLayerOutputSize,
+                         kInputDenseBias, kInputDenseWeights,
+                         ActivationFunction::kTansigApproximated,
+                         /*cpu_features=*/GetParam(),
+                         /*layer_name=*/"FC");
+  fc.ComputeOutput(kFullyConnectedInputVector);
+  ExpectNearAbsolute(kFullyConnectedExpectedOutput, fc, 1e-5f);
+}
+
+TEST_P(RnnParametrization, DISABLED_BenchmarkFullyConnectedLayer) {
+  const AvailableCpuFeatures cpu_features = GetParam();
+  FullyConnectedLayer fc(kInputLayerInputSize, kInputLayerOutputSize,
+                         kInputDenseBias, kInputDenseWeights,
+                         ActivationFunction::kTansigApproximated, cpu_features,
+                         /*layer_name=*/"FC");
+
+  constexpr int kNumTests = 10000;
+  ::webrtc::test::PerformanceTimer perf_timer(kNumTests);
+  for (int k = 0; k < kNumTests; ++k) {
+    perf_timer.StartTimer();
+    fc.ComputeOutput(kFullyConnectedInputVector);
+    perf_timer.StopTimer();
+  }
+  RTC_LOG(LS_INFO) << "CPU features: " << cpu_features.ToString() << " | "
+                   << (perf_timer.GetDurationAverage() / 1000) << " +/- "
+                   << (perf_timer.GetDurationStandardDeviation() / 1000)
+                   << " ms";
+}
+
+// Finds the relevant CPU features combinations to test.
+std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
+  std::vector<AvailableCpuFeatures> v;
+  v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false});
+  AvailableCpuFeatures available = GetAvailableCpuFeatures();
+  if (available.sse2) {
+    AvailableCpuFeatures features(
+        {/*sse2=*/true, /*avx2=*/false, /*neon=*/false});
+    v.push_back(features);
+  }
+  return v;
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    RnnVadTest,
+    RnnParametrization,
+    ::testing::ValuesIn(GetCpuFeaturesToTest()),
+    [](const ::testing::TestParamInfo<AvailableCpuFeatures>& info) {
+      return info.param.ToString();
+    });
+
+}  // namespace
+}  // namespace test
+}  // namespace rnn_vad
+}  // namespace webrtc
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_unittest.cc b/modules/audio_processing/agc2/rnn_vad/rnn_unittest.cc
index 19e0afd..4f42d11 100644
--- a/modules/audio_processing/agc2/rnn_vad/rnn_unittest.cc
+++ b/modules/audio_processing/agc2/rnn_vad/rnn_unittest.cc
@@ -20,9 +20,7 @@
 #include "rtc_base/checks.h"
 #include "rtc_base/logging.h"
 #include "rtc_base/numerics/safe_conversions.h"
-#include "rtc_base/system/arch.h"
 #include "test/gtest.h"
-#include "third_party/rnnoise/src/rnn_activations.h"
 #include "third_party/rnnoise/src/rnn_vad_weights.h"
 
 namespace webrtc {
@@ -67,21 +65,6 @@
   }
 }
 
-// Fully connected layer test data.
-constexpr std::array<float, 42> kFullyConnectedInputVector = {
-    -1.00131f,   -0.627069f, -7.81097f,  7.86285f,    -2.87145f,  3.32365f,
-    -0.653161f,  0.529839f,  -0.425307f, 0.25583f,    0.235094f,  0.230527f,
-    -0.144687f,  0.182785f,  0.57102f,   0.125039f,   0.479482f,  -0.0255439f,
-    -0.0073141f, -0.147346f, -0.217106f, -0.0846906f, -8.34943f,  3.09065f,
-    1.42628f,    -0.85235f,  -0.220207f, -0.811163f,  2.09032f,   -2.01425f,
-    -0.690268f,  -0.925327f, -0.541354f, 0.58455f,    -0.606726f, -0.0372358f,
-    0.565991f,   0.435854f,  0.420812f,  0.162198f,   -2.13f,     10.0089f};
-constexpr std::array<float, 24> kFullyConnectedExpectedOutput = {
-    -0.623293f, -0.988299f, 0.999378f,  0.967168f,  0.103087f,  -0.978545f,
-    -0.856347f, 0.346675f,  1.f,        -0.717442f, -0.544176f, 0.960363f,
-    0.983443f,  0.999991f,  -0.824335f, 0.984742f,  0.990208f,  0.938179f,
-    0.875092f,  0.999846f,  0.997707f,  -0.999382f, 0.973153f,  -0.966605f};
-
 // Gated recurrent units layer test data.
 constexpr int kGruInputSize = 5;
 constexpr int kGruOutputSize = 4;
@@ -170,61 +153,6 @@
                    << " ms";
 }
 
-class RnnParametrization
-    : public ::testing::TestWithParam<AvailableCpuFeatures> {};
-
-// Checks that the output of a fully connected layer is within tolerance given
-// test input data.
-TEST_P(RnnParametrization, CheckFullyConnectedLayerOutput) {
-  FullyConnectedLayer fc(
-      rnnoise::kInputLayerInputSize, rnnoise::kInputLayerOutputSize,
-      rnnoise::kInputDenseBias, rnnoise::kInputDenseWeights,
-      rnnoise::TansigApproximated, /*cpu_features=*/GetParam());
-  fc.ComputeOutput(kFullyConnectedInputVector);
-  ExpectNearAbsolute(kFullyConnectedExpectedOutput, fc, 1e-5f);
-}
-
-TEST_P(RnnParametrization, DISABLED_BenchmarkFullyConnectedLayer) {
-  const AvailableCpuFeatures cpu_features = GetParam();
-  FullyConnectedLayer fc(rnnoise::kInputLayerInputSize,
-                         rnnoise::kInputLayerOutputSize,
-                         rnnoise::kInputDenseBias, rnnoise::kInputDenseWeights,
-                         rnnoise::TansigApproximated, cpu_features);
-
-  constexpr int kNumTests = 10000;
-  ::webrtc::test::PerformanceTimer perf_timer(kNumTests);
-  for (int k = 0; k < kNumTests; ++k) {
-    perf_timer.StartTimer();
-    fc.ComputeOutput(kFullyConnectedInputVector);
-    perf_timer.StopTimer();
-  }
-  RTC_LOG(LS_INFO) << "CPU features: " << cpu_features.ToString() << " | "
-                   << (perf_timer.GetDurationAverage() / 1000) << " +/- "
-                   << (perf_timer.GetDurationStandardDeviation() / 1000)
-                   << " ms";
-}
-
-// Finds the relevant CPU features combinations to test.
-std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
-  std::vector<AvailableCpuFeatures> v;
-  v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false});
-  AvailableCpuFeatures available = GetAvailableCpuFeatures();
-  if (available.sse2) {
-    AvailableCpuFeatures features(
-        {/*sse2=*/true, /*avx2=*/false, /*neon=*/false});
-    v.push_back(features);
-  }
-  return v;
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    RnnVadTest,
-    RnnParametrization,
-    ::testing::ValuesIn(GetCpuFeaturesToTest()),
-    [](const ::testing::TestParamInfo<AvailableCpuFeatures>& info) {
-      return info.param.ToString();
-    });
-
 // Checks that the speech probability is zero with silence.
 TEST(RnnVadTest, CheckZeroProbabilityWithSilence) {
   RnnVad rnn_vad(GetAvailableCpuFeatures());