RNN VAD: FC layer simplified

The implementations for the fully connected layer can be simlpified by
using `VectorMath:DotProduct()`. In this way, it is also possible to
remove (nearly) duplicated SIMD code, reduce the binary size and more
easily maintain the code.

This CL also forces unoptimized code for the output layer of the VAD,
which is a FC 24x1 layer. A slight improvement of the realtime has
been measured (delta ~ +5x).

Bug: webrtc:10480
Change-Id: Iee93bd59f7905ebf96275dbbfeb3c921baf4e8db
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/195580
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Reviewed-by: Ivo Creusen <ivoc@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#32806}
diff --git a/modules/audio_processing/agc2/cpu_features.cc b/modules/audio_processing/agc2/cpu_features.cc
index 10f9a19..cced761 100644
--- a/modules/audio_processing/agc2/cpu_features.cc
+++ b/modules/audio_processing/agc2/cpu_features.cc
@@ -55,4 +55,8 @@
 #endif
 }
 
+AvailableCpuFeatures NoAvailableCpuFeatures() {
+  return {/*sse2=*/false, /*avx2=*/false, /*neon=*/false};
+}
+
 }  // namespace webrtc
diff --git a/modules/audio_processing/agc2/cpu_features.h b/modules/audio_processing/agc2/cpu_features.h
index bf73c3e..54ddfb3 100644
--- a/modules/audio_processing/agc2/cpu_features.h
+++ b/modules/audio_processing/agc2/cpu_features.h
@@ -31,6 +31,9 @@
 // Detects what CPU features are available.
 AvailableCpuFeatures GetAvailableCpuFeatures();
 
+// Returns the CPU feature flags all set to false.
+AvailableCpuFeatures NoAvailableCpuFeatures();
+
 }  // namespace webrtc
 
 #endif  // MODULES_AUDIO_PROCESSING_AGC2_CPU_FEATURES_H_
diff --git a/modules/audio_processing/agc2/rnn_vad/BUILD.gn b/modules/audio_processing/agc2/rnn_vad/BUILD.gn
index ef2370c..9895b76 100644
--- a/modules/audio_processing/agc2/rnn_vad/BUILD.gn
+++ b/modules/audio_processing/agc2/rnn_vad/BUILD.gn
@@ -92,7 +92,6 @@
     "../../../../api:function_view",
     "../../../../rtc_base:checks",
     "../../../../rtc_base:safe_conversions",
-    "../../../../rtc_base/system:arch",
     "//third_party/rnnoise:rnn_vad",
   ]
   if (current_cpu == "x86" || current_cpu == "x64") {
diff --git a/modules/audio_processing/agc2/rnn_vad/pitch_search_internal_unittest.cc b/modules/audio_processing/agc2/rnn_vad/pitch_search_internal_unittest.cc
index 8c336af..2a6e68f 100644
--- a/modules/audio_processing/agc2/rnn_vad/pitch_search_internal_unittest.cc
+++ b/modules/audio_processing/agc2/rnn_vad/pitch_search_internal_unittest.cc
@@ -41,17 +41,13 @@
 // Finds the relevant CPU features combinations to test.
 std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
   std::vector<AvailableCpuFeatures> v;
-  v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false});
+  v.push_back(NoAvailableCpuFeatures());
   AvailableCpuFeatures available = GetAvailableCpuFeatures();
   if (available.avx2) {
-    AvailableCpuFeatures features(
-        {/*sse2=*/false, /*avx2=*/true, /*neon=*/false});
-    v.push_back(features);
+    v.push_back({/*sse2=*/false, /*avx2=*/true, /*neon=*/false});
   }
   if (available.sse2) {
-    AvailableCpuFeatures features(
-        {/*sse2=*/true, /*avx2=*/false, /*neon=*/false});
-    v.push_back(features);
+    v.push_back({/*sse2=*/true, /*avx2=*/false, /*neon=*/false});
   }
   return v;
 }
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn.cc b/modules/audio_processing/agc2/rnn_vad/rnn.cc
index f828a24..475bef9 100644
--- a/modules/audio_processing/agc2/rnn_vad/rnn.cc
+++ b/modules/audio_processing/agc2/rnn_vad/rnn.cc
@@ -57,7 +57,8 @@
               kOutputDenseBias,
               kOutputDenseWeights,
               ActivationFunction::kSigmoidApproximated,
-              cpu_features,
+              // The output layer is just 24x1. The unoptimized code is faster.
+              NoAvailableCpuFeatures(),
               /*layer_name=*/"FC2") {
   // Input-output chaining size checks.
   RTC_DCHECK_EQ(input_.size(), hidden_.input_size())
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_fc.cc b/modules/audio_processing/agc2/rnn_vad/rnn_fc.cc
index 2363317..b04807f 100644
--- a/modules/audio_processing/agc2/rnn_vad/rnn_fc.cc
+++ b/modules/audio_processing/agc2/rnn_vad/rnn_fc.cc
@@ -8,13 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-// Defines WEBRTC_ARCH_X86_FAMILY, used below.
-#include "rtc_base/system/arch.h"
-
-#if defined(WEBRTC_ARCH_X86_FAMILY)
-#include <emmintrin.h>
-#endif
-
 #include <algorithm>
 #include <numeric>
 
@@ -84,7 +77,7 @@
       output_size_(output_size),
       bias_(GetScaledParams(bias)),
       weights_(PreprocessWeights(weights, output_size)),
-      cpu_features_(cpu_features),
+      vector_math_(cpu_features),
       activation_function_(GetActivationFunction(activation_function)) {
   RTC_DCHECK_LE(output_size_, kFullyConnectedLayerMaxUnits)
       << "Insufficient FC layer over-allocation (" << layer_name << ").";
@@ -100,52 +93,13 @@
 
 void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
   RTC_DCHECK_EQ(input.size(), input_size_);
-#if defined(WEBRTC_ARCH_X86_FAMILY)
-  // TODO(bugs.chromium.org/10480): Add AVX2.
-  if (cpu_features_.sse2) {
-    ComputeOutputSse2(input);
-    return;
-  }
-#endif
-  // TODO(bugs.chromium.org/10480): Add Neon.
-
-  // Un-optimized implementation.
+  rtc::ArrayView<const float> weights(weights_);
   for (int o = 0; o < output_size_; ++o) {
-    output_[o] = bias_[o];
-    // TODO(bugs.chromium.org/9076): Benchmark how different layouts for
-    // |weights_| change the performance across different platforms.
-    for (int i = 0; i < input_size_; ++i) {
-      output_[o] += input[i] * weights_[o * input_size_ + i];
-    }
-    output_[o] = activation_function_(output_[o]);
-  }
-}
-
-#if defined(WEBRTC_ARCH_X86_FAMILY)
-void FullyConnectedLayer::ComputeOutputSse2(rtc::ArrayView<const float> input) {
-  const int input_size_by_4 = input_size_ >> 2;
-  const int offset = input_size_ & ~3;
-  // TODO(bugs.chromium.org/10480): Check if reinterpret_cast below is ok.
-  __m128 sum_wx_128;
-  const float* v = reinterpret_cast<const float*>(&sum_wx_128);
-  for (int o = 0; o < output_size_; ++o) {
-    // Perform 128 bit vector operations.
-    sum_wx_128 = _mm_set1_ps(0);
-    const float* x_p = input.data();
-    const float* w_p = weights_.data() + o * input.size();
-    for (int i = 0; i < input_size_by_4; ++i, x_p += 4, w_p += 4) {
-      sum_wx_128 = _mm_add_ps(sum_wx_128,
-                              _mm_mul_ps(_mm_loadu_ps(x_p), _mm_loadu_ps(w_p)));
-    }
-    // Perform non-vector operations for any remaining items, sum up bias term
-    // and results from the vectorized code, and apply the activation function.
     output_[o] = activation_function_(
-        std::inner_product(input.begin() + offset, input.end(),
-                           weights_.begin() + o * input.size() + offset,
-                           bias_[o] + v[0] + v[1] + v[2] + v[3]));
+        bias_[o] + vector_math_.DotProduct(
+                       input, weights.subview(o * input_size_, input_size_)));
   }
 }
-#endif  // defined(WEBRTC_ARCH_X86_FAMILY)
 
 }  // namespace rnn_vad
 }  // namespace webrtc
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_fc.h b/modules/audio_processing/agc2/rnn_vad/rnn_fc.h
index d05d95c..d23957a 100644
--- a/modules/audio_processing/agc2/rnn_vad/rnn_fc.h
+++ b/modules/audio_processing/agc2/rnn_vad/rnn_fc.h
@@ -18,7 +18,7 @@
 #include "api/array_view.h"
 #include "api/function_view.h"
 #include "modules/audio_processing/agc2/cpu_features.h"
-#include "rtc_base/system/arch.h"
+#include "modules/audio_processing/agc2/rnn_vad/vector_math.h"
 
 namespace webrtc {
 namespace rnn_vad {
@@ -56,15 +56,11 @@
   void ComputeOutput(rtc::ArrayView<const float> input);
 
  private:
-#if defined(WEBRTC_ARCH_X86_FAMILY)
-  void ComputeOutputSse2(rtc::ArrayView<const float> input);
-#endif
-
   const int input_size_;
   const int output_size_;
   const std::vector<float> bias_;
   const std::vector<float> weights_;
-  const AvailableCpuFeatures cpu_features_;
+  const VectorMath vector_math_;
   rtc::FunctionView<float(float)> activation_function_;
   // Over-allocated array with size equal to `output_size_`.
   std::array<float, kFullyConnectedLayerMaxUnits> output_;
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_fc_unittest.cc b/modules/audio_processing/agc2/rnn_vad/rnn_fc_unittest.cc
index 900ce63..3074b34 100644
--- a/modules/audio_processing/agc2/rnn_vad/rnn_fc_unittest.cc
+++ b/modules/audio_processing/agc2/rnn_vad/rnn_fc_unittest.cc
@@ -84,7 +84,7 @@
 // Finds the relevant CPU features combinations to test.
 std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
   std::vector<AvailableCpuFeatures> v;
-  v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false});
+  v.push_back(NoAvailableCpuFeatures());
   AvailableCpuFeatures available = GetAvailableCpuFeatures();
   if (available.sse2) {
     v.push_back({/*sse2=*/true, /*avx2=*/false, /*neon=*/false});
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_gru_unittest.cc b/modules/audio_processing/agc2/rnn_vad/rnn_gru_unittest.cc
index ee8bdac..f9b7125 100644
--- a/modules/audio_processing/agc2/rnn_vad/rnn_gru_unittest.cc
+++ b/modules/audio_processing/agc2/rnn_vad/rnn_gru_unittest.cc
@@ -160,7 +160,7 @@
 std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
   std::vector<AvailableCpuFeatures> v;
   AvailableCpuFeatures available = GetAvailableCpuFeatures();
-  v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false});
+  v.push_back(NoAvailableCpuFeatures());
   if (available.avx2) {
     v.push_back({/*sse2=*/false, /*avx2=*/true, /*neon=*/false});
   }
diff --git a/modules/audio_processing/agc2/rnn_vad/rnn_vad_unittest.cc b/modules/audio_processing/agc2/rnn_vad/rnn_vad_unittest.cc
index 7eb699c..f223d58 100644
--- a/modules/audio_processing/agc2/rnn_vad/rnn_vad_unittest.cc
+++ b/modules/audio_processing/agc2/rnn_vad/rnn_vad_unittest.cc
@@ -158,7 +158,7 @@
 // Finds the relevant CPU features combinations to test.
 std::vector<AvailableCpuFeatures> GetCpuFeaturesToTest() {
   std::vector<AvailableCpuFeatures> v;
-  v.push_back({/*sse2=*/false, /*avx2=*/false, /*neon=*/false});
+  v.push_back(NoAvailableCpuFeatures());
   AvailableCpuFeatures available = GetAvailableCpuFeatures();
   if (available.avx2 && available.sse2) {
     v.push_back({/*sse2=*/true, /*avx2=*/true, /*neon=*/false});