RNN VAD: LP residual optimized (part 4)

This CL removes the circular shift to compute the convolution in
`ComputeLpResidual()`, which is now 4x faster (benchmarked with the
`RnnVadTest.DISABLED_ComputeLpResidualBenchmark` unit test).

Note that the `RnnVadTest.LpResidualPipelineBitExactness` unit test
is still passing.

Bug: webrtc:10480
Change-Id: Ia7767d9b57378c12c8ff31f58fea03905be5c5de
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/189964
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Reviewed-by: Per Ã…hgren <peah@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#32491}
diff --git a/modules/audio_processing/agc2/rnn_vad/BUILD.gn b/modules/audio_processing/agc2/rnn_vad/BUILD.gn
index 8b01122..fcf179c 100644
--- a/modules/audio_processing/agc2/rnn_vad/BUILD.gn
+++ b/modules/audio_processing/agc2/rnn_vad/BUILD.gn
@@ -75,6 +75,7 @@
   deps = [
     "../../../../api:array_view",
     "../../../../rtc_base:checks",
+    "../../../../rtc_base:safe_compare",
   ]
 }
 
diff --git a/modules/audio_processing/agc2/rnn_vad/lp_residual.cc b/modules/audio_processing/agc2/rnn_vad/lp_residual.cc
index e62bcc4..f732b97 100644
--- a/modules/audio_processing/agc2/rnn_vad/lp_residual.cc
+++ b/modules/audio_processing/agc2/rnn_vad/lp_residual.cc
@@ -16,6 +16,7 @@
 #include <numeric>
 
 #include "rtc_base/checks.h"
+#include "rtc_base/numerics/safe_compare.h"
 
 namespace webrtc {
 namespace rnn_vad {
@@ -117,19 +118,22 @@
     rtc::ArrayView<const float, kNumLpcCoefficients> lpc_coeffs,
     rtc::ArrayView<const float> x,
     rtc::ArrayView<float> y) {
-  RTC_DCHECK_LT(kNumLpcCoefficients, x.size());
+  RTC_DCHECK_GT(x.size(), kNumLpcCoefficients);
   RTC_DCHECK_EQ(x.size(), y.size());
-  std::array<float, kNumLpcCoefficients> input_chunk;
-  input_chunk.fill(0.f);
-  for (size_t i = 0; i < y.size(); ++i) {
-    const float sum = std::inner_product(input_chunk.begin(), input_chunk.end(),
-                                         lpc_coeffs.begin(), x[i]);
-    // Circular shift and add a new sample.
-    for (size_t j = kNumLpcCoefficients - 1; j > 0; --j)
-      input_chunk[j] = input_chunk[j - 1];
-    input_chunk[0] = x[i];
-    // Copy result.
-    y[i] = sum;
+  // The code below implements the following operation:
+  // y[i] = x[i] + dot_product({x[i], ..., x[i - kNumLpcCoefficients + 1]},
+  //                           lpc_coeffs)
+  // Edge case: i < kNumLpcCoefficients.
+  y[0] = x[0];
+  for (int i = 1; i < kNumLpcCoefficients; ++i) {
+    y[i] =
+        std::inner_product(x.crend() - i, x.crend(), lpc_coeffs.cbegin(), x[i]);
+  }
+  // Regular case.
+  auto last = x.crend();
+  for (int i = kNumLpcCoefficients; rtc::SafeLt(i, y.size()); ++i, --last) {
+    y[i] = std::inner_product(last - kNumLpcCoefficients, last,
+                              lpc_coeffs.cbegin(), x[i]);
   }
 }
 
diff --git a/modules/audio_processing/agc2/rnn_vad/lp_residual.h b/modules/audio_processing/agc2/rnn_vad/lp_residual.h
index cddedca..2e54dd9 100644
--- a/modules/audio_processing/agc2/rnn_vad/lp_residual.h
+++ b/modules/audio_processing/agc2/rnn_vad/lp_residual.h
@@ -19,7 +19,7 @@
 namespace rnn_vad {
 
 // LPC inverse filter length.
-constexpr size_t kNumLpcCoefficients = 5;
+constexpr int kNumLpcCoefficients = 5;
 
 // Given a frame |x|, computes a post-processed version of LPC coefficients
 // tailored for pitch estimation.