audio: remove neon shadowing variables

also add a IWYU keep pragma to prevent incorrect removal a include file (when IWYU is run in an Android checkout)

BUG=webrtc:42223409

Change-Id: I513dfa3cd3ef71362c759ab1c003ee9c12529f6d
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/387922
Reviewed-by: Gustaf Ullberg <gustaf@webrtc.org>
Reviewed-by: Per Ã…hgren <peah@webrtc.org>
Commit-Queue: Philipp Hancke <phancke@meta.com>
Cr-Commit-Position: refs/heads/main@{#44483}
diff --git a/modules/audio_processing/aec3/matched_filter.cc b/modules/audio_processing/aec3/matched_filter.cc
index 90d550a..866f941 100644
--- a/modules/audio_processing/aec3/matched_filter.cc
+++ b/modules/audio_processing/aec3/matched_filter.cc
@@ -13,7 +13,7 @@
 #include <vector>
 
 // Defines WEBRTC_ARCH_X86_FAMILY, used below.
-#include "rtc_base/system/arch.h"
+#include "rtc_base/system/arch.h"  // IWYU pragma: keep
 
 #if defined(WEBRTC_HAS_NEON)
 #include <arm_neon.h>
@@ -118,7 +118,7 @@
     }
     const float* x_p =
         chunk1 != h_size ? scratch_memory.data() : &x[x_start_index];
-    const float* h_p = &h[0];
+    const float* h_cp = &h[0];
     float* accumulated_error_p = &accumulated_error[0];
     // Initialize values for the accumulation.
     float32x4_t x2_sum_128 = vdupq_n_f32(0);
@@ -127,10 +127,10 @@
     // Perform 128 bit vector operations.
     const int limit_by_4 = h_size >> 2;
     for (int k = limit_by_4; k > 0;
-         --k, h_p += 4, x_p += 4, accumulated_error_p++) {
+         --k, h_cp += 4, x_p += 4, accumulated_error_p++) {
       // Load the data into 128 bit vectors.
       const float32x4_t x_k = vld1q_f32(x_p);
-      const float32x4_t h_k = vld1q_f32(h_p);
+      const float32x4_t h_k = vld1q_f32(h_cp);
       // Compute and accumulate x * x.
       x2_sum_128 = vmlaq_f32(x2_sum_128, x_k, x_k);
       // Compute x * h
@@ -154,7 +154,6 @@
       float* h_p = &h[0];
       x_p = chunk1 != h_size ? scratch_memory.data() : &x[x_start_index];
       // Perform 128 bit vector operations.
-      const int limit_by_4 = h_size >> 2;
       for (int k = limit_by_4; k > 0; --k, h_p += 4, x_p += 4) {
         // Load the data into 128 bit vectors.
         float32x4_t h_k = vld1q_f32(h_p);
@@ -197,7 +196,7 @@
 
     RTC_DCHECK_GT(x_size, x_start_index);
     const float* x_p = &x[x_start_index];
-    const float* h_p = &h[0];
+    const float* h_cp = &h[0];
 
     // Initialize values for the accumulation.
     float32x4_t s_128 = vdupq_n_f32(0);
@@ -215,20 +214,20 @@
     for (int limit : {chunk1, chunk2}) {
       // Perform 128 bit vector operations.
       const int limit_by_4 = limit >> 2;
-      for (int k = limit_by_4; k > 0; --k, h_p += 4, x_p += 4) {
+      for (int k = limit_by_4; k > 0; --k, h_cp += 4, x_p += 4) {
         // Load the data into 128 bit vectors.
         const float32x4_t x_k = vld1q_f32(x_p);
-        const float32x4_t h_k = vld1q_f32(h_p);
+        const float32x4_t h_k = vld1q_f32(h_cp);
         // Compute and accumulate x * x and h * x.
         x2_sum_128 = vmlaq_f32(x2_sum_128, x_k, x_k);
         s_128 = vmlaq_f32(s_128, h_k, x_k);
       }
 
       // Perform non-vector operations for any remaining items.
-      for (int k = limit - limit_by_4 * 4; k > 0; --k, ++h_p, ++x_p) {
+      for (int k = limit - limit_by_4 * 4; k > 0; --k, ++h_cp, ++x_p) {
         const float x_k = *x_p;
         x2_sum += x_k * x_k;
-        s += *h_p * x_k;
+        s += *h_cp * x_k;
       }
 
       x_p = &x[0];