Revert "Support AVX2/FMA intrinsics in Audio Resampler module"

This reverts commit 1ca8d87239f1209031bbc77a6443bc7ac2dcee8c.

Reason for revert: breaks downstream project

Original change's description:
> Support AVX2/FMA intrinsics in Audio Resampler module
> 
> From the test result, using AVX2/FMA is 1.60x faster than SSE on atlas.
> 
> Bug: webrtc:11663
> Test: common_audio_unittests on atlas and octopus.
> Change-Id: Ibd45ea46aa97d5790a24e5116f741592b95f6416
> Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/176382
> Reviewed-by: Per Åhgren <peah@webrtc.org>
> Reviewed-by: Henrik Andreassson <henrika@webrtc.org>
> Reviewed-by: Mirko Bonadei <mbonadei@webrtc.org>
> Reviewed-by: Sam Zackrisson <saza@webrtc.org>
> Commit-Queue: Sam Zackrisson <saza@webrtc.org>
> Cr-Commit-Position: refs/heads/master@{#31810}

TBR=mbonadei@webrtc.org,henrika@webrtc.org,henrik.lundin@webrtc.org,saza@webrtc.org,peah@webrtc.org,mflodman@webrtc.org,zhaoliang.ma@intel.com

Change-Id: I1dad31df446e336dacb29ff637bd66f809376458
No-Presubmit: true
No-Tree-Checks: true
No-Try: true
Bug: webrtc:11663
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/180622
Reviewed-by: Åsa Persson <asapersson@webrtc.org>
Commit-Queue: Åsa Persson <asapersson@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#31813}
diff --git a/BUILD.gn b/BUILD.gn
index c844e385..b676d1e 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -129,10 +129,6 @@
     defines += [ "RTC_DISABLE_CHECK_MSG" ]
   }
 
-  if (rtc_enable_avx2) {
-    defines += [ "WEBRTC_ENABLE_AVX2" ]
-  }
-
   # Some tests need to declare their own trace event handlers. If this define is
   # not set, the first time TRACE_EVENT_* is called it will store the return
   # value for the current handler in an static variable, so that subsequent
diff --git a/common_audio/BUILD.gn b/common_audio/BUILD.gn
index fc76351..4077486 100644
--- a/common_audio/BUILD.gn
+++ b/common_audio/BUILD.gn
@@ -67,7 +67,6 @@
 
   if (current_cpu == "x86" || current_cpu == "x64") {
     deps += [ ":common_audio_sse2" ]
-    deps += [ ":common_audio_avx2" ]
   }
 }
 
@@ -236,7 +235,6 @@
   ]
   if (current_cpu == "x86" || current_cpu == "x64") {
     deps += [ ":common_audio_sse2" ]
-    deps += [ ":common_audio_avx2" ]
   }
   if (rtc_build_with_neon) {
     deps += [ ":common_audio_neon" ]
@@ -263,27 +261,6 @@
       "../rtc_base/memory:aligned_malloc",
     ]
   }
-
-  rtc_library("common_audio_avx2") {
-    sources = [ "resampler/sinc_resampler_avx2.cc" ]
-
-    if (is_win) {
-      cflags = [ "/arch:AVX2" ]
-    } else {
-      cflags = [
-        "-mavx2",
-        "-mfma",
-      ]
-    }
-
-    deps = [
-      ":fir_filter",
-      ":sinc_resampler",
-      "../rtc_base:checks",
-      "../rtc_base:rtc_base_approved",
-      "../rtc_base/memory:aligned_malloc",
-    ]
-  }
 }
 
 if (rtc_build_with_neon) {
diff --git a/common_audio/resampler/sinc_resampler.cc b/common_audio/resampler/sinc_resampler.cc
index 831ce53..21707e9 100644
--- a/common_audio/resampler/sinc_resampler.cc
+++ b/common_audio/resampler/sinc_resampler.cc
@@ -122,22 +122,28 @@
 const size_t SincResampler::kKernelSize;
 
 // If we know the minimum architecture at compile time, avoid CPU detection.
-void SincResampler::InitializeCPUSpecificFeatures() {
-#if defined(WEBRTC_HAS_NEON)
-  convolve_proc_ = Convolve_NEON;
-#elif defined(WEBRTC_ARCH_X86_FAMILY)
-  // Using AVX2 instead of SSE2 when AVX2 supported.
-  if (WebRtc_GetCPUInfo(kAVX2))
-    convolve_proc_ = Convolve_AVX2;
-  else if (WebRtc_GetCPUInfo(kSSE2))
-    convolve_proc_ = Convolve_SSE;
-  else
-    convolve_proc_ = Convolve_C;
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(__SSE2__)
+#define CONVOLVE_FUNC Convolve_SSE
+void SincResampler::InitializeCPUSpecificFeatures() {}
 #else
-  // Unknown architecture.
-  convolve_proc_ = Convolve_C;
-#endif
+// x86 CPU detection required.  Function will be set by
+// InitializeCPUSpecificFeatures().
+// TODO(dalecurtis): Once Chrome moves to an SSE baseline this can be removed.
+#define CONVOLVE_FUNC convolve_proc_
+
+void SincResampler::InitializeCPUSpecificFeatures() {
+  convolve_proc_ = WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C;
 }
+#endif
+#elif defined(WEBRTC_HAS_NEON)
+#define CONVOLVE_FUNC Convolve_NEON
+void SincResampler::InitializeCPUSpecificFeatures() {}
+#else
+// Unknown architecture.
+#define CONVOLVE_FUNC Convolve_C
+void SincResampler::InitializeCPUSpecificFeatures() {}
+#endif
 
 SincResampler::SincResampler(double io_sample_rate_ratio,
                              size_t request_frames,
@@ -146,20 +152,24 @@
       read_cb_(read_cb),
       request_frames_(request_frames),
       input_buffer_size_(request_frames_ + kKernelSize),
-      // Create input buffers with a 32-byte alignment for SIMD optimizations.
+      // Create input buffers with a 16-byte alignment for SSE optimizations.
       kernel_storage_(static_cast<float*>(
-          AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))),
+          AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
       kernel_pre_sinc_storage_(static_cast<float*>(
-          AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))),
+          AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
       kernel_window_storage_(static_cast<float*>(
-          AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))),
+          AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
       input_buffer_(static_cast<float*>(
-          AlignedMalloc(sizeof(float) * input_buffer_size_, 32))),
+          AlignedMalloc(sizeof(float) * input_buffer_size_, 16))),
+#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__)
       convolve_proc_(nullptr),
+#endif
       r1_(input_buffer_.get()),
       r2_(input_buffer_.get() + kKernelSize / 2) {
+#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__)
   InitializeCPUSpecificFeatures();
   RTC_DCHECK(convolve_proc_);
+#endif
   RTC_DCHECK_GT(request_frames_, 0);
   Flush();
   RTC_DCHECK_GT(block_size_, kKernelSize);
@@ -292,10 +302,10 @@
       const float* const k1 = kernel_ptr + offset_idx * kKernelSize;
       const float* const k2 = k1 + kKernelSize;
 
-      // Ensure |k1|, |k2| are 32-byte aligned for SIMD usage.  Should always be
-      // true so long as kKernelSize is a multiple of 32.
-      RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k1) % 32);
-      RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k2) % 32);
+      // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage.  Should always be
+      // true so long as kKernelSize is a multiple of 16.
+      RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k1) % 16);
+      RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k2) % 16);
 
       // Initialize input pointer based on quantized |virtual_source_idx_|.
       const float* const input_ptr = r1_ + source_idx;
@@ -304,7 +314,7 @@
       const double kernel_interpolation_factor =
           virtual_offset_idx - offset_idx;
       *destination++ =
-          convolve_proc_(input_ptr, k1, k2, kernel_interpolation_factor);
+          CONVOLVE_FUNC(input_ptr, k1, k2, kernel_interpolation_factor);
 
       // Advance the virtual index.
       virtual_source_idx_ += current_io_ratio;
diff --git a/common_audio/resampler/sinc_resampler.h b/common_audio/resampler/sinc_resampler.h
index a72a0c6..5181c18 100644
--- a/common_audio/resampler/sinc_resampler.h
+++ b/common_audio/resampler/sinc_resampler.h
@@ -112,10 +112,6 @@
                             const float* k1,
                             const float* k2,
                             double kernel_interpolation_factor);
-  static float Convolve_AVX2(const float* input_ptr,
-                             const float* k1,
-                             const float* k2,
-                             double kernel_interpolation_factor);
 #elif defined(WEBRTC_HAS_NEON)
   static float Convolve_NEON(const float* input_ptr,
                              const float* k1,
@@ -159,11 +155,13 @@
 // TODO(ajm): Move to using a global static which must only be initialized
 // once by the user. We're not doing this initially, because we don't have
 // e.g. a LazyInstance helper in webrtc.
+#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__)
   typedef float (*ConvolveProc)(const float*,
                                 const float*,
                                 const float*,
                                 double);
   ConvolveProc convolve_proc_;
+#endif
 
   // Pointers to the various regions inside |input_buffer_|.  See the diagram at
   // the top of the .cc file for more information.
diff --git a/common_audio/resampler/sinc_resampler_avx2.cc b/common_audio/resampler/sinc_resampler_avx2.cc
deleted file mode 100644
index 3eb5d4a..0000000
--- a/common_audio/resampler/sinc_resampler_avx2.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  Copyright (c) 2020 The WebRTC project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <immintrin.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <xmmintrin.h>
-
-#include "common_audio/resampler/sinc_resampler.h"
-
-namespace webrtc {
-
-float SincResampler::Convolve_AVX2(const float* input_ptr,
-                                   const float* k1,
-                                   const float* k2,
-                                   double kernel_interpolation_factor) {
-  __m256 m_input;
-  __m256 m_sums1 = _mm256_setzero_ps();
-  __m256 m_sums2 = _mm256_setzero_ps();
-
-  // Based on |input_ptr| alignment, we need to use loadu or load.  Unrolling
-  // these loops has not been tested or benchmarked.
-  bool aligned_input = (reinterpret_cast<uintptr_t>(input_ptr) & 0x1F) == 0;
-  if (!aligned_input) {
-    for (size_t i = 0; i < kKernelSize; i += 8) {
-      m_input = _mm256_loadu_ps(input_ptr + i);
-      m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
-      m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
-    }
-  } else {
-    for (size_t i = 0; i < kKernelSize; i += 8) {
-      m_input = _mm256_load_ps(input_ptr + i);
-      m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
-      m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
-    }
-  }
-
-  // Linearly interpolate the two "convolutions".
-  __m128 m128_sums1 = _mm_add_ps(_mm256_extractf128_ps(m_sums1, 0),
-                                 _mm256_extractf128_ps(m_sums1, 1));
-  __m128 m128_sums2 = _mm_add_ps(_mm256_extractf128_ps(m_sums2, 0),
-                                 _mm256_extractf128_ps(m_sums2, 1));
-  m128_sums1 = _mm_mul_ps(
-      m128_sums1,
-      _mm_set_ps1(static_cast<float>(1.0 - kernel_interpolation_factor)));
-  m128_sums2 = _mm_mul_ps(
-      m128_sums2, _mm_set_ps1(static_cast<float>(kernel_interpolation_factor)));
-  m128_sums1 = _mm_add_ps(m128_sums1, m128_sums2);
-
-  // Sum components together.
-  float result;
-  m128_sums2 = _mm_add_ps(_mm_movehl_ps(m128_sums1, m128_sums1), m128_sums1);
-  _mm_store_ss(&result, _mm_add_ss(m128_sums2,
-                                   _mm_shuffle_ps(m128_sums2, m128_sums2, 1)));
-
-  return result;
-}
-
-}  // namespace webrtc
diff --git a/common_audio/resampler/sinc_resampler_unittest.cc b/common_audio/resampler/sinc_resampler_unittest.cc
index ece6af0..b067b23 100644
--- a/common_audio/resampler/sinc_resampler_unittest.cc
+++ b/common_audio/resampler/sinc_resampler_unittest.cc
@@ -116,9 +116,17 @@
   printf("SetRatio() took %.2fms.\n", total_time_c_us / 1000);
 }
 
+// Define platform independent function name for Convolve* tests.
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+#define CONVOLVE_FUNC Convolve_SSE
+#elif defined(WEBRTC_ARCH_ARM_V7)
+#define CONVOLVE_FUNC Convolve_NEON
+#endif
+
 // Ensure various optimized Convolve() methods return the same value.  Only run
 // this test if other optimized methods exist, otherwise the default Convolve()
 // will be tested by the parameterized SincResampler tests below.
+#if defined(CONVOLVE_FUNC)
 TEST(SincResamplerTest, Convolve) {
 #if defined(WEBRTC_ARCH_X86_FAMILY)
   ASSERT_TRUE(WebRtc_GetCPUInfo(kSSE2));
@@ -140,7 +148,7 @@
   double result = resampler.Convolve_C(
       resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
       resampler.kernel_storage_.get(), kKernelInterpolationFactor);
-  double result2 = resampler.convolve_proc_(
+  double result2 = resampler.CONVOLVE_FUNC(
       resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
       resampler.kernel_storage_.get(), kKernelInterpolationFactor);
   EXPECT_NEAR(result2, result, kEpsilon);
@@ -149,11 +157,12 @@
   result = resampler.Convolve_C(
       resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
       resampler.kernel_storage_.get(), kKernelInterpolationFactor);
-  result2 = resampler.convolve_proc_(
+  result2 = resampler.CONVOLVE_FUNC(
       resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
       resampler.kernel_storage_.get(), kKernelInterpolationFactor);
   EXPECT_NEAR(result2, result, kEpsilon);
 }
+#endif
 
 // Benchmark for the various Convolve() methods.  Make sure to build with
 // branding=Chrome so that RTC_DCHECKs are compiled out when benchmarking.
@@ -181,6 +190,7 @@
       (rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec;
   printf("Convolve_C took %.2fms.\n", total_time_c_us / 1000);
 
+#if defined(CONVOLVE_FUNC)
 #if defined(WEBRTC_ARCH_X86_FAMILY)
   ASSERT_TRUE(WebRtc_GetCPUInfo(kSSE2));
 #elif defined(WEBRTC_ARCH_ARM_V7)
@@ -190,33 +200,36 @@
   // Benchmark with unaligned input pointer.
   start = rtc::TimeNanos();
   for (int j = 0; j < kConvolveIterations; ++j) {
-    resampler.convolve_proc_(
+    resampler.CONVOLVE_FUNC(
         resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
         resampler.kernel_storage_.get(), kKernelInterpolationFactor);
   }
   double total_time_optimized_unaligned_us =
       (rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec;
-  printf(STRINGIZE(convolve_proc_) "(unaligned) took %.2fms; which is %.2fx "
+  printf(STRINGIZE(CONVOLVE_FUNC) "(unaligned) took %.2fms; which is %.2fx "
          "faster than Convolve_C.\n", total_time_optimized_unaligned_us / 1000,
          total_time_c_us / total_time_optimized_unaligned_us);
 
   // Benchmark with aligned input pointer.
   start = rtc::TimeNanos();
   for (int j = 0; j < kConvolveIterations; ++j) {
-    resampler.convolve_proc_(
+    resampler.CONVOLVE_FUNC(
         resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
         resampler.kernel_storage_.get(), kKernelInterpolationFactor);
   }
   double total_time_optimized_aligned_us =
       (rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec;
-  printf(STRINGIZE(convolve_proc_) " (aligned) took %.2fms; which is %.2fx "
+  printf(STRINGIZE(CONVOLVE_FUNC) " (aligned) took %.2fms; which is %.2fx "
          "faster than Convolve_C and %.2fx faster than "
-         STRINGIZE(convolve_proc_) " (unaligned).\n",
+         STRINGIZE(CONVOLVE_FUNC) " (unaligned).\n",
          total_time_optimized_aligned_us / 1000,
          total_time_c_us / total_time_optimized_aligned_us,
          total_time_optimized_unaligned_us / total_time_optimized_aligned_us);
+#endif
 }
 
+#undef CONVOLVE_FUNC
+
 typedef std::tuple<int, int, double, double> SincResamplerTestData;
 class SincResamplerTest
     : public ::testing::TestWithParam<SincResamplerTestData> {
@@ -339,7 +352,7 @@
         std::make_tuple(16000, 44100, kResamplingRMSError, -62.54),
         std::make_tuple(22050, 44100, kResamplingRMSError, -73.53),
         std::make_tuple(32000, 44100, kResamplingRMSError, -63.32),
-        std::make_tuple(44100, 44100, kResamplingRMSError, -73.52),
+        std::make_tuple(44100, 44100, kResamplingRMSError, -73.53),
         std::make_tuple(48000, 44100, -15.01, -64.04),
         std::make_tuple(96000, 44100, -18.49, -25.51),
         std::make_tuple(192000, 44100, -20.50, -13.31),
@@ -347,7 +360,7 @@
         // To 48kHz
         std::make_tuple(8000, 48000, kResamplingRMSError, -63.43),
         std::make_tuple(11025, 48000, kResamplingRMSError, -62.61),
-        std::make_tuple(16000, 48000, kResamplingRMSError, -63.95),
+        std::make_tuple(16000, 48000, kResamplingRMSError, -63.96),
         std::make_tuple(22050, 48000, kResamplingRMSError, -62.42),
         std::make_tuple(32000, 48000, kResamplingRMSError, -64.04),
         std::make_tuple(44100, 48000, kResamplingRMSError, -62.63),
diff --git a/system_wrappers/include/cpu_features_wrapper.h b/system_wrappers/include/cpu_features_wrapper.h
index 02d54b4..739161a 100644
--- a/system_wrappers/include/cpu_features_wrapper.h
+++ b/system_wrappers/include/cpu_features_wrapper.h
@@ -18,7 +18,7 @@
 #endif
 
 // List of features in x86.
-typedef enum { kSSE2, kSSE3, kAVX2 } CPUFeature;
+typedef enum { kSSE2, kSSE3 } CPUFeature;
 
 // List of features in ARM.
 enum {
diff --git a/system_wrappers/source/cpu_features.cc b/system_wrappers/source/cpu_features.cc
index 1667e46..ebcb48c 100644
--- a/system_wrappers/source/cpu_features.cc
+++ b/system_wrappers/source/cpu_features.cc
@@ -24,20 +24,6 @@
 }
 
 #if defined(WEBRTC_ARCH_X86_FAMILY)
-
-// xgetbv returns the value of an Intel Extended Control Register (XCR).
-// Currently only XCR0 is defined by Intel so |xcr| should always be zero.
-uint64_t xgetbv(uint32_t xcr) {
-#if defined(_MSC_VER)
-  return _xgetbv(xcr);
-#else
-  uint32_t eax, edx;
-
-  __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr));
-  return (static_cast<uint64_t>(edx) << 32) | eax;
-#endif  // _MSC_VER
-}
-
 #ifndef _MSC_VER
 // Intrinsic for "cpuid".
 #if defined(__pic__) && defined(__i386__)
@@ -55,7 +41,7 @@
   __asm__ volatile("cpuid\n"
                    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]),
                      "=d"(cpu_info[3])
-                   : "a"(info_type), "c"(0));
+                   : "a"(info_type));
 }
 #endif
 #endif  // _MSC_VER
@@ -65,8 +51,6 @@
 // Actual feature detection for x86.
 static int GetCPUInfo(CPUFeature feature) {
   int cpu_info[4];
-  __cpuid(cpu_info, 0);
-  int num_ids = cpu_info[0];
   __cpuid(cpu_info, 1);
   if (feature == kSSE2) {
     return 0 != (cpu_info[3] & 0x04000000);
@@ -74,23 +58,6 @@
   if (feature == kSSE3) {
     return 0 != (cpu_info[2] & 0x00000001);
   }
-  if (feature == kAVX2) {
-    // Interpret CPU feature information.
-    int cpu_info7[4] = {-1};
-    if (num_ids >= 7) {
-      __cpuid(cpu_info7, 7);
-    }
-
-#if defined(WEBRTC_ENABLE_AVX2)
-    return (cpu_info[2] & 0x10000000) != 0 &&
-           (cpu_info[2] & 0x04000000) != 0 /* XSAVE */ &&
-           (cpu_info[2] & 0x08000000) != 0 /* OSXSAVE */ &&
-           (xgetbv(0) & 0x00000006) == 6 /* XSAVE enabled by kernel */ &&
-           (cpu_info7[1] & 0x00000020) != 0;
-#else
-    return 0;
-#endif  // WEBRTC_ENABLE_AVX2
-  }
   return 0;
 }
 #else
diff --git a/webrtc.gni b/webrtc.gni
index 17a66f9..b3f9a714 100644
--- a/webrtc.gni
+++ b/webrtc.gni
@@ -242,10 +242,6 @@
   # standalone WebRTC.
   rtc_include_internal_audio_device = !build_with_chromium
 
-  # Set this to true to enable the avx2 support in webrtc.
-  # TODO(bugs.webrtc.org/11663): Default this to true and eventually remove.
-  rtc_enable_avx2 = false
-
   # Include tests in standalone checkout.
   rtc_include_tests = !build_with_chromium && !build_with_mozilla