Reland "Support AVX2/FMA intrinsics in Audio Resampler module"
This is a reland of 1ca8d87239f1209031bbc77a6443bc7ac2dcee8c
Original change's description:
> Support AVX2/FMA intrinsics in Audio Resampler module
>
> From the test result, using AVX2/FMA is 1.60x faster than SSE on atlas.
>
> Bug: webrtc:11663
> Test: common_audio_unittests on atlas and octopus.
> Change-Id: Ibd45ea46aa97d5790a24e5116f741592b95f6416
> Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/176382
> Reviewed-by: Per Åhgren <peah@webrtc.org>
> Reviewed-by: Henrik Andreassson <henrika@webrtc.org>
> Reviewed-by: Mirko Bonadei <mbonadei@webrtc.org>
> Reviewed-by: Sam Zackrisson <saza@webrtc.org>
> Commit-Queue: Sam Zackrisson <saza@webrtc.org>
> Cr-Commit-Position: refs/heads/master@{#31810}
Bug: webrtc:11663
Change-Id: I92f5832a42c0314853c9fead46425c08e2040dc0
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/181800
Commit-Queue: Mirko Bonadei <mbonadei@webrtc.org>
Reviewed-by: Niels Moller <nisse@webrtc.org>
Reviewed-by: Per Åhgren <peah@webrtc.org>
Reviewed-by: Mirko Bonadei <mbonadei@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#31945}
diff --git a/BUILD.gn b/BUILD.gn
index b676d1e..c844e385 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -129,6 +129,10 @@
defines += [ "RTC_DISABLE_CHECK_MSG" ]
}
+ if (rtc_enable_avx2) {
+ defines += [ "WEBRTC_ENABLE_AVX2" ]
+ }
+
# Some tests need to declare their own trace event handlers. If this define is
# not set, the first time TRACE_EVENT_* is called it will store the return
# value for the current handler in an static variable, so that subsequent
diff --git a/common_audio/BUILD.gn b/common_audio/BUILD.gn
index 4077486..fc76351 100644
--- a/common_audio/BUILD.gn
+++ b/common_audio/BUILD.gn
@@ -67,6 +67,7 @@
if (current_cpu == "x86" || current_cpu == "x64") {
deps += [ ":common_audio_sse2" ]
+ deps += [ ":common_audio_avx2" ]
}
}
@@ -235,6 +236,7 @@
]
if (current_cpu == "x86" || current_cpu == "x64") {
deps += [ ":common_audio_sse2" ]
+ deps += [ ":common_audio_avx2" ]
}
if (rtc_build_with_neon) {
deps += [ ":common_audio_neon" ]
@@ -261,6 +263,27 @@
"../rtc_base/memory:aligned_malloc",
]
}
+
+ rtc_library("common_audio_avx2") {
+ sources = [ "resampler/sinc_resampler_avx2.cc" ]
+
+ if (is_win) {
+ cflags = [ "/arch:AVX2" ]
+ } else {
+ cflags = [
+ "-mavx2",
+ "-mfma",
+ ]
+ }
+
+ deps = [
+ ":fir_filter",
+ ":sinc_resampler",
+ "../rtc_base:checks",
+ "../rtc_base:rtc_base_approved",
+ "../rtc_base/memory:aligned_malloc",
+ ]
+ }
}
if (rtc_build_with_neon) {
diff --git a/common_audio/resampler/sinc_resampler.cc b/common_audio/resampler/sinc_resampler.cc
index 21707e9..831ce53 100644
--- a/common_audio/resampler/sinc_resampler.cc
+++ b/common_audio/resampler/sinc_resampler.cc
@@ -122,28 +122,22 @@
const size_t SincResampler::kKernelSize;
// If we know the minimum architecture at compile time, avoid CPU detection.
-#if defined(WEBRTC_ARCH_X86_FAMILY)
-#if defined(__SSE2__)
-#define CONVOLVE_FUNC Convolve_SSE
-void SincResampler::InitializeCPUSpecificFeatures() {}
-#else
-// x86 CPU detection required. Function will be set by
-// InitializeCPUSpecificFeatures().
-// TODO(dalecurtis): Once Chrome moves to an SSE baseline this can be removed.
-#define CONVOLVE_FUNC convolve_proc_
-
void SincResampler::InitializeCPUSpecificFeatures() {
- convolve_proc_ = WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C;
-}
-#endif
-#elif defined(WEBRTC_HAS_NEON)
-#define CONVOLVE_FUNC Convolve_NEON
-void SincResampler::InitializeCPUSpecificFeatures() {}
+#if defined(WEBRTC_HAS_NEON)
+ convolve_proc_ = Convolve_NEON;
+#elif defined(WEBRTC_ARCH_X86_FAMILY)
+ // Using AVX2 instead of SSE2 when AVX2 supported.
+ if (WebRtc_GetCPUInfo(kAVX2))
+ convolve_proc_ = Convolve_AVX2;
+ else if (WebRtc_GetCPUInfo(kSSE2))
+ convolve_proc_ = Convolve_SSE;
+ else
+ convolve_proc_ = Convolve_C;
#else
-// Unknown architecture.
-#define CONVOLVE_FUNC Convolve_C
-void SincResampler::InitializeCPUSpecificFeatures() {}
+ // Unknown architecture.
+ convolve_proc_ = Convolve_C;
#endif
+}
SincResampler::SincResampler(double io_sample_rate_ratio,
size_t request_frames,
@@ -152,24 +146,20 @@
read_cb_(read_cb),
request_frames_(request_frames),
input_buffer_size_(request_frames_ + kKernelSize),
- // Create input buffers with a 16-byte alignment for SSE optimizations.
+ // Create input buffers with a 32-byte alignment for SIMD optimizations.
kernel_storage_(static_cast<float*>(
- AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
+ AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))),
kernel_pre_sinc_storage_(static_cast<float*>(
- AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
+ AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))),
kernel_window_storage_(static_cast<float*>(
- AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
+ AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))),
input_buffer_(static_cast<float*>(
- AlignedMalloc(sizeof(float) * input_buffer_size_, 16))),
-#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__)
+ AlignedMalloc(sizeof(float) * input_buffer_size_, 32))),
convolve_proc_(nullptr),
-#endif
r1_(input_buffer_.get()),
r2_(input_buffer_.get() + kKernelSize / 2) {
-#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__)
InitializeCPUSpecificFeatures();
RTC_DCHECK(convolve_proc_);
-#endif
RTC_DCHECK_GT(request_frames_, 0);
Flush();
RTC_DCHECK_GT(block_size_, kKernelSize);
@@ -302,10 +292,10 @@
const float* const k1 = kernel_ptr + offset_idx * kKernelSize;
const float* const k2 = k1 + kKernelSize;
- // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be
- // true so long as kKernelSize is a multiple of 16.
- RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k1) % 16);
- RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k2) % 16);
+ // Ensure |k1|, |k2| are 32-byte aligned for SIMD usage. Should always be
+ // true so long as kKernelSize is a multiple of 32.
+ RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k1) % 32);
+ RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k2) % 32);
// Initialize input pointer based on quantized |virtual_source_idx_|.
const float* const input_ptr = r1_ + source_idx;
@@ -314,7 +304,7 @@
const double kernel_interpolation_factor =
virtual_offset_idx - offset_idx;
*destination++ =
- CONVOLVE_FUNC(input_ptr, k1, k2, kernel_interpolation_factor);
+ convolve_proc_(input_ptr, k1, k2, kernel_interpolation_factor);
// Advance the virtual index.
virtual_source_idx_ += current_io_ratio;
diff --git a/common_audio/resampler/sinc_resampler.h b/common_audio/resampler/sinc_resampler.h
index 5181c18..a72a0c6 100644
--- a/common_audio/resampler/sinc_resampler.h
+++ b/common_audio/resampler/sinc_resampler.h
@@ -112,6 +112,10 @@
const float* k1,
const float* k2,
double kernel_interpolation_factor);
+ static float Convolve_AVX2(const float* input_ptr,
+ const float* k1,
+ const float* k2,
+ double kernel_interpolation_factor);
#elif defined(WEBRTC_HAS_NEON)
static float Convolve_NEON(const float* input_ptr,
const float* k1,
@@ -155,13 +159,11 @@
// TODO(ajm): Move to using a global static which must only be initialized
// once by the user. We're not doing this initially, because we don't have
// e.g. a LazyInstance helper in webrtc.
-#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__)
typedef float (*ConvolveProc)(const float*,
const float*,
const float*,
double);
ConvolveProc convolve_proc_;
-#endif
// Pointers to the various regions inside |input_buffer_|. See the diagram at
// the top of the .cc file for more information.
diff --git a/common_audio/resampler/sinc_resampler_avx2.cc b/common_audio/resampler/sinc_resampler_avx2.cc
new file mode 100644
index 0000000..3eb5d4a
--- /dev/null
+++ b/common_audio/resampler/sinc_resampler_avx2.cc
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <xmmintrin.h>
+
+#include "common_audio/resampler/sinc_resampler.h"
+
+namespace webrtc {
+
+float SincResampler::Convolve_AVX2(const float* input_ptr,
+ const float* k1,
+ const float* k2,
+ double kernel_interpolation_factor) {
+ __m256 m_input;
+ __m256 m_sums1 = _mm256_setzero_ps();
+ __m256 m_sums2 = _mm256_setzero_ps();
+
+ // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
+ // these loops has not been tested or benchmarked.
+ bool aligned_input = (reinterpret_cast<uintptr_t>(input_ptr) & 0x1F) == 0;
+ if (!aligned_input) {
+ for (size_t i = 0; i < kKernelSize; i += 8) {
+ m_input = _mm256_loadu_ps(input_ptr + i);
+ m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
+ m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
+ }
+ } else {
+ for (size_t i = 0; i < kKernelSize; i += 8) {
+ m_input = _mm256_load_ps(input_ptr + i);
+ m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
+ m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
+ }
+ }
+
+ // Linearly interpolate the two "convolutions".
+ __m128 m128_sums1 = _mm_add_ps(_mm256_extractf128_ps(m_sums1, 0),
+ _mm256_extractf128_ps(m_sums1, 1));
+ __m128 m128_sums2 = _mm_add_ps(_mm256_extractf128_ps(m_sums2, 0),
+ _mm256_extractf128_ps(m_sums2, 1));
+ m128_sums1 = _mm_mul_ps(
+ m128_sums1,
+ _mm_set_ps1(static_cast<float>(1.0 - kernel_interpolation_factor)));
+ m128_sums2 = _mm_mul_ps(
+ m128_sums2, _mm_set_ps1(static_cast<float>(kernel_interpolation_factor)));
+ m128_sums1 = _mm_add_ps(m128_sums1, m128_sums2);
+
+ // Sum components together.
+ float result;
+ m128_sums2 = _mm_add_ps(_mm_movehl_ps(m128_sums1, m128_sums1), m128_sums1);
+ _mm_store_ss(&result, _mm_add_ss(m128_sums2,
+ _mm_shuffle_ps(m128_sums2, m128_sums2, 1)));
+
+ return result;
+}
+
+} // namespace webrtc
diff --git a/common_audio/resampler/sinc_resampler_unittest.cc b/common_audio/resampler/sinc_resampler_unittest.cc
index b067b23..ece6af0 100644
--- a/common_audio/resampler/sinc_resampler_unittest.cc
+++ b/common_audio/resampler/sinc_resampler_unittest.cc
@@ -116,17 +116,9 @@
printf("SetRatio() took %.2fms.\n", total_time_c_us / 1000);
}
-// Define platform independent function name for Convolve* tests.
-#if defined(WEBRTC_ARCH_X86_FAMILY)
-#define CONVOLVE_FUNC Convolve_SSE
-#elif defined(WEBRTC_ARCH_ARM_V7)
-#define CONVOLVE_FUNC Convolve_NEON
-#endif
-
// Ensure various optimized Convolve() methods return the same value. Only run
// this test if other optimized methods exist, otherwise the default Convolve()
// will be tested by the parameterized SincResampler tests below.
-#if defined(CONVOLVE_FUNC)
TEST(SincResamplerTest, Convolve) {
#if defined(WEBRTC_ARCH_X86_FAMILY)
ASSERT_TRUE(WebRtc_GetCPUInfo(kSSE2));
@@ -148,7 +140,7 @@
double result = resampler.Convolve_C(
resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
- double result2 = resampler.CONVOLVE_FUNC(
+ double result2 = resampler.convolve_proc_(
resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
EXPECT_NEAR(result2, result, kEpsilon);
@@ -157,12 +149,11 @@
result = resampler.Convolve_C(
resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
- result2 = resampler.CONVOLVE_FUNC(
+ result2 = resampler.convolve_proc_(
resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
EXPECT_NEAR(result2, result, kEpsilon);
}
-#endif
// Benchmark for the various Convolve() methods. Make sure to build with
// branding=Chrome so that RTC_DCHECKs are compiled out when benchmarking.
@@ -190,7 +181,6 @@
(rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec;
printf("Convolve_C took %.2fms.\n", total_time_c_us / 1000);
-#if defined(CONVOLVE_FUNC)
#if defined(WEBRTC_ARCH_X86_FAMILY)
ASSERT_TRUE(WebRtc_GetCPUInfo(kSSE2));
#elif defined(WEBRTC_ARCH_ARM_V7)
@@ -200,36 +190,33 @@
// Benchmark with unaligned input pointer.
start = rtc::TimeNanos();
for (int j = 0; j < kConvolveIterations; ++j) {
- resampler.CONVOLVE_FUNC(
+ resampler.convolve_proc_(
resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
}
double total_time_optimized_unaligned_us =
(rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec;
- printf(STRINGIZE(CONVOLVE_FUNC) "(unaligned) took %.2fms; which is %.2fx "
+ printf(STRINGIZE(convolve_proc_) "(unaligned) took %.2fms; which is %.2fx "
"faster than Convolve_C.\n", total_time_optimized_unaligned_us / 1000,
total_time_c_us / total_time_optimized_unaligned_us);
// Benchmark with aligned input pointer.
start = rtc::TimeNanos();
for (int j = 0; j < kConvolveIterations; ++j) {
- resampler.CONVOLVE_FUNC(
+ resampler.convolve_proc_(
resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
}
double total_time_optimized_aligned_us =
(rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec;
- printf(STRINGIZE(CONVOLVE_FUNC) " (aligned) took %.2fms; which is %.2fx "
+ printf(STRINGIZE(convolve_proc_) " (aligned) took %.2fms; which is %.2fx "
"faster than Convolve_C and %.2fx faster than "
- STRINGIZE(CONVOLVE_FUNC) " (unaligned).\n",
+ STRINGIZE(convolve_proc_) " (unaligned).\n",
total_time_optimized_aligned_us / 1000,
total_time_c_us / total_time_optimized_aligned_us,
total_time_optimized_unaligned_us / total_time_optimized_aligned_us);
-#endif
}
-#undef CONVOLVE_FUNC
-
typedef std::tuple<int, int, double, double> SincResamplerTestData;
class SincResamplerTest
: public ::testing::TestWithParam<SincResamplerTestData> {
@@ -352,7 +339,7 @@
std::make_tuple(16000, 44100, kResamplingRMSError, -62.54),
std::make_tuple(22050, 44100, kResamplingRMSError, -73.53),
std::make_tuple(32000, 44100, kResamplingRMSError, -63.32),
- std::make_tuple(44100, 44100, kResamplingRMSError, -73.53),
+ std::make_tuple(44100, 44100, kResamplingRMSError, -73.52),
std::make_tuple(48000, 44100, -15.01, -64.04),
std::make_tuple(96000, 44100, -18.49, -25.51),
std::make_tuple(192000, 44100, -20.50, -13.31),
@@ -360,7 +347,7 @@
// To 48kHz
std::make_tuple(8000, 48000, kResamplingRMSError, -63.43),
std::make_tuple(11025, 48000, kResamplingRMSError, -62.61),
- std::make_tuple(16000, 48000, kResamplingRMSError, -63.96),
+ std::make_tuple(16000, 48000, kResamplingRMSError, -63.95),
std::make_tuple(22050, 48000, kResamplingRMSError, -62.42),
std::make_tuple(32000, 48000, kResamplingRMSError, -64.04),
std::make_tuple(44100, 48000, kResamplingRMSError, -62.63),
diff --git a/system_wrappers/include/cpu_features_wrapper.h b/system_wrappers/include/cpu_features_wrapper.h
index 739161a..02d54b4 100644
--- a/system_wrappers/include/cpu_features_wrapper.h
+++ b/system_wrappers/include/cpu_features_wrapper.h
@@ -18,7 +18,7 @@
#endif
// List of features in x86.
-typedef enum { kSSE2, kSSE3 } CPUFeature;
+typedef enum { kSSE2, kSSE3, kAVX2 } CPUFeature;
// List of features in ARM.
enum {
diff --git a/system_wrappers/source/cpu_features.cc b/system_wrappers/source/cpu_features.cc
index ebcb48c..40110ed 100644
--- a/system_wrappers/source/cpu_features.cc
+++ b/system_wrappers/source/cpu_features.cc
@@ -24,6 +24,22 @@
}
#if defined(WEBRTC_ARCH_X86_FAMILY)
+
+#if defined(WEBRTC_ENABLE_AVX2)
+// xgetbv returns the value of an Intel Extended Control Register (XCR).
+// Currently only XCR0 is defined by Intel so |xcr| should always be zero.
+static uint64_t xgetbv(uint32_t xcr) {
+#if defined(_MSC_VER)
+ return _xgetbv(xcr);
+#else
+ uint32_t eax, edx;
+
+ __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr));
+ return (static_cast<uint64_t>(edx) << 32) | eax;
+#endif // _MSC_VER
+}
+#endif // WEBRTC_ENABLE_AVX2
+
#ifndef _MSC_VER
// Intrinsic for "cpuid".
#if defined(__pic__) && defined(__i386__)
@@ -41,7 +57,7 @@
__asm__ volatile("cpuid\n"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]),
"=d"(cpu_info[3])
- : "a"(info_type));
+ : "a"(info_type), "c"(0));
}
#endif
#endif // _MSC_VER
@@ -58,6 +74,30 @@
if (feature == kSSE3) {
return 0 != (cpu_info[2] & 0x00000001);
}
+#if defined(WEBRTC_ENABLE_AVX2)
+ if (feature == kAVX2) {
+ int cpu_info7[4];
+ __cpuid(cpu_info7, 0);
+ int num_ids = cpu_info7[0];
+ if (num_ids < 7) {
+ return 0;
+ }
+ // Interpret CPU feature information.
+ __cpuid(cpu_info7, 7);
+
+ // AVX instructions can be used when
+ // a) AVX are supported by the CPU,
+ // b) XSAVE is supported by the CPU,
+ // c) XSAVE is enabled by the kernel.
+ // See http://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled
+ // AVX2 support needs (avx_support && (cpu_info7[1] & 0x00000020) != 0;).
+ return (cpu_info[2] & 0x10000000) != 0 &&
+ (cpu_info[2] & 0x04000000) != 0 /* XSAVE */ &&
+ (cpu_info[2] & 0x08000000) != 0 /* OSXSAVE */ &&
+ (xgetbv(0) & 0x00000006) == 6 /* XSAVE enabled by kernel */ &&
+ (cpu_info7[1] & 0x00000020) != 0;
+ }
+#endif // WEBRTC_ENABLE_AVX2
return 0;
}
#else
diff --git a/webrtc.gni b/webrtc.gni
index b3f9a714..17a66f9 100644
--- a/webrtc.gni
+++ b/webrtc.gni
@@ -242,6 +242,10 @@
# standalone WebRTC.
rtc_include_internal_audio_device = !build_with_chromium
+ # Set this to true to enable the avx2 support in webrtc.
+ # TODO(bugs.webrtc.org/11663): Default this to true and eventually remove.
+ rtc_enable_avx2 = false
+
# Include tests in standalone checkout.
rtc_include_tests = !build_with_chromium && !build_with_mozilla