APM: add HW-only denormal disabler

Denormal numbers (see [1]) may origin in APM when the input is zeroed
after a non-zero signal. In extreme cases, instructions involving
denormal operands may run as much as 100 times slower, which seems to
be the case (to some extent) of crbug.com/1227566.

This CL adds a class that disables denormals only via hardware on x86
and on ARM. The class is used in APM and it is an adaption of [2].

Tested: appr.tc call on Chromium (Win, Mac)

[1] https://en.wikipedia.org/wiki/Denormal_number
[2] https://source.chromium.org/chromium/chromium/src/+/main:third_party/blink/renderer/platform/audio/denormal_disabler.h

Fixed: chromium:1227566
Change-Id: I0ed2eab55dc597529f09f93c26c7a01de051fdbe
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/227768
Reviewed-by: Magnus Flodman <mflodman@webrtc.org>
Reviewed-by: Per Ã…hgren <peah@webrtc.org>
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#34701}
diff --git a/modules/audio_processing/BUILD.gn b/modules/audio_processing/BUILD.gn
index a733612..506c821 100644
--- a/modules/audio_processing/BUILD.gn
+++ b/modules/audio_processing/BUILD.gn
@@ -184,6 +184,7 @@
     "../../rtc_base/synchronization:mutex",
     "../../rtc_base/system:rtc_export",
     "../../system_wrappers",
+    "../../system_wrappers:denormal_disabler",
     "../../system_wrappers:field_trial",
     "../../system_wrappers:metrics",
     "aec3",
@@ -373,6 +374,7 @@
         "../../rtc_base/system:arch",
         "../../rtc_base/system:file_wrapper",
         "../../system_wrappers",
+        "../../system_wrappers:denormal_disabler",
         "../../test:fileutils",
         "../../test:rtc_expect_death",
         "../../test:test_support",
diff --git a/modules/audio_processing/audio_processing_impl.cc b/modules/audio_processing/audio_processing_impl.cc
index 5acf693..7facd25 100644
--- a/modules/audio_processing/audio_processing_impl.cc
+++ b/modules/audio_processing/audio_processing_impl.cc
@@ -35,6 +35,7 @@
 #include "rtc_base/ref_counted_object.h"
 #include "rtc_base/time_utils.h"
 #include "rtc_base/trace_event.h"
+#include "system_wrappers/include/denormal_disabler.h"
 #include "system_wrappers/include/field_trial.h"
 #include "system_wrappers/include/metrics.h"
 
@@ -254,6 +255,8 @@
           new ApmDataDumper(rtc::AtomicOps::Increment(&instance_count_))),
       use_setup_specific_default_aec3_config_(
           UseSetupSpecificDefaultAec3Congfig()),
+      use_denormal_disabler_(
+          !field_trial::IsEnabled("WebRTC-ApmDenormalDisablerKillSwitch")),
       capture_runtime_settings_(RuntimeSettingQueueSize()),
       render_runtime_settings_(RuntimeSettingQueueSize()),
       capture_runtime_settings_enqueuer_(&capture_runtime_settings_),
@@ -284,6 +287,9 @@
                    << !!submodules_.capture_post_processor
                    << "\nRender pre processor: "
                    << !!submodules_.render_pre_processor;
+  RTC_LOG(LS_INFO) << "Denormal disabler: "
+                   << (DenormalDisabler::IsSupported() ? "supported"
+                                                       : "unsupported");
 
   // Mark Echo Controller enabled if a factory is injected.
   capture_nonlocked_.echo_controller_enabled =
@@ -791,6 +797,7 @@
   RETURN_ON_ERR(MaybeInitializeCapture(input_config, output_config));
 
   MutexLock lock_capture(&mutex_capture_);
+  DenormalDisabler denormal_disabler(use_denormal_disabler_);
 
   if (aec_dump_) {
     RecordUnprocessedCaptureStream(src);
@@ -1080,6 +1087,7 @@
   RETURN_ON_ERR(MaybeInitializeCapture(input_config, output_config));
 
   MutexLock lock_capture(&mutex_capture_);
+  DenormalDisabler denormal_disabler(use_denormal_disabler_);
 
   if (aec_dump_) {
     RecordUnprocessedCaptureStream(src, input_config);
@@ -1109,6 +1117,7 @@
 int AudioProcessingImpl::ProcessCaptureStreamLocked() {
   EmptyQueuedRenderAudioLocked();
   HandleCaptureRuntimeSettings();
+  DenormalDisabler denormal_disabler(use_denormal_disabler_);
 
   // Ensure that not both the AEC and AECM are active at the same time.
   // TODO(peah): Simplify once the public API Enable functions for these
@@ -1436,6 +1445,8 @@
                                               float* const* dest) {
   TRACE_EVENT0("webrtc", "AudioProcessing::ProcessReverseStream_StreamConfig");
   MutexLock lock(&mutex_render_);
+  DenormalDisabler denormal_disabler(use_denormal_disabler_);
+
   RETURN_ON_ERR(AnalyzeReverseStreamLocked(src, input_config, output_config));
   if (submodule_states_.RenderMultiBandProcessingActive() ||
       submodule_states_.RenderFullBandProcessingActive()) {
@@ -1473,6 +1484,8 @@
   RTC_DCHECK_EQ(input_config.num_frames(),
                 formats_.api_format.reverse_input_stream().num_frames());
 
+  DenormalDisabler denormal_disabler(use_denormal_disabler_);
+
   if (aec_dump_) {
     const size_t channel_size =
         formats_.api_format.reverse_input_stream().num_frames();
@@ -1497,6 +1510,8 @@
   }
 
   MutexLock lock(&mutex_render_);
+  DenormalDisabler denormal_disabler(use_denormal_disabler_);
+
   ProcessingConfig processing_config = formats_.api_format;
   processing_config.reverse_input_stream().set_sample_rate_hz(
       input_config.sample_rate_hz());
@@ -1531,6 +1546,7 @@
   AudioBuffer* render_buffer = render_.render_audio.get();  // For brevity.
 
   HandleRenderRuntimeSettings();
+  DenormalDisabler denormal_disabler(use_denormal_disabler_);
 
   if (submodules_.render_pre_processor) {
     submodules_.render_pre_processor->Process(render_buffer);
diff --git a/modules/audio_processing/audio_processing_impl.h b/modules/audio_processing/audio_processing_impl.h
index 2c22536..27abbd4 100644
--- a/modules/audio_processing/audio_processing_impl.h
+++ b/modules/audio_processing/audio_processing_impl.h
@@ -187,6 +187,8 @@
   static int instance_count_;
   const bool use_setup_specific_default_aec3_config_;
 
+  const bool use_denormal_disabler_;
+
   SwapQueue<RuntimeSetting> capture_runtime_settings_;
   SwapQueue<RuntimeSetting> render_runtime_settings_;
 
diff --git a/system_wrappers/BUILD.gn b/system_wrappers/BUILD.gn
index 80088e0..fc956a5 100644
--- a/system_wrappers/BUILD.gn
+++ b/system_wrappers/BUILD.gn
@@ -108,11 +108,25 @@
   ]
 }
 
+rtc_library("denormal_disabler") {
+  visibility = [ "*" ]
+  public = [ "include/denormal_disabler.h" ]
+  sources = [ "source/denormal_disabler.cc" ]
+  deps = [
+    "../rtc_base:checks",
+    "../rtc_base/system:arch",
+  ]
+  if (is_clang) {
+    cflags_cc = [ "-Wno-unused-private-field" ]
+  }
+}
+
 if (rtc_include_tests && !build_with_chromium) {
   rtc_test("system_wrappers_unittests") {
     testonly = true
     sources = [
       "source/clock_unittest.cc",
+      "source/denormal_disabler_unittest.cc",
       "source/field_trial_unittest.cc",
       "source/metrics_default_unittest.cc",
       "source/metrics_unittest.cc",
@@ -121,6 +135,7 @@
     ]
 
     deps = [
+      ":denormal_disabler",
       ":field_trial",
       ":metrics",
       ":system_wrappers",
diff --git a/system_wrappers/include/denormal_disabler.h b/system_wrappers/include/denormal_disabler.h
new file mode 100644
index 0000000..ca1e19e
--- /dev/null
+++ b/system_wrappers/include/denormal_disabler.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright (c) 2021 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef SYSTEM_WRAPPERS_INCLUDE_DENORMAL_DISABLER_H_
+#define SYSTEM_WRAPPERS_INCLUDE_DENORMAL_DISABLER_H_
+
+#include "rtc_base/system/arch.h"
+
+namespace webrtc {
+
+// Activates the hardware (HW) way to flush denormals (see [1]) to zero as they
+// can very seriously impact performance. At destruction time restores the
+// denormals handling state read by the ctor; hence, supports nested calls.
+// Equals a no-op if the architecture is not x86 or ARM or if the compiler is
+// not CLANG.
+// [1] https://en.wikipedia.org/wiki/Denormal_number
+//
+// Example usage:
+//
+// void Foo() {
+//   DenormalDisabler d;
+//   ...
+// }
+class DenormalDisabler {
+ public:
+  // Ctor. If `enabled` is true and architecture and compiler are supported,
+  // stores the HW settings for denormals, disables denormals and sets
+  // `disabling_activated_` to true. Otherwise, only sets `disabling_activated_`
+  // to false.
+  explicit DenormalDisabler(bool enabled);
+  DenormalDisabler(const DenormalDisabler&) = delete;
+  DenormalDisabler& operator=(const DenormalDisabler&) = delete;
+  // Dtor. If `disabling_activated_` is true, restores the denormals HW settings
+  // read by the ctor before denormals were disabled. Otherwise it's a no-op.
+  ~DenormalDisabler();
+
+  // Returns true if architecture and compiler are supported.
+  static bool IsSupported();
+
+ private:
+  const int status_word_;
+  const bool disabling_activated_;
+};
+
+}  // namespace webrtc
+
+#endif  // SYSTEM_WRAPPERS_INCLUDE_DENORMAL_DISABLER_H_
diff --git a/system_wrappers/source/denormal_disabler.cc b/system_wrappers/source/denormal_disabler.cc
new file mode 100644
index 0000000..fe8ec1a
--- /dev/null
+++ b/system_wrappers/source/denormal_disabler.cc
@@ -0,0 +1,107 @@
+/*
+ *  Copyright (c) 2021 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "system_wrappers/include/denormal_disabler.h"
+
+#include "rtc_base/checks.h"
+
+namespace webrtc {
+namespace {
+
+#if defined(WEBRTC_ARCH_X86_FAMILY) && defined(__clang__)
+#define WEBRTC_DENORMAL_DISABLER_X86_SUPPORTED
+#endif
+
+#if defined(WEBRTC_DENORMAL_DISABLER_X86_SUPPORTED) || \
+    defined(WEBRTC_ARCH_ARM_FAMILY)
+#define WEBRTC_DENORMAL_DISABLER_SUPPORTED
+#endif
+
+constexpr int kUnspecifiedStatusWord = -1;
+
+#if defined(WEBRTC_DENORMAL_DISABLER_SUPPORTED)
+
+// Control register bit mask to disable denormals on the hardware.
+#if defined(WEBRTC_DENORMAL_DISABLER_X86_SUPPORTED)
+// On x86 two bits are used: flush-to-zero (FTZ) and denormals-are-zero (DAZ).
+constexpr int kDenormalBitMask = 0x8040;
+#elif defined(WEBRTC_ARCH_ARM_FAMILY)
+// On ARM one bit is used: flush-to-zero (FTZ).
+constexpr int kDenormalBitMask = 1 << 24;
+#endif
+
+// Reads the relevant CPU control register and returns its value for supported
+// architectures and compilers. Otherwise returns `kUnspecifiedStatusWord`.
+int ReadStatusWord() {
+  int result = kUnspecifiedStatusWord;
+#if defined(WEBRTC_DENORMAL_DISABLER_X86_SUPPORTED)
+  asm volatile("stmxcsr %0" : "=m"(result));
+#elif defined(WEBRTC_ARCH_ARM_FAMILY) && defined(WEBRTC_ARCH_32_BITS)
+  asm volatile("vmrs %[result], FPSCR" : [result] "=r"(result));
+#elif defined(WEBRTC_ARCH_ARM_FAMILY) && defined(WEBRTC_ARCH_64_BITS)
+  asm volatile("mrs %x[result], FPCR" : [result] "=r"(result));
+#endif
+  return result;
+}
+
+// Writes `status_word` in the relevant CPU control register if the architecture
+// and the compiler are supported.
+void SetStatusWord(int status_word) {
+#if defined(WEBRTC_DENORMAL_DISABLER_X86_SUPPORTED)
+  asm volatile("ldmxcsr %0" : : "m"(status_word));
+#elif defined(WEBRTC_ARCH_ARM_FAMILY) && defined(WEBRTC_ARCH_32_BITS)
+  asm volatile("vmsr FPSCR, %[src]" : : [src] "r"(status_word));
+#elif defined(WEBRTC_ARCH_ARM_FAMILY) && defined(WEBRTC_ARCH_64_BITS)
+  asm volatile("msr FPCR, %x[src]" : : [src] "r"(status_word));
+#endif
+}
+
+// Returns true if the status word indicates that denormals are enabled.
+constexpr bool DenormalsEnabled(int status_word) {
+  return (status_word & kDenormalBitMask) != kDenormalBitMask;
+}
+
+#endif  // defined(WEBRTC_DENORMAL_DISABLER_SUPPORTED)
+
+}  // namespace
+
+#if defined(WEBRTC_DENORMAL_DISABLER_SUPPORTED)
+DenormalDisabler::DenormalDisabler(bool enabled)
+    : status_word_(enabled ? ReadStatusWord() : kUnspecifiedStatusWord),
+      disabling_activated_(enabled && DenormalsEnabled(status_word_)) {
+  if (disabling_activated_) {
+    RTC_DCHECK_NE(status_word_, kUnspecifiedStatusWord);
+    SetStatusWord(status_word_ | kDenormalBitMask);
+    RTC_DCHECK(!DenormalsEnabled(ReadStatusWord()));
+  }
+}
+
+bool DenormalDisabler::IsSupported() {
+  return true;
+}
+
+DenormalDisabler::~DenormalDisabler() {
+  if (disabling_activated_) {
+    RTC_DCHECK_NE(status_word_, kUnspecifiedStatusWord);
+    SetStatusWord(status_word_);
+  }
+}
+#else
+DenormalDisabler::DenormalDisabler(bool enabled)
+    : status_word_(kUnspecifiedStatusWord), disabling_activated_(false) {}
+
+bool DenormalDisabler::IsSupported() {
+  return false;
+}
+
+DenormalDisabler::~DenormalDisabler() = default;
+#endif
+
+}  // namespace webrtc
diff --git a/system_wrappers/source/denormal_disabler_unittest.cc b/system_wrappers/source/denormal_disabler_unittest.cc
new file mode 100644
index 0000000..32516db
--- /dev/null
+++ b/system_wrappers/source/denormal_disabler_unittest.cc
@@ -0,0 +1,146 @@
+/*
+ *  Copyright (c) 2021 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "system_wrappers/include/denormal_disabler.h"
+
+#include <cmath>
+#include <limits>
+#include <vector>
+
+#include "rtc_base/checks.h"
+#include "test/gtest.h"
+
+namespace webrtc {
+namespace {
+
+constexpr float kSmallest = std::numeric_limits<float>::min();
+
+// Float values such that, if used as divisors of `kSmallest`, the division
+// produces a denormal or zero depending on whether denormals are enabled.
+constexpr float kDenormalDivisors[] = {123.125f, 97.0f, 32.0f, 5.0f, 1.5f};
+
+// Returns true if the result of `dividend` / `divisor` is a denormal.
+// `dividend` and `divisor` must not be denormals.
+bool DivisionIsDenormal(float dividend, float divisor) {
+  RTC_DCHECK_GE(std::fabsf(dividend), kSmallest);
+  RTC_DCHECK_GE(std::fabsf(divisor), kSmallest);
+  volatile float division = dividend / divisor;
+  return division != 0.0f && std::fabsf(division) < kSmallest;
+}
+
+}  // namespace
+
+class DenormalDisablerParametrization : public ::testing::TestWithParam<bool> {
+};
+
+// Checks that +inf and -inf are not zeroed regardless of whether
+// architecture and compiler are supported.
+TEST_P(DenormalDisablerParametrization, InfNotZeroed) {
+  DenormalDisabler denormal_disabler(/*enabled=*/GetParam());
+  constexpr float kMax = std::numeric_limits<float>::max();
+  for (float x : {-2.0f, 2.0f}) {
+    SCOPED_TRACE(x);
+    volatile float multiplication = kMax * x;
+    EXPECT_TRUE(std::isinf(multiplication));
+  }
+}
+
+// Checks that a NaN is not zeroed regardless of whether architecture and
+// compiler are supported.
+TEST_P(DenormalDisablerParametrization, NanNotZeroed) {
+  DenormalDisabler denormal_disabler(/*enabled=*/GetParam());
+  volatile float kNan = std::sqrt(-1.0f);
+  EXPECT_TRUE(std::isnan(kNan));
+}
+
+INSTANTIATE_TEST_SUITE_P(DenormalDisabler,
+                         DenormalDisablerParametrization,
+                         ::testing::Values(false, true),
+                         [](const ::testing::TestParamInfo<bool>& info) {
+                           return info.param ? "enabled" : "disabled";
+                         });
+
+// Checks that denormals are not zeroed if `DenormalDisabler` is disabled and
+// architecture and compiler are supported.
+TEST(DenormalDisabler, DoNotZeroDenormalsIfDisabled) {
+  if (!DenormalDisabler::IsSupported()) {
+    GTEST_SKIP() << "Unsupported platform.";
+  }
+  ASSERT_TRUE(DivisionIsDenormal(kSmallest, kDenormalDivisors[0]))
+      << "Precondition not met: denormals must be enabled.";
+  DenormalDisabler denormal_disabler(/*enabled=*/false);
+  for (float x : kDenormalDivisors) {
+    SCOPED_TRACE(x);
+    EXPECT_TRUE(DivisionIsDenormal(-kSmallest, x));
+    EXPECT_TRUE(DivisionIsDenormal(kSmallest, x));
+  }
+}
+
+// Checks that denormals are zeroed if `DenormalDisabler` is enabled if
+// architecture and compiler are supported.
+TEST(DenormalDisabler, ZeroDenormals) {
+  if (!DenormalDisabler::IsSupported()) {
+    GTEST_SKIP() << "Unsupported platform.";
+  }
+  DenormalDisabler denormal_disabler(/*enabled=*/true);
+  for (float x : kDenormalDivisors) {
+    SCOPED_TRACE(x);
+    EXPECT_FALSE(DivisionIsDenormal(-kSmallest, x));
+    EXPECT_FALSE(DivisionIsDenormal(kSmallest, x));
+  }
+}
+
+// Checks that the `DenormalDisabler` dtor re-enables denormals if previously
+// enabled and architecture and compiler are supported.
+TEST(DenormalDisabler, RestoreDenormalsEnabled) {
+  if (!DenormalDisabler::IsSupported()) {
+    GTEST_SKIP() << "Unsupported platform.";
+  }
+  ASSERT_TRUE(DivisionIsDenormal(kSmallest, kDenormalDivisors[0]))
+      << "Precondition not met: denormals must be enabled.";
+  {
+    DenormalDisabler denormal_disabler(/*enabled=*/true);
+    ASSERT_FALSE(DivisionIsDenormal(kSmallest, kDenormalDivisors[0]));
+  }
+  EXPECT_TRUE(DivisionIsDenormal(kSmallest, kDenormalDivisors[0]));
+}
+
+// Checks that the `DenormalDisabler` dtor keeps denormals disabled if
+// architecture and compiler are supported and if previously disabled - i.e.,
+// nested usage is supported.
+TEST(DenormalDisabler, ZeroDenormalsNested) {
+  if (!DenormalDisabler::IsSupported()) {
+    GTEST_SKIP() << "Unsupported platform.";
+  }
+  DenormalDisabler d1(/*enabled=*/true);
+  ASSERT_FALSE(DivisionIsDenormal(kSmallest, kDenormalDivisors[0]));
+  {
+    DenormalDisabler d2(/*enabled=*/true);
+    ASSERT_FALSE(DivisionIsDenormal(kSmallest, kDenormalDivisors[0]));
+  }
+  EXPECT_FALSE(DivisionIsDenormal(kSmallest, kDenormalDivisors[0]));
+}
+
+// Checks that `DenormalDisabler` does not zero denormals if architecture and
+// compiler are not supported.
+TEST(DenormalDisabler, DoNotZeroDenormalsIfUnsupported) {
+  if (DenormalDisabler::IsSupported()) {
+    // TODO(bugs.webrtc.org/13057): Use GTEST_SKIP() once fixed.
+    return;
+  }
+  DenormalDisabler denormal_disabler(/*enabled=*/true);
+  for (float x : kDenormalDivisors) {
+    SCOPED_TRACE(x);
+    EXPECT_TRUE(DivisionIsDenormal(-kSmallest, x));
+    EXPECT_TRUE(DivisionIsDenormal(kSmallest, x));
+  }
+}
+
+}  // namespace webrtc