Echo canceller 3 improvements for setups with headsets.

This CL improves the echo cancellation performance on setups where
headsets are used (systems with such low echo path gain
that no correlation between the render and capture signals
can be found) in 4 ways:
1) The echo path gain for systems with headsets is assumed to be
nonzero.
2) The stationary component of the render power is not included
in nonlinear echo power estimate.
3) The behavior after echo path gain changes is made less cautious.
4) The detection of systems with headsets is made more rapid.

BUG=chromium:712651, webrtc:6018

Review-Url: https://codereview.webrtc.org/2823903003
Cr-Commit-Position: refs/heads/master@{#17768}
diff --git a/webrtc/modules/audio_processing/aec3/aec_state.cc b/webrtc/modules/audio_processing/aec3/aec_state.cc
index 01c3c44..de4a0c1 100644
--- a/webrtc/modules/audio_processing/aec3/aec_state.cc
+++ b/webrtc/modules/audio_processing/aec3/aec_state.cc
@@ -22,7 +22,7 @@
 namespace webrtc {
 namespace {
 
-constexpr size_t kEchoPathChangeConvergenceBlocks = 4 * kNumBlocksPerSecond;
+constexpr size_t kEchoPathChangeConvergenceBlocks = 2 * kNumBlocksPerSecond;
 constexpr size_t kSaturationLeakageBlocks = 20;
 
 // Computes delay of the adaptive filter.
@@ -89,7 +89,6 @@
     const EchoPathVariability& echo_path_variability) {
   if (echo_path_variability.AudioPathChanged()) {
     blocks_since_last_saturation_ = 0;
-    active_render_blocks_ = 0;
     usable_linear_estimate_ = false;
     echo_leakage_detected_ = false;
     capture_signal_saturation_ = false;
@@ -98,6 +97,8 @@
 
     if (echo_path_variability.delay_change) {
       force_zero_gain_counter_ = 0;
+      blocks_with_filter_adaptation_ = 0;
+      render_received_ = false;
       force_zero_gain_ = true;
       echo_path_change_counter_ = kEchoPathChangeCounterMax;
     }
@@ -121,7 +122,11 @@
   // Update counters.
   const float x_energy = std::inner_product(x.begin(), x.end(), x.begin(), 0.f);
   const bool active_render_block = x_energy > 10000.f * kFftLengthBy2;
-  active_render_blocks_ += active_render_block ? 1 : 0;
+  if (active_render_block) {
+    render_received_ = true;
+  }
+  blocks_with_filter_adaptation_ +=
+      (active_render_block && (!SaturatedCapture()) ? 1 : 0);
   --echo_path_change_counter_;
 
   // Force zero echo suppression gain after an echo path change to allow at
@@ -145,6 +150,8 @@
   }
 
   // Detect and flag echo saturation.
+  // TODO(peah): Add the delay in this computation to ensure that the render and
+  // capture signals are properly aligned.
   RTC_DCHECK_LT(0, x.size());
   const float max_sample = fabs(*std::max_element(
       x.begin(), x.end(), [](float a, float b) { return a * a < b * b; }));
@@ -160,14 +167,17 @@
   // Flag whether the linear filter estimate is usable.
   usable_linear_estimate_ =
       (!echo_saturation_) &&
-      active_render_blocks_ > kEchoPathChangeConvergenceBlocks &&
+      (!render_received_ ||
+       blocks_with_filter_adaptation_ > kEchoPathChangeConvergenceBlocks) &&
       filter_delay_ && echo_path_change_counter_ <= 0;
 
   // After an amount of active render samples for which an echo should have been
   // detected in the capture signal if the ERL was not infinite, flag that a
   // headset is used.
-  headset_detected_ = !external_delay_ && !filter_delay_ &&
-                      active_render_blocks_ >= kEchoPathChangeConvergenceBlocks;
+  headset_detected_ =
+      !external_delay_ && !filter_delay_ &&
+      (!render_received_ ||
+       blocks_with_filter_adaptation_ >= kEchoPathChangeConvergenceBlocks);
 }
 
 }  // namespace webrtc
diff --git a/webrtc/modules/audio_processing/aec3/aec_state.h b/webrtc/modules/audio_processing/aec3/aec_state.h
index 387c6ea..519665f 100644
--- a/webrtc/modules/audio_processing/aec3/aec_state.h
+++ b/webrtc/modules/audio_processing/aec3/aec_state.h
@@ -41,7 +41,8 @@
   bool EchoLeakageDetected() const { return echo_leakage_detected_; }
 
   // Returns whether the render signal is currently active.
-  bool ActiveRender() const { return active_render_blocks_ > 200; }
+  // TODO(peah): Deprecate this in an upcoming CL.
+  bool ActiveRender() const { return blocks_with_filter_adaptation_ > 200; }
 
   // Returns the ERLE.
   const std::array<float, kFftLengthBy2Plus1>& Erle() const {
@@ -99,7 +100,7 @@
   ErlEstimator erl_estimator_;
   ErleEstimator erle_estimator_;
   int echo_path_change_counter_;
-  size_t active_render_blocks_ = 0;
+  size_t blocks_with_filter_adaptation_ = 0;
   bool usable_linear_estimate_ = false;
   bool echo_leakage_detected_ = false;
   bool capture_signal_saturation_ = false;
@@ -107,6 +108,7 @@
   bool headset_detected_ = false;
   float previous_max_sample_ = 0.f;
   bool force_zero_gain_ = false;
+  bool render_received_ = false;
   size_t force_zero_gain_counter_ = 0;
   rtc::Optional<size_t> filter_delay_;
   rtc::Optional<size_t> external_delay_;
diff --git a/webrtc/modules/audio_processing/aec3/residual_echo_estimator.cc b/webrtc/modules/audio_processing/aec3/residual_echo_estimator.cc
index 0a9ecac..d36720f 100644
--- a/webrtc/modules/audio_processing/aec3/residual_echo_estimator.cc
+++ b/webrtc/modules/audio_processing/aec3/residual_echo_estimator.cc
@@ -40,6 +40,43 @@
   });
 }
 
+constexpr int kNoiseFloorCounterMax = 50;
+constexpr float kNoiseFloorMin = 10.f * 10.f * 128.f * 128.f;
+
+// Updates estimate for the power of the stationary noise component in the
+// render signal.
+void RenderNoisePower(
+    const RenderBuffer& render_buffer,
+    std::array<float, kFftLengthBy2Plus1>* X2_noise_floor,
+    std::array<int, kFftLengthBy2Plus1>* X2_noise_floor_counter) {
+  RTC_DCHECK(X2_noise_floor);
+  RTC_DCHECK(X2_noise_floor_counter);
+
+  const auto render_power = render_buffer.Spectrum(0);
+  RTC_DCHECK_EQ(X2_noise_floor->size(), render_power.size());
+  RTC_DCHECK_EQ(X2_noise_floor_counter->size(), render_power.size());
+
+  // Estimate the stationary noise power in a minimum statistics manner.
+  for (size_t k = 0; k < render_power.size(); ++k) {
+    // Decrease rapidly.
+    if (render_power[k] < (*X2_noise_floor)[k]) {
+      (*X2_noise_floor)[k] = render_power[k];
+      (*X2_noise_floor_counter)[k] = 0;
+    } else {
+      // Increase in a delayed, leaky manner.
+      if ((*X2_noise_floor_counter)[k] >= kNoiseFloorCounterMax) {
+        (*X2_noise_floor)[k] =
+            std::max((*X2_noise_floor)[k] * 1.1f, kNoiseFloorMin);
+      } else {
+        ++(*X2_noise_floor_counter)[k];
+      }
+    }
+  }
+}
+
+// Assume a minimum echo path gain of -33 dB for headsets.
+constexpr float kHeadsetEchoPathGain = 0.0005f;
+
 }  // namespace
 
 ResidualEchoEstimator::ResidualEchoEstimator() {
@@ -57,28 +94,19 @@
     std::array<float, kFftLengthBy2Plus1>* R2) {
   RTC_DCHECK(R2);
 
-  // Return zero residual echo power when a headset is detected.
-  if (aec_state.HeadsetDetected()) {
-    if (!headset_detected_cached_) {
-      Reset();
-      headset_detected_cached_ = true;
-    }
-    R2->fill(0.f);
-    return;
-  } else {
-    headset_detected_cached_ = false;
-  }
-
   const rtc::Optional<size_t> delay =
       aec_state.FilterDelay()
           ? aec_state.FilterDelay()
           : (aec_state.ExternalDelay() ? aec_state.ExternalDelay()
                                        : rtc::Optional<size_t>());
 
+  // Estimate the power of the stationary noise in the render signal.
+  RenderNoisePower(render_buffer, &X2_noise_floor_, &X2_noise_floor_counter_);
+
   // Estimate the residual echo power.
   const bool use_linear_echo_power =
       aec_state.UsableLinearEstimate() && using_subtractor_output;
-  if (use_linear_echo_power) {
+  if (use_linear_echo_power && !aec_state.HeadsetDetected()) {
     RTC_DCHECK(aec_state.FilterDelay());
     const int filter_delay = *aec_state.FilterDelay();
     LinearEstimate(S2_linear, aec_state.Erle(), filter_delay, R2);
@@ -102,7 +130,15 @@
                           kResidualEchoPowerRenderWindowSize - 1, &X2);
     }
 
-    NonLinearEstimate(X2, Y2, R2);
+    // Subtract the stationary noise power to avoid stationary noise causing
+    // excessive echo suppression.
+    std::transform(
+        X2.begin(), X2.end(), X2_noise_floor_.begin(), X2.begin(),
+        [](float a, float b) { return std::max(0.f, a - 10.f * b); });
+
+    NonLinearEstimate(
+        aec_state.HeadsetDetected() ? kHeadsetEchoPathGain : kFixedEchoPathGain,
+        X2, Y2, R2);
     AddEchoReverb(*R2, aec_state.SaturatedEcho(),
                   std::min(static_cast<size_t>(kAdaptiveFilterLength),
                            delay.value_or(kAdaptiveFilterLength)),
@@ -119,6 +155,8 @@
 }
 
 void ResidualEchoEstimator::Reset() {
+  X2_noise_floor_counter_.fill(kNoiseFloorCounterMax);
+  X2_noise_floor_.fill(kNoiseFloorMin);
   R2_reverb_.fill(0.f);
   R2_old_.fill(0.f);
   R2_hold_counter_.fill(0.f);
@@ -141,14 +179,13 @@
 }
 
 void ResidualEchoEstimator::NonLinearEstimate(
+    float echo_path_gain,
     const std::array<float, kFftLengthBy2Plus1>& X2,
     const std::array<float, kFftLengthBy2Plus1>& Y2,
     std::array<float, kFftLengthBy2Plus1>* R2) {
   // Compute preliminary residual echo.
-  // TODO(peah): Try to make this adaptive. Currently the gain is hardcoded to
-  // 20 dB.
   std::transform(X2.begin(), X2.end(), R2->begin(),
-                 [](float a) { return a * kFixedEchoPathGain; });
+                 [echo_path_gain](float a) { return a * echo_path_gain; });
 
   for (size_t k = 0; k < R2->size(); ++k) {
     // Update hold counter.
diff --git a/webrtc/modules/audio_processing/aec3/residual_echo_estimator.h b/webrtc/modules/audio_processing/aec3/residual_echo_estimator.h
index 6c8a7b2..c8e6a28 100644
--- a/webrtc/modules/audio_processing/aec3/residual_echo_estimator.h
+++ b/webrtc/modules/audio_processing/aec3/residual_echo_estimator.h
@@ -48,7 +48,8 @@
 
   // Estimates the residual echo power based on the estimate of the echo path
   // gain.
-  void NonLinearEstimate(const std::array<float, kFftLengthBy2Plus1>& X2,
+  void NonLinearEstimate(float echo_path_gain,
+                         const std::array<float, kFftLengthBy2Plus1>& X2,
                          const std::array<float, kFftLengthBy2Plus1>& Y2,
                          std::array<float, kFftLengthBy2Plus1>* R2);
 
@@ -66,7 +67,8 @@
   int S2_old_index_ = 0;
   std::array<std::array<float, kFftLengthBy2Plus1>, kAdaptiveFilterLength>
       S2_old_;
-  bool headset_detected_cached_ = false;
+  std::array<float, kFftLengthBy2Plus1> X2_noise_floor_;
+  std::array<int, kFftLengthBy2Plus1> X2_noise_floor_counter_;
 
   RTC_DISALLOW_COPY_AND_ASSIGN(ResidualEchoEstimator);
 };