Revert "Remove unused APM voice activity detection sub-module" This reverts commit b4e06d032e6f82a65c52ed0c5364ae9e7c0a0215. Reason for revert: breaking downstream projects Original change's description: > Remove unused APM voice activity detection sub-module > > API changes: > - webrtc::AudioProcessing::Config::VoiceDetection removed > - webrtc::AudioProcessingStats::voice_detected deprecated > - cricket::AudioOptions::typing_detection deprecated > - webrtc::StatsReport::StatsValueName:: > kStatsValueNameTypingNoiseState deprecated > > PSA: https://groups.google.com/g/discuss-webrtc/c/7X6uwmJarE0 > > Bug: webrtc:11226,webrtc:11292 > Change-Id: I8d008b56708cf62961b9857ec052b59fda3b41bf > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/250666 > Reviewed-by: Harald Alvestrand <hta@webrtc.org> > Reviewed-by: Gustaf Ullberg <gustaf@webrtc.org> > Reviewed-by: Sam Zackrisson <saza@webrtc.org> > Reviewed-by: Björn Terelius <terelius@webrtc.org> > Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> > Cr-Commit-Position: refs/heads/main@{#35975} TBR=gustaf@webrtc.org,saza@webrtc.org,alessiob@webrtc.org,terelius@webrtc.org,hta@webrtc.org,webrtc-scoped@luci-project-accounts.iam.gserviceaccount.com Change-Id: Iee01fdb874b4e0331277f3ffe60dacaabc3859a2 No-Presubmit: true No-Tree-Checks: true No-Try: true Bug: webrtc:11226,webrtc:11292 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/251600 Reviewed-by: Harald Alvestrand <hta@webrtc.org> Reviewed-by: Gustaf Ullberg <gustaf@webrtc.org> Commit-Queue: Mirko Bonadei <mbonadei@webrtc.org> Cr-Commit-Position: refs/heads/main@{#35977}

commit: a751f167c68343f76528436defdbc61600a8d7b3 [log] [tgz]
author: Alessio Bazzica <alessiob@webrtc.org> Fri Feb 11 10:57:44 2022
committer: WebRTC LUCI CQ <webrtc-scoped@luci-project-accounts.iam.gserviceaccount.com> Fri Feb 11 12:15:44 2022
tree: 78e584e842f2a9a81e5250901d6856544090ec33
parent: 9cc5fffee150b3160714de2c9b07ad3c28728d8e [diff]
diff --git a/api/audio_options.h b/api/audio_options.h
index 16aa9e4..48dd628 100644
--- a/api/audio_options.h
+++ b/api/audio_options.h

@@ -60,8 +60,6 @@
   absl::optional<int> audio_jitter_buffer_min_delay_ms;
   // Audio receiver jitter buffer (NetEq) should handle retransmitted packets.
   absl::optional<bool> audio_jitter_buffer_enable_rtx_handling;
-  // Deprecated.
-  // TODO(bugs.webrtc.org/11226): Remove.
   // Audio processing to detect typing.
   absl::optional<bool> typing_detection;
   absl::optional<bool> experimental_agc;

diff --git a/api/stats_types.cc b/api/stats_types.cc
index b044e4a..1090643 100644
--- a/api/stats_types.cc
+++ b/api/stats_types.cc

@@ -648,7 +648,6 @@
       return "googTrackId";
     case kStatsValueNameTimingFrameInfo:
       return "googTimingFrameInfo";
-    // TODO(bugs.webrtc.org/11226): Remove.
     case kStatsValueNameTypingNoiseState:
       return "googTypingNoiseState";
     case kStatsValueNameWritable:

diff --git a/api/stats_types.h b/api/stats_types.h
index e7dd528..c3e4451 100644
--- a/api/stats_types.h
+++ b/api/stats_types.h

@@ -235,7 +235,6 @@
     kStatsValueNameTrackId,
     kStatsValueNameTransmitBitrate,
     kStatsValueNameTransportType,
-    // TODO(bugs.webrtc.org/11226): Remove.
     kStatsValueNameTypingNoiseState,
     kStatsValueNameWritable,
     kStatsValueNameAudioDeviceUnderrunCounter,

diff --git a/audio/audio_transport_impl.cc b/audio/audio_transport_impl.cc
index 194f09c..a5c952f 100644
--- a/audio/audio_transport_impl.cc
+++ b/audio/audio_transport_impl.cc

@@ -165,6 +165,24 @@
                       audio_frame.get());
   audio_frame->set_absolute_capture_timestamp_ms(estimated_capture_time_ns /
                                                  1000000);
+  // Typing detection (utilizes the APM/VAD decision). We let the VAD determine
+  // if we're using this feature or not.
+  // TODO(solenberg): GetConfig() takes a lock. Work around that.
+  bool typing_detected = false;
+  if (audio_processing_ &&
+      audio_processing_->GetConfig().voice_detection.enabled) {
+    if (audio_frame->vad_activity_ != AudioFrame::kVadUnknown) {
+      bool vad_active = audio_frame->vad_activity_ == AudioFrame::kVadActive;
+      typing_detected = typing_detection_.Process(key_pressed, vad_active);
+    }
+  }
+
+  // Copy frame and push to each sending stream. The copy is required since an
+  // encoding task will be posted internally to each stream.
+  {
+    MutexLock lock(&capture_lock_);
+    typing_noise_detected_ = typing_detected;
+  }
 
   RTC_DCHECK_GT(audio_frame->samples_per_channel_, 0);
   if (async_audio_processing_)
@@ -272,4 +290,8 @@
   swap_stereo_channels_ = enable;
 }
 
+bool AudioTransportImpl::typing_noise_detected() const {
+  MutexLock lock(&capture_lock_);
+  return typing_noise_detected_;
+}
 }  // namespace webrtc

diff --git a/audio/audio_transport_impl.h b/audio/audio_transport_impl.h
index 8999956..0b1406f 100644
--- a/audio/audio_transport_impl.h
+++ b/audio/audio_transport_impl.h

@@ -86,9 +86,7 @@
                           int send_sample_rate_hz,
                           size_t send_num_channels);
   void SetStereoChannelSwapping(bool enable);
-  // Deprecated.
-  // TODO(bugs.webrtc.org/11226): Remove.
-  bool typing_noise_detected() const { return false; }
+  bool typing_noise_detected() const;
 
  private:
   void SendProcessedData(std::unique_ptr<AudioFrame> audio_frame);
@@ -105,6 +103,7 @@
   std::vector<AudioSender*> audio_senders_ RTC_GUARDED_BY(capture_lock_);
   int send_sample_rate_hz_ RTC_GUARDED_BY(capture_lock_) = 8000;
   size_t send_num_channels_ RTC_GUARDED_BY(capture_lock_) = 1;
+  bool typing_noise_detected_ RTC_GUARDED_BY(capture_lock_) = false;
   bool swap_stereo_channels_ RTC_GUARDED_BY(capture_lock_) = false;
   PushResampler<int16_t> capture_resampler_;
   TypingDetection typing_detection_;

diff --git a/media/engine/webrtc_voice_engine.cc b/media/engine/webrtc_voice_engine.cc
index b7b0ad7..0640001 100644
--- a/media/engine/webrtc_voice_engine.cc
+++ b/media/engine/webrtc_voice_engine.cc

@@ -634,7 +634,9 @@
   }
 
   if (options.typing_detection) {
-    RTC_LOG(LS_WARNING) << "Typing detection is requested, but unsupported.";
+    RTC_LOG(LS_INFO) << "Typing detection is enabled? "
+                     << *options.typing_detection;
+    apm_config.voice_detection.enabled = *options.typing_detection;
   }
 
   ap->ApplyConfig(apm_config);

diff --git a/media/engine/webrtc_voice_engine_unittest.cc b/media/engine/webrtc_voice_engine_unittest.cc
index 40d5714..8d864ae 100644
--- a/media/engine/webrtc_voice_engine_unittest.cc
+++ b/media/engine/webrtc_voice_engine_unittest.cc

@@ -221,6 +221,11 @@
       // Default Options.
       VerifyEchoCancellationSettings(/*enabled=*/true);
       EXPECT_TRUE(IsHighPassFilterEnabled());
+#if defined(WEBRTC_ANDROID)
+      EXPECT_FALSE(IsTypingDetectionEnabled());
+#else
+      EXPECT_TRUE(IsTypingDetectionEnabled());
+#endif
       EXPECT_TRUE(apm_config_.noise_suppression.enabled);
       EXPECT_EQ(apm_config_.noise_suppression.level, kDefaultNsLevel);
       VerifyGainControlEnabledCorrectly();
@@ -788,6 +793,10 @@
     return apm_config_.high_pass_filter.enabled;
   }
 
+  bool IsTypingDetectionEnabled() {
+    return apm_config_.voice_detection.enabled;
+  }
+
  protected:
   const bool use_null_apm_;
   std::unique_ptr<webrtc::TaskQueueFactory> task_queue_factory_;
@@ -3032,10 +3041,40 @@
   if (!use_null_apm_) {
     VerifyEchoCancellationSettings(/*enabled=*/true);
     EXPECT_TRUE(IsHighPassFilterEnabled());
+#if defined(WEBRTC_ANDROID)
+    EXPECT_FALSE(IsTypingDetectionEnabled());
+#else
+    EXPECT_TRUE(IsTypingDetectionEnabled());
+#endif
   }
   EXPECT_EQ(200u, GetRecvStreamConfig(kSsrcY).jitter_buffer_max_packets);
   EXPECT_FALSE(GetRecvStreamConfig(kSsrcY).jitter_buffer_fast_accelerate);
 
+  // Turn typing detection off.
+  send_parameters_.options.typing_detection = false;
+  SetSendParameters(send_parameters_);
+  if (!use_null_apm_) {
+    EXPECT_FALSE(IsTypingDetectionEnabled());
+  }
+
+  // Leave typing detection unchanged, but non-default.
+  send_parameters_.options.typing_detection = absl::nullopt;
+  SetSendParameters(send_parameters_);
+  if (!use_null_apm_) {
+    EXPECT_FALSE(IsTypingDetectionEnabled());
+  }
+
+  // Turn typing detection on.
+  send_parameters_.options.typing_detection = true;
+  SetSendParameters(send_parameters_);
+  if (!use_null_apm_) {
+#if defined(WEBRTC_ANDROID)
+    EXPECT_FALSE(IsTypingDetectionEnabled());
+#else
+    EXPECT_TRUE(IsTypingDetectionEnabled());
+#endif
+  }
+
   // Turn echo cancellation off
   send_parameters_.options.echo_cancellation = false;
   SetSendParameters(send_parameters_);

diff --git a/modules/audio_processing/BUILD.gn b/modules/audio_processing/BUILD.gn
index ee6b579..f32058d 100644
--- a/modules/audio_processing/BUILD.gn
+++ b/modules/audio_processing/BUILD.gn

@@ -168,6 +168,7 @@
     ":high_pass_filter",
     ":optionally_built_submodule_creators",
     ":rms_level",
+    ":voice_detection",
     "../../api:array_view",
     "../../api:function_view",
     "../../api/audio:aec3_config",
@@ -217,6 +218,20 @@
   }
 }
 
+rtc_library("voice_detection") {
+  sources = [
+    "voice_detection.cc",
+    "voice_detection.h",
+  ]
+  deps = [
+    ":api",
+    ":audio_buffer",
+    "../../api/audio:audio_frame_api",
+    "../../common_audio:common_audio_c",
+    "../../rtc_base:checks",
+  ]
+}
+
 rtc_library("residual_echo_detector") {
   poisonous = [ "default_echo_detector" ]
   configs += [ ":apm_debug_dump" ]
@@ -364,6 +379,7 @@
         ":gain_controller2",
         ":high_pass_filter",
         ":mocks",
+        ":voice_detection",
         "../../api:array_view",
         "../../api:scoped_refptr",
         "../../api/audio:aec3_config",
@@ -458,6 +474,7 @@
           "test/echo_canceller_test_tools_unittest.cc",
           "test/echo_control_mock.h",
           "test/test_utils.h",
+          "voice_detection_unittest.cc",
         ]
       }
     }

diff --git a/modules/audio_processing/audio_processing_impl.cc b/modules/audio_processing/audio_processing_impl.cc
index 9a1aaee..8810efe 100644
--- a/modules/audio_processing/audio_processing_impl.cc
+++ b/modules/audio_processing/audio_processing_impl.cc

@@ -141,6 +141,7 @@
     bool gain_controller2_enabled,
     bool gain_adjustment_enabled,
     bool echo_controller_enabled,
+    bool voice_detector_enabled,
     bool transient_suppressor_enabled) {
   bool changed = false;
   changed |= (high_pass_filter_enabled != high_pass_filter_enabled_);
@@ -152,6 +153,7 @@
   changed |= (gain_controller2_enabled != gain_controller2_enabled_);
   changed |= (gain_adjustment_enabled != gain_adjustment_enabled_);
   changed |= (echo_controller_enabled != echo_controller_enabled_);
+  changed |= (voice_detector_enabled != voice_detector_enabled_);
   changed |= (transient_suppressor_enabled != transient_suppressor_enabled_);
   if (changed) {
     high_pass_filter_enabled_ = high_pass_filter_enabled;
@@ -161,6 +163,7 @@
     gain_controller2_enabled_ = gain_controller2_enabled;
     gain_adjustment_enabled_ = gain_adjustment_enabled;
     echo_controller_enabled_ = echo_controller_enabled;
+    voice_detector_enabled_ = voice_detector_enabled;
     transient_suppressor_enabled_ = transient_suppressor_enabled;
   }
 
@@ -171,7 +174,7 @@
 
 bool AudioProcessingImpl::SubmoduleStates::CaptureMultiBandSubModulesActive()
     const {
-  return CaptureMultiBandProcessingPresent();
+  return CaptureMultiBandProcessingPresent() || voice_detector_enabled_;
 }
 
 bool AudioProcessingImpl::SubmoduleStates::CaptureMultiBandProcessingPresent()
@@ -368,6 +371,7 @@
   InitializeGainController1();
   InitializeTransientSuppressor();
   InitializeHighPassFilter(true);
+  InitializeVoiceDetector();
   InitializeResidualEchoDetector();
   InitializeEchoController();
   InitializeGainController2(/*config_has_changed=*/true);
@@ -502,6 +506,9 @@
   const bool agc2_config_changed =
       config_.gain_controller2 != config.gain_controller2;
 
+  const bool voice_detection_config_changed =
+      config_.voice_detection.enabled != config.voice_detection.enabled;
+
   const bool ns_config_changed =
       config_.noise_suppression.enabled != config.noise_suppression.enabled ||
       config_.noise_suppression.level != config.noise_suppression.level;
@@ -550,6 +557,10 @@
     InitializeCaptureLevelsAdjuster();
   }
 
+  if (voice_detection_config_changed) {
+    InitializeVoiceDetector();
+  }
+
   // Reinitialization must happen after all submodule configuration to avoid
   // additional reinitializations on the next capture / render processing call.
   if (pipeline_config_changed) {
@@ -1204,6 +1215,13 @@
     }
   }
 
+  if (config_.voice_detection.enabled) {
+    capture_.stats.voice_detected =
+        submodules_.voice_detector->ProcessCaptureAudio(capture_buffer);
+  } else {
+    capture_.stats.voice_detected = absl::nullopt;
+  }
+
   if (submodules_.agc_manager) {
     submodules_.agc_manager->Process(capture_buffer);
 
@@ -1664,7 +1682,7 @@
       !!submodules_.gain_controller2,
       config_.pre_amplifier.enabled || config_.capture_level_adjustment.enabled,
       capture_nonlocked_.echo_controller_enabled,
-      !!submodules_.transient_suppressor);
+      config_.voice_detection.enabled, !!submodules_.transient_suppressor);
 }
 
 void AudioProcessingImpl::InitializeTransientSuppressor() {
@@ -1714,6 +1732,14 @@
   }
 }
 
+void AudioProcessingImpl::InitializeVoiceDetector() {
+  if (config_.voice_detection.enabled) {
+    submodules_.voice_detector = std::make_unique<VoiceDetection>(
+        proc_split_sample_rate_hz(), VoiceDetection::kVeryLowLikelihood);
+  } else {
+    submodules_.voice_detector.reset();
+  }
+}
 void AudioProcessingImpl::InitializeEchoController() {
   bool use_echo_controller =
       echo_control_factory_ ||

diff --git a/modules/audio_processing/audio_processing_impl.h b/modules/audio_processing/audio_processing_impl.h
index 344b8c5..47dd62e 100644
--- a/modules/audio_processing/audio_processing_impl.h
+++ b/modules/audio_processing/audio_processing_impl.h

@@ -39,6 +39,7 @@
 #include "modules/audio_processing/render_queue_item_verifier.h"
 #include "modules/audio_processing/rms_level.h"
 #include "modules/audio_processing/transient/transient_suppressor.h"
+#include "modules/audio_processing/voice_detection.h"
 #include "rtc_base/gtest_prod_util.h"
 #include "rtc_base/ignore_wundef.h"
 #include "rtc_base/swap_queue.h"
@@ -207,6 +208,7 @@
                 bool gain_controller2_enabled,
                 bool gain_adjustment_enabled,
                 bool echo_controller_enabled,
+                bool voice_detector_enabled,
                 bool transient_suppressor_enabled);
     bool CaptureMultiBandSubModulesActive() const;
     bool CaptureMultiBandProcessingPresent() const;
@@ -229,6 +231,7 @@
     bool gain_controller2_enabled_ = false;
     bool gain_adjustment_enabled_ = false;
     bool echo_controller_enabled_ = false;
+    bool voice_detector_enabled_ = false;
     bool transient_suppressor_enabled_ = false;
     bool first_update_ = true;
   };
@@ -264,6 +267,7 @@
   // already acquired.
   void InitializeHighPassFilter(bool forced_reset)
       RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
+  void InitializeVoiceDetector() RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
   void InitializeGainController1() RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
   void InitializeTransientSuppressor()
       RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
@@ -396,6 +400,7 @@
     std::unique_ptr<EchoControlMobileImpl> echo_control_mobile;
     std::unique_ptr<NoiseSuppressor> noise_suppressor;
     std::unique_ptr<TransientSuppressor> transient_suppressor;
+    std::unique_ptr<VoiceDetection> voice_detector;
     std::unique_ptr<CaptureLevelsAdjuster> capture_levels_adjuster;
   } submodules_;
 

diff --git a/modules/audio_processing/audio_processing_impl_locking_unittest.cc b/modules/audio_processing/audio_processing_impl_locking_unittest.cc
index 7557e91..343f077 100644
--- a/modules/audio_processing/audio_processing_impl_locking_unittest.cc
+++ b/modules/audio_processing/audio_processing_impl_locking_unittest.cc

@@ -483,6 +483,7 @@
   apm_config.gain_controller1.mode =
       AudioProcessing::Config::GainController1::kAdaptiveDigital;
   apm_config.noise_suppression.enabled = true;
+  apm_config.voice_detection.enabled = true;
   return apm_config;
 }
 

diff --git a/modules/audio_processing/audio_processing_performance_unittest.cc b/modules/audio_processing/audio_processing_performance_unittest.cc
index 57655ae..c885293 100644
--- a/modules/audio_processing/audio_processing_performance_unittest.cc
+++ b/modules/audio_processing/audio_processing_performance_unittest.cc

@@ -441,6 +441,7 @@
       apm_config.gain_controller1.enabled = true;
       apm_config.gain_controller1.mode =
           AudioProcessing::Config::GainController1::kAdaptiveDigital;
+      apm_config.voice_detection.enabled = true;
       apm->ApplyConfig(apm_config);
     };
 
@@ -452,6 +453,7 @@
       apm_config.noise_suppression.enabled = true;
       apm_config.gain_controller1.mode =
           AudioProcessing::Config::GainController1::kAdaptiveDigital;
+      apm_config.voice_detection.enabled = true;
       apm->ApplyConfig(apm_config);
     };
 
@@ -462,6 +464,7 @@
       apm_config.echo_canceller.enabled = false;
       apm_config.gain_controller1.enabled = false;
       apm_config.noise_suppression.enabled = false;
+      apm_config.voice_detection.enabled = false;
       apm->ApplyConfig(apm_config);
     };
 

diff --git a/modules/audio_processing/audio_processing_unittest.cc b/modules/audio_processing/audio_processing_unittest.cc
index b21a022..96e2d84 100644
--- a/modules/audio_processing/audio_processing_unittest.cc
+++ b/modules/audio_processing/audio_processing_unittest.cc

@@ -190,6 +190,7 @@
   apm_config.noise_suppression.enabled = true;
 
   apm_config.high_pass_filter.enabled = true;
+  apm_config.voice_detection.enabled = true;
   apm_config.pipeline.maximum_internal_processing_rate = 48000;
   ap->ApplyConfig(apm_config);
 }
@@ -1225,6 +1226,7 @@
   EXPECT_FALSE(config.high_pass_filter.enabled);
   EXPECT_FALSE(config.gain_controller1.enabled);
   EXPECT_FALSE(config.noise_suppression.enabled);
+  EXPECT_FALSE(config.voice_detection.enabled);
 }
 
 TEST_F(ApmTest, NoProcessingWhenAllComponentsDisabled) {
@@ -1365,6 +1367,48 @@
   EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy));
   apm_->ApplyConfig(apm_config);
 
+  // 3. Only GetStatistics-reporting VAD is enabled...
+  SetFrameTo(&frame_, 1000);
+  frame_copy.CopyFrom(frame_);
+  apm_config.voice_detection.enabled = true;
+  apm_->ApplyConfig(apm_config);
+  EXPECT_EQ(apm_->kNoError,
+            apm_->ProcessStream(
+                frame_.data.data(),
+                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
+                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
+                frame_.data.data()));
+  EXPECT_EQ(apm_->kNoError,
+            apm_->ProcessStream(
+                frame_.data.data(),
+                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
+                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
+                frame_.data.data()));
+  EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy));
+  apm_config.voice_detection.enabled = false;
+  apm_->ApplyConfig(apm_config);
+
+  // 4. The VAD is enabled...
+  SetFrameTo(&frame_, 1000);
+  frame_copy.CopyFrom(frame_);
+  apm_config.voice_detection.enabled = true;
+  apm_->ApplyConfig(apm_config);
+  EXPECT_EQ(apm_->kNoError,
+            apm_->ProcessStream(
+                frame_.data.data(),
+                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
+                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
+                frame_.data.data()));
+  EXPECT_EQ(apm_->kNoError,
+            apm_->ProcessStream(
+                frame_.data.data(),
+                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
+                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
+                frame_.data.data()));
+  EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy));
+  apm_config.voice_detection.enabled = false;
+  apm_->ApplyConfig(apm_config);
+
   // Check the test is valid. We should have distortion from the filter
   // when AEC is enabled (which won't affect the audio).
   apm_config.echo_canceller.enabled = true;
@@ -1692,6 +1736,7 @@
          static_cast<size_t>(test->num_reverse_channels()), true);
 
     int frame_count = 0;
+    int has_voice_count = 0;
     int analog_level = 127;
     int analog_level_average = 0;
     int max_output_average = 0;
@@ -1727,6 +1772,8 @@
       analog_level = apm_->recommended_stream_analog_level();
       analog_level_average += analog_level;
       AudioProcessingStats stats = apm_->GetStatistics();
+      EXPECT_TRUE(stats.voice_detected);
+      has_voice_count += *stats.voice_detected ? 1 : 0;
 
       size_t frame_size = frame_.samples_per_channel * frame_.num_channels;
       size_t write_count =
@@ -1782,23 +1829,33 @@
 
     if (!absl::GetFlag(FLAGS_write_apm_ref_data)) {
       const int kIntNear = 1;
-      // All numbers being consistently higher on N7 compare to the reference
-      // data.
+      // When running the test on a N7 we get a {2, 6} difference of
+      // `has_voice_count` and `max_output_average` is up to 18 higher.
+      // All numbers being consistently higher on N7 compare to ref_data.
       // TODO(bjornv): If we start getting more of these offsets on Android we
       // should consider a different approach. Either using one slack for all,
       // or generate a separate android reference.
 #if defined(WEBRTC_ANDROID) || defined(WEBRTC_IOS)
+      const int kHasVoiceCountOffset = 3;
+      const int kHasVoiceCountNear = 8;
       const int kMaxOutputAverageOffset = 9;
       const int kMaxOutputAverageNear = 26;
 #else
+      const int kHasVoiceCountOffset = 0;
+      const int kHasVoiceCountNear = kIntNear;
       const int kMaxOutputAverageOffset = 0;
       const int kMaxOutputAverageNear = kIntNear;
 #endif
+      EXPECT_NEAR(test->has_voice_count(),
+                  has_voice_count - kHasVoiceCountOffset, kHasVoiceCountNear);
+
       EXPECT_NEAR(test->analog_level_average(), analog_level_average, kIntNear);
       EXPECT_NEAR(test->max_output_average(),
                   max_output_average - kMaxOutputAverageOffset,
                   kMaxOutputAverageNear);
     } else {
+      test->set_has_voice_count(has_voice_count);
+
       test->set_analog_level_average(analog_level_average);
       test->set_max_output_average(max_output_average);
     }
@@ -2628,6 +2685,7 @@
   apm_config.echo_canceller.enabled = true;
   apm_config.echo_canceller.mobile_mode = mobile_aec;
   apm_config.noise_suppression.enabled = false;
+  apm_config.voice_detection.enabled = false;
   apm->ApplyConfig(apm_config);
   return apm;
 }
@@ -2736,9 +2794,10 @@
   EXPECT_FALSE(stats.echo_return_loss_enhancement.has_value());
 }
 
-TEST(ApmStatistics, DoNotReportVoiceDetectedStat) {
+TEST(ApmStatistics, ReportHasVoice) {
   ProcessingConfig processing_config = {
       {{32000, 1}, {32000, 1}, {32000, 1}, {32000, 1}}};
+  AudioProcessing::Config config;
 
   // Set up an audioframe.
   Int16FrameData frame;
@@ -2755,14 +2814,37 @@
       AudioProcessingBuilderForTesting().Create();
   apm->Initialize(processing_config);
 
-  // No metric should be reported.
+  // If not enabled, no metric should be reported.
   EXPECT_EQ(
       apm->ProcessStream(frame.data.data(),
                          StreamConfig(frame.sample_rate_hz, frame.num_channels),
                          StreamConfig(frame.sample_rate_hz, frame.num_channels),
                          frame.data.data()),
       0);
-  EXPECT_FALSE(apm->GetStatistics().voice_detected.has_value());
+  EXPECT_FALSE(apm->GetStatistics().voice_detected);
+
+  // If enabled, metrics should be reported.
+  config.voice_detection.enabled = true;
+  apm->ApplyConfig(config);
+  EXPECT_EQ(
+      apm->ProcessStream(frame.data.data(),
+                         StreamConfig(frame.sample_rate_hz, frame.num_channels),
+                         StreamConfig(frame.sample_rate_hz, frame.num_channels),
+                         frame.data.data()),
+      0);
+  auto stats = apm->GetStatistics();
+  EXPECT_TRUE(stats.voice_detected);
+
+  // If re-disabled, the value is again not reported.
+  config.voice_detection.enabled = false;
+  apm->ApplyConfig(config);
+  EXPECT_EQ(
+      apm->ProcessStream(frame.data.data(),
+                         StreamConfig(frame.sample_rate_hz, frame.num_channels),
+                         StreamConfig(frame.sample_rate_hz, frame.num_channels),
+                         frame.data.data()),
+      0);
+  EXPECT_FALSE(apm->GetStatistics().voice_detected);
 }
 
 TEST(ApmStatistics, GetStatisticsReportsNoEchoDetectorStatsWhenDisabled) {

diff --git a/modules/audio_processing/include/audio_processing.cc b/modules/audio_processing/include/audio_processing.cc
index 86edaee..9643b6c 100644
--- a/modules/audio_processing/include/audio_processing.cc
+++ b/modules/audio_processing/include/audio_processing.cc

@@ -145,6 +145,7 @@
           << NoiseSuppressionLevelToString(noise_suppression.level)
           << " }, transient_suppression: { enabled: "
           << transient_suppression.enabled
+          << " }, voice_detection: { enabled: " << voice_detection.enabled
           << " }, gain_controller1: { enabled: " << gain_controller1.enabled
           << ", mode: " << GainController1ModeToString(gain_controller1.mode)
           << ", target_level_dbfs: " << gain_controller1.target_level_dbfs

diff --git a/modules/audio_processing/include/audio_processing.h b/modules/audio_processing/include/audio_processing.h
index 9d6824c..8af5013 100644
--- a/modules/audio_processing/include/audio_processing.h
+++ b/modules/audio_processing/include/audio_processing.h

@@ -113,6 +113,8 @@
 //
 // config.high_pass_filter.enabled = true;
 //
+// config.voice_detection.enabled = true;
+//
 // apm->ApplyConfig(config)
 //
 // apm->noise_reduction()->set_level(kHighSuppression);
@@ -230,6 +232,11 @@
       bool enabled = false;
     } transient_suppression;
 
+    // Enables reporting of `voice_detected` in webrtc::AudioProcessingStats.
+    struct VoiceDetection {
+      bool enabled = false;
+    } voice_detection;
+
     // Enables automatic gain control (AGC) functionality.
     // The automatic gain control (AGC) component brings the signal to an
     // appropriate range. This is done by applying a digital gain directly and,

diff --git a/modules/audio_processing/include/audio_processing_statistics.h b/modules/audio_processing/include/audio_processing_statistics.h
index 3b43319..a31dafe 100644
--- a/modules/audio_processing/include/audio_processing_statistics.h
+++ b/modules/audio_processing/include/audio_processing_statistics.h

@@ -24,8 +24,6 @@
   AudioProcessingStats(const AudioProcessingStats& other);
   ~AudioProcessingStats();
 
-  // Deprecated.
-  // TODO(bugs.webrtc.org/11226): Remove.
   // True if voice is detected in the last capture frame, after processing.
   // It is conservative in flagging audio as speech, with low likelihood of
   // incorrectly flagging a frame as voice.

diff --git a/modules/audio_processing/test/audio_processing_simulator.cc b/modules/audio_processing/test/audio_processing_simulator.cc
index 4915648..b1edda1 100644
--- a/modules/audio_processing/test/audio_processing_simulator.cc
+++ b/modules/audio_processing/test/audio_processing_simulator.cc

@@ -543,6 +543,10 @@
     apm_config.high_pass_filter.enabled = *settings_.use_hpf;
   }
 
+  if (settings_.use_vad) {
+    apm_config.voice_detection.enabled = *settings_.use_vad;
+  }
+
   if (settings_.use_agc) {
     apm_config.gain_controller1.enabled = *settings_.use_agc;
   }

diff --git a/modules/audio_processing/test/audio_processing_simulator.h b/modules/audio_processing/test/audio_processing_simulator.h
index af76d7e..ae3cd4f 100644
--- a/modules/audio_processing/test/audio_processing_simulator.h
+++ b/modules/audio_processing/test/audio_processing_simulator.h

@@ -105,6 +105,7 @@
   absl::optional<bool> use_ns;
   absl::optional<int> use_ts;
   absl::optional<bool> use_analog_agc;
+  absl::optional<bool> use_vad;
   absl::optional<bool> use_all;
   absl::optional<bool> analog_agc_disable_digital_adaptive;
   absl::optional<int> agc_mode;

diff --git a/modules/audio_processing/test/audioproc_float_impl.cc b/modules/audio_processing/test/audioproc_float_impl.cc
index aab1881..d4697e4 100644
--- a/modules/audio_processing/test/audioproc_float_impl.cc
+++ b/modules/audio_processing/test/audioproc_float_impl.cc

@@ -117,6 +117,10 @@
           analog_agc,
           kParameterNotSpecifiedValue,
           "Activate (1) or deactivate (0) the analog AGC");
+ABSL_FLAG(int,
+          vad,
+          kParameterNotSpecifiedValue,
+          "Activate (1) or deactivate (0) the voice activity detector");
 ABSL_FLAG(bool,
           all_default,
           false,
@@ -361,6 +365,7 @@
 SimulationSettings CreateSettings() {
   SimulationSettings settings;
   if (absl::GetFlag(FLAGS_all_default)) {
+    settings.use_vad = true;
     settings.use_ts = true;
     settings.use_analog_agc = true;
     settings.use_ns = true;
@@ -412,6 +417,7 @@
   SetSettingIfSpecified(absl::GetFlag(FLAGS_ts), &settings.use_ts);
   SetSettingIfFlagSet(absl::GetFlag(FLAGS_analog_agc),
                       &settings.use_analog_agc);
+  SetSettingIfFlagSet(absl::GetFlag(FLAGS_vad), &settings.use_vad);
   SetSettingIfFlagSet(absl::GetFlag(FLAGS_analog_agc_disable_digital_adaptive),
                       &settings.analog_agc_disable_digital_adaptive);
   SetSettingIfSpecified(absl::GetFlag(FLAGS_agc_mode), &settings.agc_mode);

diff --git a/modules/audio_processing/voice_detection.cc b/modules/audio_processing/voice_detection.cc
new file mode 100644
index 0000000..1a633e2
--- /dev/null
+++ b/modules/audio_processing/voice_detection.cc

@@ -0,0 +1,92 @@
+/*
+ *  Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/voice_detection.h"
+
+#include "common_audio/vad/include/webrtc_vad.h"
+#include "modules/audio_processing/audio_buffer.h"
+#include "rtc_base/checks.h"
+
+namespace webrtc {
+class VoiceDetection::Vad {
+ public:
+  Vad() {
+    state_ = WebRtcVad_Create();
+    RTC_CHECK(state_);
+    int error = WebRtcVad_Init(state_);
+    RTC_DCHECK_EQ(0, error);
+  }
+  ~Vad() { WebRtcVad_Free(state_); }
+
+  Vad(Vad&) = delete;
+  Vad& operator=(Vad&) = delete;
+
+  VadInst* state() { return state_; }
+
+ private:
+  VadInst* state_ = nullptr;
+};
+
+VoiceDetection::VoiceDetection(int sample_rate_hz, Likelihood likelihood)
+    : sample_rate_hz_(sample_rate_hz),
+      frame_size_samples_(static_cast<size_t>(sample_rate_hz_ / 100)),
+      likelihood_(likelihood),
+      vad_(new Vad()) {
+  int mode = 2;
+  switch (likelihood) {
+    case VoiceDetection::kVeryLowLikelihood:
+      mode = 3;
+      break;
+    case VoiceDetection::kLowLikelihood:
+      mode = 2;
+      break;
+    case VoiceDetection::kModerateLikelihood:
+      mode = 1;
+      break;
+    case VoiceDetection::kHighLikelihood:
+      mode = 0;
+      break;
+    default:
+      RTC_DCHECK_NOTREACHED();
+      break;
+  }
+  int error = WebRtcVad_set_mode(vad_->state(), mode);
+  RTC_DCHECK_EQ(0, error);
+}
+
+VoiceDetection::~VoiceDetection() {}
+
+bool VoiceDetection::ProcessCaptureAudio(AudioBuffer* audio) {
+  RTC_DCHECK_GE(AudioBuffer::kMaxSplitFrameLength,
+                audio->num_frames_per_band());
+  std::array<int16_t, AudioBuffer::kMaxSplitFrameLength> mixed_low_pass_data;
+  rtc::ArrayView<const int16_t> mixed_low_pass(mixed_low_pass_data.data(),
+                                               audio->num_frames_per_band());
+  if (audio->num_channels() == 1) {
+    FloatS16ToS16(audio->split_bands_const(0)[kBand0To8kHz],
+                  audio->num_frames_per_band(), mixed_low_pass_data.data());
+  } else {
+    const int num_channels = static_cast<int>(audio->num_channels());
+    for (size_t i = 0; i < audio->num_frames_per_band(); ++i) {
+      int32_t value =
+          FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[0][i]);
+      for (int j = 1; j < num_channels; ++j) {
+        value += FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[j][i]);
+      }
+      mixed_low_pass_data[i] = value / num_channels;
+    }
+  }
+
+  int vad_ret = WebRtcVad_Process(vad_->state(), sample_rate_hz_,
+                                  mixed_low_pass.data(), frame_size_samples_);
+  RTC_DCHECK(vad_ret == 0 || vad_ret == 1);
+  return vad_ret == 0 ? false : true;
+}
+}  // namespace webrtc

diff --git a/modules/audio_processing/voice_detection.h b/modules/audio_processing/voice_detection.h
new file mode 100644
index 0000000..79d44e6
--- /dev/null
+++ b/modules/audio_processing/voice_detection.h

@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
+#define MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
+
+#include <stddef.h>
+
+#include <memory>
+
+#include "modules/audio_processing/include/audio_processing.h"
+
+namespace webrtc {
+
+class AudioBuffer;
+
+// The voice activity detection (VAD) component analyzes the stream to
+// determine if voice is present.
+class VoiceDetection {
+ public:
+  // Specifies the likelihood that a frame will be declared to contain voice.
+  // A higher value makes it more likely that speech will not be clipped, at
+  // the expense of more noise being detected as voice.
+  enum Likelihood {
+    kVeryLowLikelihood,
+    kLowLikelihood,
+    kModerateLikelihood,
+    kHighLikelihood
+  };
+
+  VoiceDetection(int sample_rate_hz, Likelihood likelihood);
+  ~VoiceDetection();
+
+  VoiceDetection(VoiceDetection&) = delete;
+  VoiceDetection& operator=(VoiceDetection&) = delete;
+
+  // Returns true if voice is detected in the current frame.
+  bool ProcessCaptureAudio(AudioBuffer* audio);
+
+  Likelihood likelihood() const { return likelihood_; }
+
+ private:
+  class Vad;
+
+  int sample_rate_hz_;
+  size_t frame_size_samples_;
+  Likelihood likelihood_;
+  std::unique_ptr<Vad> vad_;
+};
+}  // namespace webrtc
+
+#endif  // MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_

diff --git a/modules/audio_processing/voice_detection_unittest.cc b/modules/audio_processing/voice_detection_unittest.cc
new file mode 100644
index 0000000..e1117e4
--- /dev/null
+++ b/modules/audio_processing/voice_detection_unittest.cc

@@ -0,0 +1,104 @@
+/*
+ *  Copyright (c) 2016 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <vector>
+
+#include "api/array_view.h"
+#include "modules/audio_processing/audio_buffer.h"
+#include "modules/audio_processing/test/audio_buffer_tools.h"
+#include "modules/audio_processing/test/bitexactness_tools.h"
+#include "modules/audio_processing/voice_detection.h"
+#include "test/gtest.h"
+
+namespace webrtc {
+namespace {
+
+const int kNumFramesToProcess = 1000;
+
+// Process one frame of data and produce the output.
+bool ProcessOneFrame(int sample_rate_hz,
+                     AudioBuffer* audio_buffer,
+                     VoiceDetection* voice_detection) {
+  if (sample_rate_hz > AudioProcessing::kSampleRate16kHz) {
+    audio_buffer->SplitIntoFrequencyBands();
+  }
+
+  return voice_detection->ProcessCaptureAudio(audio_buffer);
+}
+
+// Processes a specified amount of frames, verifies the results and reports
+// any errors.
+void RunBitexactnessTest(int sample_rate_hz,
+                         size_t num_channels,
+                         bool stream_has_voice_reference) {
+  int sample_rate_to_use = std::min(sample_rate_hz, 16000);
+  VoiceDetection voice_detection(sample_rate_to_use,
+                                 VoiceDetection::kLowLikelihood);
+
+  int samples_per_channel = rtc::CheckedDivExact(sample_rate_hz, 100);
+  const StreamConfig capture_config(sample_rate_hz, num_channels);
+  AudioBuffer capture_buffer(
+      capture_config.sample_rate_hz(), capture_config.num_channels(),
+      capture_config.sample_rate_hz(), capture_config.num_channels(),
+      capture_config.sample_rate_hz(), capture_config.num_channels());
+  test::InputAudioFile capture_file(
+      test::GetApmCaptureTestVectorFileName(sample_rate_hz));
+  std::vector<float> capture_input(samples_per_channel * num_channels);
+  bool stream_has_voice = false;
+  for (int frame_no = 0; frame_no < kNumFramesToProcess; ++frame_no) {
+    ReadFloatSamplesFromStereoFile(samples_per_channel, num_channels,
+                                   &capture_file, capture_input);
+
+    test::CopyVectorToAudioBuffer(capture_config, capture_input,
+                                  &capture_buffer);
+
+    stream_has_voice =
+        ProcessOneFrame(sample_rate_hz, &capture_buffer, &voice_detection);
+  }
+
+  EXPECT_EQ(stream_has_voice_reference, stream_has_voice);
+}
+
+const bool kStreamHasVoiceReference = true;
+
+}  // namespace
+
+TEST(VoiceDetectionBitExactnessTest, Mono8kHz) {
+  RunBitexactnessTest(8000, 1, kStreamHasVoiceReference);
+}
+
+TEST(VoiceDetectionBitExactnessTest, Mono16kHz) {
+  RunBitexactnessTest(16000, 1, kStreamHasVoiceReference);
+}
+
+TEST(VoiceDetectionBitExactnessTest, Mono32kHz) {
+  RunBitexactnessTest(32000, 1, kStreamHasVoiceReference);
+}
+
+TEST(VoiceDetectionBitExactnessTest, Mono48kHz) {
+  RunBitexactnessTest(48000, 1, kStreamHasVoiceReference);
+}
+
+TEST(VoiceDetectionBitExactnessTest, Stereo8kHz) {
+  RunBitexactnessTest(8000, 2, kStreamHasVoiceReference);
+}
+
+TEST(VoiceDetectionBitExactnessTest, Stereo16kHz) {
+  RunBitexactnessTest(16000, 2, kStreamHasVoiceReference);
+}
+
+TEST(VoiceDetectionBitExactnessTest, Stereo32kHz) {
+  RunBitexactnessTest(32000, 2, kStreamHasVoiceReference);
+}
+
+TEST(VoiceDetectionBitExactnessTest, Stereo48kHz) {
+  RunBitexactnessTest(48000, 2, kStreamHasVoiceReference);
+}
+
+}  // namespace webrtc

diff --git a/test/fuzzers/audio_processing_configs_fuzzer.cc b/test/fuzzers/audio_processing_configs_fuzzer.cc
index f04ef77..54a43df 100644
--- a/test/fuzzers/audio_processing_configs_fuzzer.cc
+++ b/test/fuzzers/audio_processing_configs_fuzzer.cc

@@ -54,7 +54,7 @@
   bool use_agc = fuzz_data->ReadOrDefaultValue(true);
   bool use_ns = fuzz_data->ReadOrDefaultValue(true);
   static_cast<void>(fuzz_data->ReadOrDefaultValue(true));
-  static_cast<void>(fuzz_data->ReadOrDefaultValue(true));
+  bool use_vad = fuzz_data->ReadOrDefaultValue(true);
   bool use_agc_limiter = fuzz_data->ReadOrDefaultValue(true);
   bool use_agc2 = fuzz_data->ReadOrDefaultValue(true);
 
@@ -114,6 +114,7 @@
       use_agc2_adaptive_digital;
   apm_config.noise_suppression.enabled = use_ns;
   apm_config.transient_suppression.enabled = use_ts;
+  apm_config.voice_detection.enabled = use_vad;
 
   rtc::scoped_refptr<AudioProcessing> apm =
       AudioProcessingBuilderForTesting()
commit	a751f167c68343f76528436defdbc61600a8d7b3	[log] [tgz]
author	Alessio Bazzica <alessiob@webrtc.org>	Fri Feb 11 10:57:44 2022
committer	WebRTC LUCI CQ <webrtc-scoped@luci-project-accounts.iam.gserviceaccount.com>	Fri Feb 11 12:15:44 2022
tree	78e584e842f2a9a81e5250901d6856544090ec33
parent	9cc5fffee150b3160714de2c9b07ad3c28728d8e [diff]