Reland "Remove unused APM voice activity detection sub-module"
This reverts commit a751f167c68343f76528436defdbc61600a8d7b3.
Reason for revert: dependency in a downstream project removed
Original change's description:
> Revert "Remove unused APM voice activity detection sub-module"
>
> This reverts commit b4e06d032e6f82a65c52ed0c5364ae9e7c0a0215.
>
> Reason for revert: breaking downstream projects
>
> Original change's description:
> > Remove unused APM voice activity detection sub-module
> >
> > API changes:
> > - webrtc::AudioProcessing::Config::VoiceDetection removed
> > - webrtc::AudioProcessingStats::voice_detected deprecated
> > - cricket::AudioOptions::typing_detection deprecated
> > - webrtc::StatsReport::StatsValueName::
> > kStatsValueNameTypingNoiseState deprecated
> >
> > PSA: https://groups.google.com/g/discuss-webrtc/c/7X6uwmJarE0
> >
> > Bug: webrtc:11226,webrtc:11292
> > Change-Id: I8d008b56708cf62961b9857ec052b59fda3b41bf
> > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/250666
> > Reviewed-by: Harald Alvestrand <hta@webrtc.org>
> > Reviewed-by: Gustaf Ullberg <gustaf@webrtc.org>
> > Reviewed-by: Sam Zackrisson <saza@webrtc.org>
> > Reviewed-by: Björn Terelius <terelius@webrtc.org>
> > Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
> > Cr-Commit-Position: refs/heads/main@{#35975}
>
> TBR=gustaf@webrtc.org,saza@webrtc.org,alessiob@webrtc.org,terelius@webrtc.org,hta@webrtc.org,webrtc-scoped@luci-project-accounts.iam.gserviceaccount.com
>
> Change-Id: Iee01fdb874b4e0331277f3ffe60dacaabc3859a2
> No-Presubmit: true
> No-Tree-Checks: true
> No-Try: true
> Bug: webrtc:11226,webrtc:11292
> Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/251600
> Reviewed-by: Harald Alvestrand <hta@webrtc.org>
> Reviewed-by: Gustaf Ullberg <gustaf@webrtc.org>
> Commit-Queue: Mirko Bonadei <mbonadei@webrtc.org>
> Cr-Commit-Position: refs/heads/main@{#35977}
# Not skipping CQ checks because this is a reland.
Bug: webrtc:11226,webrtc:11292
Change-Id: I2fcbc5fdade16bfe6a0f0a02841a33a598d4f2ad
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/251660
Reviewed-by: Alessio Bazzica <alessiob@webrtc.org>
Reviewed-by: Harald Alvestrand <hta@webrtc.org>
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#35984}
diff --git a/api/audio_options.h b/api/audio_options.h
index 48dd628..16aa9e4 100644
--- a/api/audio_options.h
+++ b/api/audio_options.h
@@ -60,6 +60,8 @@
absl::optional<int> audio_jitter_buffer_min_delay_ms;
// Audio receiver jitter buffer (NetEq) should handle retransmitted packets.
absl::optional<bool> audio_jitter_buffer_enable_rtx_handling;
+ // Deprecated.
+ // TODO(bugs.webrtc.org/11226): Remove.
// Audio processing to detect typing.
absl::optional<bool> typing_detection;
absl::optional<bool> experimental_agc;
diff --git a/api/stats_types.cc b/api/stats_types.cc
index 1090643..b044e4a 100644
--- a/api/stats_types.cc
+++ b/api/stats_types.cc
@@ -648,6 +648,7 @@
return "googTrackId";
case kStatsValueNameTimingFrameInfo:
return "googTimingFrameInfo";
+ // TODO(bugs.webrtc.org/11226): Remove.
case kStatsValueNameTypingNoiseState:
return "googTypingNoiseState";
case kStatsValueNameWritable:
diff --git a/api/stats_types.h b/api/stats_types.h
index c3e4451..e7dd528 100644
--- a/api/stats_types.h
+++ b/api/stats_types.h
@@ -235,6 +235,7 @@
kStatsValueNameTrackId,
kStatsValueNameTransmitBitrate,
kStatsValueNameTransportType,
+ // TODO(bugs.webrtc.org/11226): Remove.
kStatsValueNameTypingNoiseState,
kStatsValueNameWritable,
kStatsValueNameAudioDeviceUnderrunCounter,
diff --git a/audio/audio_transport_impl.cc b/audio/audio_transport_impl.cc
index a5c952f..194f09c 100644
--- a/audio/audio_transport_impl.cc
+++ b/audio/audio_transport_impl.cc
@@ -165,24 +165,6 @@
audio_frame.get());
audio_frame->set_absolute_capture_timestamp_ms(estimated_capture_time_ns /
1000000);
- // Typing detection (utilizes the APM/VAD decision). We let the VAD determine
- // if we're using this feature or not.
- // TODO(solenberg): GetConfig() takes a lock. Work around that.
- bool typing_detected = false;
- if (audio_processing_ &&
- audio_processing_->GetConfig().voice_detection.enabled) {
- if (audio_frame->vad_activity_ != AudioFrame::kVadUnknown) {
- bool vad_active = audio_frame->vad_activity_ == AudioFrame::kVadActive;
- typing_detected = typing_detection_.Process(key_pressed, vad_active);
- }
- }
-
- // Copy frame and push to each sending stream. The copy is required since an
- // encoding task will be posted internally to each stream.
- {
- MutexLock lock(&capture_lock_);
- typing_noise_detected_ = typing_detected;
- }
RTC_DCHECK_GT(audio_frame->samples_per_channel_, 0);
if (async_audio_processing_)
@@ -290,8 +272,4 @@
swap_stereo_channels_ = enable;
}
-bool AudioTransportImpl::typing_noise_detected() const {
- MutexLock lock(&capture_lock_);
- return typing_noise_detected_;
-}
} // namespace webrtc
diff --git a/audio/audio_transport_impl.h b/audio/audio_transport_impl.h
index 0b1406f..8999956 100644
--- a/audio/audio_transport_impl.h
+++ b/audio/audio_transport_impl.h
@@ -86,7 +86,9 @@
int send_sample_rate_hz,
size_t send_num_channels);
void SetStereoChannelSwapping(bool enable);
- bool typing_noise_detected() const;
+ // Deprecated.
+ // TODO(bugs.webrtc.org/11226): Remove.
+ bool typing_noise_detected() const { return false; }
private:
void SendProcessedData(std::unique_ptr<AudioFrame> audio_frame);
@@ -103,7 +105,6 @@
std::vector<AudioSender*> audio_senders_ RTC_GUARDED_BY(capture_lock_);
int send_sample_rate_hz_ RTC_GUARDED_BY(capture_lock_) = 8000;
size_t send_num_channels_ RTC_GUARDED_BY(capture_lock_) = 1;
- bool typing_noise_detected_ RTC_GUARDED_BY(capture_lock_) = false;
bool swap_stereo_channels_ RTC_GUARDED_BY(capture_lock_) = false;
PushResampler<int16_t> capture_resampler_;
TypingDetection typing_detection_;
diff --git a/media/engine/webrtc_voice_engine.cc b/media/engine/webrtc_voice_engine.cc
index 0640001..b7b0ad7 100644
--- a/media/engine/webrtc_voice_engine.cc
+++ b/media/engine/webrtc_voice_engine.cc
@@ -634,9 +634,7 @@
}
if (options.typing_detection) {
- RTC_LOG(LS_INFO) << "Typing detection is enabled? "
- << *options.typing_detection;
- apm_config.voice_detection.enabled = *options.typing_detection;
+ RTC_LOG(LS_WARNING) << "Typing detection is requested, but unsupported.";
}
ap->ApplyConfig(apm_config);
diff --git a/media/engine/webrtc_voice_engine_unittest.cc b/media/engine/webrtc_voice_engine_unittest.cc
index 8d864ae..40d5714 100644
--- a/media/engine/webrtc_voice_engine_unittest.cc
+++ b/media/engine/webrtc_voice_engine_unittest.cc
@@ -221,11 +221,6 @@
// Default Options.
VerifyEchoCancellationSettings(/*enabled=*/true);
EXPECT_TRUE(IsHighPassFilterEnabled());
-#if defined(WEBRTC_ANDROID)
- EXPECT_FALSE(IsTypingDetectionEnabled());
-#else
- EXPECT_TRUE(IsTypingDetectionEnabled());
-#endif
EXPECT_TRUE(apm_config_.noise_suppression.enabled);
EXPECT_EQ(apm_config_.noise_suppression.level, kDefaultNsLevel);
VerifyGainControlEnabledCorrectly();
@@ -793,10 +788,6 @@
return apm_config_.high_pass_filter.enabled;
}
- bool IsTypingDetectionEnabled() {
- return apm_config_.voice_detection.enabled;
- }
-
protected:
const bool use_null_apm_;
std::unique_ptr<webrtc::TaskQueueFactory> task_queue_factory_;
@@ -3041,40 +3032,10 @@
if (!use_null_apm_) {
VerifyEchoCancellationSettings(/*enabled=*/true);
EXPECT_TRUE(IsHighPassFilterEnabled());
-#if defined(WEBRTC_ANDROID)
- EXPECT_FALSE(IsTypingDetectionEnabled());
-#else
- EXPECT_TRUE(IsTypingDetectionEnabled());
-#endif
}
EXPECT_EQ(200u, GetRecvStreamConfig(kSsrcY).jitter_buffer_max_packets);
EXPECT_FALSE(GetRecvStreamConfig(kSsrcY).jitter_buffer_fast_accelerate);
- // Turn typing detection off.
- send_parameters_.options.typing_detection = false;
- SetSendParameters(send_parameters_);
- if (!use_null_apm_) {
- EXPECT_FALSE(IsTypingDetectionEnabled());
- }
-
- // Leave typing detection unchanged, but non-default.
- send_parameters_.options.typing_detection = absl::nullopt;
- SetSendParameters(send_parameters_);
- if (!use_null_apm_) {
- EXPECT_FALSE(IsTypingDetectionEnabled());
- }
-
- // Turn typing detection on.
- send_parameters_.options.typing_detection = true;
- SetSendParameters(send_parameters_);
- if (!use_null_apm_) {
-#if defined(WEBRTC_ANDROID)
- EXPECT_FALSE(IsTypingDetectionEnabled());
-#else
- EXPECT_TRUE(IsTypingDetectionEnabled());
-#endif
- }
-
// Turn echo cancellation off
send_parameters_.options.echo_cancellation = false;
SetSendParameters(send_parameters_);
diff --git a/modules/audio_processing/BUILD.gn b/modules/audio_processing/BUILD.gn
index f32058d..ee6b579 100644
--- a/modules/audio_processing/BUILD.gn
+++ b/modules/audio_processing/BUILD.gn
@@ -168,7 +168,6 @@
":high_pass_filter",
":optionally_built_submodule_creators",
":rms_level",
- ":voice_detection",
"../../api:array_view",
"../../api:function_view",
"../../api/audio:aec3_config",
@@ -218,20 +217,6 @@
}
}
-rtc_library("voice_detection") {
- sources = [
- "voice_detection.cc",
- "voice_detection.h",
- ]
- deps = [
- ":api",
- ":audio_buffer",
- "../../api/audio:audio_frame_api",
- "../../common_audio:common_audio_c",
- "../../rtc_base:checks",
- ]
-}
-
rtc_library("residual_echo_detector") {
poisonous = [ "default_echo_detector" ]
configs += [ ":apm_debug_dump" ]
@@ -379,7 +364,6 @@
":gain_controller2",
":high_pass_filter",
":mocks",
- ":voice_detection",
"../../api:array_view",
"../../api:scoped_refptr",
"../../api/audio:aec3_config",
@@ -474,7 +458,6 @@
"test/echo_canceller_test_tools_unittest.cc",
"test/echo_control_mock.h",
"test/test_utils.h",
- "voice_detection_unittest.cc",
]
}
}
diff --git a/modules/audio_processing/audio_processing_impl.cc b/modules/audio_processing/audio_processing_impl.cc
index 8810efe..9a1aaee 100644
--- a/modules/audio_processing/audio_processing_impl.cc
+++ b/modules/audio_processing/audio_processing_impl.cc
@@ -141,7 +141,6 @@
bool gain_controller2_enabled,
bool gain_adjustment_enabled,
bool echo_controller_enabled,
- bool voice_detector_enabled,
bool transient_suppressor_enabled) {
bool changed = false;
changed |= (high_pass_filter_enabled != high_pass_filter_enabled_);
@@ -153,7 +152,6 @@
changed |= (gain_controller2_enabled != gain_controller2_enabled_);
changed |= (gain_adjustment_enabled != gain_adjustment_enabled_);
changed |= (echo_controller_enabled != echo_controller_enabled_);
- changed |= (voice_detector_enabled != voice_detector_enabled_);
changed |= (transient_suppressor_enabled != transient_suppressor_enabled_);
if (changed) {
high_pass_filter_enabled_ = high_pass_filter_enabled;
@@ -163,7 +161,6 @@
gain_controller2_enabled_ = gain_controller2_enabled;
gain_adjustment_enabled_ = gain_adjustment_enabled;
echo_controller_enabled_ = echo_controller_enabled;
- voice_detector_enabled_ = voice_detector_enabled;
transient_suppressor_enabled_ = transient_suppressor_enabled;
}
@@ -174,7 +171,7 @@
bool AudioProcessingImpl::SubmoduleStates::CaptureMultiBandSubModulesActive()
const {
- return CaptureMultiBandProcessingPresent() || voice_detector_enabled_;
+ return CaptureMultiBandProcessingPresent();
}
bool AudioProcessingImpl::SubmoduleStates::CaptureMultiBandProcessingPresent()
@@ -371,7 +368,6 @@
InitializeGainController1();
InitializeTransientSuppressor();
InitializeHighPassFilter(true);
- InitializeVoiceDetector();
InitializeResidualEchoDetector();
InitializeEchoController();
InitializeGainController2(/*config_has_changed=*/true);
@@ -506,9 +502,6 @@
const bool agc2_config_changed =
config_.gain_controller2 != config.gain_controller2;
- const bool voice_detection_config_changed =
- config_.voice_detection.enabled != config.voice_detection.enabled;
-
const bool ns_config_changed =
config_.noise_suppression.enabled != config.noise_suppression.enabled ||
config_.noise_suppression.level != config.noise_suppression.level;
@@ -557,10 +550,6 @@
InitializeCaptureLevelsAdjuster();
}
- if (voice_detection_config_changed) {
- InitializeVoiceDetector();
- }
-
// Reinitialization must happen after all submodule configuration to avoid
// additional reinitializations on the next capture / render processing call.
if (pipeline_config_changed) {
@@ -1215,13 +1204,6 @@
}
}
- if (config_.voice_detection.enabled) {
- capture_.stats.voice_detected =
- submodules_.voice_detector->ProcessCaptureAudio(capture_buffer);
- } else {
- capture_.stats.voice_detected = absl::nullopt;
- }
-
if (submodules_.agc_manager) {
submodules_.agc_manager->Process(capture_buffer);
@@ -1682,7 +1664,7 @@
!!submodules_.gain_controller2,
config_.pre_amplifier.enabled || config_.capture_level_adjustment.enabled,
capture_nonlocked_.echo_controller_enabled,
- config_.voice_detection.enabled, !!submodules_.transient_suppressor);
+ !!submodules_.transient_suppressor);
}
void AudioProcessingImpl::InitializeTransientSuppressor() {
@@ -1732,14 +1714,6 @@
}
}
-void AudioProcessingImpl::InitializeVoiceDetector() {
- if (config_.voice_detection.enabled) {
- submodules_.voice_detector = std::make_unique<VoiceDetection>(
- proc_split_sample_rate_hz(), VoiceDetection::kVeryLowLikelihood);
- } else {
- submodules_.voice_detector.reset();
- }
-}
void AudioProcessingImpl::InitializeEchoController() {
bool use_echo_controller =
echo_control_factory_ ||
diff --git a/modules/audio_processing/audio_processing_impl.h b/modules/audio_processing/audio_processing_impl.h
index 47dd62e..344b8c5 100644
--- a/modules/audio_processing/audio_processing_impl.h
+++ b/modules/audio_processing/audio_processing_impl.h
@@ -39,7 +39,6 @@
#include "modules/audio_processing/render_queue_item_verifier.h"
#include "modules/audio_processing/rms_level.h"
#include "modules/audio_processing/transient/transient_suppressor.h"
-#include "modules/audio_processing/voice_detection.h"
#include "rtc_base/gtest_prod_util.h"
#include "rtc_base/ignore_wundef.h"
#include "rtc_base/swap_queue.h"
@@ -208,7 +207,6 @@
bool gain_controller2_enabled,
bool gain_adjustment_enabled,
bool echo_controller_enabled,
- bool voice_detector_enabled,
bool transient_suppressor_enabled);
bool CaptureMultiBandSubModulesActive() const;
bool CaptureMultiBandProcessingPresent() const;
@@ -231,7 +229,6 @@
bool gain_controller2_enabled_ = false;
bool gain_adjustment_enabled_ = false;
bool echo_controller_enabled_ = false;
- bool voice_detector_enabled_ = false;
bool transient_suppressor_enabled_ = false;
bool first_update_ = true;
};
@@ -267,7 +264,6 @@
// already acquired.
void InitializeHighPassFilter(bool forced_reset)
RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
- void InitializeVoiceDetector() RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
void InitializeGainController1() RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
void InitializeTransientSuppressor()
RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
@@ -400,7 +396,6 @@
std::unique_ptr<EchoControlMobileImpl> echo_control_mobile;
std::unique_ptr<NoiseSuppressor> noise_suppressor;
std::unique_ptr<TransientSuppressor> transient_suppressor;
- std::unique_ptr<VoiceDetection> voice_detector;
std::unique_ptr<CaptureLevelsAdjuster> capture_levels_adjuster;
} submodules_;
diff --git a/modules/audio_processing/audio_processing_impl_locking_unittest.cc b/modules/audio_processing/audio_processing_impl_locking_unittest.cc
index 343f077..7557e91 100644
--- a/modules/audio_processing/audio_processing_impl_locking_unittest.cc
+++ b/modules/audio_processing/audio_processing_impl_locking_unittest.cc
@@ -483,7 +483,6 @@
apm_config.gain_controller1.mode =
AudioProcessing::Config::GainController1::kAdaptiveDigital;
apm_config.noise_suppression.enabled = true;
- apm_config.voice_detection.enabled = true;
return apm_config;
}
diff --git a/modules/audio_processing/audio_processing_performance_unittest.cc b/modules/audio_processing/audio_processing_performance_unittest.cc
index c885293..57655ae 100644
--- a/modules/audio_processing/audio_processing_performance_unittest.cc
+++ b/modules/audio_processing/audio_processing_performance_unittest.cc
@@ -441,7 +441,6 @@
apm_config.gain_controller1.enabled = true;
apm_config.gain_controller1.mode =
AudioProcessing::Config::GainController1::kAdaptiveDigital;
- apm_config.voice_detection.enabled = true;
apm->ApplyConfig(apm_config);
};
@@ -453,7 +452,6 @@
apm_config.noise_suppression.enabled = true;
apm_config.gain_controller1.mode =
AudioProcessing::Config::GainController1::kAdaptiveDigital;
- apm_config.voice_detection.enabled = true;
apm->ApplyConfig(apm_config);
};
@@ -464,7 +462,6 @@
apm_config.echo_canceller.enabled = false;
apm_config.gain_controller1.enabled = false;
apm_config.noise_suppression.enabled = false;
- apm_config.voice_detection.enabled = false;
apm->ApplyConfig(apm_config);
};
diff --git a/modules/audio_processing/audio_processing_unittest.cc b/modules/audio_processing/audio_processing_unittest.cc
index 96e2d84..b21a022 100644
--- a/modules/audio_processing/audio_processing_unittest.cc
+++ b/modules/audio_processing/audio_processing_unittest.cc
@@ -190,7 +190,6 @@
apm_config.noise_suppression.enabled = true;
apm_config.high_pass_filter.enabled = true;
- apm_config.voice_detection.enabled = true;
apm_config.pipeline.maximum_internal_processing_rate = 48000;
ap->ApplyConfig(apm_config);
}
@@ -1226,7 +1225,6 @@
EXPECT_FALSE(config.high_pass_filter.enabled);
EXPECT_FALSE(config.gain_controller1.enabled);
EXPECT_FALSE(config.noise_suppression.enabled);
- EXPECT_FALSE(config.voice_detection.enabled);
}
TEST_F(ApmTest, NoProcessingWhenAllComponentsDisabled) {
@@ -1367,48 +1365,6 @@
EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy));
apm_->ApplyConfig(apm_config);
- // 3. Only GetStatistics-reporting VAD is enabled...
- SetFrameTo(&frame_, 1000);
- frame_copy.CopyFrom(frame_);
- apm_config.voice_detection.enabled = true;
- apm_->ApplyConfig(apm_config);
- EXPECT_EQ(apm_->kNoError,
- apm_->ProcessStream(
- frame_.data.data(),
- StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
- StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
- frame_.data.data()));
- EXPECT_EQ(apm_->kNoError,
- apm_->ProcessStream(
- frame_.data.data(),
- StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
- StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
- frame_.data.data()));
- EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy));
- apm_config.voice_detection.enabled = false;
- apm_->ApplyConfig(apm_config);
-
- // 4. The VAD is enabled...
- SetFrameTo(&frame_, 1000);
- frame_copy.CopyFrom(frame_);
- apm_config.voice_detection.enabled = true;
- apm_->ApplyConfig(apm_config);
- EXPECT_EQ(apm_->kNoError,
- apm_->ProcessStream(
- frame_.data.data(),
- StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
- StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
- frame_.data.data()));
- EXPECT_EQ(apm_->kNoError,
- apm_->ProcessStream(
- frame_.data.data(),
- StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
- StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
- frame_.data.data()));
- EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy));
- apm_config.voice_detection.enabled = false;
- apm_->ApplyConfig(apm_config);
-
// Check the test is valid. We should have distortion from the filter
// when AEC is enabled (which won't affect the audio).
apm_config.echo_canceller.enabled = true;
@@ -1736,7 +1692,6 @@
static_cast<size_t>(test->num_reverse_channels()), true);
int frame_count = 0;
- int has_voice_count = 0;
int analog_level = 127;
int analog_level_average = 0;
int max_output_average = 0;
@@ -1772,8 +1727,6 @@
analog_level = apm_->recommended_stream_analog_level();
analog_level_average += analog_level;
AudioProcessingStats stats = apm_->GetStatistics();
- EXPECT_TRUE(stats.voice_detected);
- has_voice_count += *stats.voice_detected ? 1 : 0;
size_t frame_size = frame_.samples_per_channel * frame_.num_channels;
size_t write_count =
@@ -1829,33 +1782,23 @@
if (!absl::GetFlag(FLAGS_write_apm_ref_data)) {
const int kIntNear = 1;
- // When running the test on a N7 we get a {2, 6} difference of
- // `has_voice_count` and `max_output_average` is up to 18 higher.
- // All numbers being consistently higher on N7 compare to ref_data.
+ // All numbers being consistently higher on N7 compare to the reference
+ // data.
// TODO(bjornv): If we start getting more of these offsets on Android we
// should consider a different approach. Either using one slack for all,
// or generate a separate android reference.
#if defined(WEBRTC_ANDROID) || defined(WEBRTC_IOS)
- const int kHasVoiceCountOffset = 3;
- const int kHasVoiceCountNear = 8;
const int kMaxOutputAverageOffset = 9;
const int kMaxOutputAverageNear = 26;
#else
- const int kHasVoiceCountOffset = 0;
- const int kHasVoiceCountNear = kIntNear;
const int kMaxOutputAverageOffset = 0;
const int kMaxOutputAverageNear = kIntNear;
#endif
- EXPECT_NEAR(test->has_voice_count(),
- has_voice_count - kHasVoiceCountOffset, kHasVoiceCountNear);
-
EXPECT_NEAR(test->analog_level_average(), analog_level_average, kIntNear);
EXPECT_NEAR(test->max_output_average(),
max_output_average - kMaxOutputAverageOffset,
kMaxOutputAverageNear);
} else {
- test->set_has_voice_count(has_voice_count);
-
test->set_analog_level_average(analog_level_average);
test->set_max_output_average(max_output_average);
}
@@ -2685,7 +2628,6 @@
apm_config.echo_canceller.enabled = true;
apm_config.echo_canceller.mobile_mode = mobile_aec;
apm_config.noise_suppression.enabled = false;
- apm_config.voice_detection.enabled = false;
apm->ApplyConfig(apm_config);
return apm;
}
@@ -2794,10 +2736,9 @@
EXPECT_FALSE(stats.echo_return_loss_enhancement.has_value());
}
-TEST(ApmStatistics, ReportHasVoice) {
+TEST(ApmStatistics, DoNotReportVoiceDetectedStat) {
ProcessingConfig processing_config = {
{{32000, 1}, {32000, 1}, {32000, 1}, {32000, 1}}};
- AudioProcessing::Config config;
// Set up an audioframe.
Int16FrameData frame;
@@ -2814,37 +2755,14 @@
AudioProcessingBuilderForTesting().Create();
apm->Initialize(processing_config);
- // If not enabled, no metric should be reported.
+ // No metric should be reported.
EXPECT_EQ(
apm->ProcessStream(frame.data.data(),
StreamConfig(frame.sample_rate_hz, frame.num_channels),
StreamConfig(frame.sample_rate_hz, frame.num_channels),
frame.data.data()),
0);
- EXPECT_FALSE(apm->GetStatistics().voice_detected);
-
- // If enabled, metrics should be reported.
- config.voice_detection.enabled = true;
- apm->ApplyConfig(config);
- EXPECT_EQ(
- apm->ProcessStream(frame.data.data(),
- StreamConfig(frame.sample_rate_hz, frame.num_channels),
- StreamConfig(frame.sample_rate_hz, frame.num_channels),
- frame.data.data()),
- 0);
- auto stats = apm->GetStatistics();
- EXPECT_TRUE(stats.voice_detected);
-
- // If re-disabled, the value is again not reported.
- config.voice_detection.enabled = false;
- apm->ApplyConfig(config);
- EXPECT_EQ(
- apm->ProcessStream(frame.data.data(),
- StreamConfig(frame.sample_rate_hz, frame.num_channels),
- StreamConfig(frame.sample_rate_hz, frame.num_channels),
- frame.data.data()),
- 0);
- EXPECT_FALSE(apm->GetStatistics().voice_detected);
+ EXPECT_FALSE(apm->GetStatistics().voice_detected.has_value());
}
TEST(ApmStatistics, GetStatisticsReportsNoEchoDetectorStatsWhenDisabled) {
diff --git a/modules/audio_processing/include/audio_processing.cc b/modules/audio_processing/include/audio_processing.cc
index 9643b6c..86edaee 100644
--- a/modules/audio_processing/include/audio_processing.cc
+++ b/modules/audio_processing/include/audio_processing.cc
@@ -145,7 +145,6 @@
<< NoiseSuppressionLevelToString(noise_suppression.level)
<< " }, transient_suppression: { enabled: "
<< transient_suppression.enabled
- << " }, voice_detection: { enabled: " << voice_detection.enabled
<< " }, gain_controller1: { enabled: " << gain_controller1.enabled
<< ", mode: " << GainController1ModeToString(gain_controller1.mode)
<< ", target_level_dbfs: " << gain_controller1.target_level_dbfs
diff --git a/modules/audio_processing/include/audio_processing.h b/modules/audio_processing/include/audio_processing.h
index 8af5013..9d6824c 100644
--- a/modules/audio_processing/include/audio_processing.h
+++ b/modules/audio_processing/include/audio_processing.h
@@ -113,8 +113,6 @@
//
// config.high_pass_filter.enabled = true;
//
-// config.voice_detection.enabled = true;
-//
// apm->ApplyConfig(config)
//
// apm->noise_reduction()->set_level(kHighSuppression);
@@ -232,11 +230,6 @@
bool enabled = false;
} transient_suppression;
- // Enables reporting of `voice_detected` in webrtc::AudioProcessingStats.
- struct VoiceDetection {
- bool enabled = false;
- } voice_detection;
-
// Enables automatic gain control (AGC) functionality.
// The automatic gain control (AGC) component brings the signal to an
// appropriate range. This is done by applying a digital gain directly and,
diff --git a/modules/audio_processing/include/audio_processing_statistics.h b/modules/audio_processing/include/audio_processing_statistics.h
index a31dafe..3b43319 100644
--- a/modules/audio_processing/include/audio_processing_statistics.h
+++ b/modules/audio_processing/include/audio_processing_statistics.h
@@ -24,6 +24,8 @@
AudioProcessingStats(const AudioProcessingStats& other);
~AudioProcessingStats();
+ // Deprecated.
+ // TODO(bugs.webrtc.org/11226): Remove.
// True if voice is detected in the last capture frame, after processing.
// It is conservative in flagging audio as speech, with low likelihood of
// incorrectly flagging a frame as voice.
diff --git a/modules/audio_processing/test/audio_processing_simulator.cc b/modules/audio_processing/test/audio_processing_simulator.cc
index b1edda1..4915648 100644
--- a/modules/audio_processing/test/audio_processing_simulator.cc
+++ b/modules/audio_processing/test/audio_processing_simulator.cc
@@ -543,10 +543,6 @@
apm_config.high_pass_filter.enabled = *settings_.use_hpf;
}
- if (settings_.use_vad) {
- apm_config.voice_detection.enabled = *settings_.use_vad;
- }
-
if (settings_.use_agc) {
apm_config.gain_controller1.enabled = *settings_.use_agc;
}
diff --git a/modules/audio_processing/test/audio_processing_simulator.h b/modules/audio_processing/test/audio_processing_simulator.h
index ae3cd4f..af76d7e 100644
--- a/modules/audio_processing/test/audio_processing_simulator.h
+++ b/modules/audio_processing/test/audio_processing_simulator.h
@@ -105,7 +105,6 @@
absl::optional<bool> use_ns;
absl::optional<int> use_ts;
absl::optional<bool> use_analog_agc;
- absl::optional<bool> use_vad;
absl::optional<bool> use_all;
absl::optional<bool> analog_agc_disable_digital_adaptive;
absl::optional<int> agc_mode;
diff --git a/modules/audio_processing/test/audioproc_float_impl.cc b/modules/audio_processing/test/audioproc_float_impl.cc
index d4697e4..aab1881 100644
--- a/modules/audio_processing/test/audioproc_float_impl.cc
+++ b/modules/audio_processing/test/audioproc_float_impl.cc
@@ -117,10 +117,6 @@
analog_agc,
kParameterNotSpecifiedValue,
"Activate (1) or deactivate (0) the analog AGC");
-ABSL_FLAG(int,
- vad,
- kParameterNotSpecifiedValue,
- "Activate (1) or deactivate (0) the voice activity detector");
ABSL_FLAG(bool,
all_default,
false,
@@ -365,7 +361,6 @@
SimulationSettings CreateSettings() {
SimulationSettings settings;
if (absl::GetFlag(FLAGS_all_default)) {
- settings.use_vad = true;
settings.use_ts = true;
settings.use_analog_agc = true;
settings.use_ns = true;
@@ -417,7 +412,6 @@
SetSettingIfSpecified(absl::GetFlag(FLAGS_ts), &settings.use_ts);
SetSettingIfFlagSet(absl::GetFlag(FLAGS_analog_agc),
&settings.use_analog_agc);
- SetSettingIfFlagSet(absl::GetFlag(FLAGS_vad), &settings.use_vad);
SetSettingIfFlagSet(absl::GetFlag(FLAGS_analog_agc_disable_digital_adaptive),
&settings.analog_agc_disable_digital_adaptive);
SetSettingIfSpecified(absl::GetFlag(FLAGS_agc_mode), &settings.agc_mode);
diff --git a/modules/audio_processing/voice_detection.cc b/modules/audio_processing/voice_detection.cc
deleted file mode 100644
index 1a633e2..0000000
--- a/modules/audio_processing/voice_detection.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "modules/audio_processing/voice_detection.h"
-
-#include "common_audio/vad/include/webrtc_vad.h"
-#include "modules/audio_processing/audio_buffer.h"
-#include "rtc_base/checks.h"
-
-namespace webrtc {
-class VoiceDetection::Vad {
- public:
- Vad() {
- state_ = WebRtcVad_Create();
- RTC_CHECK(state_);
- int error = WebRtcVad_Init(state_);
- RTC_DCHECK_EQ(0, error);
- }
- ~Vad() { WebRtcVad_Free(state_); }
-
- Vad(Vad&) = delete;
- Vad& operator=(Vad&) = delete;
-
- VadInst* state() { return state_; }
-
- private:
- VadInst* state_ = nullptr;
-};
-
-VoiceDetection::VoiceDetection(int sample_rate_hz, Likelihood likelihood)
- : sample_rate_hz_(sample_rate_hz),
- frame_size_samples_(static_cast<size_t>(sample_rate_hz_ / 100)),
- likelihood_(likelihood),
- vad_(new Vad()) {
- int mode = 2;
- switch (likelihood) {
- case VoiceDetection::kVeryLowLikelihood:
- mode = 3;
- break;
- case VoiceDetection::kLowLikelihood:
- mode = 2;
- break;
- case VoiceDetection::kModerateLikelihood:
- mode = 1;
- break;
- case VoiceDetection::kHighLikelihood:
- mode = 0;
- break;
- default:
- RTC_DCHECK_NOTREACHED();
- break;
- }
- int error = WebRtcVad_set_mode(vad_->state(), mode);
- RTC_DCHECK_EQ(0, error);
-}
-
-VoiceDetection::~VoiceDetection() {}
-
-bool VoiceDetection::ProcessCaptureAudio(AudioBuffer* audio) {
- RTC_DCHECK_GE(AudioBuffer::kMaxSplitFrameLength,
- audio->num_frames_per_band());
- std::array<int16_t, AudioBuffer::kMaxSplitFrameLength> mixed_low_pass_data;
- rtc::ArrayView<const int16_t> mixed_low_pass(mixed_low_pass_data.data(),
- audio->num_frames_per_band());
- if (audio->num_channels() == 1) {
- FloatS16ToS16(audio->split_bands_const(0)[kBand0To8kHz],
- audio->num_frames_per_band(), mixed_low_pass_data.data());
- } else {
- const int num_channels = static_cast<int>(audio->num_channels());
- for (size_t i = 0; i < audio->num_frames_per_band(); ++i) {
- int32_t value =
- FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[0][i]);
- for (int j = 1; j < num_channels; ++j) {
- value += FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[j][i]);
- }
- mixed_low_pass_data[i] = value / num_channels;
- }
- }
-
- int vad_ret = WebRtcVad_Process(vad_->state(), sample_rate_hz_,
- mixed_low_pass.data(), frame_size_samples_);
- RTC_DCHECK(vad_ret == 0 || vad_ret == 1);
- return vad_ret == 0 ? false : true;
-}
-} // namespace webrtc
diff --git a/modules/audio_processing/voice_detection.h b/modules/audio_processing/voice_detection.h
deleted file mode 100644
index 79d44e6..0000000
--- a/modules/audio_processing/voice_detection.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
-#define MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
-
-#include <stddef.h>
-
-#include <memory>
-
-#include "modules/audio_processing/include/audio_processing.h"
-
-namespace webrtc {
-
-class AudioBuffer;
-
-// The voice activity detection (VAD) component analyzes the stream to
-// determine if voice is present.
-class VoiceDetection {
- public:
- // Specifies the likelihood that a frame will be declared to contain voice.
- // A higher value makes it more likely that speech will not be clipped, at
- // the expense of more noise being detected as voice.
- enum Likelihood {
- kVeryLowLikelihood,
- kLowLikelihood,
- kModerateLikelihood,
- kHighLikelihood
- };
-
- VoiceDetection(int sample_rate_hz, Likelihood likelihood);
- ~VoiceDetection();
-
- VoiceDetection(VoiceDetection&) = delete;
- VoiceDetection& operator=(VoiceDetection&) = delete;
-
- // Returns true if voice is detected in the current frame.
- bool ProcessCaptureAudio(AudioBuffer* audio);
-
- Likelihood likelihood() const { return likelihood_; }
-
- private:
- class Vad;
-
- int sample_rate_hz_;
- size_t frame_size_samples_;
- Likelihood likelihood_;
- std::unique_ptr<Vad> vad_;
-};
-} // namespace webrtc
-
-#endif // MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
diff --git a/modules/audio_processing/voice_detection_unittest.cc b/modules/audio_processing/voice_detection_unittest.cc
deleted file mode 100644
index e1117e4..0000000
--- a/modules/audio_processing/voice_detection_unittest.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2016 The WebRTC project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#include <vector>
-
-#include "api/array_view.h"
-#include "modules/audio_processing/audio_buffer.h"
-#include "modules/audio_processing/test/audio_buffer_tools.h"
-#include "modules/audio_processing/test/bitexactness_tools.h"
-#include "modules/audio_processing/voice_detection.h"
-#include "test/gtest.h"
-
-namespace webrtc {
-namespace {
-
-const int kNumFramesToProcess = 1000;
-
-// Process one frame of data and produce the output.
-bool ProcessOneFrame(int sample_rate_hz,
- AudioBuffer* audio_buffer,
- VoiceDetection* voice_detection) {
- if (sample_rate_hz > AudioProcessing::kSampleRate16kHz) {
- audio_buffer->SplitIntoFrequencyBands();
- }
-
- return voice_detection->ProcessCaptureAudio(audio_buffer);
-}
-
-// Processes a specified amount of frames, verifies the results and reports
-// any errors.
-void RunBitexactnessTest(int sample_rate_hz,
- size_t num_channels,
- bool stream_has_voice_reference) {
- int sample_rate_to_use = std::min(sample_rate_hz, 16000);
- VoiceDetection voice_detection(sample_rate_to_use,
- VoiceDetection::kLowLikelihood);
-
- int samples_per_channel = rtc::CheckedDivExact(sample_rate_hz, 100);
- const StreamConfig capture_config(sample_rate_hz, num_channels);
- AudioBuffer capture_buffer(
- capture_config.sample_rate_hz(), capture_config.num_channels(),
- capture_config.sample_rate_hz(), capture_config.num_channels(),
- capture_config.sample_rate_hz(), capture_config.num_channels());
- test::InputAudioFile capture_file(
- test::GetApmCaptureTestVectorFileName(sample_rate_hz));
- std::vector<float> capture_input(samples_per_channel * num_channels);
- bool stream_has_voice = false;
- for (int frame_no = 0; frame_no < kNumFramesToProcess; ++frame_no) {
- ReadFloatSamplesFromStereoFile(samples_per_channel, num_channels,
- &capture_file, capture_input);
-
- test::CopyVectorToAudioBuffer(capture_config, capture_input,
- &capture_buffer);
-
- stream_has_voice =
- ProcessOneFrame(sample_rate_hz, &capture_buffer, &voice_detection);
- }
-
- EXPECT_EQ(stream_has_voice_reference, stream_has_voice);
-}
-
-const bool kStreamHasVoiceReference = true;
-
-} // namespace
-
-TEST(VoiceDetectionBitExactnessTest, Mono8kHz) {
- RunBitexactnessTest(8000, 1, kStreamHasVoiceReference);
-}
-
-TEST(VoiceDetectionBitExactnessTest, Mono16kHz) {
- RunBitexactnessTest(16000, 1, kStreamHasVoiceReference);
-}
-
-TEST(VoiceDetectionBitExactnessTest, Mono32kHz) {
- RunBitexactnessTest(32000, 1, kStreamHasVoiceReference);
-}
-
-TEST(VoiceDetectionBitExactnessTest, Mono48kHz) {
- RunBitexactnessTest(48000, 1, kStreamHasVoiceReference);
-}
-
-TEST(VoiceDetectionBitExactnessTest, Stereo8kHz) {
- RunBitexactnessTest(8000, 2, kStreamHasVoiceReference);
-}
-
-TEST(VoiceDetectionBitExactnessTest, Stereo16kHz) {
- RunBitexactnessTest(16000, 2, kStreamHasVoiceReference);
-}
-
-TEST(VoiceDetectionBitExactnessTest, Stereo32kHz) {
- RunBitexactnessTest(32000, 2, kStreamHasVoiceReference);
-}
-
-TEST(VoiceDetectionBitExactnessTest, Stereo48kHz) {
- RunBitexactnessTest(48000, 2, kStreamHasVoiceReference);
-}
-
-} // namespace webrtc
diff --git a/test/fuzzers/audio_processing_configs_fuzzer.cc b/test/fuzzers/audio_processing_configs_fuzzer.cc
index 54a43df..f04ef77 100644
--- a/test/fuzzers/audio_processing_configs_fuzzer.cc
+++ b/test/fuzzers/audio_processing_configs_fuzzer.cc
@@ -54,7 +54,7 @@
bool use_agc = fuzz_data->ReadOrDefaultValue(true);
bool use_ns = fuzz_data->ReadOrDefaultValue(true);
static_cast<void>(fuzz_data->ReadOrDefaultValue(true));
- bool use_vad = fuzz_data->ReadOrDefaultValue(true);
+ static_cast<void>(fuzz_data->ReadOrDefaultValue(true));
bool use_agc_limiter = fuzz_data->ReadOrDefaultValue(true);
bool use_agc2 = fuzz_data->ReadOrDefaultValue(true);
@@ -114,7 +114,6 @@
use_agc2_adaptive_digital;
apm_config.noise_suppression.enabled = use_ns;
apm_config.transient_suppression.enabled = use_ts;
- apm_config.voice_detection.enabled = use_vad;
rtc::scoped_refptr<AudioProcessing> apm =
AudioProcessingBuilderForTesting()