Level estimation and saturation protection stub.
The level estimator (AdaptiveModeLevelEstimator) produces a biased
estimate of the speech level. In our model, we use another module
(the SaturationProtector) to compute the bias. This CL contains the
estimator and a stub of the saturation protector.
Bug: webrtc:7494
Change-Id: I0df736d0346063f544fa680b4cc84177ea548545
Reviewed-on: https://webrtc-review.googlesource.com/64820
Commit-Queue: Alex Loiko <aleloi@webrtc.org>
Reviewed-by: Ivo Creusen <ivoc@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#22641}
diff --git a/modules/audio_processing/BUILD.gn b/modules/audio_processing/BUILD.gn
index c8992b0..b36e76b 100644
--- a/modules/audio_processing/BUILD.gn
+++ b/modules/audio_processing/BUILD.gn
@@ -548,6 +548,7 @@
"../../test:test_support",
"../audio_coding:neteq_input_audio_tools",
"aec_dump:mock_aec_dump_unittests",
+ "agc2:adaptive_digital_unittests",
"agc2:fixed_digital_unittests",
"test/conversational_speech:unittest",
"vad:vad_unittests",
diff --git a/modules/audio_processing/agc2/BUILD.gn b/modules/audio_processing/agc2/BUILD.gn
index 61d17e7..df5ec6cf 100644
--- a/modules/audio_processing/agc2/BUILD.gn
+++ b/modules/audio_processing/agc2/BUILD.gn
@@ -25,6 +25,8 @@
"adaptive_mode_level_estimator.h",
"noise_level_estimator.cc",
"noise_level_estimator.h",
+ "saturation_protector.cc",
+ "saturation_protector.h",
]
configs += [ "..:apm_debug_dump" ]
@@ -126,3 +128,25 @@
"../../../rtc_base:rtc_base_tests_utils",
]
}
+
+rtc_source_set("adaptive_digital_unittests") {
+ testonly = true
+ configs += [ "..:apm_debug_dump" ]
+
+ sources = [
+ "adaptive_mode_level_estimator_unittest.cc",
+ ]
+ deps = [
+ ":adaptive_digital",
+ ":common",
+ ":test_utils",
+ "..:apm_logging",
+ "..:audio_frame_view",
+ "../../../api:array_view",
+ "../../../common_audio",
+ "../../../rtc_base:checks",
+ "../../../rtc_base:rtc_base_approved",
+ "../../../rtc_base:rtc_base_tests_utils",
+ "../vad:vad_with_level",
+ ]
+}
diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc b/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc
index e293bab..b190607 100644
--- a/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc
+++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc
@@ -17,7 +17,9 @@
namespace webrtc {
AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator(
- ApmDataDumper* apm_data_dumper) {}
+ ApmDataDumper* apm_data_dumper)
+ : saturation_protector_(apm_data_dumper),
+ apm_data_dumper_(apm_data_dumper) {}
void AdaptiveModeLevelEstimator::UpdateEstimation(
const VadWithLevel::LevelAndProbability& vad_data) {
@@ -27,10 +29,40 @@
RTC_DCHECK_LT(vad_data.speech_peak_dbfs, 50.f);
RTC_DCHECK_GE(vad_data.speech_probability, 0.f);
RTC_DCHECK_LE(vad_data.speech_probability, 1.f);
+
+ if (vad_data.speech_probability < kVadConfidenceThreshold) {
+ DebugDumpEstimate();
+ return;
+ }
+
+ const bool buffer_is_full = buffer_size_ms_ >= kFullBufferSizeMs;
+ if (!buffer_is_full) {
+ buffer_size_ms_ += kFrameDurationMs;
+ }
+
+ const float leak_factor = buffer_is_full ? kFullBufferLeakFactor : 1.f;
+
+ estimate_numerator_ = estimate_numerator_ * leak_factor +
+ vad_data.speech_rms_dbfs * vad_data.speech_probability;
+ estimate_denominator_ =
+ estimate_denominator_ * leak_factor + vad_data.speech_probability;
+
+ last_estimate_with_offset_dbfs_ = estimate_numerator_ / estimate_denominator_;
+
+ saturation_protector_.UpdateMargin(vad_data, last_estimate_with_offset_dbfs_);
+ DebugDumpEstimate();
}
float AdaptiveModeLevelEstimator::LatestLevelEstimate() const {
- // TODO(webrtc:7494): This is a stub. Add implementation.
- return 0.f;
+ return rtc::SafeClamp<float>(
+ last_estimate_with_offset_dbfs_ + saturation_protector_.LastMargin(),
+ -90.f, 0.f);
+}
+
+void AdaptiveModeLevelEstimator::DebugDumpEstimate() {
+ apm_data_dumper_->DumpRaw("agc2_adaptive_level_estimate_with_offset_dbfs",
+ last_estimate_with_offset_dbfs_);
+ apm_data_dumper_->DumpRaw("agc2_adaptive_level_estimate_dbfs",
+ LatestLevelEstimate());
}
} // namespace webrtc
diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator.h b/modules/audio_processing/agc2/adaptive_mode_level_estimator.h
index b8dcf1a..dfcaa53 100644
--- a/modules/audio_processing/agc2/adaptive_mode_level_estimator.h
+++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator.h
@@ -11,6 +11,7 @@
#ifndef MODULES_AUDIO_PROCESSING_AGC2_ADAPTIVE_MODE_LEVEL_ESTIMATOR_H_
#define MODULES_AUDIO_PROCESSING_AGC2_ADAPTIVE_MODE_LEVEL_ESTIMATOR_H_
+#include "modules/audio_processing/agc2/saturation_protector.h"
#include "modules/audio_processing/vad/vad_with_level.h"
namespace webrtc {
@@ -21,6 +22,16 @@
explicit AdaptiveModeLevelEstimator(ApmDataDumper* apm_data_dumper);
void UpdateEstimation(const VadWithLevel::LevelAndProbability& vad_data);
float LatestLevelEstimate() const;
+
+ private:
+ void DebugDumpEstimate();
+
+ int buffer_size_ms_ = 0;
+ float last_estimate_with_offset_dbfs_ = kInitialSpeechLevelEstimateDbfs;
+ float estimate_numerator_ = 0.f;
+ float estimate_denominator_ = 0.f;
+ SaturationProtector saturation_protector_;
+ ApmDataDumper* const apm_data_dumper_;
};
} // namespace webrtc
diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc b/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc
new file mode 100644
index 0000000..71909d0
--- /dev/null
+++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/agc2/adaptive_mode_level_estimator.h"
+
+#include "modules/audio_processing/agc2/agc2_common.h"
+#include "modules/audio_processing/logging/apm_data_dumper.h"
+#include "rtc_base/gunit.h"
+
+namespace webrtc {
+namespace {
+void RunOnConstantLevel(int num_iterations,
+ VadWithLevel::LevelAndProbability vad_data,
+ AdaptiveModeLevelEstimator* level_estimator) {
+ for (int i = 0; i < num_iterations; ++i) {
+ level_estimator->UpdateEstimation(vad_data); // By copy
+ }
+}
+} // namespace
+
+TEST(AutomaticGainController2AdaptiveModeLevelEstimator,
+ EstimatorShouldNotCrash) {
+ ApmDataDumper apm_data_dumper(0);
+ AdaptiveModeLevelEstimator level_estimator(&apm_data_dumper);
+
+ VadWithLevel::LevelAndProbability vad_data(1.f, -20.f, -10.f);
+ level_estimator.UpdateEstimation(vad_data);
+ static_cast<void>(level_estimator.LatestLevelEstimate());
+}
+
+TEST(AutomaticGainController2AdaptiveModeLevelEstimator, LevelShouldStabilize) {
+ ApmDataDumper apm_data_dumper(0);
+ AdaptiveModeLevelEstimator level_estimator(&apm_data_dumper);
+
+ constexpr float kSpeechRmsDbfs = -15.f;
+ RunOnConstantLevel(
+ 100,
+ VadWithLevel::LevelAndProbability(
+ 1.f, kSpeechRmsDbfs - kInitialSaturationMarginDb, kSpeechRmsDbfs),
+ &level_estimator);
+
+ EXPECT_NEAR(level_estimator.LatestLevelEstimate(), kSpeechRmsDbfs, 0.1f);
+}
+
+TEST(AutomaticGainController2AdaptiveModeLevelEstimator,
+ EstimatorIgnoresZeroProbabilityFrames) {
+ ApmDataDumper apm_data_dumper(0);
+ AdaptiveModeLevelEstimator level_estimator(&apm_data_dumper);
+
+ // Run for one second of fake audio.
+ constexpr float kSpeechRmsDbfs = -25.f;
+ RunOnConstantLevel(
+ 100,
+ VadWithLevel::LevelAndProbability(
+ 1.f, kSpeechRmsDbfs - kInitialSaturationMarginDb, kSpeechRmsDbfs),
+ &level_estimator);
+
+ // Run for one more second, but mark as not speech.
+ constexpr float kNoiseRmsDbfs = 0.f;
+ RunOnConstantLevel(
+ 100, VadWithLevel::LevelAndProbability(0.f, kNoiseRmsDbfs, kNoiseRmsDbfs),
+ &level_estimator);
+
+ // Level should not have changed.
+ EXPECT_NEAR(level_estimator.LatestLevelEstimate(), kSpeechRmsDbfs, 0.1f);
+}
+
+TEST(AutomaticGainController2AdaptiveModeLevelEstimator, TimeToAdapt) {
+ ApmDataDumper apm_data_dumper(0);
+ AdaptiveModeLevelEstimator level_estimator(&apm_data_dumper);
+
+ // Run for one 'window size' interval
+ constexpr float kInitialSpeechRmsDbfs = -30.f;
+ RunOnConstantLevel(
+ kFullBufferSizeMs / kFrameDurationMs,
+ VadWithLevel::LevelAndProbability(
+ 1.f, kInitialSpeechRmsDbfs - kInitialSaturationMarginDb,
+ kInitialSpeechRmsDbfs),
+ &level_estimator);
+
+ // Run for one half 'window size' interval. This should not be enough to
+ // adapt.
+ constexpr float kDifferentSpeechRmsDbfs = -10.f;
+ // It should at most differ by 25% after one 'window size' interval.
+ const float kMaxDifferenceDb =
+ 0.25 * std::abs(kDifferentSpeechRmsDbfs - kInitialSpeechRmsDbfs);
+ RunOnConstantLevel(
+ static_cast<int>(kFullBufferSizeMs / kFrameDurationMs / 2),
+ VadWithLevel::LevelAndProbability(
+ 1.f, kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb,
+ kDifferentSpeechRmsDbfs),
+ &level_estimator);
+ EXPECT_GT(
+ std::abs(kDifferentSpeechRmsDbfs - level_estimator.LatestLevelEstimate()),
+ kMaxDifferenceDb);
+
+ // Run for some more time. Afterwards, we should have adapted.
+ RunOnConstantLevel(
+ static_cast<int>(3 * kFullBufferSizeMs / kFrameDurationMs),
+ VadWithLevel::LevelAndProbability(
+ 1.f, kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb,
+ kDifferentSpeechRmsDbfs),
+ &level_estimator);
+ EXPECT_NEAR(level_estimator.LatestLevelEstimate(), kDifferentSpeechRmsDbfs,
+ kMaxDifferenceDb);
+}
+
+} // namespace webrtc
diff --git a/modules/audio_processing/agc2/agc2_common.h b/modules/audio_processing/agc2/agc2_common.h
index ad0ab4e..d4aa3fb 100644
--- a/modules/audio_processing/agc2/agc2_common.h
+++ b/modules/audio_processing/agc2/agc2_common.h
@@ -27,6 +27,18 @@
constexpr float kAttackFilterConstant = 0.f;
+// Used in the Level Estimator for deciding when to update the speech
+// level estimate.
+constexpr float kVadConfidenceThreshold = 0.9f;
+
+// The amount of 'memory' of the Level Estimator. Decides leak factors.
+constexpr float kFullBufferSizeMs = 1000.f;
+constexpr float kFullBufferLeakFactor = 1.f - 1.f / kFullBufferSizeMs;
+
+constexpr float kInitialSpeechLevelEstimateDbfs = -30.f;
+
+constexpr float kInitialSaturationMarginDb = 17.f;
+
// This is computed from kDecayMs by
// 10 ** (-1/20 * subframe_duration / kDecayMs).
// |subframe_duration| is |kFrameDurationMs / kSubFramesInFrame|.
diff --git a/modules/audio_processing/agc2/noise_level_estimator.h b/modules/audio_processing/agc2/noise_level_estimator.h
index f22bfd8..f9e4abc 100644
--- a/modules/audio_processing/agc2/noise_level_estimator.h
+++ b/modules/audio_processing/agc2/noise_level_estimator.h
@@ -20,7 +20,7 @@
public:
NoiseLevelEstimator() {}
- // Returns the estimated noise level in DbFS.
+ // Returns the estimated noise level in dBFS.
float Analyze(AudioFrameView<const float> frame);
private:
diff --git a/modules/audio_processing/agc2/saturation_protector.cc b/modules/audio_processing/agc2/saturation_protector.cc
new file mode 100644
index 0000000..a6f1a83
--- /dev/null
+++ b/modules/audio_processing/agc2/saturation_protector.cc
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/agc2/saturation_protector.h"
+
+#include <algorithm>
+
+#include "modules/audio_processing/logging/apm_data_dumper.h"
+#include "rtc_base/numerics/safe_minmax.h"
+
+namespace webrtc {
+
+SaturationProtector::SaturationProtector(ApmDataDumper* apm_data_dumper) {}
+
+void SaturationProtector::UpdateMargin(
+ const VadWithLevel::LevelAndProbability& vad_data,
+ float last_speech_level_estimate) {}
+
+float SaturationProtector::LastMargin() const {
+ return kInitialSaturationMarginDb;
+}
+} // namespace webrtc
diff --git a/modules/audio_processing/agc2/saturation_protector.h b/modules/audio_processing/agc2/saturation_protector.h
new file mode 100644
index 0000000..dcf5184
--- /dev/null
+++ b/modules/audio_processing/agc2/saturation_protector.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_
+#define MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_
+
+#include <array>
+
+#include "modules/audio_processing/agc2/agc2_common.h"
+#include "modules/audio_processing/vad/vad_with_level.h"
+
+namespace webrtc {
+
+class ApmDataDumper;
+
+class SaturationProtector {
+ public:
+ explicit SaturationProtector(ApmDataDumper* apm_data_dumper);
+
+ // Update and return margin estimate. This method should be called
+ // whenever a frame is reliably classified as 'speech'.
+ //
+ // Returned value is in DB scale.
+ void UpdateMargin(const VadWithLevel::LevelAndProbability& vad_data,
+ float last_speech_level_estimate_dbfs);
+
+ // Returns latest computed margin. Used in cases when speech is not
+ // detected.
+ float LastMargin() const;
+};
+
+} // namespace webrtc
+
+#endif // MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_