Divide SpeechLevelEstimator in interface and implementation
Bug: webrtc:42232605
Change-Id: I2112dccdadd163e62fa55614c2c23347a6fcd6d6
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/422061
Reviewed-by: Lionel Koenig <lionelk@webrtc.org>
Commit-Queue: Gustaf Ullberg <gustaf@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#46114}
diff --git a/modules/audio_processing/agc2/BUILD.gn b/modules/audio_processing/agc2/BUILD.gn
index 83f8889..5bcc7cd 100644
--- a/modules/audio_processing/agc2/BUILD.gn
+++ b/modules/audio_processing/agc2/BUILD.gn
@@ -12,6 +12,8 @@
sources = [
"speech_level_estimator.cc",
"speech_level_estimator.h",
+ "speech_level_estimator_impl.cc",
+ "speech_level_estimator_impl.h",
]
visibility = [
@@ -214,8 +216,6 @@
"../../../api:array_view",
"../../../api/audio:audio_processing",
"../../../rtc_base:checks",
- "../../../rtc_base:checks",
- "../../../rtc_base:gtest_prod",
"../../../rtc_base:gtest_prod",
"../../../rtc_base:logging",
"../../../rtc_base:safe_minmax",
diff --git a/modules/audio_processing/agc2/speech_level_estimator.cc b/modules/audio_processing/agc2/speech_level_estimator.cc
index 702f1fa..1e3cc02 100644
--- a/modules/audio_processing/agc2/speech_level_estimator.cc
+++ b/modules/audio_processing/agc2/speech_level_estimator.cc
@@ -10,162 +10,19 @@
#include "modules/audio_processing/agc2/speech_level_estimator.h"
+#include <memory>
+
#include "api/audio/audio_processing.h"
-#include "modules/audio_processing/agc2/agc2_common.h"
-#include "modules/audio_processing/logging/apm_data_dumper.h"
-#include "rtc_base/checks.h"
-#include "rtc_base/numerics/safe_minmax.h"
+#include "modules/audio_processing/agc2/speech_level_estimator_impl.h"
namespace webrtc {
-namespace {
-float ClampLevelEstimateDbfs(float level_estimate_dbfs) {
- return SafeClamp<float>(level_estimate_dbfs, -90.0f, 30.0f);
-}
-
-// Returns the initial speech level estimate needed to apply the initial gain.
-float GetInitialSpeechLevelEstimateDbfs(
- const AudioProcessing::Config::GainController2::AdaptiveDigital& config) {
- return ClampLevelEstimateDbfs(-kSaturationProtectorInitialHeadroomDb -
- config.initial_gain_db - config.headroom_db);
-}
-
-} // namespace
-
-bool SpeechLevelEstimator::LevelEstimatorState::operator==(
- const SpeechLevelEstimator::LevelEstimatorState& b) const {
- return time_to_confidence_ms == b.time_to_confidence_ms &&
- level_dbfs.numerator == b.level_dbfs.numerator &&
- level_dbfs.denominator == b.level_dbfs.denominator;
-}
-
-float SpeechLevelEstimator::LevelEstimatorState::Ratio::GetRatio() const {
- RTC_DCHECK_NE(denominator, 0.f);
- return numerator / denominator;
-}
-
-SpeechLevelEstimator::SpeechLevelEstimator(
+std::unique_ptr<SpeechLevelEstimator> SpeechLevelEstimator::Create(
ApmDataDumper* apm_data_dumper,
const AudioProcessing::Config::GainController2::AdaptiveDigital& config,
- int adjacent_speech_frames_threshold)
- : apm_data_dumper_(apm_data_dumper),
- initial_speech_level_dbfs_(GetInitialSpeechLevelEstimateDbfs(config)),
- adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold),
- level_dbfs_(initial_speech_level_dbfs_),
- // TODO(bugs.webrtc.org/7494): Remove init below when AGC2 input volume
- // controller temporal dependency removed.
- is_confident_(false) {
- RTC_DCHECK(apm_data_dumper_);
- RTC_DCHECK_GE(adjacent_speech_frames_threshold_, 1);
- Reset();
-}
-
-void SpeechLevelEstimator::Update(float rms_dbfs,
- float speech_probability) {
- RTC_DCHECK_GT(rms_dbfs, -150.0f);
- RTC_DCHECK_LT(rms_dbfs, 50.0f);
- RTC_DCHECK_GE(speech_probability, 0.0f);
- RTC_DCHECK_LE(speech_probability, 1.0f);
- if (speech_probability < kVadConfidenceThreshold) {
- // Not a speech frame.
- if (adjacent_speech_frames_threshold_ > 1) {
- // When two or more adjacent speech frames are required in order to update
- // the state, we need to decide whether to discard or confirm the updates
- // based on the speech sequence length.
- if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
- // First non-speech frame after a long enough sequence of speech frames.
- // Update the reliable state.
- reliable_state_ = preliminary_state_;
- } else if (num_adjacent_speech_frames_ > 0) {
- // First non-speech frame after a too short sequence of speech frames.
- // Reset to the last reliable state.
- preliminary_state_ = reliable_state_;
- }
- }
- num_adjacent_speech_frames_ = 0;
- } else {
- // Speech frame observed.
- num_adjacent_speech_frames_++;
-
- // Update preliminary level estimate.
- RTC_DCHECK_GE(preliminary_state_.time_to_confidence_ms, 0);
- const bool buffer_is_full = preliminary_state_.time_to_confidence_ms == 0;
- if (!buffer_is_full) {
- preliminary_state_.time_to_confidence_ms -= kFrameDurationMs;
- }
- // Weighted average of levels with speech probability as weight.
- RTC_DCHECK_GT(speech_probability, 0.0f);
- const float leak_factor = buffer_is_full ? kLevelEstimatorLeakFactor : 1.0f;
- preliminary_state_.level_dbfs.numerator =
- preliminary_state_.level_dbfs.numerator * leak_factor +
- rms_dbfs * speech_probability;
- preliminary_state_.level_dbfs.denominator =
- preliminary_state_.level_dbfs.denominator * leak_factor +
- speech_probability;
-
- const float level_dbfs = preliminary_state_.level_dbfs.GetRatio();
-
- if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
- // `preliminary_state_` is now reliable. Update the last level estimation.
- level_dbfs_ = ClampLevelEstimateDbfs(level_dbfs);
- }
- }
- UpdateIsConfident();
- DumpDebugData();
-}
-
-void SpeechLevelEstimator::UpdateIsConfident() {
- if (adjacent_speech_frames_threshold_ == 1) {
- // Ignore `reliable_state_` when a single frame is enough to update the
- // level estimate (because it is not used).
- is_confident_ = preliminary_state_.time_to_confidence_ms == 0;
- return;
- }
- // Once confident, it remains confident.
- RTC_DCHECK(reliable_state_.time_to_confidence_ms != 0 ||
- preliminary_state_.time_to_confidence_ms == 0);
- // During the first long enough speech sequence, `reliable_state_` must be
- // ignored since `preliminary_state_` is used.
- is_confident_ =
- reliable_state_.time_to_confidence_ms == 0 ||
- (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_ &&
- preliminary_state_.time_to_confidence_ms == 0);
-}
-
-void SpeechLevelEstimator::Reset() {
- ResetLevelEstimatorState(preliminary_state_);
- ResetLevelEstimatorState(reliable_state_);
- level_dbfs_ = initial_speech_level_dbfs_;
- num_adjacent_speech_frames_ = 0;
-}
-
-void SpeechLevelEstimator::ResetLevelEstimatorState(
- LevelEstimatorState& state) const {
- state.time_to_confidence_ms = kLevelEstimatorTimeToConfidenceMs;
- state.level_dbfs.numerator = initial_speech_level_dbfs_;
- state.level_dbfs.denominator = 1.0f;
-}
-
-void SpeechLevelEstimator::DumpDebugData() const {
- if (!apm_data_dumper_)
- return;
- apm_data_dumper_->DumpRaw("agc2_speech_level_dbfs", level_dbfs_);
- apm_data_dumper_->DumpRaw("agc2_speech_level_is_confident", is_confident_);
- apm_data_dumper_->DumpRaw(
- "agc2_adaptive_level_estimator_num_adjacent_speech_frames",
- num_adjacent_speech_frames_);
- apm_data_dumper_->DumpRaw(
- "agc2_adaptive_level_estimator_preliminary_level_estimate_num",
- preliminary_state_.level_dbfs.numerator);
- apm_data_dumper_->DumpRaw(
- "agc2_adaptive_level_estimator_preliminary_level_estimate_den",
- preliminary_state_.level_dbfs.denominator);
- apm_data_dumper_->DumpRaw(
- "agc2_adaptive_level_estimator_preliminary_time_to_confidence_ms",
- preliminary_state_.time_to_confidence_ms);
- apm_data_dumper_->DumpRaw(
- "agc2_adaptive_level_estimator_reliable_time_to_confidence_ms",
- reliable_state_.time_to_confidence_ms);
+ int adjacent_speech_frames_threshold) {
+ return std::make_unique<SpeechLevelEstimatorImpl>(
+ apm_data_dumper, config, adjacent_speech_frames_threshold);
}
} // namespace webrtc
diff --git a/modules/audio_processing/agc2/speech_level_estimator.h b/modules/audio_processing/agc2/speech_level_estimator.h
index 514b21c..50c7c1e 100644
--- a/modules/audio_processing/agc2/speech_level_estimator.h
+++ b/modules/audio_processing/agc2/speech_level_estimator.h
@@ -11,7 +11,7 @@
#ifndef MODULES_AUDIO_PROCESSING_AGC2_SPEECH_LEVEL_ESTIMATOR_H_
#define MODULES_AUDIO_PROCESSING_AGC2_SPEECH_LEVEL_ESTIMATOR_H_
-#include <type_traits>
+#include <memory>
#include "api/audio/audio_processing.h"
@@ -19,58 +19,23 @@
class ApmDataDumper;
// Active speech level estimator based on the analysis of the following
-// framewise properties: RMS level (dBFS), peak level (dBFS), speech
-// probability.
+// framewise properties: RMS level (dBFS), speech probability.
class SpeechLevelEstimator {
public:
- SpeechLevelEstimator(
+ virtual ~SpeechLevelEstimator() {}
+ // Updates the level estimation.
+ virtual void Update(float rms_dbfs, float speech_probability) = 0;
+ // Returns the estimated speech plus noise level.
+ virtual float GetLevelDbfs() const = 0;
+ // Returns true if the estimator is confident on its current estimate.
+ virtual bool IsConfident() const = 0;
+
+ virtual void Reset() = 0;
+
+ static std::unique_ptr<SpeechLevelEstimator> Create(
ApmDataDumper* apm_data_dumper,
const AudioProcessing::Config::GainController2::AdaptiveDigital& config,
int adjacent_speech_frames_threshold);
- SpeechLevelEstimator(const SpeechLevelEstimator&) = delete;
- SpeechLevelEstimator& operator=(const SpeechLevelEstimator&) = delete;
-
- // Updates the level estimation.
- void Update(float rms_dbfs, float speech_probability);
- // Returns the estimated speech plus noise level.
- float level_dbfs() const { return level_dbfs_; }
- // Returns true if the estimator is confident on its current estimate.
- bool is_confident() const { return is_confident_; }
-
- void Reset();
-
- private:
- // Part of the level estimator state used for check-pointing and restore ops.
- struct LevelEstimatorState {
- bool operator==(const LevelEstimatorState& s) const;
- inline bool operator!=(const LevelEstimatorState& s) const {
- return !(*this == s);
- }
- // TODO(bugs.webrtc.org/7494): Remove `time_to_confidence_ms` if redundant.
- int time_to_confidence_ms;
- struct Ratio {
- float numerator;
- float denominator;
- float GetRatio() const;
- } level_dbfs;
- };
- static_assert(std::is_trivially_copyable<LevelEstimatorState>::value, "");
-
- void UpdateIsConfident();
-
- void ResetLevelEstimatorState(LevelEstimatorState& state) const;
-
- void DumpDebugData() const;
-
- ApmDataDumper* const apm_data_dumper_;
-
- const float initial_speech_level_dbfs_;
- const int adjacent_speech_frames_threshold_;
- LevelEstimatorState preliminary_state_;
- LevelEstimatorState reliable_state_;
- float level_dbfs_;
- bool is_confident_;
- int num_adjacent_speech_frames_;
};
} // namespace webrtc
diff --git a/modules/audio_processing/agc2/speech_level_estimator_impl.cc b/modules/audio_processing/agc2/speech_level_estimator_impl.cc
new file mode 100644
index 0000000..a802111
--- /dev/null
+++ b/modules/audio_processing/agc2/speech_level_estimator_impl.cc
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2025 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/agc2/speech_level_estimator_impl.h"
+
+#include "api/audio/audio_processing.h"
+#include "modules/audio_processing/agc2/agc2_common.h"
+#include "modules/audio_processing/logging/apm_data_dumper.h"
+#include "rtc_base/checks.h"
+#include "rtc_base/numerics/safe_minmax.h"
+
+namespace webrtc {
+namespace {
+
+float ClampLevelEstimateDbfs(float level_estimate_dbfs) {
+ return SafeClamp<float>(level_estimate_dbfs, -90.0f, 30.0f);
+}
+
+// Returns the initial speech level estimate needed to apply the initial gain.
+float GetInitialSpeechLevelEstimateDbfs(
+ const AudioProcessing::Config::GainController2::AdaptiveDigital& config) {
+ return ClampLevelEstimateDbfs(-kSaturationProtectorInitialHeadroomDb -
+ config.initial_gain_db - config.headroom_db);
+}
+
+} // namespace
+
+float SpeechLevelEstimatorImpl::LevelEstimatorState::Ratio::GetRatio() const {
+ RTC_DCHECK_NE(denominator, 0.f);
+ return numerator / denominator;
+}
+
+SpeechLevelEstimatorImpl::SpeechLevelEstimatorImpl(
+ ApmDataDumper* apm_data_dumper,
+ const AudioProcessing::Config::GainController2::AdaptiveDigital& config,
+ int adjacent_speech_frames_threshold)
+ : apm_data_dumper_(apm_data_dumper),
+ initial_speech_level_dbfs_(GetInitialSpeechLevelEstimateDbfs(config)),
+ adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold),
+ level_dbfs_(initial_speech_level_dbfs_),
+ // TODO(bugs.webrtc.org/7494): Remove init below when AGC2 input volume
+ // controller temporal dependency removed.
+ is_confident_(false) {
+ RTC_DCHECK(apm_data_dumper_);
+ RTC_DCHECK_GE(adjacent_speech_frames_threshold_, 1);
+ Reset();
+}
+
+void SpeechLevelEstimatorImpl::Update(float rms_dbfs,
+ float speech_probability) {
+ RTC_DCHECK_GT(rms_dbfs, -150.0f);
+ RTC_DCHECK_LT(rms_dbfs, 50.0f);
+ RTC_DCHECK_GE(speech_probability, 0.0f);
+ RTC_DCHECK_LE(speech_probability, 1.0f);
+ if (speech_probability < kVadConfidenceThreshold) {
+ // Not a speech frame.
+ if (adjacent_speech_frames_threshold_ > 1) {
+ // When two or more adjacent speech frames are required in order to update
+ // the state, we need to decide whether to discard or confirm the updates
+ // based on the speech sequence length.
+ if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
+ // First non-speech frame after a long enough sequence of speech frames.
+ // Update the reliable state.
+ reliable_state_ = preliminary_state_;
+ } else if (num_adjacent_speech_frames_ > 0) {
+ // First non-speech frame after a too short sequence of speech frames.
+ // Reset to the last reliable state.
+ preliminary_state_ = reliable_state_;
+ }
+ }
+ num_adjacent_speech_frames_ = 0;
+ } else {
+ // Speech frame observed.
+ num_adjacent_speech_frames_++;
+
+ // Update preliminary level estimate.
+ RTC_DCHECK_GE(preliminary_state_.time_to_confidence_ms, 0);
+ const bool buffer_is_full = preliminary_state_.time_to_confidence_ms == 0;
+ if (!buffer_is_full) {
+ preliminary_state_.time_to_confidence_ms -= kFrameDurationMs;
+ }
+ // Weighted average of levels with speech probability as weight.
+ RTC_DCHECK_GT(speech_probability, 0.0f);
+ const float leak_factor = buffer_is_full ? kLevelEstimatorLeakFactor : 1.0f;
+ preliminary_state_.level_dbfs.numerator =
+ preliminary_state_.level_dbfs.numerator * leak_factor +
+ rms_dbfs * speech_probability;
+ preliminary_state_.level_dbfs.denominator =
+ preliminary_state_.level_dbfs.denominator * leak_factor +
+ speech_probability;
+
+ const float level_dbfs = preliminary_state_.level_dbfs.GetRatio();
+
+ if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
+ // `preliminary_state_` is now reliable. Update the last level estimation.
+ level_dbfs_ = ClampLevelEstimateDbfs(level_dbfs);
+ }
+ }
+ UpdateIsConfident();
+ DumpDebugData();
+}
+
+void SpeechLevelEstimatorImpl::UpdateIsConfident() {
+ if (adjacent_speech_frames_threshold_ == 1) {
+ // Ignore `reliable_state_` when a single frame is enough to update the
+ // level estimate (because it is not used).
+ is_confident_ = preliminary_state_.time_to_confidence_ms == 0;
+ return;
+ }
+ // Once confident, it remains confident.
+ RTC_DCHECK(reliable_state_.time_to_confidence_ms != 0 ||
+ preliminary_state_.time_to_confidence_ms == 0);
+ // During the first long enough speech sequence, `reliable_state_` must be
+ // ignored since `preliminary_state_` is used.
+ is_confident_ =
+ reliable_state_.time_to_confidence_ms == 0 ||
+ (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_ &&
+ preliminary_state_.time_to_confidence_ms == 0);
+}
+
+void SpeechLevelEstimatorImpl::Reset() {
+ ResetLevelEstimatorState(preliminary_state_);
+ ResetLevelEstimatorState(reliable_state_);
+ level_dbfs_ = initial_speech_level_dbfs_;
+ num_adjacent_speech_frames_ = 0;
+}
+
+void SpeechLevelEstimatorImpl::ResetLevelEstimatorState(
+ LevelEstimatorState& state) const {
+ state.time_to_confidence_ms = kLevelEstimatorTimeToConfidenceMs;
+ state.level_dbfs.numerator = initial_speech_level_dbfs_;
+ state.level_dbfs.denominator = 1.0f;
+}
+
+void SpeechLevelEstimatorImpl::DumpDebugData() const {
+ if (!apm_data_dumper_)
+ return;
+ apm_data_dumper_->DumpRaw("agc2_speech_level_dbfs", level_dbfs_);
+ apm_data_dumper_->DumpRaw("agc2_speech_level_is_confident", is_confident_);
+ apm_data_dumper_->DumpRaw(
+ "agc2_adaptive_level_estimator_num_adjacent_speech_frames",
+ num_adjacent_speech_frames_);
+ apm_data_dumper_->DumpRaw(
+ "agc2_adaptive_level_estimator_preliminary_level_estimate_num",
+ preliminary_state_.level_dbfs.numerator);
+ apm_data_dumper_->DumpRaw(
+ "agc2_adaptive_level_estimator_preliminary_level_estimate_den",
+ preliminary_state_.level_dbfs.denominator);
+ apm_data_dumper_->DumpRaw(
+ "agc2_adaptive_level_estimator_preliminary_time_to_confidence_ms",
+ preliminary_state_.time_to_confidence_ms);
+ apm_data_dumper_->DumpRaw(
+ "agc2_adaptive_level_estimator_reliable_time_to_confidence_ms",
+ reliable_state_.time_to_confidence_ms);
+}
+
+} // namespace webrtc
diff --git a/modules/audio_processing/agc2/speech_level_estimator_impl.h b/modules/audio_processing/agc2/speech_level_estimator_impl.h
new file mode 100644
index 0000000..68c62d0
--- /dev/null
+++ b/modules/audio_processing/agc2/speech_level_estimator_impl.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2025 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef MODULES_AUDIO_PROCESSING_AGC2_SPEECH_LEVEL_ESTIMATOR_IMPL_H_
+#define MODULES_AUDIO_PROCESSING_AGC2_SPEECH_LEVEL_ESTIMATOR_IMPL_H_
+
+#include <type_traits>
+
+#include "api/audio/audio_processing.h"
+#include "modules/audio_processing/agc2/speech_level_estimator.h"
+
+namespace webrtc {
+class ApmDataDumper;
+
+class SpeechLevelEstimatorImpl : public SpeechLevelEstimator {
+ public:
+ SpeechLevelEstimatorImpl(
+ ApmDataDumper* apm_data_dumper,
+ const AudioProcessing::Config::GainController2::AdaptiveDigital& config,
+ int adjacent_speech_frames_threshold);
+ explicit SpeechLevelEstimatorImpl(const SpeechLevelEstimatorImpl&) = delete;
+ SpeechLevelEstimatorImpl& operator=(const SpeechLevelEstimatorImpl&) = delete;
+
+ // Updates the level estimation.
+ void Update(float rms_dbfs, float speech_probability) override;
+ // Returns the estimated speech plus noise level.
+ float GetLevelDbfs() const override { return level_dbfs_; }
+ // Returns true if the estimator is confident on its current estimate.
+ bool IsConfident() const override { return is_confident_; }
+
+ void Reset() override;
+
+ private:
+ // Part of the level estimator state used for check-pointing and restore ops.
+ struct LevelEstimatorState {
+ // TODO(bugs.webrtc.org/7494): Remove `time_to_confidence_ms` if redundant.
+ int time_to_confidence_ms;
+ struct Ratio {
+ float numerator;
+ float denominator;
+ float GetRatio() const;
+ } level_dbfs;
+ };
+ static_assert(std::is_trivially_copyable<LevelEstimatorState>::value, "");
+
+ void UpdateIsConfident();
+
+ void ResetLevelEstimatorState(LevelEstimatorState& state) const;
+
+ void DumpDebugData() const;
+
+ ApmDataDumper* const apm_data_dumper_;
+
+ const float initial_speech_level_dbfs_;
+ const int adjacent_speech_frames_threshold_;
+ LevelEstimatorState preliminary_state_;
+ LevelEstimatorState reliable_state_;
+ float level_dbfs_;
+ bool is_confident_;
+ int num_adjacent_speech_frames_;
+};
+
+} // namespace webrtc
+
+#endif // MODULES_AUDIO_PROCESSING_AGC2_SPEECH_LEVEL_ESTIMATOR_IMPL_H_
diff --git a/modules/audio_processing/agc2/speech_level_estimator_unittest.cc b/modules/audio_processing/agc2/speech_level_estimator_unittest.cc
index eb466d1..f9387f2c 100644
--- a/modules/audio_processing/agc2/speech_level_estimator_unittest.cc
+++ b/modules/audio_processing/agc2/speech_level_estimator_unittest.cc
@@ -8,12 +8,11 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "modules/audio_processing/agc2/speech_level_estimator.h"
-
#include <memory>
#include "api/audio/audio_processing.h"
#include "modules/audio_processing/agc2/agc2_common.h"
+#include "modules/audio_processing/agc2/speech_level_estimator_impl.h"
#include "modules/audio_processing/logging/apm_data_dumper.h"
#include "rtc_base/checks.h"
#include "test/gtest.h"
@@ -36,7 +35,7 @@
void RunOnConstantLevel(int num_iterations,
float rms_dbfs,
float speech_probability,
- SpeechLevelEstimator& level_estimator) {
+ SpeechLevelEstimatorImpl& level_estimator) {
for (int i = 0; i < num_iterations; ++i) {
level_estimator.Update(rms_dbfs, speech_probability);
}
@@ -50,11 +49,11 @@
struct TestLevelEstimator {
explicit TestLevelEstimator(int adjacent_speech_frames_threshold)
: data_dumper(0),
- estimator(std::make_unique<SpeechLevelEstimator>(
+ estimator(std::make_unique<SpeechLevelEstimatorImpl>(
&data_dumper,
AdaptiveDigitalConfig{},
adjacent_speech_frames_threshold)),
- initial_speech_level_dbfs(estimator->level_dbfs()),
+ initial_speech_level_dbfs(estimator->GetLevelDbfs()),
level_rms_dbfs(initial_speech_level_dbfs / 2.0f),
level_peak_dbfs(initial_speech_level_dbfs / 3.0f) {
RTC_DCHECK_LT(level_rms_dbfs, level_peak_dbfs);
@@ -64,7 +63,7 @@
"level is wide enough for the tests";
}
ApmDataDumper data_dumper;
- std::unique_ptr<SpeechLevelEstimator> estimator;
+ std::unique_ptr<SpeechLevelEstimatorImpl> estimator;
const float initial_speech_level_dbfs;
const float level_rms_dbfs;
const float level_peak_dbfs;
@@ -76,10 +75,10 @@
RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence,
level_estimator.level_rms_dbfs, kMaxSpeechProbability,
*level_estimator.estimator);
- const float estimated_level_dbfs = level_estimator.estimator->level_dbfs();
+ const float estimated_level_dbfs = level_estimator.estimator->GetLevelDbfs();
RunOnConstantLevel(/*num_iterations=*/1, level_estimator.level_rms_dbfs,
kMaxSpeechProbability, *level_estimator.estimator);
- EXPECT_NEAR(level_estimator.estimator->level_dbfs(), estimated_level_dbfs,
+ EXPECT_NEAR(level_estimator.estimator->GetLevelDbfs(), estimated_level_dbfs,
0.1f);
}
@@ -90,7 +89,7 @@
RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence / 2,
level_estimator.level_rms_dbfs, kMaxSpeechProbability,
*level_estimator.estimator);
- EXPECT_FALSE(level_estimator.estimator->is_confident());
+ EXPECT_FALSE(level_estimator.estimator->IsConfident());
}
// Checks that the level controller becomes confident when enough speech frames
@@ -100,7 +99,7 @@
RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence,
level_estimator.level_rms_dbfs, kMaxSpeechProbability,
*level_estimator.estimator);
- EXPECT_TRUE(level_estimator.estimator->is_confident());
+ EXPECT_TRUE(level_estimator.estimator->IsConfident());
}
// Checks that the estimated level is not affected by the level of non-speech
@@ -111,13 +110,13 @@
RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence,
level_estimator.level_rms_dbfs, kMaxSpeechProbability,
*level_estimator.estimator);
- const float estimated_level_dbfs = level_estimator.estimator->level_dbfs();
+ const float estimated_level_dbfs = level_estimator.estimator->GetLevelDbfs();
// Simulate full-scale non-speech.
RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence,
/*rms_dbfs=*/0.0f, kNoSpeechProbability,
*level_estimator.estimator);
// No estimated level change is expected.
- EXPECT_FLOAT_EQ(level_estimator.estimator->level_dbfs(),
+ EXPECT_FLOAT_EQ(level_estimator.estimator->GetLevelDbfs(),
estimated_level_dbfs);
}
@@ -127,7 +126,7 @@
RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence,
level_estimator.level_rms_dbfs, kMaxSpeechProbability,
*level_estimator.estimator);
- EXPECT_NEAR(level_estimator.estimator->level_dbfs(),
+ EXPECT_NEAR(level_estimator.estimator->GetLevelDbfs(),
level_estimator.level_rms_dbfs,
kConvergenceSpeedTestsLevelTolerance);
}
@@ -141,9 +140,9 @@
/*rms_dbfs=*/level_estimator.initial_speech_level_dbfs,
kMaxSpeechProbability, *level_estimator.estimator);
// No estimate change should occur, but confidence is achieved.
- ASSERT_FLOAT_EQ(level_estimator.estimator->level_dbfs(),
+ ASSERT_FLOAT_EQ(level_estimator.estimator->GetLevelDbfs(),
level_estimator.initial_speech_level_dbfs);
- ASSERT_TRUE(level_estimator.estimator->is_confident());
+ ASSERT_TRUE(level_estimator.estimator->IsConfident());
// After confidence.
constexpr float kConvergenceTimeAfterConfidenceNumFrames = 700; // 7 seconds.
static_assert(
@@ -152,7 +151,7 @@
/*num_iterations=*/kConvergenceTimeAfterConfidenceNumFrames,
level_estimator.level_rms_dbfs, kMaxSpeechProbability,
*level_estimator.estimator);
- EXPECT_NEAR(level_estimator.estimator->level_dbfs(),
+ EXPECT_NEAR(level_estimator.estimator->GetLevelDbfs(),
level_estimator.level_rms_dbfs,
kConvergenceSpeedTestsLevelTolerance);
}
@@ -165,28 +164,28 @@
TEST_P(SpeechLevelEstimatorParametrization, DoNotAdaptToShortSpeechSegments) {
TestLevelEstimator level_estimator(adjacent_speech_frames_threshold());
- const float initial_level = level_estimator.estimator->level_dbfs();
+ const float initial_level = level_estimator.estimator->GetLevelDbfs();
ASSERT_LT(initial_level, level_estimator.level_peak_dbfs);
for (int i = 0; i < adjacent_speech_frames_threshold() - 1; ++i) {
SCOPED_TRACE(i);
level_estimator.estimator->Update(level_estimator.level_rms_dbfs,
kMaxSpeechProbability);
- EXPECT_EQ(initial_level, level_estimator.estimator->level_dbfs());
+ EXPECT_EQ(initial_level, level_estimator.estimator->GetLevelDbfs());
}
level_estimator.estimator->Update(level_estimator.level_rms_dbfs,
kLowSpeechProbability);
- EXPECT_EQ(initial_level, level_estimator.estimator->level_dbfs());
+ EXPECT_EQ(initial_level, level_estimator.estimator->GetLevelDbfs());
}
TEST_P(SpeechLevelEstimatorParametrization, AdaptToEnoughSpeechSegments) {
TestLevelEstimator level_estimator(adjacent_speech_frames_threshold());
- const float initial_level = level_estimator.estimator->level_dbfs();
+ const float initial_level = level_estimator.estimator->GetLevelDbfs();
ASSERT_LT(initial_level, level_estimator.level_peak_dbfs);
for (int i = 0; i < adjacent_speech_frames_threshold(); ++i) {
level_estimator.estimator->Update(level_estimator.level_rms_dbfs,
kMaxSpeechProbability);
}
- EXPECT_LT(initial_level, level_estimator.estimator->level_dbfs());
+ EXPECT_LT(initial_level, level_estimator.estimator->GetLevelDbfs());
}
INSTANTIATE_TEST_SUITE_P(GainController2,
diff --git a/modules/audio_processing/gain_controller2.cc b/modules/audio_processing/gain_controller2.cc
index 98deda4..ba4b61d 100644
--- a/modules/audio_processing/gain_controller2.cc
+++ b/modules/audio_processing/gain_controller2.cc
@@ -117,7 +117,7 @@
if (config.input_volume_controller.enabled ||
config.adaptive_digital.enabled) {
// Create dependencies.
- speech_level_estimator_ = std::make_unique<SpeechLevelEstimator>(
+ speech_level_estimator_ = SpeechLevelEstimator::Create(
&data_dumper_, config.adaptive_digital, kAdjacentSpeechFramesThreshold);
if (use_internal_vad)
vad_ = std::make_unique<VoiceActivityDetectorWrapper>(
@@ -218,8 +218,8 @@
if (speech_level_estimator_) {
speech_level_estimator_->Update(audio_levels.rms_dbfs, speech_probability);
speech_level =
- SpeechLevel{.is_confident = speech_level_estimator_->is_confident(),
- .rms_dbfs = speech_level_estimator_->level_dbfs()};
+ SpeechLevel{.is_confident = speech_level_estimator_->IsConfident(),
+ .rms_dbfs = speech_level_estimator_->GetLevelDbfs()};
}
// Update the recommended input volume.