AGC2: retuning and large refactoring
- Bug fix: the desired initial gain quickly dropped to 0 dB hence
starting a call with a too low level
- New tuning to make AGC2 more robust to VAD mistakes
- Smarter max gain increase speed: to deal with an increased threshold
of adjacent speech frames, the gain applier temporarily allows a
faster gain increase to deal with a longer time spent waiting for
enough speech frames in a row to be observed
- Saturation protector isolated from `AdaptiveModeLevelEstimator` to
simplify the unit tests for the latter (non bit-exact change)
- AGC2 adaptive digital config: unnecessary params deprecated
- Code readability improvements
- Data dumps clean-up and better naming
Bug: webrtc:7494
Change-Id: I4e36059bdf2566cc2a7e1a7e95b7430ba9ae9844
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/215140
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Reviewed-by: Jesus de Vicente Pena <devicentepena@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#33736}
diff --git a/modules/audio_processing/agc2/BUILD.gn b/modules/audio_processing/agc2/BUILD.gn
index 910b58c..4c6cfab 100644
--- a/modules/audio_processing/agc2/BUILD.gn
+++ b/modules/audio_processing/agc2/BUILD.gn
@@ -25,6 +25,8 @@
"adaptive_mode_level_estimator.h",
"saturation_protector.cc",
"saturation_protector.h",
+ "saturation_protector_buffer.cc",
+ "saturation_protector_buffer.h",
]
configs += [ "..:apm_debug_dump" ]
@@ -177,6 +179,7 @@
"adaptive_digital_gain_applier_unittest.cc",
"adaptive_mode_level_estimator_unittest.cc",
"gain_applier_unittest.cc",
+ "saturation_protector_buffer_unittest.cc",
"saturation_protector_unittest.cc",
]
deps = [
diff --git a/modules/audio_processing/agc2/adaptive_agc.cc b/modules/audio_processing/agc2/adaptive_agc.cc
index 37f11d2..8bf192e 100644
--- a/modules/audio_processing/agc2/adaptive_agc.cc
+++ b/modules/audio_processing/agc2/adaptive_agc.cc
@@ -25,15 +25,6 @@
using NoiseEstimatorType =
AudioProcessing::Config::GainController2::NoiseEstimator;
-void DumpDebugData(const AdaptiveDigitalGainApplier::FrameInfo& info,
- ApmDataDumper& dumper) {
- dumper.DumpRaw("agc2_vad_probability", info.vad_result.speech_probability);
- dumper.DumpRaw("agc2_vad_rms_dbfs", info.vad_result.rms_dbfs);
- dumper.DumpRaw("agc2_vad_peak_dbfs", info.vad_result.peak_dbfs);
- dumper.DumpRaw("agc2_noise_estimate_dbfs", info.input_noise_level_dbfs);
- dumper.DumpRaw("agc2_last_limiter_audio_level", info.limiter_envelope_dbfs);
-}
-
constexpr int kGainApplierAdjacentSpeechFramesThreshold = 1;
constexpr float kMaxGainChangePerSecondDb = 3.0f;
constexpr float kMaxOutputNoiseLevelDbfs = -50.0f;
@@ -72,36 +63,42 @@
AdaptiveAgc::AdaptiveAgc(ApmDataDumper* apm_data_dumper)
: speech_level_estimator_(apm_data_dumper),
- gain_applier_(apm_data_dumper,
- kGainApplierAdjacentSpeechFramesThreshold,
- kMaxGainChangePerSecondDb,
- kMaxOutputNoiseLevelDbfs),
+ gain_controller_(apm_data_dumper,
+ kGainApplierAdjacentSpeechFramesThreshold,
+ kMaxGainChangePerSecondDb,
+ kMaxOutputNoiseLevelDbfs),
apm_data_dumper_(apm_data_dumper),
noise_level_estimator_(
CreateNoiseLevelEstimator(kDefaultNoiseLevelEstimatorType,
+ apm_data_dumper)),
+ saturation_protector_(
+ CreateSaturationProtector(kSaturationProtectorInitialHeadroomDb,
+ kSaturationProtectorExtraHeadroomDb,
+ kGainApplierAdjacentSpeechFramesThreshold,
apm_data_dumper)) {
RTC_DCHECK(apm_data_dumper);
}
AdaptiveAgc::AdaptiveAgc(ApmDataDumper* apm_data_dumper,
const AdaptiveDigitalConfig& config)
- : speech_level_estimator_(
- apm_data_dumper,
- config.level_estimator,
- config.level_estimator_adjacent_speech_frames_threshold,
- config.initial_saturation_margin_db,
- config.extra_saturation_margin_db),
- vad_(config.vad_reset_period_ms,
- config.vad_probability_attack,
- GetAllowedCpuFeatures(config)),
- gain_applier_(apm_data_dumper,
- config.gain_applier_adjacent_speech_frames_threshold,
- config.max_gain_change_db_per_second,
- config.max_output_noise_level_dbfs),
+ : speech_level_estimator_(apm_data_dumper,
+ config.adjacent_speech_frames_threshold),
+ vad_(config.vad_reset_period_ms, GetAllowedCpuFeatures(config)),
+ gain_controller_(apm_data_dumper,
+ config.adjacent_speech_frames_threshold,
+ config.max_gain_change_db_per_second,
+ config.max_output_noise_level_dbfs),
apm_data_dumper_(apm_data_dumper),
noise_level_estimator_(
- CreateNoiseLevelEstimator(config.noise_estimator, apm_data_dumper)) {
+ CreateNoiseLevelEstimator(config.noise_estimator, apm_data_dumper)),
+ saturation_protector_(
+ CreateSaturationProtector(kSaturationProtectorInitialHeadroomDb,
+ kSaturationProtectorExtraHeadroomDb,
+ config.adjacent_speech_frames_threshold,
+ apm_data_dumper)) {
RTC_DCHECK(apm_data_dumper);
+ RTC_DCHECK(noise_level_estimator_);
+ RTC_DCHECK(saturation_protector_);
if (!config.use_saturation_protector) {
RTC_LOG(LS_WARNING) << "The saturation protector cannot be disabled.";
}
@@ -111,19 +108,39 @@
void AdaptiveAgc::Process(AudioFrameView<float> frame, float limiter_envelope) {
AdaptiveDigitalGainApplier::FrameInfo info;
- info.vad_result = vad_.AnalyzeFrame(frame);
- speech_level_estimator_.Update(info.vad_result);
- info.input_level_dbfs = speech_level_estimator_.level_dbfs();
- info.input_noise_level_dbfs = noise_level_estimator_->Analyze(frame);
- info.limiter_envelope_dbfs =
- limiter_envelope > 0 ? FloatS16ToDbfs(limiter_envelope) : -90.0f;
- info.estimate_is_confident = speech_level_estimator_.IsConfident();
- DumpDebugData(info, *apm_data_dumper_);
- gain_applier_.Process(info, frame);
+
+ VadLevelAnalyzer::Result vad_result = vad_.AnalyzeFrame(frame);
+ info.speech_probability = vad_result.speech_probability;
+ apm_data_dumper_->DumpRaw("agc2_speech_probability",
+ vad_result.speech_probability);
+ apm_data_dumper_->DumpRaw("agc2_input_rms_dbfs", vad_result.rms_dbfs);
+ apm_data_dumper_->DumpRaw("agc2_input_peak_dbfs", vad_result.peak_dbfs);
+
+ speech_level_estimator_.Update(vad_result);
+ info.speech_level_dbfs = speech_level_estimator_.level_dbfs();
+ info.speech_level_reliable = speech_level_estimator_.IsConfident();
+ apm_data_dumper_->DumpRaw("agc2_speech_level_dbfs", info.speech_level_dbfs);
+ apm_data_dumper_->DumpRaw("agc2_speech_level_reliable",
+ info.speech_level_reliable);
+
+ info.noise_rms_dbfs = noise_level_estimator_->Analyze(frame);
+ apm_data_dumper_->DumpRaw("agc2_noise_rms_dbfs", info.noise_rms_dbfs);
+
+ saturation_protector_->Analyze(info.speech_probability, vad_result.peak_dbfs,
+ info.speech_level_dbfs);
+ info.headroom_db = saturation_protector_->HeadroomDb();
+ apm_data_dumper_->DumpRaw("agc2_headroom_db", info.headroom_db);
+
+ info.limiter_envelope_dbfs = FloatS16ToDbfs(limiter_envelope);
+ apm_data_dumper_->DumpRaw("agc2_limiter_envelope_dbfs",
+ info.limiter_envelope_dbfs);
+
+ gain_controller_.Process(info, frame);
}
-void AdaptiveAgc::Reset() {
+void AdaptiveAgc::HandleInputGainChange() {
speech_level_estimator_.Reset();
+ saturation_protector_->Reset();
}
} // namespace webrtc
diff --git a/modules/audio_processing/agc2/adaptive_agc.h b/modules/audio_processing/agc2/adaptive_agc.h
index 525cab7..fe81444 100644
--- a/modules/audio_processing/agc2/adaptive_agc.h
+++ b/modules/audio_processing/agc2/adaptive_agc.h
@@ -16,6 +16,7 @@
#include "modules/audio_processing/agc2/adaptive_digital_gain_applier.h"
#include "modules/audio_processing/agc2/adaptive_mode_level_estimator.h"
#include "modules/audio_processing/agc2/noise_level_estimator.h"
+#include "modules/audio_processing/agc2/saturation_protector.h"
#include "modules/audio_processing/agc2/vad_with_level.h"
#include "modules/audio_processing/include/audio_frame_view.h"
#include "modules/audio_processing/include/audio_processing.h"
@@ -38,14 +39,17 @@
// account the envelope measured by the limiter.
// TODO(crbug.com/webrtc/7494): Make the class depend on the limiter.
void Process(AudioFrameView<float> frame, float limiter_envelope);
- void Reset();
+
+ // Handles a gain change applied to the input signal (e.g., analog gain).
+ void HandleInputGainChange();
private:
AdaptiveModeLevelEstimator speech_level_estimator_;
VadLevelAnalyzer vad_;
- AdaptiveDigitalGainApplier gain_applier_;
+ AdaptiveDigitalGainApplier gain_controller_;
ApmDataDumper* const apm_data_dumper_;
std::unique_ptr<NoiseLevelEstimator> noise_level_estimator_;
+ std::unique_ptr<SaturationProtector> saturation_protector_;
};
} // namespace webrtc
diff --git a/modules/audio_processing/agc2/adaptive_digital_gain_applier.cc b/modules/audio_processing/agc2/adaptive_digital_gain_applier.cc
index 8a56464..8a8a7fd 100644
--- a/modules/audio_processing/agc2/adaptive_digital_gain_applier.cc
+++ b/modules/audio_processing/agc2/adaptive_digital_gain_applier.cc
@@ -23,6 +23,9 @@
namespace webrtc {
namespace {
+constexpr int kHeadroomHistogramMin = 0;
+constexpr int kHeadroomHistogramMax = 50;
+
// This function maps input level to desired applied gain. We want to
// boost the signal so that peaks are at -kHeadroomDbfs. We can't
// apply more than kMaxGainDb gain.
@@ -31,17 +34,13 @@
if (input_level_dbfs < -(kHeadroomDbfs + kMaxGainDb)) {
return kMaxGainDb;
}
-
// We expect to end up here most of the time: the level is below
// -headroom, but we can boost it to -headroom.
if (input_level_dbfs < -kHeadroomDbfs) {
return -kHeadroomDbfs - input_level_dbfs;
}
-
- // Otherwise, the level is too high and we can't boost. The
- // LevelEstimator is responsible for not reporting bogus gain
- // values.
- RTC_DCHECK_LE(input_level_dbfs, 0.f);
+ // Otherwise, the level is too high and we can't boost.
+ RTC_DCHECK_GE(input_level_dbfs, -kHeadroomDbfs);
return 0.f;
}
@@ -52,10 +51,11 @@
float input_noise_level_dbfs,
float max_output_noise_level_dbfs,
ApmDataDumper& apm_data_dumper) {
- const float noise_headroom_db =
+ const float max_allowed_gain_db =
max_output_noise_level_dbfs - input_noise_level_dbfs;
- apm_data_dumper.DumpRaw("agc2_noise_headroom_db", noise_headroom_db);
- return std::min(target_gain, std::max(noise_headroom_db, 0.f));
+ apm_data_dumper.DumpRaw("agc2_adaptive_gain_applier_max_allowed_gain_db",
+ max_allowed_gain_db);
+ return std::min(target_gain, std::max(max_allowed_gain_db, 0.f));
}
float LimitGainByLowConfidence(float target_gain,
@@ -68,8 +68,8 @@
}
const float limiter_level_before_gain = limiter_audio_level_dbfs - last_gain;
- // Compute a new gain so that limiter_level_before_gain + new_gain <=
- // kLimiterThreshold.
+ // Compute a new gain so that `limiter_level_before_gain` + `new_target_gain`
+ // is not great than `kLimiterThresholdForAgcGainDbfs`.
const float new_target_gain = std::max(
kLimiterThresholdForAgcGainDbfs - limiter_level_before_gain, 0.f);
return std::min(new_target_gain, target_gain);
@@ -80,13 +80,16 @@
float ComputeGainChangeThisFrameDb(float target_gain_db,
float last_gain_db,
bool gain_increase_allowed,
- float max_gain_change_db) {
+ float max_gain_decrease_db,
+ float max_gain_increase_db) {
+ RTC_DCHECK_GT(max_gain_decrease_db, 0);
+ RTC_DCHECK_GT(max_gain_increase_db, 0);
float target_gain_difference_db = target_gain_db - last_gain_db;
if (!gain_increase_allowed) {
target_gain_difference_db = std::min(target_gain_difference_db, 0.f);
}
- return rtc::SafeClamp(target_gain_difference_db, -max_gain_change_db,
- max_gain_change_db);
+ return rtc::SafeClamp(target_gain_difference_db, -max_gain_decrease_db,
+ max_gain_increase_db);
}
} // namespace
@@ -115,7 +118,7 @@
void AdaptiveDigitalGainApplier::Process(const FrameInfo& info,
AudioFrameView<float> frame) {
- RTC_DCHECK_GE(info.input_level_dbfs, -150.f);
+ RTC_DCHECK_GE(info.speech_level_dbfs, -150.f);
RTC_DCHECK_GE(frame.num_channels(), 1);
RTC_DCHECK(
frame.samples_per_channel() == 80 || frame.samples_per_channel() == 160 ||
@@ -123,30 +126,46 @@
<< "`frame` does not look like a 10 ms frame for an APM supported sample "
"rate";
+ // Compute the input level used to select the desired gain.
+ RTC_DCHECK_GT(info.headroom_db, 0.0f);
+ const float input_level_dbfs = info.speech_level_dbfs + info.headroom_db;
+
const float target_gain_db = LimitGainByLowConfidence(
- LimitGainByNoise(ComputeGainDb(std::min(info.input_level_dbfs, 0.f)),
- info.input_noise_level_dbfs,
+ LimitGainByNoise(ComputeGainDb(input_level_dbfs), info.noise_rms_dbfs,
max_output_noise_level_dbfs_, *apm_data_dumper_),
- last_gain_db_, info.limiter_envelope_dbfs, info.estimate_is_confident);
+ last_gain_db_, info.limiter_envelope_dbfs, info.speech_level_reliable);
// Forbid increasing the gain until enough adjacent speech frames are
// observed.
- if (info.vad_result.speech_probability < kVadConfidenceThreshold) {
+ bool first_confident_speech_frame = false;
+ if (info.speech_probability < kVadConfidenceThreshold) {
frames_to_gain_increase_allowed_ = adjacent_speech_frames_threshold_;
} else if (frames_to_gain_increase_allowed_ > 0) {
frames_to_gain_increase_allowed_--;
+ first_confident_speech_frame = frames_to_gain_increase_allowed_ == 0;
}
- apm_data_dumper_->DumpRaw("agc2_frames_to_gain_increase_allowed",
- frames_to_gain_increase_allowed_);
+ apm_data_dumper_->DumpRaw(
+ "agc2_adaptive_gain_applier_frames_to_gain_increase_allowed",
+ frames_to_gain_increase_allowed_);
+
+ const bool gain_increase_allowed = frames_to_gain_increase_allowed_ == 0;
+
+ float max_gain_increase_db = max_gain_change_db_per_10ms_;
+ if (first_confident_speech_frame) {
+ // No gain increase happened while waiting for a long enough speech
+ // sequence. Therefore, temporarily allow a faster gain increase.
+ RTC_DCHECK(gain_increase_allowed);
+ max_gain_increase_db *= adjacent_speech_frames_threshold_;
+ }
const float gain_change_this_frame_db = ComputeGainChangeThisFrameDb(
- target_gain_db, last_gain_db_,
- /*gain_increase_allowed=*/frames_to_gain_increase_allowed_ == 0,
- max_gain_change_db_per_10ms_);
+ target_gain_db, last_gain_db_, gain_increase_allowed,
+ /*max_gain_decrease_db=*/max_gain_change_db_per_10ms_,
+ max_gain_increase_db);
- apm_data_dumper_->DumpRaw("agc2_want_to_change_by_db",
+ apm_data_dumper_->DumpRaw("agc2_adaptive_gain_applier_want_to_change_by_db",
target_gain_db - last_gain_db_);
- apm_data_dumper_->DumpRaw("agc2_will_change_by_db",
+ apm_data_dumper_->DumpRaw("agc2_adaptive_gain_applier_will_change_by_db",
gain_change_this_frame_db);
// Optimization: avoid calling math functions if gain does not
@@ -159,23 +178,29 @@
// Remember that the gain has changed for the next iteration.
last_gain_db_ = last_gain_db_ + gain_change_this_frame_db;
- apm_data_dumper_->DumpRaw("agc2_applied_gain_db", last_gain_db_);
+ apm_data_dumper_->DumpRaw("agc2_adaptive_gain_applier_applied_gain_db",
+ last_gain_db_);
// Log every 10 seconds.
calls_since_last_gain_log_++;
if (calls_since_last_gain_log_ == 1000) {
calls_since_last_gain_log_ = 0;
+ RTC_HISTOGRAM_COUNTS_LINEAR("WebRTC.Audio.Agc2.EstimatedSpeechLevel",
+ -info.speech_level_dbfs, 0, 100, 101);
+ RTC_HISTOGRAM_COUNTS_LINEAR("WebRTC.Audio.Agc2.EstimatedNoiseLevel",
+ -info.noise_rms_dbfs, 0, 100, 101);
+ RTC_HISTOGRAM_COUNTS_LINEAR(
+ "WebRTC.Audio.Agc2.Headroom", info.headroom_db, kHeadroomHistogramMin,
+ kHeadroomHistogramMax,
+ kHeadroomHistogramMax - kHeadroomHistogramMin + 1);
RTC_HISTOGRAM_COUNTS_LINEAR("WebRTC.Audio.Agc2.DigitalGainApplied",
last_gain_db_, 0, kMaxGainDb, kMaxGainDb + 1);
- RTC_HISTOGRAM_COUNTS_LINEAR(
- "WebRTC.Audio.Agc2.EstimatedSpeechPlusNoiseLevel",
- -info.input_level_dbfs, 0, 100, 101);
- RTC_HISTOGRAM_COUNTS_LINEAR("WebRTC.Audio.Agc2.EstimatedNoiseLevel",
- -info.input_noise_level_dbfs, 0, 100, 101);
RTC_LOG(LS_INFO) << "AGC2 adaptive digital"
- << " | speech_plus_noise_dbfs: " << info.input_level_dbfs
- << " | noise_dbfs: " << info.input_noise_level_dbfs
+ << " | speech_dbfs: " << info.speech_level_dbfs
+ << " | noise_dbfs: " << info.noise_rms_dbfs
+ << " | headroom_db: " << info.headroom_db
<< " | gain_db: " << last_gain_db_;
}
}
+
} // namespace webrtc
diff --git a/modules/audio_processing/agc2/adaptive_digital_gain_applier.h b/modules/audio_processing/agc2/adaptive_digital_gain_applier.h
index a65379f..74220fa 100644
--- a/modules/audio_processing/agc2/adaptive_digital_gain_applier.h
+++ b/modules/audio_processing/agc2/adaptive_digital_gain_applier.h
@@ -12,33 +12,32 @@
#define MODULES_AUDIO_PROCESSING_AGC2_ADAPTIVE_DIGITAL_GAIN_APPLIER_H_
#include "modules/audio_processing/agc2/gain_applier.h"
-#include "modules/audio_processing/agc2/vad_with_level.h"
#include "modules/audio_processing/include/audio_frame_view.h"
namespace webrtc {
class ApmDataDumper;
-// Part of the adaptive digital controller that applies a digital adaptive gain.
-// The gain is updated towards a target. The logic decides when gain updates are
-// allowed, it controls the adaptation speed and caps the target based on the
-// estimated noise level and the speech level estimate confidence.
+// TODO(bugs.webrtc.org): Split into `GainAdaptor` and `GainApplier`.
+// Selects the target digital gain, decides when and how quickly to adapt to the
+// target and applies the current gain to 10 ms frames.
class AdaptiveDigitalGainApplier {
public:
// Information about a frame to process.
struct FrameInfo {
- float input_level_dbfs; // Estimated speech plus noise level.
- float input_noise_level_dbfs; // Estimated noise level.
- VadLevelAnalyzer::Result vad_result;
- float limiter_envelope_dbfs; // Envelope level from the limiter.
- bool estimate_is_confident;
+ float speech_probability; // Probability of speech in the [0, 1] range.
+ float speech_level_dbfs; // Estimated speech level (dBFS).
+ bool speech_level_reliable; // True with reliable speech level estimation.
+ float noise_rms_dbfs; // Estimated noise RMS level (dBFS).
+ float headroom_db; // Headroom (dB).
+ float limiter_envelope_dbfs; // Envelope level from the limiter (dBFS).
};
- // Ctor.
- // `adjacent_speech_frames_threshold` indicates how many speech frames are
- // required before a gain increase is allowed. `max_gain_change_db_per_second`
- // limits the adaptation speed (uniformly operated across frames).
- // `max_output_noise_level_dbfs` limits the output noise level.
+ // Ctor. `adjacent_speech_frames_threshold` indicates how many adjacent speech
+ // frames must be observed in order to consider the sequence as speech.
+ // `max_gain_change_db_per_second` limits the adaptation speed (uniformly
+ // operated across frames). `max_output_noise_level_dbfs` limits the output
+ // noise level.
AdaptiveDigitalGainApplier(ApmDataDumper* apm_data_dumper,
int adjacent_speech_frames_threshold,
float max_gain_change_db_per_second,
diff --git a/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc b/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc
index e2df700..ee9cb02 100644
--- a/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc
+++ b/modules/audio_processing/agc2/adaptive_digital_gain_applier_unittest.cc
@@ -11,6 +11,7 @@
#include "modules/audio_processing/agc2/adaptive_digital_gain_applier.h"
#include <algorithm>
+#include <memory>
#include "common_audio/include/audio_util.h"
#include "modules/audio_processing/agc2/agc2_common.h"
@@ -26,104 +27,75 @@
constexpr int kFrameLen10ms8kHz = 80;
constexpr int kFrameLen10ms48kHz = 480;
+constexpr float kMaxSpeechProbability = 1.0f;
+
// Constants used in place of estimated noise levels.
-constexpr float kNoNoiseDbfs = -90.f;
+constexpr float kNoNoiseDbfs = kMinLevelDbfs;
constexpr float kWithNoiseDbfs = -20.f;
-static_assert(std::is_trivially_destructible<VadLevelAnalyzer::Result>::value,
- "");
-constexpr VadLevelAnalyzer::Result kVadSpeech{1.f, -20.f, 0.f};
-constexpr float kMaxGainChangePerSecondDb = 3.f;
+constexpr float kMaxGainChangePerSecondDb = 3.0f;
constexpr float kMaxGainChangePerFrameDb =
- kMaxGainChangePerSecondDb * kFrameDurationMs / 1000.f;
-constexpr float kMaxOutputNoiseLevelDbfs = -50.f;
+ kMaxGainChangePerSecondDb * kFrameDurationMs / 1000.0f;
+constexpr float kMaxOutputNoiseLevelDbfs = -50.0f;
-// Helper to instance `AdaptiveDigitalGainApplier`.
+// Helper to create initialized `AdaptiveDigitalGainApplier` objects.
struct GainApplierHelper {
GainApplierHelper()
: GainApplierHelper(/*adjacent_speech_frames_threshold=*/1) {}
explicit GainApplierHelper(int adjacent_speech_frames_threshold)
: apm_data_dumper(0),
- gain_applier(&apm_data_dumper,
- adjacent_speech_frames_threshold,
- kMaxGainChangePerSecondDb,
- kMaxOutputNoiseLevelDbfs) {}
+ gain_applier(std::make_unique<AdaptiveDigitalGainApplier>(
+ &apm_data_dumper,
+ adjacent_speech_frames_threshold,
+ kMaxGainChangePerSecondDb,
+ kMaxOutputNoiseLevelDbfs)) {}
ApmDataDumper apm_data_dumper;
- AdaptiveDigitalGainApplier gain_applier;
+ std::unique_ptr<AdaptiveDigitalGainApplier> gain_applier;
};
-// Runs gain applier and returns the applied gain in linear scale.
-float RunOnConstantLevel(int num_iterations,
- VadLevelAnalyzer::Result vad_level,
- float input_level_dbfs,
- AdaptiveDigitalGainApplier* gain_applier) {
- float gain_linear = 0.f;
-
- for (int i = 0; i < num_iterations; ++i) {
- VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.f);
- AdaptiveDigitalGainApplier::FrameInfo info;
- info.input_level_dbfs = input_level_dbfs;
- info.input_noise_level_dbfs = kNoNoiseDbfs;
- info.vad_result = vad_level;
- info.limiter_envelope_dbfs = -2.f;
- info.estimate_is_confident = true;
- gain_applier->Process(info, fake_audio.float_frame_view());
- gain_linear = fake_audio.float_frame_view().channel(0)[0];
- }
- return gain_linear;
-}
-
// Voice on, no noise, low limiter, confident level.
+static_assert(std::is_trivially_destructible<
+ AdaptiveDigitalGainApplier::FrameInfo>::value,
+ "");
constexpr AdaptiveDigitalGainApplier::FrameInfo kFrameInfo{
- /*input_level_dbfs=*/-1.f,
- /*input_noise_level_dbfs=*/kNoNoiseDbfs,
- /*vad_result=*/kVadSpeech,
- /*limiter_envelope_dbfs=*/-2.f,
- /*estimate_is_confident=*/true};
+ /*speech_probability=*/kMaxSpeechProbability,
+ /*speech_level_dbfs=*/kInitialSpeechLevelEstimateDbfs,
+ /*speech_level_reliable=*/true,
+ /*noise_rms_dbfs=*/kNoNoiseDbfs,
+ /*headroom_db=*/kSaturationProtectorInitialHeadroomDb,
+ /*limiter_envelope_dbfs=*/-2.0f};
-TEST(AutomaticGainController2AdaptiveGainApplier, GainApplierShouldNotCrash) {
+TEST(GainController2AdaptiveGainApplier, GainApplierShouldNotCrash) {
GainApplierHelper helper;
// Make one call with reasonable audio level values and settings.
- VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.f);
+ VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.0f);
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
- info.input_level_dbfs = -5.0;
- helper.gain_applier.Process(kFrameInfo, fake_audio.float_frame_view());
+ info.speech_level_dbfs = -5.0f;
+ helper.gain_applier->Process(kFrameInfo, fake_audio.float_frame_view());
}
-// Check that the output is -kHeadroom dBFS.
-TEST(AutomaticGainController2AdaptiveGainApplier, TargetLevelIsReached) {
- GainApplierHelper helper;
-
- constexpr float initial_level_dbfs = -5.f;
-
- const float applied_gain = RunOnConstantLevel(
- 200, kVadSpeech, initial_level_dbfs, &helper.gain_applier);
-
- EXPECT_NEAR(applied_gain, DbToRatio(-kHeadroomDbfs - initial_level_dbfs),
- 0.1f);
-}
-
-// Check that the output is -kHeadroom dBFS
-TEST(AutomaticGainController2AdaptiveGainApplier, GainApproachesMaxGain) {
- GainApplierHelper helper;
-
- constexpr float initial_level_dbfs = -kHeadroomDbfs - kMaxGainDb - 10.f;
- // A few extra frames for safety.
+// Checks that the maximum allowed gain is applied.
+TEST(GainController2AdaptiveGainApplier, MaxGainApplied) {
constexpr int kNumFramesToAdapt =
static_cast<int>(kMaxGainDb / kMaxGainChangePerFrameDb) + 10;
- const float applied_gain = RunOnConstantLevel(
- kNumFramesToAdapt, kVadSpeech, initial_level_dbfs, &helper.gain_applier);
- EXPECT_NEAR(applied_gain, DbToRatio(kMaxGainDb), 0.1f);
-
- const float applied_gain_db = 20.f * std::log10(applied_gain);
+ GainApplierHelper helper;
+ AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
+ info.speech_level_dbfs = -60.0f;
+ float applied_gain;
+ for (int i = 0; i < kNumFramesToAdapt; ++i) {
+ VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.0f);
+ helper.gain_applier->Process(info, fake_audio.float_frame_view());
+ applied_gain = fake_audio.float_frame_view().channel(0)[0];
+ }
+ const float applied_gain_db = 20.0f * std::log10f(applied_gain);
EXPECT_NEAR(applied_gain_db, kMaxGainDb, 0.1f);
}
-TEST(AutomaticGainController2AdaptiveGainApplier, GainDoesNotChangeFast) {
+TEST(GainController2AdaptiveGainApplier, GainDoesNotChangeFast) {
GainApplierHelper helper;
- constexpr float initial_level_dbfs = -25.f;
+ constexpr float initial_level_dbfs = -25.0f;
// A few extra frames for safety.
constexpr int kNumFramesToAdapt =
static_cast<int>(initial_level_dbfs / kMaxGainChangePerFrameDb) + 10;
@@ -133,10 +105,10 @@
float last_gain_linear = 1.f;
for (int i = 0; i < kNumFramesToAdapt; ++i) {
SCOPED_TRACE(i);
- VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.f);
+ VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.0f);
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
- info.input_level_dbfs = initial_level_dbfs;
- helper.gain_applier.Process(info, fake_audio.float_frame_view());
+ info.speech_level_dbfs = initial_level_dbfs;
+ helper.gain_applier->Process(info, fake_audio.float_frame_view());
float current_gain_linear = fake_audio.float_frame_view().channel(0)[0];
EXPECT_LE(std::abs(current_gain_linear - last_gain_linear),
kMaxChangePerFrameLinear);
@@ -146,10 +118,10 @@
// Check that the same is true when gain decreases as well.
for (int i = 0; i < kNumFramesToAdapt; ++i) {
SCOPED_TRACE(i);
- VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.f);
+ VectorFloatFrame fake_audio(kMono, kFrameLen10ms8kHz, 1.0f);
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
- info.input_level_dbfs = 0.f;
- helper.gain_applier.Process(info, fake_audio.float_frame_view());
+ info.speech_level_dbfs = 0.f;
+ helper.gain_applier->Process(info, fake_audio.float_frame_view());
float current_gain_linear = fake_audio.float_frame_view().channel(0)[0];
EXPECT_LE(std::abs(current_gain_linear - last_gain_linear),
kMaxChangePerFrameLinear);
@@ -157,17 +129,17 @@
}
}
-TEST(AutomaticGainController2AdaptiveGainApplier, GainIsRampedInAFrame) {
+TEST(GainController2AdaptiveGainApplier, GainIsRampedInAFrame) {
GainApplierHelper helper;
- constexpr float initial_level_dbfs = -25.f;
+ constexpr float initial_level_dbfs = -25.0f;
- VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f);
+ VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.0f);
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
- info.input_level_dbfs = initial_level_dbfs;
- helper.gain_applier.Process(info, fake_audio.float_frame_view());
- float maximal_difference = 0.f;
- float current_value = 1.f * DbToRatio(kInitialAdaptiveDigitalGainDb);
+ info.speech_level_dbfs = initial_level_dbfs;
+ helper.gain_applier->Process(info, fake_audio.float_frame_view());
+ float maximal_difference = 0.0f;
+ float current_value = 1.0f * DbToRatio(kInitialAdaptiveDigitalGainDb);
for (const auto& x : fake_audio.float_frame_view().channel(0)) {
const float difference = std::abs(x - current_value);
maximal_difference = std::max(maximal_difference, difference);
@@ -181,10 +153,10 @@
EXPECT_LE(maximal_difference, kMaxChangePerSample);
}
-TEST(AutomaticGainController2AdaptiveGainApplier, NoiseLimitsGain) {
+TEST(GainController2AdaptiveGainApplier, NoiseLimitsGain) {
GainApplierHelper helper;
- constexpr float initial_level_dbfs = -25.f;
+ constexpr float initial_level_dbfs = -25.0f;
constexpr int num_initial_frames =
kInitialAdaptiveDigitalGainDb / kMaxGainChangePerFrameDb;
constexpr int num_frames = 50;
@@ -193,11 +165,11 @@
<< "kWithNoiseDbfs is too low";
for (int i = 0; i < num_initial_frames + num_frames; ++i) {
- VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f);
+ VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.0f);
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
- info.input_level_dbfs = initial_level_dbfs;
- info.input_noise_level_dbfs = kWithNoiseDbfs;
- helper.gain_applier.Process(info, fake_audio.float_frame_view());
+ info.speech_level_dbfs = initial_level_dbfs;
+ info.noise_rms_dbfs = kWithNoiseDbfs;
+ helper.gain_applier->Process(info, fake_audio.float_frame_view());
// Wait so that the adaptive gain applier has time to lower the gain.
if (i > num_initial_frames) {
@@ -205,25 +177,25 @@
*std::max_element(fake_audio.float_frame_view().channel(0).begin(),
fake_audio.float_frame_view().channel(0).end());
- EXPECT_NEAR(maximal_ratio, 1.f, 0.001f);
+ EXPECT_NEAR(maximal_ratio, 1.0f, 0.001f);
}
}
}
-TEST(AutomaticGainController2GainApplier, CanHandlePositiveSpeechLevels) {
+TEST(GainController2GainApplier, CanHandlePositiveSpeechLevels) {
GainApplierHelper helper;
// Make one call with positive audio level values and settings.
- VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.f);
+ VectorFloatFrame fake_audio(kStereo, kFrameLen10ms48kHz, 10000.0f);
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
- info.input_level_dbfs = 5.f;
- helper.gain_applier.Process(info, fake_audio.float_frame_view());
+ info.speech_level_dbfs = 5.0f;
+ helper.gain_applier->Process(info, fake_audio.float_frame_view());
}
-TEST(AutomaticGainController2GainApplier, AudioLevelLimitsGain) {
+TEST(GainController2GainApplier, AudioLevelLimitsGain) {
GainApplierHelper helper;
- constexpr float initial_level_dbfs = -25.f;
+ constexpr float initial_level_dbfs = -25.0f;
constexpr int num_initial_frames =
kInitialAdaptiveDigitalGainDb / kMaxGainChangePerFrameDb;
constexpr int num_frames = 50;
@@ -232,12 +204,12 @@
<< "kWithNoiseDbfs is too low";
for (int i = 0; i < num_initial_frames + num_frames; ++i) {
- VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.f);
+ VectorFloatFrame fake_audio(kMono, kFrameLen10ms48kHz, 1.0f);
AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
- info.input_level_dbfs = initial_level_dbfs;
- info.limiter_envelope_dbfs = 1.f;
- info.estimate_is_confident = false;
- helper.gain_applier.Process(info, fake_audio.float_frame_view());
+ info.speech_level_dbfs = initial_level_dbfs;
+ info.limiter_envelope_dbfs = 1.0f;
+ info.speech_level_reliable = false;
+ helper.gain_applier->Process(info, fake_audio.float_frame_view());
// Wait so that the adaptive gain applier has time to lower the gain.
if (i > num_initial_frames) {
@@ -245,7 +217,7 @@
*std::max_element(fake_audio.float_frame_view().channel(0).begin(),
fake_audio.float_frame_view().channel(0).end());
- EXPECT_NEAR(maximal_ratio, 1.f, 0.001f);
+ EXPECT_NEAR(maximal_ratio, 1.0f, 0.001f);
}
}
}
@@ -260,14 +232,11 @@
const int adjacent_speech_frames_threshold = AdjacentSpeechFramesThreshold();
GainApplierHelper helper(adjacent_speech_frames_threshold);
- AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
- info.input_level_dbfs = -25.0;
-
- float prev_gain = 0.f;
+ float prev_gain = 0.0f;
for (int i = 0; i < adjacent_speech_frames_threshold; ++i) {
SCOPED_TRACE(i);
- VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f);
- helper.gain_applier.Process(info, audio.float_frame_view());
+ VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.0f);
+ helper.gain_applier->Process(kFrameInfo, audio.float_frame_view());
const float gain = audio.float_frame_view().channel(0)[0];
if (i > 0) {
EXPECT_EQ(prev_gain, gain); // No gain increase.
@@ -280,25 +249,23 @@
const int adjacent_speech_frames_threshold = AdjacentSpeechFramesThreshold();
GainApplierHelper helper(adjacent_speech_frames_threshold);
- AdaptiveDigitalGainApplier::FrameInfo info = kFrameInfo;
- info.input_level_dbfs = -25.0;
-
- float prev_gain = 0.f;
+ float prev_gain = 0.0f;
for (int i = 0; i < adjacent_speech_frames_threshold; ++i) {
- VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f);
- helper.gain_applier.Process(info, audio.float_frame_view());
+ SCOPED_TRACE(i);
+ VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.0f);
+ helper.gain_applier->Process(kFrameInfo, audio.float_frame_view());
prev_gain = audio.float_frame_view().channel(0)[0];
}
// Process one more speech frame.
- VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.f);
- helper.gain_applier.Process(info, audio.float_frame_view());
+ VectorFloatFrame audio(kMono, kFrameLen10ms48kHz, 1.0f);
+ helper.gain_applier->Process(kFrameInfo, audio.float_frame_view());
// The gain has increased.
EXPECT_GT(audio.float_frame_view().channel(0)[0], prev_gain);
}
-INSTANTIATE_TEST_SUITE_P(AutomaticGainController2,
+INSTANTIATE_TEST_SUITE_P(GainController2,
AdaptiveDigitalGainApplierTest,
::testing::Values(1, 7, 31));
diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc b/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc
index 9857471..507aa12 100644
--- a/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc
+++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator.cc
@@ -22,37 +22,17 @@
using LevelEstimatorType =
AudioProcessing::Config::GainController2::LevelEstimator;
-// Combines a level estimation with the saturation protector margins.
-float ComputeLevelEstimateDbfs(float level_estimate_dbfs,
- float saturation_margin_db,
- float extra_saturation_margin_db) {
- return rtc::SafeClamp<float>(
- level_estimate_dbfs + saturation_margin_db + extra_saturation_margin_db,
- -90.f, 30.f);
-}
-
-// Returns the level of given type from `vad_level`.
-float GetLevel(const VadLevelAnalyzer::Result& vad_level,
- LevelEstimatorType type) {
- switch (type) {
- case LevelEstimatorType::kRms:
- return vad_level.rms_dbfs;
- break;
- case LevelEstimatorType::kPeak:
- return vad_level.peak_dbfs;
- break;
- }
- RTC_CHECK_NOTREACHED();
+float ClampLevelEstimateDbfs(float level_estimate_dbfs) {
+ return rtc::SafeClamp<float>(level_estimate_dbfs, -90.f, 30.f);
}
} // namespace
bool AdaptiveModeLevelEstimator::LevelEstimatorState::operator==(
const AdaptiveModeLevelEstimator::LevelEstimatorState& b) const {
- return time_to_full_buffer_ms == b.time_to_full_buffer_ms &&
+ return time_to_confidence_ms == b.time_to_confidence_ms &&
level_dbfs.numerator == b.level_dbfs.numerator &&
- level_dbfs.denominator == b.level_dbfs.denominator &&
- saturation_protector == b.saturation_protector;
+ level_dbfs.denominator == b.level_dbfs.denominator;
}
float AdaptiveModeLevelEstimator::LevelEstimatorState::Ratio::GetRatio() const {
@@ -64,25 +44,14 @@
ApmDataDumper* apm_data_dumper)
: AdaptiveModeLevelEstimator(
apm_data_dumper,
- AudioProcessing::Config::GainController2::LevelEstimator::kRms,
- kDefaultLevelEstimatorAdjacentSpeechFramesThreshold,
- kDefaultInitialSaturationMarginDb,
- kDefaultExtraSaturationMarginDb) {}
+ kDefaultLevelEstimatorAdjacentSpeechFramesThreshold) {}
AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator(
ApmDataDumper* apm_data_dumper,
- AudioProcessing::Config::GainController2::LevelEstimator level_estimator,
- int adjacent_speech_frames_threshold,
- float initial_saturation_margin_db,
- float extra_saturation_margin_db)
+ int adjacent_speech_frames_threshold)
: apm_data_dumper_(apm_data_dumper),
- level_estimator_type_(level_estimator),
adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold),
- initial_saturation_margin_db_(initial_saturation_margin_db),
- extra_saturation_margin_db_(extra_saturation_margin_db),
- level_dbfs_(ComputeLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs,
- initial_saturation_margin_db_,
- extra_saturation_margin_db_)) {
+ level_dbfs_(ClampLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs)) {
RTC_DCHECK(apm_data_dumper_);
RTC_DCHECK_GE(adjacent_speech_frames_threshold_, 1);
Reset();
@@ -96,8 +65,6 @@
RTC_DCHECK_LT(vad_level.peak_dbfs, 50.f);
RTC_DCHECK_GE(vad_level.speech_probability, 0.f);
RTC_DCHECK_LE(vad_level.speech_probability, 1.f);
- DumpDebugData();
-
if (vad_level.speech_probability < kVadConfidenceThreshold) {
// Not a speech frame.
if (adjacent_speech_frames_threshold_ > 1) {
@@ -115,89 +82,82 @@
}
}
num_adjacent_speech_frames_ = 0;
- return;
+ } else {
+ // Speech frame observed.
+ num_adjacent_speech_frames_++;
+
+ // Update preliminary level estimate.
+ RTC_DCHECK_GE(preliminary_state_.time_to_confidence_ms, 0);
+ const bool buffer_is_full = preliminary_state_.time_to_confidence_ms == 0;
+ if (!buffer_is_full) {
+ preliminary_state_.time_to_confidence_ms -= kFrameDurationMs;
+ }
+ // Weighted average of levels with speech probability as weight.
+ RTC_DCHECK_GT(vad_level.speech_probability, 0.f);
+ const float leak_factor = buffer_is_full ? kLevelEstimatorLeakFactor : 1.f;
+ preliminary_state_.level_dbfs.numerator =
+ preliminary_state_.level_dbfs.numerator * leak_factor +
+ vad_level.rms_dbfs * vad_level.speech_probability;
+ preliminary_state_.level_dbfs.denominator =
+ preliminary_state_.level_dbfs.denominator * leak_factor +
+ vad_level.speech_probability;
+
+ const float level_dbfs = preliminary_state_.level_dbfs.GetRatio();
+
+ if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
+ // `preliminary_state_` is now reliable. Update the last level estimation.
+ level_dbfs_ = ClampLevelEstimateDbfs(level_dbfs);
+ }
}
-
- // Speech frame observed.
- num_adjacent_speech_frames_++;
-
- // Update preliminary level estimate.
- RTC_DCHECK_GE(preliminary_state_.time_to_full_buffer_ms, 0);
- const bool buffer_is_full = preliminary_state_.time_to_full_buffer_ms == 0;
- if (!buffer_is_full) {
- preliminary_state_.time_to_full_buffer_ms -= kFrameDurationMs;
- }
- // Weighted average of levels with speech probability as weight.
- RTC_DCHECK_GT(vad_level.speech_probability, 0.f);
- const float leak_factor = buffer_is_full ? kFullBufferLeakFactor : 1.f;
- preliminary_state_.level_dbfs.numerator =
- preliminary_state_.level_dbfs.numerator * leak_factor +
- GetLevel(vad_level, level_estimator_type_) * vad_level.speech_probability;
- preliminary_state_.level_dbfs.denominator =
- preliminary_state_.level_dbfs.denominator * leak_factor +
- vad_level.speech_probability;
-
- const float level_dbfs = preliminary_state_.level_dbfs.GetRatio();
-
- UpdateSaturationProtectorState(vad_level.peak_dbfs, level_dbfs,
- preliminary_state_.saturation_protector);
-
- if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
- // `preliminary_state_` is now reliable. Update the last level estimation.
- level_dbfs_ = ComputeLevelEstimateDbfs(
- level_dbfs, preliminary_state_.saturation_protector.margin_db,
- extra_saturation_margin_db_);
- }
+ DumpDebugData();
}
bool AdaptiveModeLevelEstimator::IsConfident() const {
if (adjacent_speech_frames_threshold_ == 1) {
// Ignore `reliable_state_` when a single frame is enough to update the
// level estimate (because it is not used).
- return preliminary_state_.time_to_full_buffer_ms == 0;
+ return preliminary_state_.time_to_confidence_ms == 0;
}
// Once confident, it remains confident.
- RTC_DCHECK(reliable_state_.time_to_full_buffer_ms != 0 ||
- preliminary_state_.time_to_full_buffer_ms == 0);
+ RTC_DCHECK(reliable_state_.time_to_confidence_ms != 0 ||
+ preliminary_state_.time_to_confidence_ms == 0);
// During the first long enough speech sequence, `reliable_state_` must be
// ignored since `preliminary_state_` is used.
- return reliable_state_.time_to_full_buffer_ms == 0 ||
+ return reliable_state_.time_to_confidence_ms == 0 ||
(num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_ &&
- preliminary_state_.time_to_full_buffer_ms == 0);
+ preliminary_state_.time_to_confidence_ms == 0);
}
void AdaptiveModeLevelEstimator::Reset() {
ResetLevelEstimatorState(preliminary_state_);
ResetLevelEstimatorState(reliable_state_);
- level_dbfs_ = ComputeLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs,
- initial_saturation_margin_db_,
- extra_saturation_margin_db_);
+ level_dbfs_ = ClampLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs);
num_adjacent_speech_frames_ = 0;
}
void AdaptiveModeLevelEstimator::ResetLevelEstimatorState(
LevelEstimatorState& state) const {
- state.time_to_full_buffer_ms = kFullBufferSizeMs;
- state.level_dbfs.numerator = 0.f;
- state.level_dbfs.denominator = 0.f;
- ResetSaturationProtectorState(initial_saturation_margin_db_,
- state.saturation_protector);
+ state.time_to_confidence_ms = kLevelEstimatorTimeToConfidenceMs;
+ state.level_dbfs.numerator = kInitialSpeechLevelEstimateDbfs;
+ state.level_dbfs.denominator = 1.0f;
}
void AdaptiveModeLevelEstimator::DumpDebugData() const {
- apm_data_dumper_->DumpRaw("agc2_adaptive_level_estimate_dbfs", level_dbfs_);
- apm_data_dumper_->DumpRaw("agc2_adaptive_num_adjacent_speech_frames",
- num_adjacent_speech_frames_);
- apm_data_dumper_->DumpRaw("agc2_adaptive_preliminary_level_estimate_num",
- preliminary_state_.level_dbfs.numerator);
- apm_data_dumper_->DumpRaw("agc2_adaptive_preliminary_level_estimate_den",
- preliminary_state_.level_dbfs.denominator);
- apm_data_dumper_->DumpRaw("agc2_adaptive_preliminary_saturation_margin_db",
- preliminary_state_.saturation_protector.margin_db);
- apm_data_dumper_->DumpRaw("agc2_adaptive_preliminary_time_to_full_buffer_ms",
- preliminary_state_.time_to_full_buffer_ms);
- apm_data_dumper_->DumpRaw("agc2_adaptive_reliable_time_to_full_buffer_ms",
- reliable_state_.time_to_full_buffer_ms);
+ apm_data_dumper_->DumpRaw(
+ "agc2_adaptive_level_estimator_num_adjacent_speech_frames",
+ num_adjacent_speech_frames_);
+ apm_data_dumper_->DumpRaw(
+ "agc2_adaptive_level_estimator_preliminary_level_estimate_num",
+ preliminary_state_.level_dbfs.numerator);
+ apm_data_dumper_->DumpRaw(
+ "agc2_adaptive_level_estimator_preliminary_level_estimate_den",
+ preliminary_state_.level_dbfs.denominator);
+ apm_data_dumper_->DumpRaw(
+ "agc2_adaptive_level_estimator_preliminary_time_to_confidence_ms",
+ preliminary_state_.time_to_confidence_ms);
+ apm_data_dumper_->DumpRaw(
+ "agc2_adaptive_level_estimator_reliable_time_to_confidence_ms",
+ reliable_state_.time_to_confidence_ms);
}
} // namespace webrtc
diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator.h b/modules/audio_processing/agc2/adaptive_mode_level_estimator.h
index 213fc0f..6d44938 100644
--- a/modules/audio_processing/agc2/adaptive_mode_level_estimator.h
+++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator.h
@@ -15,7 +15,6 @@
#include <type_traits>
#include "modules/audio_processing/agc2/agc2_common.h"
-#include "modules/audio_processing/agc2/saturation_protector.h"
#include "modules/audio_processing/agc2/vad_with_level.h"
#include "modules/audio_processing/include/audio_processing.h"
@@ -29,12 +28,8 @@
AdaptiveModeLevelEstimator(const AdaptiveModeLevelEstimator&) = delete;
AdaptiveModeLevelEstimator& operator=(const AdaptiveModeLevelEstimator&) =
delete;
- AdaptiveModeLevelEstimator(
- ApmDataDumper* apm_data_dumper,
- AudioProcessing::Config::GainController2::LevelEstimator level_estimator,
- int adjacent_speech_frames_threshold,
- float initial_saturation_margin_db,
- float extra_saturation_margin_db);
+ AdaptiveModeLevelEstimator(ApmDataDumper* apm_data_dumper,
+ int adjacent_speech_frames_threshold);
// Updates the level estimation.
void Update(const VadLevelAnalyzer::Result& vad_data);
@@ -57,10 +52,9 @@
float denominator;
float GetRatio() const;
};
- // TODO(crbug.com/webrtc/7494): Remove time_to_full_buffer_ms if redundant.
- int time_to_full_buffer_ms;
+ // TODO(crbug.com/webrtc/7494): Remove time_to_confidence_ms if redundant.
+ int time_to_confidence_ms;
Ratio level_dbfs;
- SaturationProtectorState saturation_protector;
};
static_assert(std::is_trivially_copyable<LevelEstimatorState>::value, "");
@@ -70,11 +64,7 @@
ApmDataDumper* const apm_data_dumper_;
- const AudioProcessing::Config::GainController2::LevelEstimator
- level_estimator_type_;
const int adjacent_speech_frames_threshold_;
- const float initial_saturation_margin_db_;
- const float extra_saturation_margin_db_;
LevelEstimatorState preliminary_state_;
LevelEstimatorState reliable_state_;
float level_dbfs_;
diff --git a/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc b/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc
index ea35797..c55950a 100644
--- a/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc
+++ b/modules/audio_processing/agc2/adaptive_mode_level_estimator_unittest.cc
@@ -19,22 +19,34 @@
namespace webrtc {
namespace {
-constexpr float kInitialSaturationMarginDb = 20.f;
-constexpr float kExtraSaturationMarginDb = 2.f;
+// Number of speech frames that the level estimator must observe in order to
+// become confident about the estimated level.
+constexpr int kNumFramesToConfidence =
+ kLevelEstimatorTimeToConfidenceMs / kFrameDurationMs;
+static_assert(kNumFramesToConfidence > 0, "");
-static_assert(kInitialSpeechLevelEstimateDbfs < 0.f, "");
-constexpr float kVadLevelRms = kInitialSpeechLevelEstimateDbfs / 2.f;
-constexpr float kVadLevelPeak = kInitialSpeechLevelEstimateDbfs / 3.f;
+// Fake levels and speech probabilities used in the tests.
+static_assert(kInitialSpeechLevelEstimateDbfs < 0.0f, "");
+constexpr float kVadLevelRms = kInitialSpeechLevelEstimateDbfs / 2.0f;
+constexpr float kVadLevelPeak = kInitialSpeechLevelEstimateDbfs / 3.0f;
+static_assert(kVadLevelRms < kVadLevelPeak, "");
+static_assert(kVadLevelRms > kInitialSpeechLevelEstimateDbfs, "");
+static_assert(kVadLevelRms - kInitialSpeechLevelEstimateDbfs > 5.0f,
+ "Adjust `kVadLevelRms` so that the difference from the initial "
+ "level is wide enough for the tests.");
-constexpr VadLevelAnalyzer::Result kVadDataSpeech{/*speech_probability=*/1.f,
+constexpr VadLevelAnalyzer::Result kVadDataSpeech{/*speech_probability=*/1.0f,
kVadLevelRms, kVadLevelPeak};
constexpr VadLevelAnalyzer::Result kVadDataNonSpeech{
- /*speech_probability=*/kVadConfidenceThreshold / 2.f, kVadLevelRms,
+ /*speech_probability=*/kVadConfidenceThreshold / 2.0f, kVadLevelRms,
kVadLevelPeak};
-constexpr float kMinSpeechProbability = 0.f;
-constexpr float kMaxSpeechProbability = 1.f;
+constexpr float kMinSpeechProbability = 0.0f;
+constexpr float kMaxSpeechProbability = 1.0f;
+constexpr float kConvergenceSpeedTestsLevelTolerance = 0.5f;
+
+// Provides the `vad_level` value `num_iterations` times to `level_estimator`.
void RunOnConstantLevel(int num_iterations,
const VadLevelAnalyzer::Result& vad_level,
AdaptiveModeLevelEstimator& level_estimator) {
@@ -43,172 +55,125 @@
}
}
+// Level estimator with data dumper.
struct TestLevelEstimator {
TestLevelEstimator()
: data_dumper(0),
estimator(std::make_unique<AdaptiveModeLevelEstimator>(
&data_dumper,
- AudioProcessing::Config::GainController2::LevelEstimator::kRms,
- /*adjacent_speech_frames_threshold=*/1,
- kInitialSaturationMarginDb,
- kExtraSaturationMarginDb)) {}
+ /*adjacent_speech_frames_threshold=*/1)) {}
ApmDataDumper data_dumper;
std::unique_ptr<AdaptiveModeLevelEstimator> estimator;
};
-TEST(AutomaticGainController2AdaptiveModeLevelEstimator,
- EstimatorShouldNotCrash) {
+// Checks the initially estimated level.
+TEST(GainController2AdaptiveModeLevelEstimator, CheckInitialEstimate) {
TestLevelEstimator level_estimator;
-
- VadLevelAnalyzer::Result vad_level{kMaxSpeechProbability, /*rms_dbfs=*/-20.f,
- /*peak_dbfs=*/-10.f};
- level_estimator.estimator->Update(vad_level);
- static_cast<void>(level_estimator.estimator->level_dbfs());
+ EXPECT_FLOAT_EQ(level_estimator.estimator->level_dbfs(),
+ kInitialSpeechLevelEstimateDbfs);
}
-TEST(AutomaticGainController2AdaptiveModeLevelEstimator, LevelShouldStabilize) {
+// Checks that the level estimator converges to a constant input speech level.
+TEST(GainController2AdaptiveModeLevelEstimator, LevelStabilizes) {
TestLevelEstimator level_estimator;
-
- constexpr float kSpeechPeakDbfs = -15.f;
- RunOnConstantLevel(100,
- VadLevelAnalyzer::Result{kMaxSpeechProbability,
- /*rms_dbfs=*/kSpeechPeakDbfs -
- kInitialSaturationMarginDb,
- kSpeechPeakDbfs},
+ RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, kVadDataSpeech,
*level_estimator.estimator);
-
- EXPECT_NEAR(
- level_estimator.estimator->level_dbfs() - kExtraSaturationMarginDb,
- kSpeechPeakDbfs, 0.1f);
+ const float estimated_level_dbfs = level_estimator.estimator->level_dbfs();
+ RunOnConstantLevel(/*num_iterations=*/1, kVadDataSpeech,
+ *level_estimator.estimator);
+ EXPECT_NEAR(level_estimator.estimator->level_dbfs(), estimated_level_dbfs,
+ 0.1f);
}
-TEST(AutomaticGainController2AdaptiveModeLevelEstimator,
- EstimatorIgnoresZeroProbabilityFrames) {
+// Checks that the level controller does not become confident when too few
+// speech frames are observed.
+TEST(GainController2AdaptiveModeLevelEstimator, IsNotConfident) {
TestLevelEstimator level_estimator;
+ RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence / 2,
+ kVadDataSpeech, *level_estimator.estimator);
+ EXPECT_FALSE(level_estimator.estimator->IsConfident());
+}
- // Run for one second of fake audio.
- constexpr float kSpeechRmsDbfs = -25.f;
- RunOnConstantLevel(100,
- VadLevelAnalyzer::Result{kMaxSpeechProbability,
- /*rms_dbfs=*/kSpeechRmsDbfs -
- kInitialSaturationMarginDb,
- /*peak_dbfs=*/kSpeechRmsDbfs},
+// Checks that the level controller becomes confident when enough speech frames
+// are observed.
+TEST(GainController2AdaptiveModeLevelEstimator, IsConfident) {
+ TestLevelEstimator level_estimator;
+ RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, kVadDataSpeech,
*level_estimator.estimator);
+ EXPECT_TRUE(level_estimator.estimator->IsConfident());
+}
- // Run for one more second, but mark as not speech.
- constexpr float kNoiseRmsDbfs = 0.f;
- RunOnConstantLevel(100,
+// Checks that the estimated level is not affected by the level of non-speech
+// frames.
+TEST(GainController2AdaptiveModeLevelEstimator,
+ EstimatorIgnoresNonSpeechFrames) {
+ TestLevelEstimator level_estimator;
+ // Simulate speech.
+ RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, kVadDataSpeech,
+ *level_estimator.estimator);
+ const float estimated_level_dbfs = level_estimator.estimator->level_dbfs();
+ // Simulate full-scale non-speech.
+ RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence,
VadLevelAnalyzer::Result{kMinSpeechProbability,
- /*rms_dbfs=*/kNoiseRmsDbfs,
- /*peak_dbfs=*/kNoiseRmsDbfs},
+ /*rms_dbfs=*/0.0f,
+ /*peak_dbfs=*/0.0f},
*level_estimator.estimator);
-
- // Level should not have changed.
- EXPECT_NEAR(
- level_estimator.estimator->level_dbfs() - kExtraSaturationMarginDb,
- kSpeechRmsDbfs, 0.1f);
+ // No estimated level change is expected.
+ EXPECT_FLOAT_EQ(level_estimator.estimator->level_dbfs(),
+ estimated_level_dbfs);
}
-TEST(AutomaticGainController2AdaptiveModeLevelEstimator, TimeToAdapt) {
+// Checks the convergence speed of the estimator before it becomes confident.
+TEST(GainController2AdaptiveModeLevelEstimator,
+ ConvergenceSpeedBeforeConfidence) {
TestLevelEstimator level_estimator;
-
- // Run for one 'window size' interval.
- constexpr float kInitialSpeechRmsDbfs = -30.f;
- RunOnConstantLevel(
- kFullBufferSizeMs / kFrameDurationMs,
- VadLevelAnalyzer::Result{
- kMaxSpeechProbability,
- /*rms_dbfs=*/kInitialSpeechRmsDbfs - kInitialSaturationMarginDb,
- /*peak_dbfs=*/kInitialSpeechRmsDbfs},
- *level_estimator.estimator);
-
- // Run for one half 'window size' interval. This should not be enough to
- // adapt.
- constexpr float kDifferentSpeechRmsDbfs = -10.f;
- // It should at most differ by 25% after one half 'window size' interval.
- // TODO(crbug.com/webrtc/7494): Add constexpr for repeated expressions.
- const float kMaxDifferenceDb =
- 0.25f * std::abs(kDifferentSpeechRmsDbfs - kInitialSpeechRmsDbfs);
- RunOnConstantLevel(
- static_cast<int>(kFullBufferSizeMs / kFrameDurationMs / 2),
- VadLevelAnalyzer::Result{
- kMaxSpeechProbability,
- /*rms_dbfs=*/kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb,
- /*peak_dbfs=*/kDifferentSpeechRmsDbfs},
- *level_estimator.estimator);
- EXPECT_GT(std::abs(kDifferentSpeechRmsDbfs -
- level_estimator.estimator->level_dbfs()),
- kMaxDifferenceDb);
-
- // Run for some more time. Afterwards, we should have adapted.
- RunOnConstantLevel(
- static_cast<int>(3 * kFullBufferSizeMs / kFrameDurationMs),
- VadLevelAnalyzer::Result{
- kMaxSpeechProbability,
- /*rms_dbfs=*/kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb,
- /*peak_dbfs=*/kDifferentSpeechRmsDbfs},
- *level_estimator.estimator);
- EXPECT_NEAR(
- level_estimator.estimator->level_dbfs() - kExtraSaturationMarginDb,
- kDifferentSpeechRmsDbfs, kMaxDifferenceDb * 0.5f);
+ RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence, kVadDataSpeech,
+ *level_estimator.estimator);
+ EXPECT_NEAR(level_estimator.estimator->level_dbfs(), kVadDataSpeech.rms_dbfs,
+ kConvergenceSpeedTestsLevelTolerance);
}
-TEST(AutomaticGainController2AdaptiveModeLevelEstimator,
- ResetGivesFastAdaptation) {
+// Checks the convergence speed of the estimator after it becomes confident.
+TEST(GainController2AdaptiveModeLevelEstimator,
+ ConvergenceSpeedAfterConfidence) {
TestLevelEstimator level_estimator;
-
- // Run the level estimator for one window size interval. This gives time to
- // adapt.
- constexpr float kInitialSpeechRmsDbfs = -30.f;
+ // Reach confidence using the initial level estimate.
RunOnConstantLevel(
- kFullBufferSizeMs / kFrameDurationMs,
+ /*num_iterations=*/kNumFramesToConfidence,
VadLevelAnalyzer::Result{
kMaxSpeechProbability,
- /*rms_dbfs=*/kInitialSpeechRmsDbfs - kInitialSaturationMarginDb,
- /*peak_dbfs=*/kInitialSpeechRmsDbfs},
+ /*rms_dbfs=*/kInitialSpeechLevelEstimateDbfs,
+ /*peak_dbfs=*/kInitialSpeechLevelEstimateDbfs + 6.0f},
*level_estimator.estimator);
-
- constexpr float kDifferentSpeechRmsDbfs = -10.f;
- // Reset and run one half window size interval.
- level_estimator.estimator->Reset();
-
+ // No estimate change should occur, but confidence is achieved.
+ ASSERT_FLOAT_EQ(level_estimator.estimator->level_dbfs(),
+ kInitialSpeechLevelEstimateDbfs);
+ ASSERT_TRUE(level_estimator.estimator->IsConfident());
+ // After confidence.
+ constexpr float kConvergenceTimeAfterConfidenceNumFrames = 600; // 6 seconds.
+ static_assert(
+ kConvergenceTimeAfterConfidenceNumFrames > kNumFramesToConfidence, "");
RunOnConstantLevel(
- kFullBufferSizeMs / kFrameDurationMs / 2,
- VadLevelAnalyzer::Result{
- kMaxSpeechProbability,
- /*rms_dbfs=*/kDifferentSpeechRmsDbfs - kInitialSaturationMarginDb,
- /*peak_dbfs=*/kDifferentSpeechRmsDbfs},
- *level_estimator.estimator);
-
- // The level should be close to 'kDifferentSpeechRmsDbfs'.
- const float kMaxDifferenceDb =
- 0.1f * std::abs(kDifferentSpeechRmsDbfs - kInitialSpeechRmsDbfs);
- EXPECT_LT(std::abs(kDifferentSpeechRmsDbfs -
- (level_estimator.estimator->level_dbfs() -
- kExtraSaturationMarginDb)),
- kMaxDifferenceDb);
+ /*num_iterations=*/kConvergenceTimeAfterConfidenceNumFrames,
+ kVadDataSpeech, *level_estimator.estimator);
+ EXPECT_NEAR(level_estimator.estimator->level_dbfs(), kVadDataSpeech.rms_dbfs,
+ kConvergenceSpeedTestsLevelTolerance);
}
-struct TestConfig {
- int min_consecutive_speech_frames;
- float initial_saturation_margin_db;
- float extra_saturation_margin_db;
+class AdaptiveModeLevelEstimatorParametrization
+ : public ::testing::TestWithParam<int> {
+ protected:
+ int adjacent_speech_frames_threshold() const { return GetParam(); }
};
-class AdaptiveModeLevelEstimatorTest
- : public ::testing::TestWithParam<TestConfig> {};
-
-TEST_P(AdaptiveModeLevelEstimatorTest, DoNotAdaptToShortSpeechSegments) {
- const auto params = GetParam();
+TEST_P(AdaptiveModeLevelEstimatorParametrization,
+ DoNotAdaptToShortSpeechSegments) {
ApmDataDumper apm_data_dumper(0);
AdaptiveModeLevelEstimator level_estimator(
- &apm_data_dumper,
- AudioProcessing::Config::GainController2::LevelEstimator::kRms,
- params.min_consecutive_speech_frames, params.initial_saturation_margin_db,
- params.extra_saturation_margin_db);
+ &apm_data_dumper, adjacent_speech_frames_threshold());
const float initial_level = level_estimator.level_dbfs();
- ASSERT_LT(initial_level, kVadDataSpeech.rms_dbfs);
- for (int i = 0; i < params.min_consecutive_speech_frames - 1; ++i) {
+ ASSERT_LT(initial_level, kVadDataSpeech.peak_dbfs);
+ for (int i = 0; i < adjacent_speech_frames_threshold() - 1; ++i) {
SCOPED_TRACE(i);
level_estimator.Update(kVadDataSpeech);
EXPECT_EQ(initial_level, level_estimator.level_dbfs());
@@ -217,26 +182,21 @@
EXPECT_EQ(initial_level, level_estimator.level_dbfs());
}
-TEST_P(AdaptiveModeLevelEstimatorTest, AdaptToEnoughSpeechSegments) {
- const auto params = GetParam();
+TEST_P(AdaptiveModeLevelEstimatorParametrization, AdaptToEnoughSpeechSegments) {
ApmDataDumper apm_data_dumper(0);
AdaptiveModeLevelEstimator level_estimator(
- &apm_data_dumper,
- AudioProcessing::Config::GainController2::LevelEstimator::kRms,
- params.min_consecutive_speech_frames, params.initial_saturation_margin_db,
- params.extra_saturation_margin_db);
+ &apm_data_dumper, adjacent_speech_frames_threshold());
const float initial_level = level_estimator.level_dbfs();
- ASSERT_LT(initial_level, kVadDataSpeech.rms_dbfs);
- for (int i = 0; i < params.min_consecutive_speech_frames; ++i) {
+ ASSERT_LT(initial_level, kVadDataSpeech.peak_dbfs);
+ for (int i = 0; i < adjacent_speech_frames_threshold(); ++i) {
level_estimator.Update(kVadDataSpeech);
}
EXPECT_LT(initial_level, level_estimator.level_dbfs());
}
-INSTANTIATE_TEST_SUITE_P(AutomaticGainController2,
- AdaptiveModeLevelEstimatorTest,
- ::testing::Values(TestConfig{1, 0.f, 0.f},
- TestConfig{9, 0.f, 0.f}));
+INSTANTIATE_TEST_SUITE_P(GainController2,
+ AdaptiveModeLevelEstimatorParametrization,
+ ::testing::Values(1, 9, 17));
} // namespace
} // namespace webrtc
diff --git a/modules/audio_processing/agc2/agc2_common.h b/modules/audio_processing/agc2/agc2_common.h
index ccd04bc..0f806d3 100644
--- a/modules/audio_processing/agc2/agc2_common.h
+++ b/modules/audio_processing/agc2/agc2_common.h
@@ -11,20 +11,19 @@
#ifndef MODULES_AUDIO_PROCESSING_AGC2_AGC2_COMMON_H_
#define MODULES_AUDIO_PROCESSING_AGC2_AGC2_COMMON_H_
-#include <stddef.h>
-
namespace webrtc {
constexpr float kMinFloatS16Value = -32768.0f;
constexpr float kMaxFloatS16Value = 32767.0f;
constexpr float kMaxAbsFloatS16Value = 32768.0f;
+// Minimum audio level in dBFS scale for S16 samples.
+constexpr float kMinLevelDbfs = -90.31f;
+
constexpr int kFrameDurationMs = 10;
constexpr int kSubFramesInFrame = 20;
constexpr int kMaximalNumberOfSamplesPerChannel = 480;
-constexpr float kAttackFilterConstant = 0.0f;
-
// Adaptive digital gain applier settings below.
constexpr float kHeadroomDbfs = 1.0f;
constexpr float kMaxGainDb = 30.0f;
@@ -37,43 +36,29 @@
// gain reduction.
constexpr float kVadConfidenceThreshold = 0.95f;
-// The amount of 'memory' of the Level Estimator. Decides leak factors.
-constexpr int kFullBufferSizeMs = 1200;
-constexpr float kFullBufferLeakFactor = 1.0f - 1.0f / kFullBufferSizeMs;
-
-constexpr float kInitialSpeechLevelEstimateDbfs = -30.0f;
+// Adaptive digital level estimator parameters.
+// Number of milliseconds of speech frames to observe to make the estimator
+// confident.
+constexpr float kLevelEstimatorTimeToConfidenceMs = 400;
+constexpr float kLevelEstimatorLeakFactor =
+ 1.0f - 1.0f / kLevelEstimatorTimeToConfidenceMs;
// Robust VAD probability and speech decisions.
constexpr int kDefaultVadRnnResetPeriodMs = 1500;
static_assert(kDefaultVadRnnResetPeriodMs % kFrameDurationMs == 0, "");
-constexpr float kDefaultSmoothedVadProbabilityAttack = 1.0f;
-constexpr int kDefaultLevelEstimatorAdjacentSpeechFramesThreshold = 1;
+constexpr int kDefaultLevelEstimatorAdjacentSpeechFramesThreshold = 12;
// Saturation Protector settings.
-constexpr float kDefaultInitialSaturationMarginDb = 20.0f;
-constexpr float kDefaultExtraSaturationMarginDb = 2.0f;
+constexpr float kSaturationProtectorInitialHeadroomDb = 20.0f;
+constexpr float kSaturationProtectorExtraHeadroomDb = 5.0f;
+constexpr int kSaturationProtectorBufferSize = 4;
-constexpr int kPeakEnveloperSuperFrameLengthMs = 400;
-static_assert(kFullBufferSizeMs % kPeakEnveloperSuperFrameLengthMs == 0,
- "Full buffer size should be a multiple of super frame length for "
- "optimal Saturation Protector performance.");
-
-constexpr int kPeakEnveloperBufferSize =
- kFullBufferSizeMs / kPeakEnveloperSuperFrameLengthMs + 1;
-
-// This value is 10 ** (-1/20 * frame_size_ms / satproc_attack_ms),
-// where satproc_attack_ms is 5000.
-constexpr float kSaturationProtectorAttackConstant = 0.9988493699365052f;
-
-// This value is 10 ** (-1/20 * frame_size_ms / satproc_decay_ms),
-// where satproc_decay_ms is 1000.
-constexpr float kSaturationProtectorDecayConstant = 0.9997697679981565f;
-
-// This is computed from kDecayMs by
-// 10 ** (-1/20 * subframe_duration / kDecayMs).
-// |subframe_duration| is |kFrameDurationMs / kSubFramesInFrame|.
-// kDecayMs is defined in agc2_testing_common.h
-constexpr float kDecayFilterConstant = 0.9998848773724686f;
+// Set the initial speech level estimate so that `kInitialAdaptiveDigitalGainDb`
+// is applied at the beginning of the call.
+constexpr float kInitialSpeechLevelEstimateDbfs =
+ -kSaturationProtectorExtraHeadroomDb -
+ kSaturationProtectorInitialHeadroomDb - kInitialAdaptiveDigitalGainDb -
+ kHeadroomDbfs;
// Number of interpolation points for each region of the limiter.
// These values have been tuned to limit the interpolated gain curve error given
diff --git a/modules/audio_processing/agc2/agc2_testing_common_unittest.cc b/modules/audio_processing/agc2/agc2_testing_common_unittest.cc
index f52ea3c..79c3cc9 100644
--- a/modules/audio_processing/agc2/agc2_testing_common_unittest.cc
+++ b/modules/audio_processing/agc2/agc2_testing_common_unittest.cc
@@ -14,7 +14,7 @@
namespace webrtc {
-TEST(AutomaticGainController2Common, TestLinSpace) {
+TEST(GainController2TestingCommon, LinSpace) {
std::vector<double> points1 = test::LinSpace(-1.0, 2.0, 4);
const std::vector<double> expected_points1{{-1.0, 0.0, 1.0, 2.0}};
EXPECT_EQ(expected_points1, points1);
diff --git a/modules/audio_processing/agc2/fixed_digital_level_estimator.cc b/modules/audio_processing/agc2/fixed_digital_level_estimator.cc
index 9636136..3e9bb2e 100644
--- a/modules/audio_processing/agc2/fixed_digital_level_estimator.cc
+++ b/modules/audio_processing/agc2/fixed_digital_level_estimator.cc
@@ -22,6 +22,14 @@
constexpr float kInitialFilterStateLevel = 0.f;
+// Instant attack.
+constexpr float kAttackFilterConstant = 0.f;
+// This is computed from kDecayMs by
+// 10 ** (-1/20 * subframe_duration / kDecayMs).
+// |subframe_duration| is |kFrameDurationMs / kSubFramesInFrame|.
+// kDecayMs is defined in agc2_testing_common.h
+constexpr float kDecayFilterConstant = 0.9998848773724686f;
+
} // namespace
FixedDigitalLevelEstimator::FixedDigitalLevelEstimator(
diff --git a/modules/audio_processing/agc2/fixed_digital_level_estimator_unittest.cc b/modules/audio_processing/agc2/fixed_digital_level_estimator_unittest.cc
index 7547f8e..97b421d 100644
--- a/modules/audio_processing/agc2/fixed_digital_level_estimator_unittest.cc
+++ b/modules/audio_processing/agc2/fixed_digital_level_estimator_unittest.cc
@@ -101,25 +101,25 @@
}
} // namespace
-TEST(AutomaticGainController2LevelEstimator, EstimatorShouldNotCrash) {
+TEST(GainController2FixedDigitalLevelEstimator, EstimatorShouldNotCrash) {
TestLevelEstimator(8000, 1, 0, std::numeric_limits<float>::lowest(),
std::numeric_limits<float>::max());
}
-TEST(AutomaticGainController2LevelEstimator,
+TEST(GainController2FixedDigitalLevelEstimator,
EstimatorShouldEstimateConstantLevel) {
TestLevelEstimator(10000, 1, kInputLevel, kInputLevel * 0.99,
kInputLevel * 1.01);
}
-TEST(AutomaticGainController2LevelEstimator,
+TEST(GainController2FixedDigitalLevelEstimator,
EstimatorShouldEstimateConstantLevelForManyChannels) {
constexpr size_t num_channels = 10;
TestLevelEstimator(20000, num_channels, kInputLevel, kInputLevel * 0.99,
kInputLevel * 1.01);
}
-TEST(AutomaticGainController2LevelEstimator, TimeToDecreaseForLowLevel) {
+TEST(GainController2FixedDigitalLevelEstimator, TimeToDecreaseForLowLevel) {
constexpr float kLevelReductionDb = 25;
constexpr float kInitialLowLevel = -40;
constexpr float kExpectedTime = kLevelReductionDb * test::kDecayMs;
@@ -131,7 +131,8 @@
EXPECT_LE(time_to_decrease, kExpectedTime * 1.1);
}
-TEST(AutomaticGainController2LevelEstimator, TimeToDecreaseForFullScaleLevel) {
+TEST(GainController2FixedDigitalLevelEstimator,
+ TimeToDecreaseForFullScaleLevel) {
constexpr float kLevelReductionDb = 25;
constexpr float kExpectedTime = kLevelReductionDb * test::kDecayMs;
@@ -142,7 +143,7 @@
EXPECT_LE(time_to_decrease, kExpectedTime * 1.1);
}
-TEST(AutomaticGainController2LevelEstimator,
+TEST(GainController2FixedDigitalLevelEstimator,
TimeToDecreaseForMultipleChannels) {
constexpr float kLevelReductionDb = 25;
constexpr float kExpectedTime = kLevelReductionDb * test::kDecayMs;
diff --git a/modules/audio_processing/agc2/interpolated_gain_curve.h b/modules/audio_processing/agc2/interpolated_gain_curve.h
index 69652c5..af99320 100644
--- a/modules/audio_processing/agc2/interpolated_gain_curve.h
+++ b/modules/audio_processing/agc2/interpolated_gain_curve.h
@@ -75,7 +75,7 @@
private:
// For comparing 'approximation_params_*_' with ones computed by
// ComputeInterpolatedGainCurve.
- FRIEND_TEST_ALL_PREFIXES(AutomaticGainController2InterpolatedGainCurve,
+ FRIEND_TEST_ALL_PREFIXES(GainController2InterpolatedGainCurve,
CheckApproximationParams);
struct RegionLogger {
diff --git a/modules/audio_processing/agc2/interpolated_gain_curve_unittest.cc b/modules/audio_processing/agc2/interpolated_gain_curve_unittest.cc
index 67d34e5..7861ae9 100644
--- a/modules/audio_processing/agc2/interpolated_gain_curve_unittest.cc
+++ b/modules/audio_processing/agc2/interpolated_gain_curve_unittest.cc
@@ -34,7 +34,7 @@
} // namespace
-TEST(AutomaticGainController2InterpolatedGainCurve, CreateUse) {
+TEST(GainController2InterpolatedGainCurve, CreateUse) {
InterpolatedGainCurve igc(&apm_data_dumper, "");
const auto levels = test::LinSpace(
@@ -44,7 +44,7 @@
}
}
-TEST(AutomaticGainController2InterpolatedGainCurve, CheckValidOutput) {
+TEST(GainController2InterpolatedGainCurve, CheckValidOutput) {
InterpolatedGainCurve igc(&apm_data_dumper, "");
const auto levels = test::LinSpace(
@@ -57,7 +57,7 @@
}
}
-TEST(AutomaticGainController2InterpolatedGainCurve, CheckMonotonicity) {
+TEST(GainController2InterpolatedGainCurve, CheckMonotonicity) {
InterpolatedGainCurve igc(&apm_data_dumper, "");
const auto levels = test::LinSpace(
@@ -71,7 +71,7 @@
}
}
-TEST(AutomaticGainController2InterpolatedGainCurve, CheckApproximation) {
+TEST(GainController2InterpolatedGainCurve, CheckApproximation) {
InterpolatedGainCurve igc(&apm_data_dumper, "");
const auto levels = test::LinSpace(
@@ -84,7 +84,7 @@
}
}
-TEST(AutomaticGainController2InterpolatedGainCurve, CheckRegionBoundaries) {
+TEST(GainController2InterpolatedGainCurve, CheckRegionBoundaries) {
InterpolatedGainCurve igc(&apm_data_dumper, "");
const std::vector<double> levels{
@@ -102,7 +102,7 @@
EXPECT_EQ(1ul, stats.look_ups_saturation_region);
}
-TEST(AutomaticGainController2InterpolatedGainCurve, CheckIdentityRegion) {
+TEST(GainController2InterpolatedGainCurve, CheckIdentityRegion) {
constexpr size_t kNumSteps = 10;
InterpolatedGainCurve igc(&apm_data_dumper, "");
@@ -120,8 +120,7 @@
EXPECT_EQ(0ul, stats.look_ups_saturation_region);
}
-TEST(AutomaticGainController2InterpolatedGainCurve,
- CheckNoOverApproximationKnee) {
+TEST(GainController2InterpolatedGainCurve, CheckNoOverApproximationKnee) {
constexpr size_t kNumSteps = 10;
InterpolatedGainCurve igc(&apm_data_dumper, "");
@@ -142,8 +141,7 @@
EXPECT_EQ(0ul, stats.look_ups_saturation_region);
}
-TEST(AutomaticGainController2InterpolatedGainCurve,
- CheckNoOverApproximationBeyondKnee) {
+TEST(GainController2InterpolatedGainCurve, CheckNoOverApproximationBeyondKnee) {
constexpr size_t kNumSteps = 10;
InterpolatedGainCurve igc(&apm_data_dumper, "");
@@ -164,7 +162,7 @@
EXPECT_EQ(0ul, stats.look_ups_saturation_region);
}
-TEST(AutomaticGainController2InterpolatedGainCurve,
+TEST(GainController2InterpolatedGainCurve,
CheckNoOverApproximationWithSaturation) {
constexpr size_t kNumSteps = 3;
InterpolatedGainCurve igc(&apm_data_dumper, "");
@@ -184,7 +182,7 @@
EXPECT_EQ(kNumSteps, stats.look_ups_saturation_region);
}
-TEST(AutomaticGainController2InterpolatedGainCurve, CheckApproximationParams) {
+TEST(GainController2InterpolatedGainCurve, CheckApproximationParams) {
test::InterpolatedParameters parameters =
test::ComputeInterpolatedGainCurveApproximationParams();
diff --git a/modules/audio_processing/agc2/noise_level_estimator.cc b/modules/audio_processing/agc2/noise_level_estimator.cc
index ae8a501..10e8437 100644
--- a/modules/audio_processing/agc2/noise_level_estimator.cc
+++ b/modules/audio_processing/agc2/noise_level_estimator.cc
@@ -184,7 +184,7 @@
const float frame_energy = FrameEnergy(frame);
if (frame_energy <= min_noise_energy_) {
// Ignore frames when muted or below the minimum measurable energy.
- data_dumper_->DumpRaw("agc2_noise_floor_preliminary_level",
+ data_dumper_->DumpRaw("agc2_noise_floor_estimator_preliminary_level",
noise_energy_);
return EnergyToDbfs(noise_energy_, frame.samples_per_channel());
}
@@ -196,7 +196,7 @@
preliminary_noise_energy_ = frame_energy;
preliminary_noise_energy_set_ = true;
}
- data_dumper_->DumpRaw("agc2_noise_floor_preliminary_level",
+ data_dumper_->DumpRaw("agc2_noise_floor_estimator_preliminary_level",
preliminary_noise_energy_);
if (counter_ == 0) {
diff --git a/modules/audio_processing/agc2/saturation_protector.cc b/modules/audio_processing/agc2/saturation_protector.cc
index b64fcdb..d6f21ef 100644
--- a/modules/audio_processing/agc2/saturation_protector.cc
+++ b/modules/audio_processing/agc2/saturation_protector.cc
@@ -10,84 +10,59 @@
#include "modules/audio_processing/agc2/saturation_protector.h"
+#include <memory>
+
+#include "modules/audio_processing/agc2/agc2_common.h"
+#include "modules/audio_processing/agc2/saturation_protector_buffer.h"
#include "modules/audio_processing/logging/apm_data_dumper.h"
+#include "rtc_base/checks.h"
#include "rtc_base/numerics/safe_minmax.h"
namespace webrtc {
namespace {
-constexpr float kMinLevelDbfs = -90.f;
+constexpr int kPeakEnveloperSuperFrameLengthMs = 400;
+constexpr float kMinMarginDb = 12.0f;
+constexpr float kMaxMarginDb = 25.0f;
+constexpr float kAttack = 0.9988493699365052f;
+constexpr float kDecay = 0.9997697679981565f;
-// Min/max margins are based on speech crest-factor.
-constexpr float kMinMarginDb = 12.f;
-constexpr float kMaxMarginDb = 25.f;
-
-using saturation_protector_impl::RingBuffer;
-
-} // namespace
-
-bool RingBuffer::operator==(const RingBuffer& b) const {
- RTC_DCHECK_LE(size_, buffer_.size());
- RTC_DCHECK_LE(b.size_, b.buffer_.size());
- if (size_ != b.size_) {
- return false;
+// Saturation protector state. Defined outside of `SaturationProtectorImpl` to
+// implement check-point and restore ops.
+struct SaturationProtectorState {
+ bool operator==(const SaturationProtectorState& s) const {
+ return headroom_db == s.headroom_db &&
+ peak_delay_buffer == s.peak_delay_buffer &&
+ max_peaks_dbfs == s.max_peaks_dbfs &&
+ time_since_push_ms == s.time_since_push_ms;
}
- for (int i = 0, i0 = FrontIndex(), i1 = b.FrontIndex(); i < size_;
- ++i, ++i0, ++i1) {
- if (buffer_[i0 % buffer_.size()] != b.buffer_[i1 % b.buffer_.size()]) {
- return false;
- }
+ inline bool operator!=(const SaturationProtectorState& s) const {
+ return !(*this == s);
}
- return true;
-}
-void RingBuffer::Reset() {
- next_ = 0;
- size_ = 0;
-}
+ float headroom_db;
+ SaturationProtectorBuffer peak_delay_buffer;
+ float max_peaks_dbfs;
+ int time_since_push_ms; // Time since the last ring buffer push operation.
+};
-void RingBuffer::PushBack(float v) {
- RTC_DCHECK_GE(next_, 0);
- RTC_DCHECK_GE(size_, 0);
- RTC_DCHECK_LT(next_, buffer_.size());
- RTC_DCHECK_LE(size_, buffer_.size());
- buffer_[next_++] = v;
- if (rtc::SafeEq(next_, buffer_.size())) {
- next_ = 0;
- }
- if (rtc::SafeLt(size_, buffer_.size())) {
- size_++;
- }
-}
-
-absl::optional<float> RingBuffer::Front() const {
- if (size_ == 0) {
- return absl::nullopt;
- }
- RTC_DCHECK_LT(FrontIndex(), buffer_.size());
- return buffer_[FrontIndex()];
-}
-
-bool SaturationProtectorState::operator==(
- const SaturationProtectorState& b) const {
- return margin_db == b.margin_db && peak_delay_buffer == b.peak_delay_buffer &&
- max_peaks_dbfs == b.max_peaks_dbfs &&
- time_since_push_ms == b.time_since_push_ms;
-}
-
-void ResetSaturationProtectorState(float initial_margin_db,
+// Resets the saturation protector state.
+void ResetSaturationProtectorState(float initial_headroom_db,
SaturationProtectorState& state) {
- state.margin_db = initial_margin_db;
+ state.headroom_db = initial_headroom_db;
state.peak_delay_buffer.Reset();
state.max_peaks_dbfs = kMinLevelDbfs;
state.time_since_push_ms = 0;
}
-void UpdateSaturationProtectorState(float speech_peak_dbfs,
+// Updates `state` by analyzing the estimated speech level `speech_level_dbfs`
+// and the peak level `peak_dbfs` for an observed frame. `state` must not be
+// modified without calling this function.
+void UpdateSaturationProtectorState(float peak_dbfs,
float speech_level_dbfs,
SaturationProtectorState& state) {
// Get the max peak over `kPeakEnveloperSuperFrameLengthMs` ms.
- state.max_peaks_dbfs = std::max(state.max_peaks_dbfs, speech_peak_dbfs);
+ state.max_peaks_dbfs = std::max(state.max_peaks_dbfs, peak_dbfs);
state.time_since_push_ms += kFrameDurationMs;
if (rtc::SafeGt(state.time_since_push_ms, kPeakEnveloperSuperFrameLengthMs)) {
// Push `max_peaks_dbfs` back into the ring buffer.
@@ -97,25 +72,117 @@
state.time_since_push_ms = 0;
}
- // Update margin by comparing the estimated speech level and the delayed max
- // speech peak power.
- // TODO(alessiob): Check with aleloi@ why we use a delay and how to tune it.
+ // Update the headroom by comparing the estimated speech level and the delayed
+ // max speech peak.
const float delayed_peak_dbfs =
state.peak_delay_buffer.Front().value_or(state.max_peaks_dbfs);
const float difference_db = delayed_peak_dbfs - speech_level_dbfs;
- if (difference_db > state.margin_db) {
+ if (difference_db > state.headroom_db) {
// Attack.
- state.margin_db =
- state.margin_db * kSaturationProtectorAttackConstant +
- difference_db * (1.f - kSaturationProtectorAttackConstant);
+ state.headroom_db =
+ state.headroom_db * kAttack + difference_db * (1.0f - kAttack);
} else {
// Decay.
- state.margin_db = state.margin_db * kSaturationProtectorDecayConstant +
- difference_db * (1.f - kSaturationProtectorDecayConstant);
+ state.headroom_db =
+ state.headroom_db * kDecay + difference_db * (1.0f - kDecay);
}
- state.margin_db =
- rtc::SafeClamp<float>(state.margin_db, kMinMarginDb, kMaxMarginDb);
+ state.headroom_db =
+ rtc::SafeClamp<float>(state.headroom_db, kMinMarginDb, kMaxMarginDb);
+}
+
+// Saturation protector which recommends a headroom based on the recent peaks.
+class SaturationProtectorImpl : public SaturationProtector {
+ public:
+ explicit SaturationProtectorImpl(float initial_headroom_db,
+ float extra_headroom_db,
+ int adjacent_speech_frames_threshold,
+ ApmDataDumper* apm_data_dumper)
+ : apm_data_dumper_(apm_data_dumper),
+ initial_headroom_db_(initial_headroom_db),
+ extra_headroom_db_(extra_headroom_db),
+ adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold) {
+ Reset();
+ }
+ SaturationProtectorImpl(const SaturationProtectorImpl&) = delete;
+ SaturationProtectorImpl& operator=(const SaturationProtectorImpl&) = delete;
+ ~SaturationProtectorImpl() = default;
+
+ float HeadroomDb() override { return headroom_db_; }
+
+ void Analyze(float speech_probability,
+ float peak_dbfs,
+ float speech_level_dbfs) override {
+ if (speech_probability < kVadConfidenceThreshold) {
+ // Not a speech frame.
+ if (adjacent_speech_frames_threshold_ > 1) {
+ // When two or more adjacent speech frames are required in order to
+ // update the state, we need to decide whether to discard or confirm the
+ // updates based on the speech sequence length.
+ if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
+ // First non-speech frame after a long enough sequence of speech
+ // frames. Update the reliable state.
+ reliable_state_ = preliminary_state_;
+ } else if (num_adjacent_speech_frames_ > 0) {
+ // First non-speech frame after a too short sequence of speech frames.
+ // Reset to the last reliable state.
+ preliminary_state_ = reliable_state_;
+ }
+ }
+ num_adjacent_speech_frames_ = 0;
+ } else {
+ // Speech frame observed.
+ num_adjacent_speech_frames_++;
+
+ // Update preliminary level estimate.
+ UpdateSaturationProtectorState(peak_dbfs, speech_level_dbfs,
+ preliminary_state_);
+
+ if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
+ // `preliminary_state_` is now reliable. Update the headroom.
+ headroom_db_ = preliminary_state_.headroom_db + extra_headroom_db_;
+ }
+ }
+ DumpDebugData();
+ }
+
+ void Reset() override {
+ num_adjacent_speech_frames_ = 0;
+ headroom_db_ = initial_headroom_db_ + extra_headroom_db_;
+ ResetSaturationProtectorState(initial_headroom_db_, preliminary_state_);
+ ResetSaturationProtectorState(initial_headroom_db_, reliable_state_);
+ }
+
+ private:
+ void DumpDebugData() {
+ apm_data_dumper_->DumpRaw(
+ "agc2_saturation_protector_preliminary_max_peak_dbfs",
+ preliminary_state_.max_peaks_dbfs);
+ apm_data_dumper_->DumpRaw(
+ "agc2_saturation_protector_reliable_max_peak_dbfs",
+ reliable_state_.max_peaks_dbfs);
+ }
+
+ ApmDataDumper* const apm_data_dumper_;
+ const float initial_headroom_db_;
+ const float extra_headroom_db_;
+ const int adjacent_speech_frames_threshold_;
+ int num_adjacent_speech_frames_;
+ float headroom_db_;
+ SaturationProtectorState preliminary_state_;
+ SaturationProtectorState reliable_state_;
+};
+
+} // namespace
+
+std::unique_ptr<SaturationProtector> CreateSaturationProtector(
+ float initial_headroom_db,
+ float extra_headroom_db,
+ int adjacent_speech_frames_threshold,
+ ApmDataDumper* apm_data_dumper) {
+ return std::make_unique<SaturationProtectorImpl>(
+ initial_headroom_db, extra_headroom_db, adjacent_speech_frames_threshold,
+ apm_data_dumper);
}
} // namespace webrtc
diff --git a/modules/audio_processing/agc2/saturation_protector.h b/modules/audio_processing/agc2/saturation_protector.h
index 88be91a..0c384f1 100644
--- a/modules/audio_processing/agc2/saturation_protector.h
+++ b/modules/audio_processing/agc2/saturation_protector.h
@@ -11,71 +11,36 @@
#ifndef MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_
#define MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_H_
-#include <array>
-
-#include "absl/types/optional.h"
-#include "modules/audio_processing/agc2/agc2_common.h"
-#include "rtc_base/numerics/safe_compare.h"
+#include <memory>
namespace webrtc {
-namespace saturation_protector_impl {
+class ApmDataDumper;
-// Ring buffer which only supports (i) push back and (ii) read oldest item.
-class RingBuffer {
+// Saturation protector. Analyzes peak levels and recommends a headroom to
+// reduce the chances of clipping.
+class SaturationProtector {
public:
- bool operator==(const RingBuffer& b) const;
- inline bool operator!=(const RingBuffer& b) const { return !(*this == b); }
+ virtual ~SaturationProtector() = default;
- // Maximum number of values that the buffer can contain.
- int Capacity() const { return buffer_.size(); }
- // Number of values in the buffer.
- int Size() const { return size_; }
+ // Returns the recommended headroom in dB.
+ virtual float HeadroomDb() = 0;
- void Reset();
- // Pushes back `v`. If the buffer is full, the oldest value is replaced.
- void PushBack(float v);
- // Returns the oldest item in the buffer. Returns an empty value if the
- // buffer is empty.
- absl::optional<float> Front() const;
+ // Analyzes the peak level of a 10 ms frame along with its speech probability
+ // and the current speech level estimate to update the recommended headroom.
+ virtual void Analyze(float speech_probability,
+ float peak_dbfs,
+ float speech_level_dbfs) = 0;
- private:
- inline int FrontIndex() const {
- return rtc::SafeEq(size_, buffer_.size()) ? next_ : 0;
- }
- // `buffer_` has `size_` elements (up to the size of `buffer_`) and `next_` is
- // the position where the next new value is written in `buffer_`.
- std::array<float, kPeakEnveloperBufferSize> buffer_;
- int next_ = 0;
- int size_ = 0;
+ // Resets the internal state.
+ virtual void Reset() = 0;
};
-} // namespace saturation_protector_impl
-
-// Saturation protector state. Exposed publicly for check-pointing and restore
-// ops.
-struct SaturationProtectorState {
- bool operator==(const SaturationProtectorState& s) const;
- inline bool operator!=(const SaturationProtectorState& s) const {
- return !(*this == s);
- }
-
- float margin_db; // Recommended margin.
- saturation_protector_impl::RingBuffer peak_delay_buffer;
- float max_peaks_dbfs;
- int time_since_push_ms; // Time since the last ring buffer push operation.
-};
-
-// Resets the saturation protector state.
-void ResetSaturationProtectorState(float initial_margin_db,
- SaturationProtectorState& state);
-
-// Updates `state` by analyzing the estimated speech level `speech_level_dbfs`
-// and the peak power `speech_peak_dbfs` for an observed frame which is
-// reliably classified as "speech". `state` must not be modified without calling
-// this function.
-void UpdateSaturationProtectorState(float speech_peak_dbfs,
- float speech_level_dbfs,
- SaturationProtectorState& state);
+// Creates a saturation protector that starts at `initial_headroom_db`.
+std::unique_ptr<SaturationProtector> CreateSaturationProtector(
+ float initial_headroom_db,
+ float extra_headroom_db,
+ int adjacent_speech_frames_threshold,
+ ApmDataDumper* apm_data_dumper);
} // namespace webrtc
diff --git a/modules/audio_processing/agc2/saturation_protector_buffer.cc b/modules/audio_processing/agc2/saturation_protector_buffer.cc
new file mode 100644
index 0000000..41efdad
--- /dev/null
+++ b/modules/audio_processing/agc2/saturation_protector_buffer.cc
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2021 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/agc2/saturation_protector_buffer.h"
+
+#include "rtc_base/checks.h"
+#include "rtc_base/numerics/safe_compare.h"
+
+namespace webrtc {
+
+SaturationProtectorBuffer::SaturationProtectorBuffer() = default;
+
+SaturationProtectorBuffer::~SaturationProtectorBuffer() = default;
+
+bool SaturationProtectorBuffer::operator==(
+ const SaturationProtectorBuffer& b) const {
+ RTC_DCHECK_LE(size_, buffer_.size());
+ RTC_DCHECK_LE(b.size_, b.buffer_.size());
+ if (size_ != b.size_) {
+ return false;
+ }
+ for (int i = 0, i0 = FrontIndex(), i1 = b.FrontIndex(); i < size_;
+ ++i, ++i0, ++i1) {
+ if (buffer_[i0 % buffer_.size()] != b.buffer_[i1 % b.buffer_.size()]) {
+ return false;
+ }
+ }
+ return true;
+}
+
+int SaturationProtectorBuffer::Capacity() const {
+ return buffer_.size();
+}
+
+int SaturationProtectorBuffer::Size() const {
+ return size_;
+}
+
+void SaturationProtectorBuffer::Reset() {
+ next_ = 0;
+ size_ = 0;
+}
+
+void SaturationProtectorBuffer::PushBack(float v) {
+ RTC_DCHECK_GE(next_, 0);
+ RTC_DCHECK_GE(size_, 0);
+ RTC_DCHECK_LT(next_, buffer_.size());
+ RTC_DCHECK_LE(size_, buffer_.size());
+ buffer_[next_++] = v;
+ if (rtc::SafeEq(next_, buffer_.size())) {
+ next_ = 0;
+ }
+ if (rtc::SafeLt(size_, buffer_.size())) {
+ size_++;
+ }
+}
+
+absl::optional<float> SaturationProtectorBuffer::Front() const {
+ if (size_ == 0) {
+ return absl::nullopt;
+ }
+ RTC_DCHECK_LT(FrontIndex(), buffer_.size());
+ return buffer_[FrontIndex()];
+}
+
+int SaturationProtectorBuffer::FrontIndex() const {
+ return rtc::SafeEq(size_, buffer_.size()) ? next_ : 0;
+}
+
+} // namespace webrtc
diff --git a/modules/audio_processing/agc2/saturation_protector_buffer.h b/modules/audio_processing/agc2/saturation_protector_buffer.h
new file mode 100644
index 0000000..e17d099
--- /dev/null
+++ b/modules/audio_processing/agc2/saturation_protector_buffer.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_BUFFER_H_
+#define MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_BUFFER_H_
+
+#include <array>
+
+#include "absl/types/optional.h"
+#include "modules/audio_processing/agc2/agc2_common.h"
+
+namespace webrtc {
+
+// Ring buffer for the saturation protector which only supports (i) push back
+// and (ii) read oldest item.
+class SaturationProtectorBuffer {
+ public:
+ SaturationProtectorBuffer();
+ ~SaturationProtectorBuffer();
+
+ bool operator==(const SaturationProtectorBuffer& b) const;
+ inline bool operator!=(const SaturationProtectorBuffer& b) const {
+ return !(*this == b);
+ }
+
+ // Maximum number of values that the buffer can contain.
+ int Capacity() const;
+
+ // Number of values in the buffer.
+ int Size() const;
+
+ void Reset();
+
+ // Pushes back `v`. If the buffer is full, the oldest value is replaced.
+ void PushBack(float v);
+
+ // Returns the oldest item in the buffer. Returns an empty value if the
+ // buffer is empty.
+ absl::optional<float> Front() const;
+
+ private:
+ int FrontIndex() const;
+ // `buffer_` has `size_` elements (up to the size of `buffer_`) and `next_` is
+ // the position where the next new value is written in `buffer_`.
+ std::array<float, kSaturationProtectorBufferSize> buffer_;
+ int next_ = 0;
+ int size_ = 0;
+};
+
+} // namespace webrtc
+
+#endif // MODULES_AUDIO_PROCESSING_AGC2_SATURATION_PROTECTOR_BUFFER_H_
diff --git a/modules/audio_processing/agc2/saturation_protector_buffer_unittest.cc b/modules/audio_processing/agc2/saturation_protector_buffer_unittest.cc
new file mode 100644
index 0000000..22187bf
--- /dev/null
+++ b/modules/audio_processing/agc2/saturation_protector_buffer_unittest.cc
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/agc2/saturation_protector_buffer.h"
+
+#include "test/gmock.h"
+#include "test/gtest.h"
+
+namespace webrtc {
+namespace {
+
+using ::testing::Eq;
+using ::testing::Optional;
+
+TEST(GainController2SaturationProtectorBuffer, Init) {
+ SaturationProtectorBuffer b;
+ EXPECT_EQ(b.Size(), 0);
+ EXPECT_FALSE(b.Front().has_value());
+}
+
+TEST(GainController2SaturationProtectorBuffer, PushBack) {
+ SaturationProtectorBuffer b;
+ constexpr float kValue = 123.0f;
+ b.PushBack(kValue);
+ EXPECT_EQ(b.Size(), 1);
+ EXPECT_THAT(b.Front(), Optional(Eq(kValue)));
+}
+
+TEST(GainController2SaturationProtectorBuffer, Reset) {
+ SaturationProtectorBuffer b;
+ b.PushBack(123.0f);
+ b.Reset();
+ EXPECT_EQ(b.Size(), 0);
+ EXPECT_FALSE(b.Front().has_value());
+}
+
+// Checks that the front value does not change until the ring buffer gets full.
+TEST(GainController2SaturationProtectorBuffer, FrontUntilBufferIsFull) {
+ SaturationProtectorBuffer b;
+ constexpr float kValue = 123.0f;
+ b.PushBack(kValue);
+ for (int i = 1; i < b.Capacity(); ++i) {
+ SCOPED_TRACE(i);
+ EXPECT_THAT(b.Front(), Optional(Eq(kValue)));
+ b.PushBack(kValue + i);
+ }
+}
+
+// Checks that when the buffer is full it behaves as a shift register.
+TEST(GainController2SaturationProtectorBuffer, FrontIsDelayed) {
+ SaturationProtectorBuffer b;
+ // Fill the buffer.
+ for (int i = 0; i < b.Capacity(); ++i) {
+ b.PushBack(i);
+ }
+ // The ring buffer should now behave as a shift register with a delay equal to
+ // its capacity.
+ for (int i = b.Capacity(); i < 2 * b.Capacity() + 1; ++i) {
+ SCOPED_TRACE(i);
+ EXPECT_THAT(b.Front(), Optional(Eq(i - b.Capacity())));
+ b.PushBack(i);
+ }
+}
+
+} // namespace
+} // namespace webrtc
diff --git a/modules/audio_processing/agc2/saturation_protector_unittest.cc b/modules/audio_processing/agc2/saturation_protector_unittest.cc
index 2c5ee5b..dc16dc2 100644
--- a/modules/audio_processing/agc2/saturation_protector_unittest.cc
+++ b/modules/audio_processing/agc2/saturation_protector_unittest.cc
@@ -10,181 +10,166 @@
#include "modules/audio_processing/agc2/saturation_protector.h"
-#include <algorithm>
-
#include "modules/audio_processing/agc2/agc2_common.h"
#include "modules/audio_processing/logging/apm_data_dumper.h"
#include "rtc_base/gunit.h"
-#include "test/gmock.h"
namespace webrtc {
namespace {
-constexpr float kInitialMarginDb = 20.f;
+constexpr float kInitialHeadroomDb = 20.0f;
+constexpr float kNoExtraHeadroomDb = 0.0f;
+constexpr int kNoAdjacentSpeechFramesRequired = 1;
+constexpr float kMaxSpeechProbability = 1.0f;
-using saturation_protector_impl::RingBuffer;
-
-SaturationProtectorState CreateSaturationProtectorState() {
- SaturationProtectorState state;
- ResetSaturationProtectorState(kInitialMarginDb, state);
- return state;
-}
-
-// Updates `state` for `num_iterations` times with constant speech level and
-// peak powers and returns the maximum margin.
+// Calls `Analyze(speech_probability, peak_dbfs, speech_level_dbfs)`
+// `num_iterations` times on `saturation_protector` and return the largest
+// headroom difference between two consecutive calls.
float RunOnConstantLevel(int num_iterations,
- float speech_peak_dbfs,
+ float speech_probability,
+ float peak_dbfs,
float speech_level_dbfs,
- SaturationProtectorState& state) {
- float last_margin = state.margin_db;
- float max_difference = 0.f;
+ SaturationProtector& saturation_protector) {
+ float last_headroom = saturation_protector.HeadroomDb();
+ float max_difference = 0.0f;
for (int i = 0; i < num_iterations; ++i) {
- UpdateSaturationProtectorState(speech_peak_dbfs, speech_level_dbfs, state);
- const float new_margin = state.margin_db;
+ saturation_protector.Analyze(speech_probability, peak_dbfs,
+ speech_level_dbfs);
+ const float new_headroom = saturation_protector.HeadroomDb();
max_difference =
- std::max(max_difference, std::abs(new_margin - last_margin));
- last_margin = new_margin;
+ std::max(max_difference, std::fabs(new_headroom - last_headroom));
+ last_headroom = new_headroom;
}
return max_difference;
}
-} // namespace
-
-TEST(AutomaticGainController2SaturationProtector, RingBufferInit) {
- RingBuffer b;
- EXPECT_EQ(b.Size(), 0);
- EXPECT_FALSE(b.Front().has_value());
-}
-
-TEST(AutomaticGainController2SaturationProtector, RingBufferPushBack) {
- RingBuffer b;
- constexpr float kValue = 123.f;
- b.PushBack(kValue);
- EXPECT_EQ(b.Size(), 1);
- ASSERT_TRUE(b.Front().has_value());
- EXPECT_EQ(b.Front().value(), kValue);
-}
-
-TEST(AutomaticGainController2SaturationProtector, RingBufferReset) {
- RingBuffer b;
- b.PushBack(123.f);
- b.Reset();
- EXPECT_EQ(b.Size(), 0);
- EXPECT_FALSE(b.Front().has_value());
-}
-
-// Checks that the front value does not change until the ring buffer gets full.
-TEST(AutomaticGainController2SaturationProtector,
- RingBufferFrontUntilBufferIsFull) {
- RingBuffer b;
- constexpr float kValue = 123.f;
- b.PushBack(kValue);
- for (int i = 1; i < b.Capacity(); ++i) {
- EXPECT_EQ(b.Front().value(), kValue);
- b.PushBack(kValue + i);
- }
-}
-
-// Checks that when the buffer is full it behaves as a shift register.
-TEST(AutomaticGainController2SaturationProtector,
- FullRingBufferFrontIsDelayed) {
- RingBuffer b;
- // Fill the buffer.
- for (int i = 0; i < b.Capacity(); ++i) {
- b.PushBack(i);
- }
- // The ring buffer should now behave as a shift register with a delay equal to
- // its capacity.
- for (int i = b.Capacity(); i < 2 * b.Capacity() + 1; ++i) {
- EXPECT_EQ(b.Front().value(), i - b.Capacity());
- b.PushBack(i);
- }
-}
-
-// Checks that a state after reset equals a state after construction.
-TEST(AutomaticGainController2SaturationProtector, ResetState) {
- SaturationProtectorState init_state;
- ResetSaturationProtectorState(kInitialMarginDb, init_state);
-
- SaturationProtectorState state;
- ResetSaturationProtectorState(kInitialMarginDb, state);
- RunOnConstantLevel(/*num_iterations=*/10, /*speech_level_dbfs=*/-20.f,
- /*speech_peak_dbfs=*/-10.f, state);
- ASSERT_NE(init_state, state); // Make sure that there are side-effects.
- ResetSaturationProtectorState(kInitialMarginDb, state);
-
- EXPECT_EQ(init_state, state);
+// Checks that the returned headroom value is correctly reset.
+TEST(GainController2SaturationProtector, Reset) {
+ ApmDataDumper apm_data_dumper(0);
+ auto saturation_protector = CreateSaturationProtector(
+ kInitialHeadroomDb, kNoExtraHeadroomDb, kNoAdjacentSpeechFramesRequired,
+ &apm_data_dumper);
+ const float initial_headroom_db = saturation_protector->HeadroomDb();
+ RunOnConstantLevel(/*num_iterations=*/10, kMaxSpeechProbability,
+ /*peak_dbfs=*/0.0f,
+ /*speech_level_dbfs=*/-10.0f, *saturation_protector);
+ // Make sure that there are side-effects.
+ ASSERT_NE(initial_headroom_db, saturation_protector->HeadroomDb());
+ saturation_protector->Reset();
+ EXPECT_EQ(initial_headroom_db, saturation_protector->HeadroomDb());
}
// Checks that the estimate converges to the ratio between peaks and level
// estimator values after a while.
-TEST(AutomaticGainController2SaturationProtector,
- ProtectorEstimatesCrestRatio) {
+TEST(GainController2SaturationProtector, EstimatesCrestRatio) {
constexpr int kNumIterations = 2000;
- constexpr float kPeakLevel = -20.f;
- constexpr float kCrestFactor = kInitialMarginDb + 1.f;
- constexpr float kSpeechLevel = kPeakLevel - kCrestFactor;
- const float kMaxDifference = 0.5f * std::abs(kInitialMarginDb - kCrestFactor);
+ constexpr float kPeakLevelDbfs = -20.0f;
+ constexpr float kCrestFactorDb = kInitialHeadroomDb + 1.0f;
+ constexpr float kSpeechLevelDbfs = kPeakLevelDbfs - kCrestFactorDb;
+ const float kMaxDifferenceDb =
+ 0.5f * std::fabs(kInitialHeadroomDb - kCrestFactorDb);
- auto state = CreateSaturationProtectorState();
- RunOnConstantLevel(kNumIterations, kPeakLevel, kSpeechLevel, state);
-
- EXPECT_NEAR(state.margin_db, kCrestFactor, kMaxDifference);
+ ApmDataDumper apm_data_dumper(0);
+ auto saturation_protector = CreateSaturationProtector(
+ kInitialHeadroomDb, kNoExtraHeadroomDb, kNoAdjacentSpeechFramesRequired,
+ &apm_data_dumper);
+ RunOnConstantLevel(kNumIterations, kMaxSpeechProbability, kPeakLevelDbfs,
+ kSpeechLevelDbfs, *saturation_protector);
+ EXPECT_NEAR(saturation_protector->HeadroomDb(), kCrestFactorDb,
+ kMaxDifferenceDb);
}
-// Checks that the margin does not change too quickly.
-TEST(AutomaticGainController2SaturationProtector, ChangeSlowly) {
+// Checks that the extra headroom is applied.
+TEST(GainController2SaturationProtector, ExtraHeadroomApplied) {
+ constexpr float kExtraHeadroomDb = 5.1234f;
+ constexpr int kNumIterations = 10;
+ constexpr float kPeakLevelDbfs = -20.0f;
+ constexpr float kSpeechLevelDbfs = kPeakLevelDbfs - 15.0f;
+
+ ApmDataDumper apm_data_dumper(0);
+
+ auto saturation_protector_no_extra = CreateSaturationProtector(
+ kInitialHeadroomDb, kNoExtraHeadroomDb, kNoAdjacentSpeechFramesRequired,
+ &apm_data_dumper);
+ for (int i = 0; i < kNumIterations; ++i) {
+ saturation_protector_no_extra->Analyze(kMaxSpeechProbability,
+ kPeakLevelDbfs, kSpeechLevelDbfs);
+ }
+
+ auto saturation_protector_extra = CreateSaturationProtector(
+ kInitialHeadroomDb, kExtraHeadroomDb, kNoAdjacentSpeechFramesRequired,
+ &apm_data_dumper);
+ for (int i = 0; i < kNumIterations; ++i) {
+ saturation_protector_extra->Analyze(kMaxSpeechProbability, kPeakLevelDbfs,
+ kSpeechLevelDbfs);
+ }
+
+ EXPECT_EQ(saturation_protector_no_extra->HeadroomDb() + kExtraHeadroomDb,
+ saturation_protector_extra->HeadroomDb());
+}
+
+// Checks that the headroom does not change too quickly.
+TEST(GainController2SaturationProtector, ChangeSlowly) {
constexpr int kNumIterations = 1000;
- constexpr float kPeakLevel = -20.f;
- constexpr float kCrestFactor = kInitialMarginDb - 5.f;
- constexpr float kOtherCrestFactor = kInitialMarginDb;
- constexpr float kSpeechLevel = kPeakLevel - kCrestFactor;
- constexpr float kOtherSpeechLevel = kPeakLevel - kOtherCrestFactor;
+ constexpr float kPeakLevelDbfs = -20.f;
+ constexpr float kCrestFactorDb = kInitialHeadroomDb - 5.f;
+ constexpr float kOtherCrestFactorDb = kInitialHeadroomDb;
+ constexpr float kSpeechLevelDbfs = kPeakLevelDbfs - kCrestFactorDb;
+ constexpr float kOtherSpeechLevelDbfs = kPeakLevelDbfs - kOtherCrestFactorDb;
- auto state = CreateSaturationProtectorState();
- float max_difference =
- RunOnConstantLevel(kNumIterations, kPeakLevel, kSpeechLevel, state);
- max_difference = std::max(
- RunOnConstantLevel(kNumIterations, kPeakLevel, kOtherSpeechLevel, state),
- max_difference);
-
+ ApmDataDumper apm_data_dumper(0);
+ auto saturation_protector = CreateSaturationProtector(
+ kInitialHeadroomDb, kNoExtraHeadroomDb, kNoAdjacentSpeechFramesRequired,
+ &apm_data_dumper);
+ float max_difference_db =
+ RunOnConstantLevel(kNumIterations, kMaxSpeechProbability, kPeakLevelDbfs,
+ kSpeechLevelDbfs, *saturation_protector);
+ max_difference_db = std::max(
+ RunOnConstantLevel(kNumIterations, kMaxSpeechProbability, kPeakLevelDbfs,
+ kOtherSpeechLevelDbfs, *saturation_protector),
+ max_difference_db);
constexpr float kMaxChangeSpeedDbPerSecond = 0.5f; // 1 db / 2 seconds.
- EXPECT_LE(max_difference,
+ EXPECT_LE(max_difference_db,
kMaxChangeSpeedDbPerSecond / 1000 * kFrameDurationMs);
}
-// Checks that there is a delay between input change and margin adaptations.
-TEST(AutomaticGainController2SaturationProtector, AdaptToDelayedChanges) {
- constexpr int kDelayIterations = kFullBufferSizeMs / kFrameDurationMs;
- constexpr float kInitialSpeechLevelDbfs = -30.f;
- constexpr float kLaterSpeechLevelDbfs = -15.f;
+class SaturationProtectorParametrization
+ : public ::testing::TestWithParam<int> {
+ protected:
+ int adjacent_speech_frames_threshold() const { return GetParam(); }
+};
- auto state = CreateSaturationProtectorState();
- // First run on initial level.
- float max_difference = RunOnConstantLevel(
- kDelayIterations, kInitialSpeechLevelDbfs + kInitialMarginDb,
- kInitialSpeechLevelDbfs, state);
- // Then peak changes, but not RMS.
- max_difference =
- std::max(RunOnConstantLevel(kDelayIterations,
- kLaterSpeechLevelDbfs + kInitialMarginDb,
- kInitialSpeechLevelDbfs, state),
- max_difference);
- // Then both change.
- max_difference =
- std::max(RunOnConstantLevel(kDelayIterations,
- kLaterSpeechLevelDbfs + kInitialMarginDb,
- kLaterSpeechLevelDbfs, state),
- max_difference);
-
- // The saturation protector expects that the RMS changes roughly
- // 'kFullBufferSizeMs' after peaks change. This is to account for delay
- // introduced by the level estimator. Therefore, the input above is 'normal'
- // and 'expected', and shouldn't influence the margin by much.
- const float total_difference = std::abs(state.margin_db - kInitialMarginDb);
-
- EXPECT_LE(total_difference, 0.05f);
- EXPECT_LE(max_difference, 0.01f);
+TEST_P(SaturationProtectorParametrization, DoNotAdaptToShortSpeechSegments) {
+ ApmDataDumper apm_data_dumper(0);
+ auto saturation_protector = CreateSaturationProtector(
+ kInitialHeadroomDb, kNoExtraHeadroomDb,
+ adjacent_speech_frames_threshold(), &apm_data_dumper);
+ const float initial_headroom_db = saturation_protector->HeadroomDb();
+ RunOnConstantLevel(/*num_iterations=*/adjacent_speech_frames_threshold() - 1,
+ kMaxSpeechProbability,
+ /*peak_dbfs=*/0.0f,
+ /*speech_level_dbfs=*/-10.0f, *saturation_protector);
+ // No adaptation expected.
+ EXPECT_EQ(initial_headroom_db, saturation_protector->HeadroomDb());
}
+TEST_P(SaturationProtectorParametrization, AdaptToEnoughSpeechSegments) {
+ ApmDataDumper apm_data_dumper(0);
+ auto saturation_protector = CreateSaturationProtector(
+ kInitialHeadroomDb, kNoExtraHeadroomDb,
+ adjacent_speech_frames_threshold(), &apm_data_dumper);
+ const float initial_headroom_db = saturation_protector->HeadroomDb();
+ RunOnConstantLevel(/*num_iterations=*/adjacent_speech_frames_threshold() + 1,
+ kMaxSpeechProbability,
+ /*peak_dbfs=*/0.0f,
+ /*speech_level_dbfs=*/-10.0f, *saturation_protector);
+ // Adaptation expected.
+ EXPECT_NE(initial_headroom_db, saturation_protector->HeadroomDb());
+}
+
+INSTANTIATE_TEST_SUITE_P(GainController2,
+ SaturationProtectorParametrization,
+ ::testing::Values(2, 9, 17));
+
+} // namespace
} // namespace webrtc
diff --git a/modules/audio_processing/agc2/vad_with_level.cc b/modules/audio_processing/agc2/vad_with_level.cc
index 597c09c..034f2b6 100644
--- a/modules/audio_processing/agc2/vad_with_level.cc
+++ b/modules/audio_processing/agc2/vad_with_level.cc
@@ -65,43 +65,23 @@
rnn_vad::RnnVad rnn_vad_;
};
-// Returns an updated version of `p_old` by using instant decay and the given
-// `attack` on a new VAD probability value `p_new`.
-float SmoothedVadProbability(float p_old, float p_new, float attack) {
- RTC_DCHECK_GT(attack, 0.0f);
- RTC_DCHECK_LE(attack, 1.0f);
- if (p_new < p_old || attack == 1.0f) {
- // Instant decay (or no smoothing).
- return p_new;
- } else {
- // Attack phase.
- return attack * p_new + (1.0f - attack) * p_old;
- }
-}
-
} // namespace
VadLevelAnalyzer::VadLevelAnalyzer()
- : VadLevelAnalyzer(kDefaultVadRnnResetPeriodMs,
- kDefaultSmoothedVadProbabilityAttack,
- GetAvailableCpuFeatures()) {}
+ : VadLevelAnalyzer(kDefaultVadRnnResetPeriodMs, GetAvailableCpuFeatures()) {
+}
VadLevelAnalyzer::VadLevelAnalyzer(int vad_reset_period_ms,
- float vad_probability_attack,
const AvailableCpuFeatures& cpu_features)
: VadLevelAnalyzer(vad_reset_period_ms,
- vad_probability_attack,
std::make_unique<Vad>(cpu_features)) {}
VadLevelAnalyzer::VadLevelAnalyzer(int vad_reset_period_ms,
- float vad_probability_attack,
std::unique_ptr<VoiceActivityDetector> vad)
: vad_(std::move(vad)),
vad_reset_period_frames_(
rtc::CheckedDivExact(vad_reset_period_ms, kFrameDurationMs)),
- vad_probability_attack_(vad_probability_attack),
- time_to_vad_reset_(vad_reset_period_frames_),
- vad_probability_(0.0f) {
+ time_to_vad_reset_(vad_reset_period_frames_) {
RTC_DCHECK(vad_);
RTC_DCHECK_GT(vad_reset_period_frames_, 1);
}
@@ -123,11 +103,7 @@
peak = std::max(std::fabs(x), peak);
rms += x * x;
}
- // Compute smoothed speech probability.
- vad_probability_ = SmoothedVadProbability(
- /*p_old=*/vad_probability_, /*p_new=*/vad_->ComputeProbability(frame),
- vad_probability_attack_);
- return {vad_probability_,
+ return {vad_->ComputeProbability(frame),
FloatS16ToDbfs(std::sqrt(rms / frame.samples_per_channel())),
FloatS16ToDbfs(peak)};
}
diff --git a/modules/audio_processing/agc2/vad_with_level.h b/modules/audio_processing/agc2/vad_with_level.h
index 386f162..7cd93d6 100644
--- a/modules/audio_processing/agc2/vad_with_level.h
+++ b/modules/audio_processing/agc2/vad_with_level.h
@@ -37,18 +37,15 @@
virtual float ComputeProbability(AudioFrameView<const float> frame) = 0;
};
- // Ctor. Uses the default VAD.
+ // Ctor. Uses the default VAD with the default settings.
VadLevelAnalyzer();
// Ctor. `vad_reset_period_ms` indicates the period in milliseconds to call
// `VadLevelAnalyzer::Reset()`; it must be equal to or greater than the
- // duration of two frames. `vad_probability_attack` is a number in (0,1] used
- // to smooth the speech probability (instant decay, slow attack).
+ // duration of two frames. Uses `cpu_features` to instantiate the default VAD.
VadLevelAnalyzer(int vad_reset_period_ms,
- float vad_probability_attack,
const AvailableCpuFeatures& cpu_features);
// Ctor. Uses a custom `vad`.
VadLevelAnalyzer(int vad_reset_period_ms,
- float vad_probability_attack,
std::unique_ptr<VoiceActivityDetector> vad);
VadLevelAnalyzer(const VadLevelAnalyzer&) = delete;
@@ -61,9 +58,7 @@
private:
std::unique_ptr<VoiceActivityDetector> vad_;
const int vad_reset_period_frames_;
- const float vad_probability_attack_;
int time_to_vad_reset_;
- float vad_probability_;
};
} // namespace webrtc
diff --git a/modules/audio_processing/agc2/vad_with_level_unittest.cc b/modules/audio_processing/agc2/vad_with_level_unittest.cc
index fd8265e..99b0136 100644
--- a/modules/audio_processing/agc2/vad_with_level_unittest.cc
+++ b/modules/audio_processing/agc2/vad_with_level_unittest.cc
@@ -29,9 +29,6 @@
constexpr int kNoVadPeriodicReset =
kFrameDurationMs * (std::numeric_limits<int>::max() / kFrameDurationMs);
-constexpr float kInstantAttack = 1.0f;
-constexpr float kSlowAttack = 0.1f;
-
constexpr int kSampleRateHz = 8000;
class MockVad : public VadLevelAnalyzer::VoiceActivityDetector {
@@ -48,7 +45,6 @@
// restart from the beginning.
std::unique_ptr<VadLevelAnalyzer> CreateVadLevelAnalyzerWithMockVad(
int vad_reset_period_ms,
- float vad_probability_attack,
const std::vector<float>& speech_probabilities,
int expected_vad_reset_calls = 0) {
auto vad = std::make_unique<MockVad>();
@@ -58,8 +54,8 @@
if (expected_vad_reset_calls >= 0) {
EXPECT_CALL(*vad, Reset).Times(expected_vad_reset_calls);
}
- return std::make_unique<VadLevelAnalyzer>(
- vad_reset_period_ms, vad_probability_attack, std::move(vad));
+ return std::make_unique<VadLevelAnalyzer>(vad_reset_period_ms,
+ std::move(vad));
}
// 10 ms mono frame.
@@ -75,7 +71,7 @@
const AudioFrameView<const float> view;
};
-TEST(AutomaticGainController2VadLevelAnalyzer, PeakLevelGreaterThanRmsLevel) {
+TEST(GainController2VadLevelAnalyzer, PeakLevelGreaterThanRmsLevel) {
// Handcrafted frame so that the average is lower than the peak value.
FrameWithView frame(1000.0f); // Constant frame.
frame.samples[10] = 2000.0f; // Except for one peak value.
@@ -88,14 +84,13 @@
EXPECT_LT(levels_and_vad_prob.rms_dbfs, levels_and_vad_prob.peak_dbfs);
}
-// Checks that the unprocessed and the smoothed speech probabilities match when
-// instant attack is used.
-TEST(AutomaticGainController2VadLevelAnalyzer, NoSpeechProbabilitySmoothing) {
+// Checks that the expect VAD probabilities are returned.
+TEST(GainController2VadLevelAnalyzer, NoSpeechProbabilitySmoothing) {
const std::vector<float> speech_probabilities{0.709f, 0.484f, 0.882f, 0.167f,
0.44f, 0.525f, 0.858f, 0.314f,
0.653f, 0.965f, 0.413f, 0.0f};
- auto analyzer = CreateVadLevelAnalyzerWithMockVad(
- kNoVadPeriodicReset, kInstantAttack, speech_probabilities);
+ auto analyzer = CreateVadLevelAnalyzerWithMockVad(kNoVadPeriodicReset,
+ speech_probabilities);
FrameWithView frame;
for (int i = 0; rtc::SafeLt(i, speech_probabilities.size()); ++i) {
SCOPED_TRACE(i);
@@ -104,45 +99,11 @@
}
}
-// Checks that the smoothed speech probability does not instantly converge to
-// the unprocessed one when slow attack is used.
-TEST(AutomaticGainController2VadLevelAnalyzer,
- SlowAttackSpeechProbabilitySmoothing) {
- const std::vector<float> speech_probabilities{0.0f, 0.0f, 1.0f,
- 1.0f, 1.0f, 1.0f};
- auto analyzer = CreateVadLevelAnalyzerWithMockVad(
- kNoVadPeriodicReset, kSlowAttack, speech_probabilities);
- FrameWithView frame;
- float prev_probability = 0.0f;
- for (int i = 0; rtc::SafeLt(i, speech_probabilities.size()); ++i) {
- SCOPED_TRACE(i);
- const float smoothed_probability =
- analyzer->AnalyzeFrame(frame.view).speech_probability;
- EXPECT_LT(smoothed_probability, 1.0f); // Not enough time to reach 1.
- EXPECT_LE(prev_probability, smoothed_probability); // Converge towards 1.
- prev_probability = smoothed_probability;
- }
-}
-
-// Checks that the smoothed speech probability instantly decays to the
-// unprocessed one when slow attack is used.
-TEST(AutomaticGainController2VadLevelAnalyzer, SpeechProbabilityInstantDecay) {
- const std::vector<float> speech_probabilities{1.0f, 1.0f, 1.0f,
- 1.0f, 1.0f, 0.0f};
- auto analyzer = CreateVadLevelAnalyzerWithMockVad(
- kNoVadPeriodicReset, kSlowAttack, speech_probabilities);
- FrameWithView frame;
- for (int i = 0; rtc::SafeLt(i, speech_probabilities.size() - 1); ++i) {
- analyzer->AnalyzeFrame(frame.view);
- }
- EXPECT_EQ(0.0f, analyzer->AnalyzeFrame(frame.view).speech_probability);
-}
-
// Checks that the VAD is not periodically reset.
-TEST(AutomaticGainController2VadLevelAnalyzer, VadNoPeriodicReset) {
+TEST(GainController2VadLevelAnalyzer, VadNoPeriodicReset) {
constexpr int kNumFrames = 19;
auto analyzer = CreateVadLevelAnalyzerWithMockVad(
- kNoVadPeriodicReset, kSlowAttack, /*speech_probabilities=*/{1.0f},
+ kNoVadPeriodicReset, /*speech_probabilities=*/{1.0f},
/*expected_vad_reset_calls=*/0);
FrameWithView frame;
for (int i = 0; i < kNumFrames; ++i) {
@@ -161,7 +122,7 @@
TEST_P(VadPeriodResetParametrization, VadPeriodicReset) {
auto analyzer = CreateVadLevelAnalyzerWithMockVad(
/*vad_reset_period_ms=*/vad_reset_period_frames() * kFrameDurationMs,
- kSlowAttack, /*speech_probabilities=*/{1.0f},
+ /*speech_probabilities=*/{1.0f},
/*expected_vad_reset_calls=*/num_frames() / vad_reset_period_frames());
FrameWithView frame;
for (int i = 0; i < num_frames(); ++i) {
@@ -169,7 +130,7 @@
}
}
-INSTANTIATE_TEST_SUITE_P(AutomaticGainController2VadLevelAnalyzer,
+INSTANTIATE_TEST_SUITE_P(GainController2VadLevelAnalyzer,
VadPeriodResetParametrization,
::testing::Combine(::testing::Values(1, 19, 123),
::testing::Values(2, 5, 20, 53)));
diff --git a/modules/audio_processing/gain_controller2.cc b/modules/audio_processing/gain_controller2.cc
index 6c5e24e..9e3e8e7 100644
--- a/modules/audio_processing/gain_controller2.cc
+++ b/modules/audio_processing/gain_controller2.cc
@@ -73,7 +73,7 @@
void GainController2::NotifyAnalogLevel(int level) {
if (analog_level_ != level && adaptive_agc_) {
- adaptive_agc_->Reset();
+ adaptive_agc_->HandleInputGainChange();
}
analog_level_ = level;
}
diff --git a/modules/audio_processing/gain_controller2_unittest.cc b/modules/audio_processing/gain_controller2_unittest.cc
index 274c821..815d58e 100644
--- a/modules/audio_processing/gain_controller2_unittest.cc
+++ b/modules/audio_processing/gain_controller2_unittest.cc
@@ -11,6 +11,7 @@
#include "modules/audio_processing/gain_controller2.h"
#include <algorithm>
+#include <cmath>
#include <memory>
#include "api/array_view.h"
@@ -68,7 +69,8 @@
return agc2;
}
-float GainAfterProcessingFile(GainController2* gain_controller) {
+float GainDbAfterProcessingFile(GainController2& gain_controller,
+ int max_duration_ms) {
// Set up an AudioBuffer to be filled from the speech file.
constexpr size_t kStereo = 2u;
const StreamConfig capture_config(AudioProcessing::kSampleRate48kHz, kStereo,
@@ -82,24 +84,29 @@
std::vector<float> capture_input(capture_config.num_frames() *
capture_config.num_channels());
- // The file should contain at least this many frames. Every iteration, we put
- // a frame through the gain controller.
- const int kNumFramesToProcess = 100;
- for (int frame_no = 0; frame_no < kNumFramesToProcess; ++frame_no) {
+ // Process the input file which must be long enough to cover
+ // `max_duration_ms`.
+ RTC_DCHECK_GT(max_duration_ms, 0);
+ const int num_frames = rtc::CheckedDivExact(max_duration_ms, 10);
+ for (int i = 0; i < num_frames; ++i) {
ReadFloatSamplesFromStereoFile(capture_config.num_frames(),
capture_config.num_channels(), &capture_file,
capture_input);
-
test::CopyVectorToAudioBuffer(capture_config, capture_input, &ab);
- gain_controller->Process(&ab);
+ gain_controller.Process(&ab);
}
- // Send in a last frame with values constant 1 (It's low enough to detect high
- // gain, and for ease of computation). The applied gain is the result.
+ // Send in a last frame with minimum dBFS level.
constexpr float sample_value = 1.f;
SetAudioBufferSamples(sample_value, &ab);
- gain_controller->Process(&ab);
- return ab.channels()[0][0];
+ gain_controller.Process(&ab);
+ // Measure the RMS level after processing.
+ float rms = 0.0f;
+ for (size_t i = 0; i < capture_config.num_frames(); ++i) {
+ rms += ab.channels()[0][i] * ab.channels()[0][i];
+ }
+ // Return the applied gain in dB.
+ return 20.0f * std::log10(std::sqrt(rms / capture_config.num_frames()));
}
} // namespace
@@ -324,34 +331,20 @@
48000,
true)));
-TEST(GainController2, UsageSaturationMargin) {
+// Checks that the gain applied at the end of a PCM samples file is close to the
+// expected value.
+TEST(GainController2, CheckGainAdaptiveDigital) {
+ constexpr float kExpectedGainDb = 4.3f;
+ constexpr float kToleranceDb = 0.5f;
GainController2 gain_controller2;
gain_controller2.Initialize(AudioProcessing::kSampleRate48kHz);
-
AudioProcessing::Config::GainController2 config;
- // Check that samples are not amplified as much when extra margin is
- // high. They should not be amplified at all, but only after convergence. GC2
- // starts with a gain, and it takes time until it's down to 0 dB.
config.fixed_digital.gain_db = 0.f;
config.adaptive_digital.enabled = true;
- config.adaptive_digital.extra_saturation_margin_db = 50.f;
gain_controller2.ApplyConfig(config);
-
- EXPECT_LT(GainAfterProcessingFile(&gain_controller2), 2.f);
-}
-
-TEST(GainController2, UsageNoSaturationMargin) {
- GainController2 gain_controller2;
- gain_controller2.Initialize(AudioProcessing::kSampleRate48kHz);
-
- AudioProcessing::Config::GainController2 config;
- // Check that some gain is applied if there is no margin.
- config.fixed_digital.gain_db = 0.f;
- config.adaptive_digital.enabled = true;
- config.adaptive_digital.extra_saturation_margin_db = 0.f;
- gain_controller2.ApplyConfig(config);
-
- EXPECT_GT(GainAfterProcessingFile(&gain_controller2), 1.9f);
+ EXPECT_NEAR(
+ GainDbAfterProcessingFile(gain_controller2, /*max_duration_ms=*/2000),
+ kExpectedGainDb, kToleranceDb);
}
} // namespace test
diff --git a/modules/audio_processing/include/audio_processing.cc b/modules/audio_processing/include/audio_processing.cc
index 790b1a7..fa45230 100644
--- a/modules/audio_processing/include/audio_processing.cc
+++ b/modules/audio_processing/include/audio_processing.cc
@@ -46,17 +46,6 @@
RTC_CHECK_NOTREACHED();
}
-std::string GainController2LevelEstimatorToString(
- const Agc2Config::LevelEstimator& level) {
- switch (level) {
- case Agc2Config::LevelEstimator::kRms:
- return "Rms";
- case Agc2Config::LevelEstimator::kPeak:
- return "Peak";
- }
- RTC_CHECK_NOTREACHED();
-}
-
std::string GainController2NoiseEstimatorToString(
const Agc2Config::NoiseEstimator& type) {
switch (type) {
@@ -174,20 +163,10 @@
<< gain_controller2.adaptive_digital.enabled << ", noise_estimator: "
<< GainController2NoiseEstimatorToString(
gain_controller2.adaptive_digital.noise_estimator)
- << ", level_estimator: { vad_probability_attack: "
- << gain_controller2.adaptive_digital.vad_probability_attack << ", type: "
- << GainController2LevelEstimatorToString(
- gain_controller2.adaptive_digital.level_estimator)
+ << ", vad_reset_period_ms: "
+ << gain_controller2.adaptive_digital.vad_reset_period_ms
<< ", adjacent_speech_frames_threshold: "
- << gain_controller2.adaptive_digital
- .level_estimator_adjacent_speech_frames_threshold
- << ", initial_saturation_margin_db: "
- << gain_controller2.adaptive_digital.initial_saturation_margin_db
- << ", extra_saturation_margin_db: "
- << gain_controller2.adaptive_digital.extra_saturation_margin_db
- << " }, gain_applier: { adjacent_speech_frames_threshold: "
- << gain_controller2.adaptive_digital
- .gain_applier_adjacent_speech_frames_threshold
+ << gain_controller2.adaptive_digital.adjacent_speech_frames_threshold
<< ", max_gain_change_db_per_second: "
<< gain_controller2.adaptive_digital.max_gain_change_db_per_second
<< ", max_output_noise_level_dbfs: "
@@ -195,7 +174,7 @@
<< ", sse2_allowed: " << gain_controller2.adaptive_digital.sse2_allowed
<< ", avx2_allowed: " << gain_controller2.adaptive_digital.avx2_allowed
<< ", neon_allowed: " << gain_controller2.adaptive_digital.neon_allowed
- << " }}}, residual_echo_detector: { enabled: "
+ << "}}, residual_echo_detector: { enabled: "
<< residual_echo_detector.enabled
<< " }, level_estimation: { enabled: " << level_estimation.enabled
<< " }}";
diff --git a/modules/audio_processing/include/audio_processing.h b/modules/audio_processing/include/audio_processing.h
index 781b17e..01bb7c3 100644
--- a/modules/audio_processing/include/audio_processing.h
+++ b/modules/audio_processing/include/audio_processing.h
@@ -349,6 +349,7 @@
return !(*this == rhs);
}
+ // TODO(crbug.com/webrtc/7494): Remove `LevelEstimator`.
enum LevelEstimator { kRms, kPeak };
enum NoiseEstimator { kStationaryNoise, kNoiseFloor };
bool enabled = false;
@@ -359,19 +360,20 @@
bool enabled = false;
NoiseEstimator noise_estimator = kNoiseFloor;
int vad_reset_period_ms = 1500;
- float vad_probability_attack = 0.9f;
- LevelEstimator level_estimator = kRms;
- int level_estimator_adjacent_speech_frames_threshold = 11;
- // TODO(crbug.com/webrtc/7494): Remove `use_saturation_protector`.
- bool use_saturation_protector = true;
- float initial_saturation_margin_db = 20.0f;
- float extra_saturation_margin_db = 5.0f;
- int gain_applier_adjacent_speech_frames_threshold = 11;
+ int adjacent_speech_frames_threshold = 12;
float max_gain_change_db_per_second = 3.0f;
- float max_output_noise_level_dbfs = -55.0f;
+ float max_output_noise_level_dbfs = -50.0f;
bool sse2_allowed = true;
bool avx2_allowed = true;
bool neon_allowed = true;
+ // TODO(crbug.com/webrtc/7494): Remove deprecated settings below.
+ float vad_probability_attack = 1.0f;
+ LevelEstimator level_estimator = kRms;
+ int level_estimator_adjacent_speech_frames_threshold = 12;
+ bool use_saturation_protector = true;
+ float initial_saturation_margin_db = 25.0f;
+ float extra_saturation_margin_db = 5.0f;
+ int gain_applier_adjacent_speech_frames_threshold = 12;
} adaptive_digital;
} gain_controller2;