modules/audio_processing/agc2/adaptive_mode_level_estimator.cc - src - Git at Google

 /*
  *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */

 #include "modules/audio_processing/agc2/adaptive_mode_level_estimator.h"

 #include "modules/audio_processing/agc2/agc2_common.h"
 #include "modules/audio_processing/logging/apm_data_dumper.h"
 #include "rtc_base/checks.h"
 #include "rtc_base/logging.h"
 #include "rtc_base/numerics/safe_minmax.h"

 namespace webrtc {
 namespace {

 using LevelEstimatorType =
     AudioProcessing::Config::GainController2::LevelEstimator;

 float ClampLevelEstimateDbfs(float level_estimate_dbfs) {
   return rtc::SafeClamp<float>(level_estimate_dbfs, -90.f, 30.f);
 }

 }  // namespace

 bool AdaptiveModeLevelEstimator::LevelEstimatorState::operator==(
     const AdaptiveModeLevelEstimator::LevelEstimatorState& b) const {
   return time_to_confidence_ms == b.time_to_confidence_ms &&
          level_dbfs.numerator == b.level_dbfs.numerator &&
          level_dbfs.denominator == b.level_dbfs.denominator;
 }

 float AdaptiveModeLevelEstimator::LevelEstimatorState::Ratio::GetRatio() const {
   RTC_DCHECK_NE(denominator, 0.f);
   return numerator / denominator;
 }

 AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator(
     ApmDataDumper* apm_data_dumper)
     : AdaptiveModeLevelEstimator(
           apm_data_dumper,
           kDefaultLevelEstimatorAdjacentSpeechFramesThreshold) {}

 AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator(
     ApmDataDumper* apm_data_dumper,
     int adjacent_speech_frames_threshold)
     : apm_data_dumper_(apm_data_dumper),
       adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold),
       level_dbfs_(ClampLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs)) {
   RTC_DCHECK(apm_data_dumper_);
   RTC_DCHECK_GE(adjacent_speech_frames_threshold_, 1);
   Reset();
 }

 void AdaptiveModeLevelEstimator::Update(
     const VadLevelAnalyzer::Result& vad_level) {
   RTC_DCHECK_GT(vad_level.rms_dbfs, -150.f);
   RTC_DCHECK_LT(vad_level.rms_dbfs, 50.f);
   RTC_DCHECK_GT(vad_level.peak_dbfs, -150.f);
   RTC_DCHECK_LT(vad_level.peak_dbfs, 50.f);
   RTC_DCHECK_GE(vad_level.speech_probability, 0.f);
   RTC_DCHECK_LE(vad_level.speech_probability, 1.f);
   if (vad_level.speech_probability < kVadConfidenceThreshold) {
     // Not a speech frame.
     if (adjacent_speech_frames_threshold_ > 1) {
       // When two or more adjacent speech frames are required in order to update
       // the state, we need to decide whether to discard or confirm the updates
       // based on the speech sequence length.
       if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
         // First non-speech frame after a long enough sequence of speech frames.
         // Update the reliable state.
         reliable_state_ = preliminary_state_;
       } else if (num_adjacent_speech_frames_ > 0) {
         // First non-speech frame after a too short sequence of speech frames.
         // Reset to the last reliable state.
         preliminary_state_ = reliable_state_;
       }
     }
     num_adjacent_speech_frames_ = 0;
   } else {
     // Speech frame observed.
     num_adjacent_speech_frames_++;

     // Update preliminary level estimate.
     RTC_DCHECK_GE(preliminary_state_.time_to_confidence_ms, 0);
     const bool buffer_is_full = preliminary_state_.time_to_confidence_ms == 0;
     if (!buffer_is_full) {
       preliminary_state_.time_to_confidence_ms -= kFrameDurationMs;
     }
     // Weighted average of levels with speech probability as weight.
     RTC_DCHECK_GT(vad_level.speech_probability, 0.f);
     const float leak_factor = buffer_is_full ? kLevelEstimatorLeakFactor : 1.f;
     preliminary_state_.level_dbfs.numerator =
         preliminary_state_.level_dbfs.numerator * leak_factor +
         vad_level.rms_dbfs * vad_level.speech_probability;
     preliminary_state_.level_dbfs.denominator =
         preliminary_state_.level_dbfs.denominator * leak_factor +
         vad_level.speech_probability;

     const float level_dbfs = preliminary_state_.level_dbfs.GetRatio();

     if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
       // `preliminary_state_` is now reliable. Update the last level estimation.
       level_dbfs_ = ClampLevelEstimateDbfs(level_dbfs);
     }
   }
   DumpDebugData();
 }

 bool AdaptiveModeLevelEstimator::IsConfident() const {
   if (adjacent_speech_frames_threshold_ == 1) {
     // Ignore `reliable_state_` when a single frame is enough to update the
     // level estimate (because it is not used).
     return preliminary_state_.time_to_confidence_ms == 0;
   }
   // Once confident, it remains confident.
   RTC_DCHECK(reliable_state_.time_to_confidence_ms != 0 ||
              preliminary_state_.time_to_confidence_ms == 0);
   // During the first long enough speech sequence, `reliable_state_` must be
   // ignored since `preliminary_state_` is used.
   return reliable_state_.time_to_confidence_ms == 0 ||
          (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_ &&
           preliminary_state_.time_to_confidence_ms == 0);
 }

 void AdaptiveModeLevelEstimator::Reset() {
   ResetLevelEstimatorState(preliminary_state_);
   ResetLevelEstimatorState(reliable_state_);
   level_dbfs_ = ClampLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs);
   num_adjacent_speech_frames_ = 0;
 }

 void AdaptiveModeLevelEstimator::ResetLevelEstimatorState(
     LevelEstimatorState& state) const {
   state.time_to_confidence_ms = kLevelEstimatorTimeToConfidenceMs;
   state.level_dbfs.numerator = kInitialSpeechLevelEstimateDbfs;
   state.level_dbfs.denominator = 1.0f;
 }

 void AdaptiveModeLevelEstimator::DumpDebugData() const {
   apm_data_dumper_->DumpRaw(
       "agc2_adaptive_level_estimator_num_adjacent_speech_frames",
       num_adjacent_speech_frames_);
   apm_data_dumper_->DumpRaw(
       "agc2_adaptive_level_estimator_preliminary_level_estimate_num",
       preliminary_state_.level_dbfs.numerator);
   apm_data_dumper_->DumpRaw(
       "agc2_adaptive_level_estimator_preliminary_level_estimate_den",
       preliminary_state_.level_dbfs.denominator);
   apm_data_dumper_->DumpRaw(
       "agc2_adaptive_level_estimator_preliminary_time_to_confidence_ms",
       preliminary_state_.time_to_confidence_ms);
   apm_data_dumper_->DumpRaw(
       "agc2_adaptive_level_estimator_reliable_time_to_confidence_ms",
       reliable_state_.time_to_confidence_ms);
 }

 }  // namespace webrtc
	/*
	* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
	*
	* Use of this source code is governed by a BSD-style license
	* that can be found in the LICENSE file in the root of the source
	* tree. An additional intellectual property rights grant can be found
	* in the file PATENTS. All contributing project authors may
	* be found in the AUTHORS file in the root of the source tree.
	*/

	#include "modules/audio_processing/agc2/adaptive_mode_level_estimator.h"

	#include "modules/audio_processing/agc2/agc2_common.h"
	#include "modules/audio_processing/logging/apm_data_dumper.h"
	#include "rtc_base/checks.h"
	#include "rtc_base/logging.h"
	#include "rtc_base/numerics/safe_minmax.h"

	namespace webrtc {
	namespace {

	using LevelEstimatorType =
	AudioProcessing::Config::GainController2::LevelEstimator;

	float ClampLevelEstimateDbfs(float level_estimate_dbfs) {
	return rtc::SafeClamp<float>(level_estimate_dbfs, -90.f, 30.f);
	}

	} // namespace

	bool AdaptiveModeLevelEstimator::LevelEstimatorState::operator==(
	const AdaptiveModeLevelEstimator::LevelEstimatorState& b) const {
	return time_to_confidence_ms == b.time_to_confidence_ms &&
	level_dbfs.numerator == b.level_dbfs.numerator &&
	level_dbfs.denominator == b.level_dbfs.denominator;
	}

	float AdaptiveModeLevelEstimator::LevelEstimatorState::Ratio::GetRatio() const {
	RTC_DCHECK_NE(denominator, 0.f);
	return numerator / denominator;
	}

	AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator(
	ApmDataDumper* apm_data_dumper)
	: AdaptiveModeLevelEstimator(
	apm_data_dumper,
	kDefaultLevelEstimatorAdjacentSpeechFramesThreshold) {}

	AdaptiveModeLevelEstimator::AdaptiveModeLevelEstimator(
	ApmDataDumper* apm_data_dumper,
	int adjacent_speech_frames_threshold)
	: apm_data_dumper_(apm_data_dumper),
	adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold),
	level_dbfs_(ClampLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs)) {
	RTC_DCHECK(apm_data_dumper_);
	RTC_DCHECK_GE(adjacent_speech_frames_threshold_, 1);
	Reset();
	}

	void AdaptiveModeLevelEstimator::Update(
	const VadLevelAnalyzer::Result& vad_level) {
	RTC_DCHECK_GT(vad_level.rms_dbfs, -150.f);
	RTC_DCHECK_LT(vad_level.rms_dbfs, 50.f);
	RTC_DCHECK_GT(vad_level.peak_dbfs, -150.f);
	RTC_DCHECK_LT(vad_level.peak_dbfs, 50.f);
	RTC_DCHECK_GE(vad_level.speech_probability, 0.f);
	RTC_DCHECK_LE(vad_level.speech_probability, 1.f);
	if (vad_level.speech_probability < kVadConfidenceThreshold) {
	// Not a speech frame.
	if (adjacent_speech_frames_threshold_ > 1) {
	// When two or more adjacent speech frames are required in order to update
	// the state, we need to decide whether to discard or confirm the updates
	// based on the speech sequence length.
	if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
	// First non-speech frame after a long enough sequence of speech frames.
	// Update the reliable state.
	reliable_state_ = preliminary_state_;
	} else if (num_adjacent_speech_frames_ > 0) {
	// First non-speech frame after a too short sequence of speech frames.
	// Reset to the last reliable state.
	preliminary_state_ = reliable_state_;
	}
	}
	num_adjacent_speech_frames_ = 0;
	} else {
	// Speech frame observed.
	num_adjacent_speech_frames_++;

	// Update preliminary level estimate.
	RTC_DCHECK_GE(preliminary_state_.time_to_confidence_ms, 0);
	const bool buffer_is_full = preliminary_state_.time_to_confidence_ms == 0;
	if (!buffer_is_full) {
	preliminary_state_.time_to_confidence_ms -= kFrameDurationMs;
	}
	// Weighted average of levels with speech probability as weight.
	RTC_DCHECK_GT(vad_level.speech_probability, 0.f);
	const float leak_factor = buffer_is_full ? kLevelEstimatorLeakFactor : 1.f;
	preliminary_state_.level_dbfs.numerator =
	preliminary_state_.level_dbfs.numerator * leak_factor +
	vad_level.rms_dbfs * vad_level.speech_probability;
	preliminary_state_.level_dbfs.denominator =
	preliminary_state_.level_dbfs.denominator * leak_factor +
	vad_level.speech_probability;

	const float level_dbfs = preliminary_state_.level_dbfs.GetRatio();

	if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
	// `preliminary_state_` is now reliable. Update the last level estimation.
	level_dbfs_ = ClampLevelEstimateDbfs(level_dbfs);
	}
	}
	DumpDebugData();
	}

	bool AdaptiveModeLevelEstimator::IsConfident() const {
	if (adjacent_speech_frames_threshold_ == 1) {
	// Ignore `reliable_state_` when a single frame is enough to update the
	// level estimate (because it is not used).
	return preliminary_state_.time_to_confidence_ms == 0;
	}
	// Once confident, it remains confident.
	RTC_DCHECK(reliable_state_.time_to_confidence_ms != 0 \|\|
	preliminary_state_.time_to_confidence_ms == 0);
	// During the first long enough speech sequence, `reliable_state_` must be
	// ignored since `preliminary_state_` is used.
	return reliable_state_.time_to_confidence_ms == 0 \|\|
	(num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_ &&
	preliminary_state_.time_to_confidence_ms == 0);
	}

	void AdaptiveModeLevelEstimator::Reset() {
	ResetLevelEstimatorState(preliminary_state_);
	ResetLevelEstimatorState(reliable_state_);
	level_dbfs_ = ClampLevelEstimateDbfs(kInitialSpeechLevelEstimateDbfs);
	num_adjacent_speech_frames_ = 0;
	}

	void AdaptiveModeLevelEstimator::ResetLevelEstimatorState(
	LevelEstimatorState& state) const {
	state.time_to_confidence_ms = kLevelEstimatorTimeToConfidenceMs;
	state.level_dbfs.numerator = kInitialSpeechLevelEstimateDbfs;
	state.level_dbfs.denominator = 1.0f;
	}

	void AdaptiveModeLevelEstimator::DumpDebugData() const {
	apm_data_dumper_->DumpRaw(
	"agc2_adaptive_level_estimator_num_adjacent_speech_frames",
	num_adjacent_speech_frames_);
	apm_data_dumper_->DumpRaw(
	"agc2_adaptive_level_estimator_preliminary_level_estimate_num",
	preliminary_state_.level_dbfs.numerator);
	apm_data_dumper_->DumpRaw(
	"agc2_adaptive_level_estimator_preliminary_level_estimate_den",
	preliminary_state_.level_dbfs.denominator);
	apm_data_dumper_->DumpRaw(
	"agc2_adaptive_level_estimator_preliminary_time_to_confidence_ms",
	preliminary_state_.time_to_confidence_ms);
	apm_data_dumper_->DumpRaw(
	"agc2_adaptive_level_estimator_reliable_time_to_confidence_ms",
	reliable_state_.time_to_confidence_ms);
	}

	} // namespace webrtc