modules/audio_processing/vad/voice_activity_detector.cc - src.git - Git at Google

 /*
  *  Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */

 #include "modules/audio_processing/vad/voice_activity_detector.h"

 #include <algorithm>

 #include "rtc_base/checks.h"

 namespace webrtc {
 namespace {

 const size_t kMaxLength = 320;
 const size_t kNumChannels = 1;

 const double kDefaultVoiceValue = 1.0;
 const double kNeutralProbability = 0.5;
 const double kLowProbability = 0.01;

 }  // namespace

 VoiceActivityDetector::VoiceActivityDetector()
     : last_voice_probability_(kDefaultVoiceValue),
       standalone_vad_(StandaloneVad::Create()) {
 }

 VoiceActivityDetector::~VoiceActivityDetector() = default;

 // Because ISAC has a different chunk length, it updates
 // |chunkwise_voice_probabilities_| and |chunkwise_rms_| when there is new data.
 // Otherwise it clears them.
 void VoiceActivityDetector::ProcessChunk(const int16_t* audio,
                                          size_t length,
                                          int sample_rate_hz) {
   RTC_DCHECK_EQ(length, sample_rate_hz / 100);
   RTC_DCHECK_LE(length, kMaxLength);
   // Resample to the required rate.
   const int16_t* resampled_ptr = audio;
   if (sample_rate_hz != kSampleRateHz) {
     RTC_CHECK_EQ(
         resampler_.ResetIfNeeded(sample_rate_hz, kSampleRateHz, kNumChannels),
         0);
     resampler_.Push(audio, length, resampled_, kLength10Ms, length);
     resampled_ptr = resampled_;
   }
   RTC_DCHECK_EQ(length, kLength10Ms);

   // Each chunk needs to be passed into |standalone_vad_|, because internally it
   // buffers the audio and processes it all at once when GetActivity() is
   // called.
   RTC_CHECK_EQ(standalone_vad_->AddAudio(resampled_ptr, length), 0);

   audio_processing_.ExtractFeatures(resampled_ptr, length, &features_);

   chunkwise_voice_probabilities_.resize(features_.num_frames);
   chunkwise_rms_.resize(features_.num_frames);
   std::copy(features_.rms, features_.rms + chunkwise_rms_.size(),
             chunkwise_rms_.begin());
   if (features_.num_frames > 0) {
     if (features_.silence) {
       // The other features are invalid, so set the voice probabilities to an
       // arbitrary low value.
       std::fill(chunkwise_voice_probabilities_.begin(),
                 chunkwise_voice_probabilities_.end(), kLowProbability);
     } else {
       std::fill(chunkwise_voice_probabilities_.begin(),
                 chunkwise_voice_probabilities_.end(), kNeutralProbability);
       RTC_CHECK_GE(
           standalone_vad_->GetActivity(&chunkwise_voice_probabilities_[0],
                                        chunkwise_voice_probabilities_.size()),
           0);
       RTC_CHECK_GE(pitch_based_vad_.VoicingProbability(
                        features_, &chunkwise_voice_probabilities_[0]),
                    0);
     }
     last_voice_probability_ = chunkwise_voice_probabilities_.back();
   }
 }

 }  // namespace webrtc
	/*
	* Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
	*
	* Use of this source code is governed by a BSD-style license
	* that can be found in the LICENSE file in the root of the source
	* tree. An additional intellectual property rights grant can be found
	* in the file PATENTS. All contributing project authors may
	* be found in the AUTHORS file in the root of the source tree.
	*/

	#include "modules/audio_processing/vad/voice_activity_detector.h"

	#include <algorithm>

	#include "rtc_base/checks.h"

	namespace webrtc {
	namespace {

	const size_t kMaxLength = 320;
	const size_t kNumChannels = 1;

	const double kDefaultVoiceValue = 1.0;
	const double kNeutralProbability = 0.5;
	const double kLowProbability = 0.01;

	} // namespace

	VoiceActivityDetector::VoiceActivityDetector()
	: last_voice_probability_(kDefaultVoiceValue),
	standalone_vad_(StandaloneVad::Create()) {
	}

	VoiceActivityDetector::~VoiceActivityDetector() = default;

	// Because ISAC has a different chunk length, it updates
	// \|chunkwise_voice_probabilities_\| and \|chunkwise_rms_\| when there is new data.
	// Otherwise it clears them.
	void VoiceActivityDetector::ProcessChunk(const int16_t* audio,
	size_t length,
	int sample_rate_hz) {
	RTC_DCHECK_EQ(length, sample_rate_hz / 100);
	RTC_DCHECK_LE(length, kMaxLength);
	// Resample to the required rate.
	const int16_t* resampled_ptr = audio;
	if (sample_rate_hz != kSampleRateHz) {
	RTC_CHECK_EQ(
	resampler_.ResetIfNeeded(sample_rate_hz, kSampleRateHz, kNumChannels),
	0);
	resampler_.Push(audio, length, resampled_, kLength10Ms, length);
	resampled_ptr = resampled_;
	}
	RTC_DCHECK_EQ(length, kLength10Ms);

	// Each chunk needs to be passed into \|standalone_vad_\|, because internally it
	// buffers the audio and processes it all at once when GetActivity() is
	// called.
	RTC_CHECK_EQ(standalone_vad_->AddAudio(resampled_ptr, length), 0);

	audio_processing_.ExtractFeatures(resampled_ptr, length, &features_);

	chunkwise_voice_probabilities_.resize(features_.num_frames);
	chunkwise_rms_.resize(features_.num_frames);
	std::copy(features_.rms, features_.rms + chunkwise_rms_.size(),
	chunkwise_rms_.begin());
	if (features_.num_frames > 0) {
	if (features_.silence) {
	// The other features are invalid, so set the voice probabilities to an
	// arbitrary low value.
	std::fill(chunkwise_voice_probabilities_.begin(),
	chunkwise_voice_probabilities_.end(), kLowProbability);
	} else {
	std::fill(chunkwise_voice_probabilities_.begin(),
	chunkwise_voice_probabilities_.end(), kNeutralProbability);
	RTC_CHECK_GE(
	standalone_vad_->GetActivity(&chunkwise_voice_probabilities_[0],
	chunkwise_voice_probabilities_.size()),
	0);
	RTC_CHECK_GE(pitch_based_vad_.VoicingProbability(
	features_, &chunkwise_voice_probabilities_[0]),
	0);
	}
	last_voice_probability_ = chunkwise_voice_probabilities_.back();
	}
	}

	} // namespace webrtc