|  | /* | 
|  | *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | 
|  | * | 
|  | *  Use of this source code is governed by a BSD-style license | 
|  | *  that can be found in the LICENSE file in the root of the source | 
|  | *  tree. An additional intellectual property rights grant can be found | 
|  | *  in the file PATENTS.  All contributing project authors may | 
|  | *  be found in the AUTHORS file in the root of the source tree. | 
|  | */ | 
|  |  | 
|  | #include "modules/audio_processing/vad/vad_audio_proc.h" | 
|  |  | 
|  | #include <math.h> | 
|  | #include <stdio.h> | 
|  | #include <string.h> | 
|  |  | 
|  | #include "common_audio/third_party/ooura/fft_size_256/fft4g.h" | 
|  | #include "modules/audio_processing/vad/pitch_internal.h" | 
|  | #include "modules/audio_processing/vad/pole_zero_filter.h" | 
|  | #include "modules/audio_processing/vad/vad_audio_proc_internal.h" | 
|  | #include "rtc_base/checks.h" | 
|  | extern "C" { | 
|  | #include "modules/audio_coding/codecs/isac/main/source/filter_functions.h" | 
|  | #include "modules/audio_coding/codecs/isac/main/source/isac_vad.h" | 
|  | #include "modules/audio_coding/codecs/isac/main/source/pitch_estimator.h" | 
|  | #include "modules/audio_coding/codecs/isac/main/source/structs.h" | 
|  | } | 
|  |  | 
|  | namespace webrtc { | 
|  |  | 
|  | // The following structures are declared anonymous in iSAC's structs.h. To | 
|  | // forward declare them, we use this derived class trick. | 
|  | struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {}; | 
|  | struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {}; | 
|  |  | 
|  | static constexpr float kFrequencyResolution = | 
|  | kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize); | 
|  | static constexpr int kSilenceRms = 5; | 
|  |  | 
|  | // TODO(turajs): Make a Create or Init for VadAudioProc. | 
|  | VadAudioProc::VadAudioProc() | 
|  | : audio_buffer_(), | 
|  | num_buffer_samples_(kNumPastSignalSamples), | 
|  | log_old_gain_(-2), | 
|  | old_lag_(50),  // Arbitrary but valid as pitch-lag (in samples). | 
|  | pitch_analysis_handle_(new PitchAnalysisStruct), | 
|  | pre_filter_handle_(new PreFiltBankstr), | 
|  | high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator, | 
|  | kFilterOrder, | 
|  | kCoeffDenominator, | 
|  | kFilterOrder)) { | 
|  | static_assert(kNumPastSignalSamples + kNumSubframeSamples == | 
|  | sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]), | 
|  | "lpc analysis window incorrect size"); | 
|  | static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]), | 
|  | "correlation weight incorrect size"); | 
|  |  | 
|  | // TODO(turajs): Are we doing too much in the constructor? | 
|  | float data[kDftSize]; | 
|  | // Make FFT to initialize. | 
|  | ip_[0] = 0; | 
|  | WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_); | 
|  | // TODO(turajs): Need to initialize high-pass filter. | 
|  |  | 
|  | // Initialize iSAC components. | 
|  | WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get()); | 
|  | WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get()); | 
|  | } | 
|  |  | 
|  | VadAudioProc::~VadAudioProc() {} | 
|  |  | 
|  | void VadAudioProc::ResetBuffer() { | 
|  | memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess], | 
|  | sizeof(audio_buffer_[0]) * kNumPastSignalSamples); | 
|  | num_buffer_samples_ = kNumPastSignalSamples; | 
|  | } | 
|  |  | 
|  | int VadAudioProc::ExtractFeatures(const int16_t* frame, | 
|  | size_t length, | 
|  | AudioFeatures* features) { | 
|  | features->num_frames = 0; | 
|  | if (length != kNumSubframeSamples) { | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | // High-pass filter to remove the DC component and very low frequency content. | 
|  | // We have experienced that this high-pass filtering improves voice/non-voiced | 
|  | // classification. | 
|  | if (high_pass_filter_->Filter(frame, kNumSubframeSamples, | 
|  | &audio_buffer_[num_buffer_samples_]) != 0) { | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | num_buffer_samples_ += kNumSubframeSamples; | 
|  | if (num_buffer_samples_ < kBufferLength) { | 
|  | return 0; | 
|  | } | 
|  | RTC_DCHECK_EQ(num_buffer_samples_, kBufferLength); | 
|  | features->num_frames = kNum10msSubframes; | 
|  | features->silence = false; | 
|  |  | 
|  | Rms(features->rms, kMaxNumFrames); | 
|  | for (size_t i = 0; i < kNum10msSubframes; ++i) { | 
|  | if (features->rms[i] < kSilenceRms) { | 
|  | // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence. | 
|  | // Bail out here instead. | 
|  | features->silence = true; | 
|  | ResetBuffer(); | 
|  | return 0; | 
|  | } | 
|  | } | 
|  |  | 
|  | PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz, | 
|  | kMaxNumFrames); | 
|  | FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames); | 
|  | ResetBuffer(); | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | // Computes |kLpcOrder + 1| correlation coefficients. | 
|  | void VadAudioProc::SubframeCorrelation(double* corr, | 
|  | size_t length_corr, | 
|  | size_t subframe_index) { | 
|  | RTC_DCHECK_GE(length_corr, kLpcOrder + 1); | 
|  | double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples]; | 
|  | size_t buffer_index = subframe_index * kNumSubframeSamples; | 
|  |  | 
|  | for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++) | 
|  | windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n]; | 
|  |  | 
|  | WebRtcIsac_AutoCorr(corr, windowed_audio, | 
|  | kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder); | 
|  | } | 
|  |  | 
|  | // Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input. | 
|  | // The analysis window is 15 ms long and it is centered on the first half of | 
|  | // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the | 
|  | // first half of each 10 ms subframe. | 
|  | void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) { | 
|  | RTC_DCHECK_GE(length_lpc, kNum10msSubframes * (kLpcOrder + 1)); | 
|  | double corr[kLpcOrder + 1]; | 
|  | double reflec_coeff[kLpcOrder]; | 
|  | for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes; | 
|  | i++, offset_lpc += kLpcOrder + 1) { | 
|  | SubframeCorrelation(corr, kLpcOrder + 1, i); | 
|  | corr[0] *= 1.0001; | 
|  | // This makes Lev-Durb a bit more stable. | 
|  | for (size_t k = 0; k < kLpcOrder + 1; k++) { | 
|  | corr[k] *= kCorrWeight[k]; | 
|  | } | 
|  | WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder); | 
|  | } | 
|  | } | 
|  |  | 
|  | // Fit a second order curve to these 3 points and find the location of the | 
|  | // extremum. The points are inverted before curve fitting. | 
|  | static float QuadraticInterpolation(float prev_val, | 
|  | float curr_val, | 
|  | float next_val) { | 
|  | // Doing the interpolation in |1 / A(z)|^2. | 
|  | float fractional_index = 0; | 
|  | next_val = 1.0f / next_val; | 
|  | prev_val = 1.0f / prev_val; | 
|  | curr_val = 1.0f / curr_val; | 
|  |  | 
|  | fractional_index = | 
|  | -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val); | 
|  | RTC_DCHECK_LT(fabs(fractional_index), 1); | 
|  | return fractional_index; | 
|  | } | 
|  |  | 
|  | // 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope | 
|  | // of the input signal. The local maximum of the spectral envelope corresponds | 
|  | // with the local minimum of A(z). It saves complexity, as we save one | 
|  | // inversion. Furthermore, we find the first local maximum of magnitude squared, | 
|  | // to save on one square root. | 
|  | void VadAudioProc::FindFirstSpectralPeaks(double* f_peak, | 
|  | size_t length_f_peak) { | 
|  | RTC_DCHECK_GE(length_f_peak, kNum10msSubframes); | 
|  | double lpc[kNum10msSubframes * (kLpcOrder + 1)]; | 
|  | // For all sub-frames. | 
|  | GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1)); | 
|  |  | 
|  | const size_t kNumDftCoefficients = kDftSize / 2 + 1; | 
|  | float data[kDftSize]; | 
|  |  | 
|  | for (size_t i = 0; i < kNum10msSubframes; i++) { | 
|  | // Convert to float with zero pad. | 
|  | memset(data, 0, sizeof(data)); | 
|  | for (size_t n = 0; n < kLpcOrder + 1; n++) { | 
|  | data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]); | 
|  | } | 
|  | // Transform to frequency domain. | 
|  | WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_); | 
|  |  | 
|  | size_t index_peak = 0; | 
|  | float prev_magn_sqr = data[0] * data[0]; | 
|  | float curr_magn_sqr = data[2] * data[2] + data[3] * data[3]; | 
|  | float next_magn_sqr; | 
|  | bool found_peak = false; | 
|  | for (size_t n = 2; n < kNumDftCoefficients - 1; n++) { | 
|  | next_magn_sqr = | 
|  | data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1]; | 
|  | if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) { | 
|  | found_peak = true; | 
|  | index_peak = n - 1; | 
|  | break; | 
|  | } | 
|  | prev_magn_sqr = curr_magn_sqr; | 
|  | curr_magn_sqr = next_magn_sqr; | 
|  | } | 
|  | float fractional_index = 0; | 
|  | if (!found_peak) { | 
|  | // Checking if |kNumDftCoefficients - 1| is the local minimum. | 
|  | next_magn_sqr = data[1] * data[1]; | 
|  | if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) { | 
|  | index_peak = kNumDftCoefficients - 1; | 
|  | } | 
|  | } else { | 
|  | // A peak is found, do a simple quadratic interpolation to get a more | 
|  | // accurate estimate of the peak location. | 
|  | fractional_index = | 
|  | QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr); | 
|  | } | 
|  | f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Using iSAC functions to estimate pitch gains & lags. | 
|  | void VadAudioProc::PitchAnalysis(double* log_pitch_gains, | 
|  | double* pitch_lags_hz, | 
|  | size_t length) { | 
|  | // TODO(turajs): This can be "imported" from iSAC & and the next two | 
|  | // constants. | 
|  | RTC_DCHECK_GE(length, kNum10msSubframes); | 
|  | const int kNumPitchSubframes = 4; | 
|  | double gains[kNumPitchSubframes]; | 
|  | double lags[kNumPitchSubframes]; | 
|  |  | 
|  | const int kNumSubbandFrameSamples = 240; | 
|  | const int kNumLookaheadSamples = 24; | 
|  |  | 
|  | float lower[kNumSubbandFrameSamples]; | 
|  | float upper[kNumSubbandFrameSamples]; | 
|  | double lower_lookahead[kNumSubbandFrameSamples]; | 
|  | double upper_lookahead[kNumSubbandFrameSamples]; | 
|  | double lower_lookahead_pre_filter[kNumSubbandFrameSamples + | 
|  | kNumLookaheadSamples]; | 
|  |  | 
|  | // Split signal to lower and upper bands | 
|  | WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower, | 
|  | upper, lower_lookahead, upper_lookahead, | 
|  | pre_filter_handle_.get()); | 
|  | WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter, | 
|  | pitch_analysis_handle_.get(), lags, gains); | 
|  |  | 
|  | // Lags are computed on lower-band signal with sampling rate half of the | 
|  | // input signal. | 
|  | GetSubframesPitchParameters( | 
|  | kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes, | 
|  | &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz); | 
|  | } | 
|  |  | 
|  | void VadAudioProc::Rms(double* rms, size_t length_rms) { | 
|  | RTC_DCHECK_GE(length_rms, kNum10msSubframes); | 
|  | size_t offset = kNumPastSignalSamples; | 
|  | for (size_t i = 0; i < kNum10msSubframes; i++) { | 
|  | rms[i] = 0; | 
|  | for (size_t n = 0; n < kNumSubframeSamples; n++, offset++) | 
|  | rms[i] += audio_buffer_[offset] * audio_buffer_[offset]; | 
|  | rms[i] = sqrt(rms[i] / kNumSubframeSamples); | 
|  | } | 
|  | } | 
|  |  | 
|  | }  // namespace webrtc |