| /* |
| * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. |
| * |
| * Use of this source code is governed by a BSD-style license |
| * that can be found in the LICENSE file in the root of the source |
| * tree. An additional intellectual property rights grant can be found |
| * in the file PATENTS. All contributing project authors may |
| * be found in the AUTHORS file in the root of the source tree. |
| */ |
| |
| #include "modules/audio_processing/vad/vad_audio_proc.h" |
| |
| #include <math.h> |
| #include <stdio.h> |
| #include <string.h> |
| |
| #include "common_audio/third_party/fft4g/fft4g.h" |
| #include "modules/audio_processing/vad/pitch_internal.h" |
| #include "modules/audio_processing/vad/pole_zero_filter.h" |
| #include "modules/audio_processing/vad/vad_audio_proc_internal.h" |
| #include "rtc_base/checks.h" |
| extern "C" { |
| #include "modules/audio_coding/codecs/isac/main/source/filter_functions.h" |
| #include "modules/audio_coding/codecs/isac/main/source/isac_vad.h" |
| #include "modules/audio_coding/codecs/isac/main/source/pitch_estimator.h" |
| #include "modules/audio_coding/codecs/isac/main/source/structs.h" |
| } |
| |
| namespace webrtc { |
| |
| // The following structures are declared anonymous in iSAC's structs.h. To |
| // forward declare them, we use this derived class trick. |
| struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {}; |
| struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {}; |
| |
| static constexpr float kFrequencyResolution = |
| kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize); |
| static constexpr int kSilenceRms = 5; |
| |
| // TODO(turajs): Make a Create or Init for VadAudioProc. |
| VadAudioProc::VadAudioProc() |
| : audio_buffer_(), |
| num_buffer_samples_(kNumPastSignalSamples), |
| log_old_gain_(-2), |
| old_lag_(50), // Arbitrary but valid as pitch-lag (in samples). |
| pitch_analysis_handle_(new PitchAnalysisStruct), |
| pre_filter_handle_(new PreFiltBankstr), |
| high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator, |
| kFilterOrder, |
| kCoeffDenominator, |
| kFilterOrder)) { |
| static_assert(kNumPastSignalSamples + kNumSubframeSamples == |
| sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]), |
| "lpc analysis window incorrect size"); |
| static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]), |
| "correlation weight incorrect size"); |
| |
| // TODO(turajs): Are we doing too much in the constructor? |
| float data[kDftSize]; |
| // Make FFT to initialize. |
| ip_[0] = 0; |
| WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_); |
| // TODO(turajs): Need to initialize high-pass filter. |
| |
| // Initialize iSAC components. |
| WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get()); |
| WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get()); |
| } |
| |
| VadAudioProc::~VadAudioProc() {} |
| |
| void VadAudioProc::ResetBuffer() { |
| memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess], |
| sizeof(audio_buffer_[0]) * kNumPastSignalSamples); |
| num_buffer_samples_ = kNumPastSignalSamples; |
| } |
| |
| int VadAudioProc::ExtractFeatures(const int16_t* frame, |
| size_t length, |
| AudioFeatures* features) { |
| features->num_frames = 0; |
| if (length != kNumSubframeSamples) { |
| return -1; |
| } |
| |
| // High-pass filter to remove the DC component and very low frequency content. |
| // We have experienced that this high-pass filtering improves voice/non-voiced |
| // classification. |
| if (high_pass_filter_->Filter(frame, kNumSubframeSamples, |
| &audio_buffer_[num_buffer_samples_]) != 0) { |
| return -1; |
| } |
| |
| num_buffer_samples_ += kNumSubframeSamples; |
| if (num_buffer_samples_ < kBufferLength) { |
| return 0; |
| } |
| RTC_DCHECK_EQ(num_buffer_samples_, kBufferLength); |
| features->num_frames = kNum10msSubframes; |
| features->silence = false; |
| |
| Rms(features->rms, kMaxNumFrames); |
| for (size_t i = 0; i < kNum10msSubframes; ++i) { |
| if (features->rms[i] < kSilenceRms) { |
| // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence. |
| // Bail out here instead. |
| features->silence = true; |
| ResetBuffer(); |
| return 0; |
| } |
| } |
| |
| PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz, |
| kMaxNumFrames); |
| FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames); |
| ResetBuffer(); |
| return 0; |
| } |
| |
| // Computes |kLpcOrder + 1| correlation coefficients. |
| void VadAudioProc::SubframeCorrelation(double* corr, |
| size_t length_corr, |
| size_t subframe_index) { |
| RTC_DCHECK_GE(length_corr, kLpcOrder + 1); |
| double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples]; |
| size_t buffer_index = subframe_index * kNumSubframeSamples; |
| |
| for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++) |
| windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n]; |
| |
| WebRtcIsac_AutoCorr(corr, windowed_audio, |
| kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder); |
| } |
| |
| // Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input. |
| // The analysis window is 15 ms long and it is centered on the first half of |
| // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the |
| // first half of each 10 ms subframe. |
| void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) { |
| RTC_DCHECK_GE(length_lpc, kNum10msSubframes * (kLpcOrder + 1)); |
| double corr[kLpcOrder + 1]; |
| double reflec_coeff[kLpcOrder]; |
| for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes; |
| i++, offset_lpc += kLpcOrder + 1) { |
| SubframeCorrelation(corr, kLpcOrder + 1, i); |
| corr[0] *= 1.0001; |
| // This makes Lev-Durb a bit more stable. |
| for (size_t k = 0; k < kLpcOrder + 1; k++) { |
| corr[k] *= kCorrWeight[k]; |
| } |
| WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder); |
| } |
| } |
| |
| // Fit a second order curve to these 3 points and find the location of the |
| // extremum. The points are inverted before curve fitting. |
| static float QuadraticInterpolation(float prev_val, |
| float curr_val, |
| float next_val) { |
| // Doing the interpolation in |1 / A(z)|^2. |
| float fractional_index = 0; |
| next_val = 1.0f / next_val; |
| prev_val = 1.0f / prev_val; |
| curr_val = 1.0f / curr_val; |
| |
| fractional_index = |
| -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val); |
| RTC_DCHECK_LT(fabs(fractional_index), 1); |
| return fractional_index; |
| } |
| |
| // 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope |
| // of the input signal. The local maximum of the spectral envelope corresponds |
| // with the local minimum of A(z). It saves complexity, as we save one |
| // inversion. Furthermore, we find the first local maximum of magnitude squared, |
| // to save on one square root. |
| void VadAudioProc::FindFirstSpectralPeaks(double* f_peak, |
| size_t length_f_peak) { |
| RTC_DCHECK_GE(length_f_peak, kNum10msSubframes); |
| double lpc[kNum10msSubframes * (kLpcOrder + 1)]; |
| // For all sub-frames. |
| GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1)); |
| |
| const size_t kNumDftCoefficients = kDftSize / 2 + 1; |
| float data[kDftSize]; |
| |
| for (size_t i = 0; i < kNum10msSubframes; i++) { |
| // Convert to float with zero pad. |
| memset(data, 0, sizeof(data)); |
| for (size_t n = 0; n < kLpcOrder + 1; n++) { |
| data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]); |
| } |
| // Transform to frequency domain. |
| WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_); |
| |
| size_t index_peak = 0; |
| float prev_magn_sqr = data[0] * data[0]; |
| float curr_magn_sqr = data[2] * data[2] + data[3] * data[3]; |
| float next_magn_sqr; |
| bool found_peak = false; |
| for (size_t n = 2; n < kNumDftCoefficients - 1; n++) { |
| next_magn_sqr = |
| data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1]; |
| if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) { |
| found_peak = true; |
| index_peak = n - 1; |
| break; |
| } |
| prev_magn_sqr = curr_magn_sqr; |
| curr_magn_sqr = next_magn_sqr; |
| } |
| float fractional_index = 0; |
| if (!found_peak) { |
| // Checking if |kNumDftCoefficients - 1| is the local minimum. |
| next_magn_sqr = data[1] * data[1]; |
| if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) { |
| index_peak = kNumDftCoefficients - 1; |
| } |
| } else { |
| // A peak is found, do a simple quadratic interpolation to get a more |
| // accurate estimate of the peak location. |
| fractional_index = |
| QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr); |
| } |
| f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution; |
| } |
| } |
| |
| // Using iSAC functions to estimate pitch gains & lags. |
| void VadAudioProc::PitchAnalysis(double* log_pitch_gains, |
| double* pitch_lags_hz, |
| size_t length) { |
| // TODO(turajs): This can be "imported" from iSAC & and the next two |
| // constants. |
| RTC_DCHECK_GE(length, kNum10msSubframes); |
| const int kNumPitchSubframes = 4; |
| double gains[kNumPitchSubframes]; |
| double lags[kNumPitchSubframes]; |
| |
| const int kNumSubbandFrameSamples = 240; |
| const int kNumLookaheadSamples = 24; |
| |
| float lower[kNumSubbandFrameSamples]; |
| float upper[kNumSubbandFrameSamples]; |
| double lower_lookahead[kNumSubbandFrameSamples]; |
| double upper_lookahead[kNumSubbandFrameSamples]; |
| double lower_lookahead_pre_filter[kNumSubbandFrameSamples + |
| kNumLookaheadSamples]; |
| |
| // Split signal to lower and upper bands |
| WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower, |
| upper, lower_lookahead, upper_lookahead, |
| pre_filter_handle_.get()); |
| WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter, |
| pitch_analysis_handle_.get(), lags, gains); |
| |
| // Lags are computed on lower-band signal with sampling rate half of the |
| // input signal. |
| GetSubframesPitchParameters( |
| kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes, |
| &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz); |
| } |
| |
| void VadAudioProc::Rms(double* rms, size_t length_rms) { |
| RTC_DCHECK_GE(length_rms, kNum10msSubframes); |
| size_t offset = kNumPastSignalSamples; |
| for (size_t i = 0; i < kNum10msSubframes; i++) { |
| rms[i] = 0; |
| for (size_t n = 0; n < kNumSubframeSamples; n++, offset++) |
| rms[i] += audio_buffer_[offset] * audio_buffer_[offset]; |
| rms[i] = sqrt(rms[i] / kNumSubframeSamples); |
| } |
| } |
| |
| } // namespace webrtc |