| /* | 
 |  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | 
 |  * | 
 |  *  Use of this source code is governed by a BSD-style license | 
 |  *  that can be found in the LICENSE file in the root of the source | 
 |  *  tree. An additional intellectual property rights grant can be found | 
 |  *  in the file PATENTS.  All contributing project authors may | 
 |  *  be found in the AUTHORS file in the root of the source tree. | 
 |  */ | 
 |  | 
 | #include "webrtc/modules/audio_coding/neteq/time_stretch.h" | 
 |  | 
 | #include <algorithm>  // min, max | 
 | #include <memory> | 
 |  | 
 | #include "webrtc/base/safe_conversions.h" | 
 | #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" | 
 | #include "webrtc/modules/audio_coding/neteq/background_noise.h" | 
 | #include "webrtc/modules/audio_coding/neteq/cross_correlation.h" | 
 | #include "webrtc/modules/audio_coding/neteq/dsp_helper.h" | 
 |  | 
 | namespace webrtc { | 
 |  | 
 | TimeStretch::ReturnCodes TimeStretch::Process(const int16_t* input, | 
 |                                               size_t input_len, | 
 |                                               bool fast_mode, | 
 |                                               AudioMultiVector* output, | 
 |                                               size_t* length_change_samples) { | 
 |   // Pre-calculate common multiplication with |fs_mult_|. | 
 |   size_t fs_mult_120 = | 
 |       static_cast<size_t>(fs_mult_ * 120);  // Corresponds to 15 ms. | 
 |  | 
 |   const int16_t* signal; | 
 |   std::unique_ptr<int16_t[]> signal_array; | 
 |   size_t signal_len; | 
 |   if (num_channels_ == 1) { | 
 |     signal = input; | 
 |     signal_len = input_len; | 
 |   } else { | 
 |     // We want |signal| to be only the first channel of |input|, which is | 
 |     // interleaved. Thus, we take the first sample, skip forward |num_channels| | 
 |     // samples, and continue like that. | 
 |     signal_len = input_len / num_channels_; | 
 |     signal_array.reset(new int16_t[signal_len]); | 
 |     signal = signal_array.get(); | 
 |     size_t j = master_channel_; | 
 |     for (size_t i = 0; i < signal_len; ++i) { | 
 |       signal_array[i] = input[j]; | 
 |       j += num_channels_; | 
 |     } | 
 |   } | 
 |  | 
 |   // Find maximum absolute value of input signal. | 
 |   max_input_value_ = WebRtcSpl_MaxAbsValueW16(signal, signal_len); | 
 |  | 
 |   // Downsample to 4 kHz sample rate and calculate auto-correlation. | 
 |   DspHelper::DownsampleTo4kHz(signal, signal_len, kDownsampledLen, | 
 |                               sample_rate_hz_, true /* compensate delay*/, | 
 |                               downsampled_input_); | 
 |   AutoCorrelation(); | 
 |  | 
 |   // Find the strongest correlation peak. | 
 |   static const size_t kNumPeaks = 1; | 
 |   size_t peak_index; | 
 |   int16_t peak_value; | 
 |   DspHelper::PeakDetection(auto_correlation_, kCorrelationLen, kNumPeaks, | 
 |                            fs_mult_, &peak_index, &peak_value); | 
 |   // Assert that |peak_index| stays within boundaries. | 
 |   assert(peak_index <= (2 * kCorrelationLen - 1) * fs_mult_); | 
 |  | 
 |   // Compensate peak_index for displaced starting position. The displacement | 
 |   // happens in AutoCorrelation(). Here, |kMinLag| is in the down-sampled 4 kHz | 
 |   // domain, while the |peak_index| is in the original sample rate; hence, the | 
 |   // multiplication by fs_mult_ * 2. | 
 |   peak_index += kMinLag * fs_mult_ * 2; | 
 |   // Assert that |peak_index| stays within boundaries. | 
 |   assert(peak_index >= static_cast<size_t>(20 * fs_mult_)); | 
 |   assert(peak_index <= 20 * fs_mult_ + (2 * kCorrelationLen - 1) * fs_mult_); | 
 |  | 
 |   // Calculate scaling to ensure that |peak_index| samples can be square-summed | 
 |   // without overflowing. | 
 |   int scaling = 31 - WebRtcSpl_NormW32(max_input_value_ * max_input_value_) - | 
 |       WebRtcSpl_NormW32(static_cast<int32_t>(peak_index)); | 
 |   scaling = std::max(0, scaling); | 
 |  | 
 |   // |vec1| starts at 15 ms minus one pitch period. | 
 |   const int16_t* vec1 = &signal[fs_mult_120 - peak_index]; | 
 |   // |vec2| start at 15 ms. | 
 |   const int16_t* vec2 = &signal[fs_mult_120]; | 
 |   // Calculate energies for |vec1| and |vec2|, assuming they both contain | 
 |   // |peak_index| samples. | 
 |   int32_t vec1_energy = | 
 |       WebRtcSpl_DotProductWithScale(vec1, vec1, peak_index, scaling); | 
 |   int32_t vec2_energy = | 
 |       WebRtcSpl_DotProductWithScale(vec2, vec2, peak_index, scaling); | 
 |  | 
 |   // Calculate cross-correlation between |vec1| and |vec2|. | 
 |   int32_t cross_corr = | 
 |       WebRtcSpl_DotProductWithScale(vec1, vec2, peak_index, scaling); | 
 |  | 
 |   // Check if the signal seems to be active speech or not (simple VAD). | 
 |   bool active_speech = SpeechDetection(vec1_energy, vec2_energy, peak_index, | 
 |                                        scaling); | 
 |  | 
 |   int16_t best_correlation; | 
 |   if (!active_speech) { | 
 |     SetParametersForPassiveSpeech(signal_len, &best_correlation, &peak_index); | 
 |   } else { | 
 |     // Calculate correlation: | 
 |     // cross_corr / sqrt(vec1_energy * vec2_energy). | 
 |  | 
 |     // Start with calculating scale values. | 
 |     int energy1_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec1_energy)); | 
 |     int energy2_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec2_energy)); | 
 |  | 
 |     // Make sure total scaling is even (to simplify scale factor after sqrt). | 
 |     if ((energy1_scale + energy2_scale) & 1) { | 
 |       // The sum is odd. | 
 |       energy1_scale += 1; | 
 |     } | 
 |  | 
 |     // Scale energies to int16_t. | 
 |     int16_t vec1_energy_int16 = | 
 |         static_cast<int16_t>(vec1_energy >> energy1_scale); | 
 |     int16_t vec2_energy_int16 = | 
 |         static_cast<int16_t>(vec2_energy >> energy2_scale); | 
 |  | 
 |     // Calculate square-root of energy product. | 
 |     int16_t sqrt_energy_prod = WebRtcSpl_SqrtFloor(vec1_energy_int16 * | 
 |                                                    vec2_energy_int16); | 
 |  | 
 |     // Calculate cross_corr / sqrt(en1*en2) in Q14. | 
 |     int temp_scale = 14 - (energy1_scale + energy2_scale) / 2; | 
 |     cross_corr = WEBRTC_SPL_SHIFT_W32(cross_corr, temp_scale); | 
 |     cross_corr = std::max(0, cross_corr);  // Don't use if negative. | 
 |     best_correlation = WebRtcSpl_DivW32W16(cross_corr, sqrt_energy_prod); | 
 |     // Make sure |best_correlation| is no larger than 1 in Q14. | 
 |     best_correlation = std::min(static_cast<int16_t>(16384), best_correlation); | 
 |   } | 
 |  | 
 |  | 
 |   // Check accelerate criteria and stretch the signal. | 
 |   ReturnCodes return_value = | 
 |       CheckCriteriaAndStretch(input, input_len, peak_index, best_correlation, | 
 |                               active_speech, fast_mode, output); | 
 |   switch (return_value) { | 
 |     case kSuccess: | 
 |       *length_change_samples = peak_index; | 
 |       break; | 
 |     case kSuccessLowEnergy: | 
 |       *length_change_samples = peak_index; | 
 |       break; | 
 |     case kNoStretch: | 
 |     case kError: | 
 |       *length_change_samples = 0; | 
 |       break; | 
 |   } | 
 |   return return_value; | 
 | } | 
 |  | 
 | void TimeStretch::AutoCorrelation() { | 
 |   // Calculate correlation from lag kMinLag to lag kMaxLag in 4 kHz domain. | 
 |   int32_t auto_corr[kCorrelationLen]; | 
 |   CrossCorrelationWithAutoShift( | 
 |       &downsampled_input_[kMaxLag], &downsampled_input_[kMaxLag - kMinLag], | 
 |       kCorrelationLen, kMaxLag - kMinLag, -1, auto_corr); | 
 |  | 
 |   // Normalize correlation to 14 bits and write to |auto_correlation_|. | 
 |   int32_t max_corr = WebRtcSpl_MaxAbsValueW32(auto_corr, kCorrelationLen); | 
 |   int scaling = std::max(0, 17 - WebRtcSpl_NormW32(max_corr)); | 
 |   WebRtcSpl_VectorBitShiftW32ToW16(auto_correlation_, kCorrelationLen, | 
 |                                    auto_corr, scaling); | 
 | } | 
 |  | 
 | bool TimeStretch::SpeechDetection(int32_t vec1_energy, int32_t vec2_energy, | 
 |                                   size_t peak_index, int scaling) const { | 
 |   // Check if the signal seems to be active speech or not (simple VAD). | 
 |   // If (vec1_energy + vec2_energy) / (2 * peak_index) <= | 
 |   // 8 * background_noise_energy, then we say that the signal contains no | 
 |   // active speech. | 
 |   // Rewrite the inequality as: | 
 |   // (vec1_energy + vec2_energy) / 16 <= peak_index * background_noise_energy. | 
 |   // The two sides of the inequality will be denoted |left_side| and | 
 |   // |right_side|. | 
 |   int32_t left_side = (vec1_energy + vec2_energy) / 16; | 
 |   int32_t right_side; | 
 |   if (background_noise_.initialized()) { | 
 |     right_side = background_noise_.Energy(master_channel_); | 
 |   } else { | 
 |     // If noise parameters have not been estimated, use a fixed threshold. | 
 |     right_side = 75000; | 
 |   } | 
 |   int right_scale = 16 - WebRtcSpl_NormW32(right_side); | 
 |   right_scale = std::max(0, right_scale); | 
 |   left_side = left_side >> right_scale; | 
 |   right_side = | 
 |       rtc::checked_cast<int32_t>(peak_index) * (right_side >> right_scale); | 
 |  | 
 |   // Scale |left_side| properly before comparing with |right_side|. | 
 |   // (|scaling| is the scale factor before energy calculation, thus the scale | 
 |   // factor for the energy is 2 * scaling.) | 
 |   if (WebRtcSpl_NormW32(left_side) < 2 * scaling) { | 
 |     // Cannot scale only |left_side|, must scale |right_side| too. | 
 |     int temp_scale = WebRtcSpl_NormW32(left_side); | 
 |     left_side = left_side << temp_scale; | 
 |     right_side = right_side >> (2 * scaling - temp_scale); | 
 |   } else { | 
 |     left_side = left_side << 2 * scaling; | 
 |   } | 
 |   return left_side > right_side; | 
 | } | 
 |  | 
 | }  // namespace webrtc |