modules/audio_coding/neteq/time_stretch.cc - src - Git at Google

 /*
  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */

 #include "modules/audio_coding/neteq/time_stretch.h"

 #include <algorithm>  // min, max
 #include <memory>

 #include "common_audio/signal_processing/include/signal_processing_library.h"
 #include "modules/audio_coding/neteq/background_noise.h"
 #include "modules/audio_coding/neteq/cross_correlation.h"
 #include "modules/audio_coding/neteq/dsp_helper.h"
 #include "rtc_base/numerics/safe_conversions.h"

 namespace webrtc {

 TimeStretch::ReturnCodes TimeStretch::Process(const int16_t* input,
                                               size_t input_len,
                                               bool fast_mode,
                                               AudioMultiVector* output,
                                               size_t* length_change_samples) {
   // Pre-calculate common multiplication with |fs_mult_|.
   size_t fs_mult_120 =
       static_cast<size_t>(fs_mult_ * 120);  // Corresponds to 15 ms.

   const int16_t* signal;
   std::unique_ptr<int16_t[]> signal_array;
   size_t signal_len;
   if (num_channels_ == 1) {
     signal = input;
     signal_len = input_len;
   } else {
     // We want |signal| to be only the first channel of |input|, which is
     // interleaved. Thus, we take the first sample, skip forward |num_channels|
     // samples, and continue like that.
     signal_len = input_len / num_channels_;
     signal_array.reset(new int16_t[signal_len]);
     signal = signal_array.get();
     size_t j = kRefChannel;
     for (size_t i = 0; i < signal_len; ++i) {
       signal_array[i] = input[j];
       j += num_channels_;
     }
   }

   // Find maximum absolute value of input signal.
   max_input_value_ = WebRtcSpl_MaxAbsValueW16(signal, signal_len);

   // Downsample to 4 kHz sample rate and calculate auto-correlation.
   DspHelper::DownsampleTo4kHz(signal, signal_len, kDownsampledLen,
                               sample_rate_hz_, true /* compensate delay*/,
                               downsampled_input_);
   AutoCorrelation();

   // Find the strongest correlation peak.
   static const size_t kNumPeaks = 1;
   size_t peak_index;
   int16_t peak_value;
   DspHelper::PeakDetection(auto_correlation_, kCorrelationLen, kNumPeaks,
                            fs_mult_, &peak_index, &peak_value);
   // Assert that |peak_index| stays within boundaries.
   assert(peak_index <= (2 * kCorrelationLen - 1) * fs_mult_);

   // Compensate peak_index for displaced starting position. The displacement
   // happens in AutoCorrelation(). Here, |kMinLag| is in the down-sampled 4 kHz
   // domain, while the |peak_index| is in the original sample rate; hence, the
   // multiplication by fs_mult_ * 2.
   peak_index += kMinLag * fs_mult_ * 2;
   // Assert that |peak_index| stays within boundaries.
   assert(peak_index >= static_cast<size_t>(20 * fs_mult_));
   assert(peak_index <= 20 * fs_mult_ + (2 * kCorrelationLen - 1) * fs_mult_);

   // Calculate scaling to ensure that |peak_index| samples can be square-summed
   // without overflowing.
   int scaling = 31 - WebRtcSpl_NormW32(max_input_value_ * max_input_value_) -
                 WebRtcSpl_NormW32(static_cast<int32_t>(peak_index));
   scaling = std::max(0, scaling);

   // |vec1| starts at 15 ms minus one pitch period.
   const int16_t* vec1 = &signal[fs_mult_120 - peak_index];
   // |vec2| start at 15 ms.
   const int16_t* vec2 = &signal[fs_mult_120];
   // Calculate energies for |vec1| and |vec2|, assuming they both contain
   // |peak_index| samples.
   int32_t vec1_energy =
       WebRtcSpl_DotProductWithScale(vec1, vec1, peak_index, scaling);
   int32_t vec2_energy =
       WebRtcSpl_DotProductWithScale(vec2, vec2, peak_index, scaling);

   // Calculate cross-correlation between |vec1| and |vec2|.
   int32_t cross_corr =
       WebRtcSpl_DotProductWithScale(vec1, vec2, peak_index, scaling);

   // Check if the signal seems to be active speech or not (simple VAD).
   bool active_speech =
       SpeechDetection(vec1_energy, vec2_energy, peak_index, scaling);

   int16_t best_correlation;
   if (!active_speech) {
     SetParametersForPassiveSpeech(signal_len, &best_correlation, &peak_index);
   } else {
     // Calculate correlation:
     // cross_corr / sqrt(vec1_energy * vec2_energy).

     // Start with calculating scale values.
     int energy1_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec1_energy));
     int energy2_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec2_energy));

     // Make sure total scaling is even (to simplify scale factor after sqrt).
     if ((energy1_scale + energy2_scale) & 1) {
       // The sum is odd.
       energy1_scale += 1;
     }

     // Scale energies to int16_t.
     int16_t vec1_energy_int16 =
         static_cast<int16_t>(vec1_energy >> energy1_scale);
     int16_t vec2_energy_int16 =
         static_cast<int16_t>(vec2_energy >> energy2_scale);

     // Calculate square-root of energy product.
     int16_t sqrt_energy_prod =
         WebRtcSpl_SqrtFloor(vec1_energy_int16 * vec2_energy_int16);

     // Calculate cross_corr / sqrt(en1*en2) in Q14.
     int temp_scale = 14 - (energy1_scale + energy2_scale) / 2;
     cross_corr = WEBRTC_SPL_SHIFT_W32(cross_corr, temp_scale);
     cross_corr = std::max(0, cross_corr);  // Don't use if negative.
     best_correlation = WebRtcSpl_DivW32W16(cross_corr, sqrt_energy_prod);
     // Make sure |best_correlation| is no larger than 1 in Q14.
     best_correlation = std::min(static_cast<int16_t>(16384), best_correlation);
   }

   // Check accelerate criteria and stretch the signal.
   ReturnCodes return_value =
       CheckCriteriaAndStretch(input, input_len, peak_index, best_correlation,
                               active_speech, fast_mode, output);
   switch (return_value) {
     case kSuccess:
       *length_change_samples = peak_index;
       break;
     case kSuccessLowEnergy:
       *length_change_samples = peak_index;
       break;
     case kNoStretch:
     case kError:
       *length_change_samples = 0;
       break;
   }
   return return_value;
 }

 void TimeStretch::AutoCorrelation() {
   // Calculate correlation from lag kMinLag to lag kMaxLag in 4 kHz domain.
   int32_t auto_corr[kCorrelationLen];
   CrossCorrelationWithAutoShift(
       &downsampled_input_[kMaxLag], &downsampled_input_[kMaxLag - kMinLag],
       kCorrelationLen, kMaxLag - kMinLag, -1, auto_corr);

   // Normalize correlation to 14 bits and write to |auto_correlation_|.
   int32_t max_corr = WebRtcSpl_MaxAbsValueW32(auto_corr, kCorrelationLen);
   int scaling = std::max(0, 17 - WebRtcSpl_NormW32(max_corr));
   WebRtcSpl_VectorBitShiftW32ToW16(auto_correlation_, kCorrelationLen,
                                    auto_corr, scaling);
 }

 bool TimeStretch::SpeechDetection(int32_t vec1_energy,
                                   int32_t vec2_energy,
                                   size_t peak_index,
                                   int scaling) const {
   // Check if the signal seems to be active speech or not (simple VAD).
   // If (vec1_energy + vec2_energy) / (2 * peak_index) <=
   // 8 * background_noise_energy, then we say that the signal contains no
   // active speech.
   // Rewrite the inequality as:
   // (vec1_energy + vec2_energy) / 16 <= peak_index * background_noise_energy.
   // The two sides of the inequality will be denoted |left_side| and
   // |right_side|.
   int32_t left_side = rtc::saturated_cast<int32_t>(
       (static_cast<int64_t>(vec1_energy) + vec2_energy) / 16);
   int32_t right_side;
   if (background_noise_.initialized()) {
     right_side = background_noise_.Energy(kRefChannel);
   } else {
     // If noise parameters have not been estimated, use a fixed threshold.
     right_side = 75000;
   }
   int right_scale = 16 - WebRtcSpl_NormW32(right_side);
   right_scale = std::max(0, right_scale);
   left_side = left_side >> right_scale;
   right_side =
       rtc::dchecked_cast<int32_t>(peak_index) * (right_side >> right_scale);

   // Scale |left_side| properly before comparing with |right_side|.
   // (|scaling| is the scale factor before energy calculation, thus the scale
   // factor for the energy is 2 * scaling.)
   if (WebRtcSpl_NormW32(left_side) < 2 * scaling) {
     // Cannot scale only |left_side|, must scale |right_side| too.
     int temp_scale = WebRtcSpl_NormW32(left_side);
     left_side = left_side << temp_scale;
     right_side = right_side >> (2 * scaling - temp_scale);
   } else {
     left_side = left_side << 2 * scaling;
   }
   return left_side > right_side;
 }

 }  // namespace webrtc
	/*
	* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
	*
	* Use of this source code is governed by a BSD-style license
	* that can be found in the LICENSE file in the root of the source
	* tree. An additional intellectual property rights grant can be found
	* in the file PATENTS. All contributing project authors may
	* be found in the AUTHORS file in the root of the source tree.
	*/

	#include "modules/audio_coding/neteq/time_stretch.h"

	#include <algorithm> // min, max
	#include <memory>

	#include "common_audio/signal_processing/include/signal_processing_library.h"
	#include "modules/audio_coding/neteq/background_noise.h"
	#include "modules/audio_coding/neteq/cross_correlation.h"
	#include "modules/audio_coding/neteq/dsp_helper.h"
	#include "rtc_base/numerics/safe_conversions.h"

	namespace webrtc {

	TimeStretch::ReturnCodes TimeStretch::Process(const int16_t* input,
	size_t input_len,
	bool fast_mode,
	AudioMultiVector* output,
	size_t* length_change_samples) {
	// Pre-calculate common multiplication with \|fs_mult_\|.
	size_t fs_mult_120 =
	static_cast<size_t>(fs_mult_ * 120); // Corresponds to 15 ms.

	const int16_t* signal;
	std::unique_ptr<int16_t[]> signal_array;
	size_t signal_len;
	if (num_channels_ == 1) {
	signal = input;
	signal_len = input_len;
	} else {
	// We want \|signal\| to be only the first channel of \|input\|, which is
	// interleaved. Thus, we take the first sample, skip forward \|num_channels\|
	// samples, and continue like that.
	signal_len = input_len / num_channels_;
	signal_array.reset(new int16_t[signal_len]);
	signal = signal_array.get();
	size_t j = kRefChannel;
	for (size_t i = 0; i < signal_len; ++i) {
	signal_array[i] = input[j];
	j += num_channels_;
	}
	}

	// Find maximum absolute value of input signal.
	max_input_value_ = WebRtcSpl_MaxAbsValueW16(signal, signal_len);

	// Downsample to 4 kHz sample rate and calculate auto-correlation.
	DspHelper::DownsampleTo4kHz(signal, signal_len, kDownsampledLen,
	sample_rate_hz_, true /* compensate delay*/,
	downsampled_input_);
	AutoCorrelation();

	// Find the strongest correlation peak.
	static const size_t kNumPeaks = 1;
	size_t peak_index;
	int16_t peak_value;
	DspHelper::PeakDetection(auto_correlation_, kCorrelationLen, kNumPeaks,
	fs_mult_, &peak_index, &peak_value);
	// Assert that \|peak_index\| stays within boundaries.
	assert(peak_index <= (2 * kCorrelationLen - 1) * fs_mult_);

	// Compensate peak_index for displaced starting position. The displacement
	// happens in AutoCorrelation(). Here, \|kMinLag\| is in the down-sampled 4 kHz
	// domain, while the \|peak_index\| is in the original sample rate; hence, the
	// multiplication by fs_mult_ * 2.
	peak_index += kMinLag * fs_mult_ * 2;
	// Assert that \|peak_index\| stays within boundaries.
	assert(peak_index >= static_cast<size_t>(20 * fs_mult_));
	assert(peak_index <= 20 * fs_mult_ + (2 * kCorrelationLen - 1) * fs_mult_);

	// Calculate scaling to ensure that \|peak_index\| samples can be square-summed
	// without overflowing.
	int scaling = 31 - WebRtcSpl_NormW32(max_input_value_ * max_input_value_) -
	WebRtcSpl_NormW32(static_cast<int32_t>(peak_index));
	scaling = std::max(0, scaling);

	// \|vec1\| starts at 15 ms minus one pitch period.
	const int16_t* vec1 = &signal[fs_mult_120 - peak_index];
	// \|vec2\| start at 15 ms.
	const int16_t* vec2 = &signal[fs_mult_120];
	// Calculate energies for \|vec1\| and \|vec2\|, assuming they both contain
	// \|peak_index\| samples.
	int32_t vec1_energy =
	WebRtcSpl_DotProductWithScale(vec1, vec1, peak_index, scaling);
	int32_t vec2_energy =
	WebRtcSpl_DotProductWithScale(vec2, vec2, peak_index, scaling);

	// Calculate cross-correlation between \|vec1\| and \|vec2\|.
	int32_t cross_corr =
	WebRtcSpl_DotProductWithScale(vec1, vec2, peak_index, scaling);

	// Check if the signal seems to be active speech or not (simple VAD).
	bool active_speech =
	SpeechDetection(vec1_energy, vec2_energy, peak_index, scaling);

	int16_t best_correlation;
	if (!active_speech) {
	SetParametersForPassiveSpeech(signal_len, &best_correlation, &peak_index);
	} else {
	// Calculate correlation:
	// cross_corr / sqrt(vec1_energy * vec2_energy).

	// Start with calculating scale values.
	int energy1_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec1_energy));
	int energy2_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec2_energy));

	// Make sure total scaling is even (to simplify scale factor after sqrt).
	if ((energy1_scale + energy2_scale) & 1) {
	// The sum is odd.
	energy1_scale += 1;
	}

	// Scale energies to int16_t.
	int16_t vec1_energy_int16 =
	static_cast<int16_t>(vec1_energy >> energy1_scale);
	int16_t vec2_energy_int16 =
	static_cast<int16_t>(vec2_energy >> energy2_scale);

	// Calculate square-root of energy product.
	int16_t sqrt_energy_prod =
	WebRtcSpl_SqrtFloor(vec1_energy_int16 * vec2_energy_int16);

	// Calculate cross_corr / sqrt(en1*en2) in Q14.
	int temp_scale = 14 - (energy1_scale + energy2_scale) / 2;
	cross_corr = WEBRTC_SPL_SHIFT_W32(cross_corr, temp_scale);
	cross_corr = std::max(0, cross_corr); // Don't use if negative.
	best_correlation = WebRtcSpl_DivW32W16(cross_corr, sqrt_energy_prod);
	// Make sure \|best_correlation\| is no larger than 1 in Q14.
	best_correlation = std::min(static_cast<int16_t>(16384), best_correlation);
	}

	// Check accelerate criteria and stretch the signal.
	ReturnCodes return_value =
	CheckCriteriaAndStretch(input, input_len, peak_index, best_correlation,
	active_speech, fast_mode, output);
	switch (return_value) {
	case kSuccess:
	*length_change_samples = peak_index;
	break;
	case kSuccessLowEnergy:
	*length_change_samples = peak_index;
	break;
	case kNoStretch:
	case kError:
	*length_change_samples = 0;
	break;
	}
	return return_value;
	}

	void TimeStretch::AutoCorrelation() {
	// Calculate correlation from lag kMinLag to lag kMaxLag in 4 kHz domain.
	int32_t auto_corr[kCorrelationLen];
	CrossCorrelationWithAutoShift(
	&downsampled_input_[kMaxLag], &downsampled_input_[kMaxLag - kMinLag],
	kCorrelationLen, kMaxLag - kMinLag, -1, auto_corr);

	// Normalize correlation to 14 bits and write to \|auto_correlation_\|.
	int32_t max_corr = WebRtcSpl_MaxAbsValueW32(auto_corr, kCorrelationLen);
	int scaling = std::max(0, 17 - WebRtcSpl_NormW32(max_corr));
	WebRtcSpl_VectorBitShiftW32ToW16(auto_correlation_, kCorrelationLen,
	auto_corr, scaling);
	}

	bool TimeStretch::SpeechDetection(int32_t vec1_energy,
	int32_t vec2_energy,
	size_t peak_index,
	int scaling) const {
	// Check if the signal seems to be active speech or not (simple VAD).
	// If (vec1_energy + vec2_energy) / (2 * peak_index) <=
	// 8 * background_noise_energy, then we say that the signal contains no
	// active speech.
	// Rewrite the inequality as:
	// (vec1_energy + vec2_energy) / 16 <= peak_index * background_noise_energy.
	// The two sides of the inequality will be denoted \|left_side\| and
	// \|right_side\|.
	int32_t left_side = rtc::saturated_cast<int32_t>(
	(static_cast<int64_t>(vec1_energy) + vec2_energy) / 16);
	int32_t right_side;
	if (background_noise_.initialized()) {
	right_side = background_noise_.Energy(kRefChannel);
	} else {
	// If noise parameters have not been estimated, use a fixed threshold.
	right_side = 75000;
	}
	int right_scale = 16 - WebRtcSpl_NormW32(right_side);
	right_scale = std::max(0, right_scale);
	left_side = left_side >> right_scale;
	right_side =
	rtc::dchecked_cast<int32_t>(peak_index) * (right_side >> right_scale);

	// Scale \|left_side\| properly before comparing with \|right_side\|.
	// (\|scaling\| is the scale factor before energy calculation, thus the scale
	// factor for the energy is 2 * scaling.)
	if (WebRtcSpl_NormW32(left_side) < 2 * scaling) {
	// Cannot scale only \|left_side\|, must scale \|right_side\| too.
	int temp_scale = WebRtcSpl_NormW32(left_side);
	left_side = left_side << temp_scale;
	right_side = right_side >> (2 * scaling - temp_scale);
	} else {
	left_side = left_side << 2 * scaling;
	}
	return left_side > right_side;
	}

	} // namespace webrtc