RNN VAD: Opus band spectral analysis refactoring
This CL refactors the computation of band energy and spectral
cross-correlation coefficients by moving and optimizing
the code from ComputeBandCoefficients, ComputeBandEnergies and
ComputeSpectralCrossCorrelation into a single class (named
BandFeaturesExtractor).
This change will also help replacing FFT library in the RNN VAD.
Bug: webrtc:10480
Change-Id: I6cefa23e8f3bc8de6eb09d3ea434699d5e19124e
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/129726
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Reviewed-by: Per Ã…hgren <peah@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#27535}
diff --git a/modules/audio_processing/agc2/rnn_vad/BUILD.gn b/modules/audio_processing/agc2/rnn_vad/BUILD.gn
index 237c809..cd9a7a2 100644
--- a/modules/audio_processing/agc2/rnn_vad/BUILD.gn
+++ b/modules/audio_processing/agc2/rnn_vad/BUILD.gn
@@ -38,7 +38,6 @@
deps = [
"..:biquad_filter",
"../../../../api:array_view",
- "../../../../api:function_view",
"../../../../rtc_base:checks",
"../../../../rtc_base:rtc_base_approved",
"../../utility:pffft_wrapper",
diff --git a/modules/audio_processing/agc2/rnn_vad/common.h b/modules/audio_processing/agc2/rnn_vad/common.h
index 2f16cd4..4fef3ab 100644
--- a/modules/audio_processing/agc2/rnn_vad/common.h
+++ b/modules/audio_processing/agc2/rnn_vad/common.h
@@ -52,17 +52,13 @@
constexpr size_t kMinPitch48kHz = kMinPitch24kHz * 2;
constexpr size_t kMaxPitch48kHz = kMaxPitch24kHz * 2;
-// Sub-band frequency boundaries.
+// Spectral features.
+constexpr size_t kFftSizeBy2Plus1 = kFrameSize20ms24kHz / 2 + 1;
constexpr size_t kNumBands = 22;
-constexpr int kBandFrequencyBoundaries[kNumBands] = {
- 0, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 2000, 2400,
- 2800, 3200, 4000, 4800, 5600, 6800, 8000, 9600, 12000, 15600, 20000};
-
-// Feature extraction parameters.
constexpr size_t kNumLowerBands = 6;
static_assert((0 < kNumLowerBands) && (kNumLowerBands < kNumBands), "");
-constexpr size_t kSpectralCoeffsHistorySize = 8;
-static_assert(kSpectralCoeffsHistorySize > 2,
+constexpr size_t kCepstralCoeffsHistorySize = 8;
+static_assert(kCepstralCoeffsHistorySize > 2,
"The history size must at least be 3 to compute first and second "
"derivatives.");
diff --git a/modules/audio_processing/agc2/rnn_vad/features_extraction.cc b/modules/audio_processing/agc2/rnn_vad/features_extraction.cc
index 8f472a5..e935179 100644
--- a/modules/audio_processing/agc2/rnn_vad/features_extraction.cc
+++ b/modules/audio_processing/agc2/rnn_vad/features_extraction.cc
@@ -78,12 +78,12 @@
// and write the feature vector.
return spectral_features_extractor_.CheckSilenceComputeFeatures(
reference_frame_view_, {lagged_frame.data(), kFrameSize20ms24kHz},
- {{feature_vector.data() + kNumLowerBands, kNumBands - kNumLowerBands},
- {feature_vector.data(), kNumLowerBands},
- {feature_vector.data() + kNumBands, kNumLowerBands},
- {feature_vector.data() + kNumBands + kNumLowerBands, kNumLowerBands},
- {feature_vector.data() + kNumBands + 2 * kNumLowerBands, kNumLowerBands},
- &feature_vector[kFeatureVectorSize - 1]});
+ {feature_vector.data() + kNumLowerBands, kNumBands - kNumLowerBands},
+ {feature_vector.data(), kNumLowerBands},
+ {feature_vector.data() + kNumBands, kNumLowerBands},
+ {feature_vector.data() + kNumBands + kNumLowerBands, kNumLowerBands},
+ {feature_vector.data() + kNumBands + 2 * kNumLowerBands, kNumLowerBands},
+ &feature_vector[kFeatureVectorSize - 1]);
}
} // namespace rnn_vad
diff --git a/modules/audio_processing/agc2/rnn_vad/fft_util.cc b/modules/audio_processing/agc2/rnn_vad/fft_util.cc
index 4825e2b..4cc3ed9 100644
--- a/modules/audio_processing/agc2/rnn_vad/fft_util.cc
+++ b/modules/audio_processing/agc2/rnn_vad/fft_util.cc
@@ -35,16 +35,16 @@
} // namespace
-BandAnalysisFft::BandAnalysisFft()
+FftUtil::FftUtil()
: half_window_(ComputeHalfVorbisWindow()),
fft_(static_cast<int>(input_buf_.size())) {}
-BandAnalysisFft::~BandAnalysisFft() = default;
+FftUtil::~FftUtil() = default;
-void BandAnalysisFft::ForwardFft(rtc::ArrayView<const float> samples,
- rtc::ArrayView<std::complex<float>> dst) {
+void FftUtil::WindowedFft(rtc::ArrayView<const float> samples,
+ rtc::ArrayView<std::complex<float>> dst) {
RTC_DCHECK_EQ(samples.size(), kFrameSize20ms24kHz);
- RTC_DCHECK_EQ(dst.size(), kFrameSize20ms24kHz / 2 + 1);
+ RTC_DCHECK_EQ(dst.size(), kFftSizeBy2Plus1);
// Apply windowing.
RTC_DCHECK_EQ(input_buf_.size(), 2 * half_window_.size());
for (size_t i = 0; i < input_buf_.size() / 2; ++i) {
diff --git a/modules/audio_processing/agc2/rnn_vad/fft_util.h b/modules/audio_processing/agc2/rnn_vad/fft_util.h
index c744ff6..e38b0ff 100644
--- a/modules/audio_processing/agc2/rnn_vad/fft_util.h
+++ b/modules/audio_processing/agc2/rnn_vad/fft_util.h
@@ -21,32 +21,31 @@
namespace webrtc {
namespace rnn_vad {
-// TODO(alessiob): Switch to PFFFT using its own wrapper.
-// TODO(alessiob): Delete this class when switching to PFFFT.
+// TODO(alessiob): Switch to PFFFT and remove this class.
// FFT implementation wrapper for the band-wise analysis step in which 20 ms
// frames at 24 kHz are analyzed in the frequency domain. The goal of this class
// are (i) making easy to switch to another FFT implementation, (ii) own the
// input buffer for the FFT and (iii) apply a windowing function before
// computing the FFT.
-class BandAnalysisFft {
+class FftUtil {
public:
- BandAnalysisFft();
- BandAnalysisFft(const BandAnalysisFft&) = delete;
- BandAnalysisFft& operator=(const BandAnalysisFft&) = delete;
- ~BandAnalysisFft();
+ FftUtil();
+ FftUtil(const FftUtil&) = delete;
+ FftUtil& operator=(const FftUtil&) = delete;
+ ~FftUtil();
// Applies a windowing function to |samples|, computes the real forward FFT
// and writes the result in |dst|.
// The size of |samples| must be 480 (20 ms at 24 kHz).
// The size of |dst| must be 241 since the complex conjugate is not written.
- void ForwardFft(rtc::ArrayView<const float> samples,
- rtc::ArrayView<std::complex<float>> dst);
+ void WindowedFft(rtc::ArrayView<const float> samples,
+ rtc::ArrayView<std::complex<float>> dst);
private:
static_assert((kFrameSize20ms24kHz & 1) == 0,
"kFrameSize20ms24kHz must be even.");
const std::array<float, kFrameSize20ms24kHz / 2> half_window_;
- std::array<std::complex<float>, kFrameSize20ms24kHz> input_buf_{};
- std::array<std::complex<float>, kFrameSize20ms24kHz> output_buf_{};
+ std::array<std::complex<float>, kFrameSize20ms24kHz> input_buf_;
+ std::array<std::complex<float>, kFrameSize20ms24kHz> output_buf_;
rnnoise::KissFft fft_;
};
diff --git a/modules/audio_processing/agc2/rnn_vad/fft_util_unittest.cc b/modules/audio_processing/agc2/rnn_vad/fft_util_unittest.cc
index 28f56bd..a7efa1e 100644
--- a/modules/audio_processing/agc2/rnn_vad/fft_util_unittest.cc
+++ b/modules/audio_processing/agc2/rnn_vad/fft_util_unittest.cc
@@ -39,16 +39,16 @@
} // namespace
-TEST(RnnVadTest, BandAnalysisFftTest) {
+TEST(RnnVadTest, FftUtilTest) {
for (float frequency_hz : {200.f, 450.f, 1500.f}) {
SCOPED_TRACE(frequency_hz);
auto x = CreateSine(
/*amplitude=*/1000.f, frequency_hz,
/*duration_s=*/0.02f,
/*sample_rate_hz=*/kSampleRate24kHz);
- BandAnalysisFft analyzer;
+ FftUtil analyzer;
std::vector<std::complex<float>> x_fft(x.size() / 2 + 1);
- analyzer.ForwardFft(x, x_fft);
+ analyzer.WindowedFft(x, x_fft);
int peak_fft_bin_index = std::distance(
x_fft.begin(),
std::max_element(x_fft.begin(), x_fft.end(),
diff --git a/modules/audio_processing/agc2/rnn_vad/spectral_features.cc b/modules/audio_processing/agc2/rnn_vad/spectral_features.cc
index 84db2df..8235579 100644
--- a/modules/audio_processing/agc2/rnn_vad/spectral_features.cc
+++ b/modules/audio_processing/agc2/rnn_vad/spectral_features.cc
@@ -15,7 +15,6 @@
#include <limits>
#include <numeric>
-#include "modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h"
#include "rtc_base/checks.h"
namespace webrtc {
@@ -24,21 +23,21 @@
constexpr float kSilenceThreshold = 0.04f;
-// Computes the new spectral difference stats and pushes them into the passed
+// Computes the new cepstral difference stats and pushes them into the passed
// symmetric matrix buffer.
-void UpdateSpectralDifferenceStats(
- rtc::ArrayView<const float, kNumBands> new_spectral_coeffs,
- const RingBuffer<float, kNumBands, kSpectralCoeffsHistorySize>& ring_buf,
- SymmetricMatrixBuffer<float, kSpectralCoeffsHistorySize>* sym_matrix_buf) {
+void UpdateCepstralDifferenceStats(
+ rtc::ArrayView<const float, kNumBands> new_cepstral_coeffs,
+ const RingBuffer<float, kNumBands, kCepstralCoeffsHistorySize>& ring_buf,
+ SymmetricMatrixBuffer<float, kCepstralCoeffsHistorySize>* sym_matrix_buf) {
RTC_DCHECK(sym_matrix_buf);
- // Compute the new spectral distance stats.
- std::array<float, kSpectralCoeffsHistorySize - 1> distances;
- for (size_t i = 0; i < kSpectralCoeffsHistorySize - 1; ++i) {
+ // Compute the new cepstral distance stats.
+ std::array<float, kCepstralCoeffsHistorySize - 1> distances;
+ for (size_t i = 0; i < kCepstralCoeffsHistorySize - 1; ++i) {
const size_t delay = i + 1;
- auto old_spectral_coeffs = ring_buf.GetArrayView(delay);
+ auto old_cepstral_coeffs = ring_buf.GetArrayView(delay);
distances[i] = 0.f;
for (size_t k = 0; k < kNumBands; ++k) {
- const float c = new_spectral_coeffs[k] - old_spectral_coeffs[k];
+ const float c = new_cepstral_coeffs[k] - old_cepstral_coeffs[k];
distances[i] += c * c;
}
}
@@ -48,96 +47,77 @@
} // namespace
-SpectralFeaturesView::SpectralFeaturesView(
- rtc::ArrayView<float, kNumBands - kNumLowerBands> coeffs,
- rtc::ArrayView<float, kNumLowerBands> average,
- rtc::ArrayView<float, kNumLowerBands> first_derivative,
- rtc::ArrayView<float, kNumLowerBands> second_derivative,
- rtc::ArrayView<float, kNumLowerBands> cross_correlations,
- float* variability)
- : coeffs(coeffs),
- average(average),
- first_derivative(first_derivative),
- second_derivative(second_derivative),
- cross_correlations(cross_correlations),
- variability(variability) {}
-
-SpectralFeaturesView::SpectralFeaturesView(const SpectralFeaturesView&) =
- default;
-SpectralFeaturesView::~SpectralFeaturesView() = default;
-
SpectralFeaturesExtractor::SpectralFeaturesExtractor()
: fft_(),
- reference_frame_fft_(kFrameSize20ms24kHz / 2 + 1),
- lagged_frame_fft_(kFrameSize20ms24kHz / 2 + 1),
- band_boundaries_(
- ComputeBandBoundaryIndexes(kSampleRate24kHz, kFrameSize20ms24kHz)),
+ reference_frame_fft_(kFftSizeBy2Plus1),
+ lagged_frame_fft_(kFftSizeBy2Plus1),
dct_table_(ComputeDctTable()) {}
SpectralFeaturesExtractor::~SpectralFeaturesExtractor() = default;
void SpectralFeaturesExtractor::Reset() {
- spectral_coeffs_ring_buf_.Reset();
- spectral_diffs_buf_.Reset();
+ cepstral_coeffs_ring_buf_.Reset();
+ cepstral_diffs_buf_.Reset();
}
bool SpectralFeaturesExtractor::CheckSilenceComputeFeatures(
rtc::ArrayView<const float, kFrameSize20ms24kHz> reference_frame,
rtc::ArrayView<const float, kFrameSize20ms24kHz> lagged_frame,
- SpectralFeaturesView spectral_features) {
- // Analyze reference frame.
- fft_.ForwardFft(reference_frame, reference_frame_fft_);
- ComputeBandEnergies(reference_frame_fft_, band_boundaries_,
- reference_frame_energy_coeffs_);
+ rtc::ArrayView<float, kNumBands - kNumLowerBands> higher_bands_cepstrum,
+ rtc::ArrayView<float, kNumLowerBands> average,
+ rtc::ArrayView<float, kNumLowerBands> first_derivative,
+ rtc::ArrayView<float, kNumLowerBands> second_derivative,
+ rtc::ArrayView<float, kNumLowerBands> bands_cross_corr,
+ float* variability) {
+ // Compute the Opus band energies for the reference frame.
+ fft_.WindowedFft(reference_frame, reference_frame_fft_);
+ spectral_correlator_.ComputeAutoCorrelation(
+ {reference_frame_fft_.data(), kFftSizeBy2Plus1},
+ reference_frame_bands_energy_);
// Check if the reference frame has silence.
const float tot_energy =
- std::accumulate(reference_frame_energy_coeffs_.begin(),
- reference_frame_energy_coeffs_.end(), 0.f);
- if (tot_energy < kSilenceThreshold)
+ std::accumulate(reference_frame_bands_energy_.begin(),
+ reference_frame_bands_energy_.end(), 0.f);
+ if (tot_energy < kSilenceThreshold) {
return true;
- // Analyze lagged frame.
- fft_.ForwardFft(lagged_frame, lagged_frame_fft_);
- ComputeBandEnergies(lagged_frame_fft_, band_boundaries_,
- lagged_frame_energy_coeffs_);
+ }
+ // Compute the Opus band energies for the lagged frame.
+ fft_.WindowedFft(lagged_frame, lagged_frame_fft_);
+ spectral_correlator_.ComputeAutoCorrelation(
+ {lagged_frame_fft_.data(), kFftSizeBy2Plus1}, lagged_frame_bands_energy_);
// Log of the band energies for the reference frame.
- std::array<float, kNumBands> log_band_energy_coeffs;
- ComputeLogBandEnergiesCoefficients(reference_frame_energy_coeffs_,
- log_band_energy_coeffs);
- // Decorrelate band-wise log energy coefficients via DCT.
- std::array<float, kNumBands> log_band_energy_coeffs_decorrelated;
- ComputeDct(log_band_energy_coeffs, dct_table_,
- log_band_energy_coeffs_decorrelated);
- // Normalize (based on training set stats).
- log_band_energy_coeffs_decorrelated[0] -= 12;
- log_band_energy_coeffs_decorrelated[1] -= 4;
- // Update the ring buffer and the spectral difference stats.
- spectral_coeffs_ring_buf_.Push(log_band_energy_coeffs_decorrelated);
- UpdateSpectralDifferenceStats(log_band_energy_coeffs_decorrelated,
- spectral_coeffs_ring_buf_,
- &spectral_diffs_buf_);
- // Write the higher bands spectral coefficients.
- auto coeffs_src = spectral_coeffs_ring_buf_.GetArrayView(0);
- RTC_DCHECK_EQ(coeffs_src.size() - kNumLowerBands,
- spectral_features.coeffs.size());
- std::copy(coeffs_src.begin() + kNumLowerBands, coeffs_src.end(),
- spectral_features.coeffs.begin());
+ std::array<float, kNumBands> log_bands_energy;
+ ComputeSmoothedLogMagnitudeSpectrum(reference_frame_bands_energy_,
+ log_bands_energy);
+ // Reference frame cepstrum.
+ std::array<float, kNumBands> cepstrum;
+ ComputeDct(log_bands_energy, dct_table_, cepstrum);
+ // Ad-hoc correction terms for the first two cepstral coefficients.
+ cepstrum[0] -= 12.f;
+ cepstrum[1] -= 4.f;
+ // Update the ring buffer and the cepstral difference stats.
+ cepstral_coeffs_ring_buf_.Push(cepstrum);
+ UpdateCepstralDifferenceStats(cepstrum, cepstral_coeffs_ring_buf_,
+ &cepstral_diffs_buf_);
+ // Write the higher bands cepstral coefficients.
+ RTC_DCHECK_EQ(cepstrum.size() - kNumLowerBands, higher_bands_cepstrum.size());
+ std::copy(cepstrum.begin() + kNumLowerBands, cepstrum.end(),
+ higher_bands_cepstrum.begin());
// Compute and write remaining features.
- ComputeAvgAndDerivatives(spectral_features.average,
- spectral_features.first_derivative,
- spectral_features.second_derivative);
- ComputeCrossCorrelation(spectral_features.cross_correlations);
- RTC_DCHECK(spectral_features.variability);
- *(spectral_features.variability) = ComputeVariability();
+ ComputeAvgAndDerivatives(average, first_derivative, second_derivative);
+ ComputeNormalizedCepstralCorrelation(bands_cross_corr);
+ RTC_DCHECK(variability);
+ *variability = ComputeVariability();
return false;
}
void SpectralFeaturesExtractor::ComputeAvgAndDerivatives(
rtc::ArrayView<float, kNumLowerBands> average,
rtc::ArrayView<float, kNumLowerBands> first_derivative,
- rtc::ArrayView<float, kNumLowerBands> second_derivative) {
- auto curr = spectral_coeffs_ring_buf_.GetArrayView(0);
- auto prev1 = spectral_coeffs_ring_buf_.GetArrayView(1);
- auto prev2 = spectral_coeffs_ring_buf_.GetArrayView(2);
+ rtc::ArrayView<float, kNumLowerBands> second_derivative) const {
+ auto curr = cepstral_coeffs_ring_buf_.GetArrayView(0);
+ auto prev1 = cepstral_coeffs_ring_buf_.GetArrayView(1);
+ auto prev2 = cepstral_coeffs_ring_buf_.GetArrayView(2);
RTC_DCHECK_EQ(average.size(), first_derivative.size());
RTC_DCHECK_EQ(first_derivative.size(), second_derivative.size());
RTC_DCHECK_LE(average.size(), curr.size());
@@ -151,47 +131,41 @@
}
}
-void SpectralFeaturesExtractor::ComputeCrossCorrelation(
- rtc::ArrayView<float, kNumLowerBands> cross_correlations) {
- const auto& x = reference_frame_fft_;
- const auto& y = lagged_frame_fft_;
- auto cross_corr = [x, y](const size_t freq_bin_index) -> float {
- return (x[freq_bin_index].real() * y[freq_bin_index].real() +
- x[freq_bin_index].imag() * y[freq_bin_index].imag());
- };
- std::array<float, kNumBands> cross_corr_coeffs;
- constexpr size_t kNumFftPoints = kFrameSize20ms24kHz / 2 + 1;
- ComputeBandCoefficients(cross_corr, band_boundaries_, kNumFftPoints - 1,
- cross_corr_coeffs);
+void SpectralFeaturesExtractor::ComputeNormalizedCepstralCorrelation(
+ rtc::ArrayView<float, kNumLowerBands> bands_cross_corr) {
+ spectral_correlator_.ComputeCrossCorrelation(
+ {reference_frame_fft_.data(), kFftSizeBy2Plus1},
+ {lagged_frame_fft_.data(), kFftSizeBy2Plus1}, bands_cross_corr_);
// Normalize.
- for (size_t i = 0; i < cross_corr_coeffs.size(); ++i) {
- cross_corr_coeffs[i] =
- cross_corr_coeffs[i] /
- std::sqrt(0.001f + reference_frame_energy_coeffs_[i] *
- lagged_frame_energy_coeffs_[i]);
+ for (size_t i = 0; i < bands_cross_corr_.size(); ++i) {
+ bands_cross_corr_[i] =
+ bands_cross_corr_[i] /
+ std::sqrt(0.001f + reference_frame_bands_energy_[i] *
+ lagged_frame_bands_energy_[i]);
}
- // Decorrelate.
- ComputeDct(cross_corr_coeffs, dct_table_, cross_correlations);
- // Normalize (based on training set stats).
- cross_correlations[0] -= 1.3f;
- cross_correlations[1] -= 0.9f;
+ // Cepstrum.
+ ComputeDct(bands_cross_corr_, dct_table_, bands_cross_corr);
+ // Ad-hoc correction terms for the first two cepstral coefficients.
+ bands_cross_corr[0] -= 1.3f;
+ bands_cross_corr[1] -= 0.9f;
}
-float SpectralFeaturesExtractor::ComputeVariability() {
- // Compute spectral variability score.
- float spec_variability = 0.f;
- for (size_t delay1 = 0; delay1 < kSpectralCoeffsHistorySize; ++delay1) {
+float SpectralFeaturesExtractor::ComputeVariability() const {
+ // Compute cepstral variability score.
+ float variability = 0.f;
+ for (size_t delay1 = 0; delay1 < kCepstralCoeffsHistorySize; ++delay1) {
float min_dist = std::numeric_limits<float>::max();
- for (size_t delay2 = 0; delay2 < kSpectralCoeffsHistorySize; ++delay2) {
+ for (size_t delay2 = 0; delay2 < kCepstralCoeffsHistorySize; ++delay2) {
if (delay1 == delay2) // The distance would be 0.
continue;
min_dist =
- std::min(min_dist, spectral_diffs_buf_.GetValue(delay1, delay2));
+ std::min(min_dist, cepstral_diffs_buf_.GetValue(delay1, delay2));
}
- spec_variability += min_dist;
+ variability += min_dist;
}
// Normalize (based on training set stats).
- return spec_variability / kSpectralCoeffsHistorySize - 2.1f;
+ // TODO(bugs.webrtc.org/10480): Isolate normalization from feature extraction.
+ return variability / kCepstralCoeffsHistorySize - 2.1f;
}
} // namespace rnn_vad
diff --git a/modules/audio_processing/agc2/rnn_vad/spectral_features.h b/modules/audio_processing/agc2/rnn_vad/spectral_features.h
index 5c33dcd..047af24 100644
--- a/modules/audio_processing/agc2/rnn_vad/spectral_features.h
+++ b/modules/audio_processing/agc2/rnn_vad/spectral_features.h
@@ -20,34 +20,12 @@
#include "modules/audio_processing/agc2/rnn_vad/common.h"
#include "modules/audio_processing/agc2/rnn_vad/fft_util.h"
#include "modules/audio_processing/agc2/rnn_vad/ring_buffer.h"
+#include "modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h"
#include "modules/audio_processing/agc2/rnn_vad/symmetric_matrix_buffer.h"
namespace webrtc {
namespace rnn_vad {
-// View on spectral features.
-class SpectralFeaturesView {
- public:
- SpectralFeaturesView(rtc::ArrayView<float, kNumBands - kNumLowerBands> coeffs,
- rtc::ArrayView<float, kNumLowerBands> average,
- rtc::ArrayView<float, kNumLowerBands> first_derivative,
- rtc::ArrayView<float, kNumLowerBands> second_derivative,
- rtc::ArrayView<float, kNumLowerBands> cross_correlations,
- float* variability);
- SpectralFeaturesView(const SpectralFeaturesView&);
- ~SpectralFeaturesView();
- // Higher bands spectral coefficients.
- const rtc::ArrayView<float, kNumBands - kNumLowerBands> coeffs;
- // Average and first and second derivative over time for the lower bands.
- const rtc::ArrayView<float, kNumLowerBands> average;
- const rtc::ArrayView<float, kNumLowerBands> first_derivative;
- const rtc::ArrayView<float, kNumLowerBands> second_derivative;
- // Spectral cross-correlation for the lower bands.
- const rtc::ArrayView<float, kNumLowerBands> cross_correlations;
- // Spectral variability score.
- float* const variability;
-};
-
// Class to compute spectral features.
class SpectralFeaturesExtractor {
public:
@@ -64,27 +42,33 @@
bool CheckSilenceComputeFeatures(
rtc::ArrayView<const float, kFrameSize20ms24kHz> reference_frame,
rtc::ArrayView<const float, kFrameSize20ms24kHz> lagged_frame,
- SpectralFeaturesView spectral_features);
+ rtc::ArrayView<float, kNumBands - kNumLowerBands> higher_bands_cepstrum,
+ rtc::ArrayView<float, kNumLowerBands> average,
+ rtc::ArrayView<float, kNumLowerBands> first_derivative,
+ rtc::ArrayView<float, kNumLowerBands> second_derivative,
+ rtc::ArrayView<float, kNumLowerBands> bands_cross_corr,
+ float* variability);
private:
void ComputeAvgAndDerivatives(
rtc::ArrayView<float, kNumLowerBands> average,
rtc::ArrayView<float, kNumLowerBands> first_derivative,
- rtc::ArrayView<float, kNumLowerBands> second_derivative);
- void ComputeCrossCorrelation(
- rtc::ArrayView<float, kNumLowerBands> cross_correlations);
- float ComputeVariability();
+ rtc::ArrayView<float, kNumLowerBands> second_derivative) const;
+ void ComputeNormalizedCepstralCorrelation(
+ rtc::ArrayView<float, kNumLowerBands> bands_cross_corr);
+ float ComputeVariability() const;
- BandAnalysisFft fft_;
+ FftUtil fft_;
std::vector<std::complex<float>> reference_frame_fft_;
std::vector<std::complex<float>> lagged_frame_fft_;
- std::array<float, kNumBands> reference_frame_energy_coeffs_{};
- std::array<float, kNumBands> lagged_frame_energy_coeffs_{};
- const std::array<size_t, kNumBands> band_boundaries_;
+ SpectralCorrelator spectral_correlator_;
+ std::array<float, kOpusBands24kHz> reference_frame_bands_energy_;
+ std::array<float, kOpusBands24kHz> lagged_frame_bands_energy_;
+ std::array<float, kOpusBands24kHz> bands_cross_corr_;
const std::array<float, kNumBands * kNumBands> dct_table_;
- RingBuffer<float, kNumBands, kSpectralCoeffsHistorySize>
- spectral_coeffs_ring_buf_;
- SymmetricMatrixBuffer<float, kSpectralCoeffsHistorySize> spectral_diffs_buf_;
+ RingBuffer<float, kNumBands, kCepstralCoeffsHistorySize>
+ cepstral_coeffs_ring_buf_;
+ SymmetricMatrixBuffer<float, kCepstralCoeffsHistorySize> cepstral_diffs_buf_;
};
} // namespace rnn_vad
diff --git a/modules/audio_processing/agc2/rnn_vad/spectral_features_internal.cc b/modules/audio_processing/agc2/rnn_vad/spectral_features_internal.cc
index 74211fe..8135e3c 100644
--- a/modules/audio_processing/agc2/rnn_vad/spectral_features_internal.cc
+++ b/modules/audio_processing/agc2/rnn_vad/spectral_features_internal.cc
@@ -20,85 +20,126 @@
namespace rnn_vad {
namespace {
-// DCT scaling factor.
-static_assert(
- kNumBands == 22,
- "kNumBands changed! Please update the value of kDctScalingFactor");
-constexpr float kDctScalingFactor = 0.301511345f; // sqrt(2 / kNumBands)
+// Weights for each FFT coefficient for each Opus band (Nyquist frequency
+// excluded). The size of each band is specified in
+// |kOpusScaleNumBins24kHz20ms|.
+constexpr std::array<float, kFrameSize20ms24kHz / 2> kOpusBandWeights24kHz20ms =
+ {{
+ 0.f, 0.25f, 0.5f, 0.75f, // Band 0
+ 0.f, 0.25f, 0.5f, 0.75f, // Band 1
+ 0.f, 0.25f, 0.5f, 0.75f, // Band 2
+ 0.f, 0.25f, 0.5f, 0.75f, // Band 3
+ 0.f, 0.25f, 0.5f, 0.75f, // Band 4
+ 0.f, 0.25f, 0.5f, 0.75f, // Band 5
+ 0.f, 0.25f, 0.5f, 0.75f, // Band 6
+ 0.f, 0.25f, 0.5f, 0.75f, // Band 7
+ 0.f, 0.125f, 0.25f, 0.375f, 0.5f,
+ 0.625f, 0.75f, 0.875f, // Band 8
+ 0.f, 0.125f, 0.25f, 0.375f, 0.5f,
+ 0.625f, 0.75f, 0.875f, // Band 9
+ 0.f, 0.125f, 0.25f, 0.375f, 0.5f,
+ 0.625f, 0.75f, 0.875f, // Band 10
+ 0.f, 0.125f, 0.25f, 0.375f, 0.5f,
+ 0.625f, 0.75f, 0.875f, // Band 11
+ 0.f, 0.0625f, 0.125f, 0.1875f, 0.25f,
+ 0.3125f, 0.375f, 0.4375f, 0.5f, 0.5625f,
+ 0.625f, 0.6875f, 0.75f, 0.8125f, 0.875f,
+ 0.9375f, // Band 12
+ 0.f, 0.0625f, 0.125f, 0.1875f, 0.25f,
+ 0.3125f, 0.375f, 0.4375f, 0.5f, 0.5625f,
+ 0.625f, 0.6875f, 0.75f, 0.8125f, 0.875f,
+ 0.9375f, // Band 13
+ 0.f, 0.0625f, 0.125f, 0.1875f, 0.25f,
+ 0.3125f, 0.375f, 0.4375f, 0.5f, 0.5625f,
+ 0.625f, 0.6875f, 0.75f, 0.8125f, 0.875f,
+ 0.9375f, // Band 14
+ 0.f, 0.0416667f, 0.0833333f, 0.125f, 0.166667f,
+ 0.208333f, 0.25f, 0.291667f, 0.333333f, 0.375f,
+ 0.416667f, 0.458333f, 0.5f, 0.541667f, 0.583333f,
+ 0.625f, 0.666667f, 0.708333f, 0.75f, 0.791667f,
+ 0.833333f, 0.875f, 0.916667f, 0.958333f, // Band 15
+ 0.f, 0.0416667f, 0.0833333f, 0.125f, 0.166667f,
+ 0.208333f, 0.25f, 0.291667f, 0.333333f, 0.375f,
+ 0.416667f, 0.458333f, 0.5f, 0.541667f, 0.583333f,
+ 0.625f, 0.666667f, 0.708333f, 0.75f, 0.791667f,
+ 0.833333f, 0.875f, 0.916667f, 0.958333f, // Band 16
+ 0.f, 0.03125f, 0.0625f, 0.09375f, 0.125f,
+ 0.15625f, 0.1875f, 0.21875f, 0.25f, 0.28125f,
+ 0.3125f, 0.34375f, 0.375f, 0.40625f, 0.4375f,
+ 0.46875f, 0.5f, 0.53125f, 0.5625f, 0.59375f,
+ 0.625f, 0.65625f, 0.6875f, 0.71875f, 0.75f,
+ 0.78125f, 0.8125f, 0.84375f, 0.875f, 0.90625f,
+ 0.9375f, 0.96875f, // Band 17
+ 0.f, 0.0208333f, 0.0416667f, 0.0625f, 0.0833333f,
+ 0.104167f, 0.125f, 0.145833f, 0.166667f, 0.1875f,
+ 0.208333f, 0.229167f, 0.25f, 0.270833f, 0.291667f,
+ 0.3125f, 0.333333f, 0.354167f, 0.375f, 0.395833f,
+ 0.416667f, 0.4375f, 0.458333f, 0.479167f, 0.5f,
+ 0.520833f, 0.541667f, 0.5625f, 0.583333f, 0.604167f,
+ 0.625f, 0.645833f, 0.666667f, 0.6875f, 0.708333f,
+ 0.729167f, 0.75f, 0.770833f, 0.791667f, 0.8125f,
+ 0.833333f, 0.854167f, 0.875f, 0.895833f, 0.916667f,
+ 0.9375f, 0.958333f, 0.979167f // Band 18
+ }};
} // namespace
-std::array<size_t, kNumBands> ComputeBandBoundaryIndexes(
- size_t sample_rate_hz,
- size_t frame_size_samples) {
- std::array<size_t, kNumBands> indexes;
- for (size_t i = 0; i < kNumBands; ++i) {
- indexes[i] =
- kBandFrequencyBoundaries[i] * frame_size_samples / sample_rate_hz;
- }
- return indexes;
+SpectralCorrelator::SpectralCorrelator()
+ : weights_(kOpusBandWeights24kHz20ms.begin(),
+ kOpusBandWeights24kHz20ms.end()) {}
+
+SpectralCorrelator::~SpectralCorrelator() = default;
+
+void SpectralCorrelator::ComputeAutoCorrelation(
+ rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> x,
+ rtc::ArrayView<float, kOpusBands24kHz> auto_corr) const {
+ ComputeCrossCorrelation(x, x, auto_corr);
}
-void ComputeBandCoefficients(
- rtc::FunctionView<float(size_t)> functor,
- rtc::ArrayView<const size_t, kNumBands> band_boundaries,
- size_t max_freq_bin_index,
- rtc::ArrayView<float, kNumBands> coefficients) {
- std::fill(coefficients.begin(), coefficients.end(), 0.f);
- for (size_t i = 0; i < coefficients.size() - 1; ++i) {
- RTC_DCHECK_EQ(0.f, coefficients[i + 1]);
- RTC_DCHECK_GT(band_boundaries[i + 1], band_boundaries[i]);
- const size_t first_freq_bin = band_boundaries[i];
- const size_t last_freq_bin =
- std::min(max_freq_bin_index, first_freq_bin + band_boundaries[i + 1] -
- band_boundaries[i] - 1);
- // Depending on the sample rate, the highest bands can have no FFT
- // coefficients. Stop the iteration when coming across the first empty band.
- if (first_freq_bin >= last_freq_bin)
- break;
- const size_t band_size = last_freq_bin - first_freq_bin + 1;
- // Compute the band coefficient using a triangular band with peak response
- // at the band boundary.
- for (size_t j = first_freq_bin; j <= last_freq_bin; ++j) {
- const float w = static_cast<float>(j - first_freq_bin) / band_size;
- const float coefficient = functor(j);
- coefficients[i] += (1.f - w) * coefficient;
- coefficients[i + 1] += w * coefficient;
+void SpectralCorrelator::ComputeCrossCorrelation(
+ rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> x,
+ rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> y,
+ rtc::ArrayView<float, kOpusBands24kHz> cross_corr) const {
+ constexpr auto kOpusScaleNumBins24kHz20ms = GetOpusScaleNumBins24kHz20ms();
+ size_t k = 0; // Next Fourier coefficient index.
+ cross_corr[0] = 0.f;
+ for (size_t i = 0; i < kOpusBands24kHz - 1; ++i) {
+ cross_corr[i + 1] = 0.f;
+ for (int j = 0; j < kOpusScaleNumBins24kHz20ms[i]; ++j) { // Band size.
+ const float v = x[k].real() * y[k].real() + x[k].imag() * y[k].imag();
+ const float tmp = weights_[k] * v;
+ cross_corr[i] += v - tmp;
+ cross_corr[i + 1] += tmp;
+ k++;
}
}
- // The first and the last bands in the loop above only got half contribution.
- coefficients[0] *= 2.f;
- coefficients[coefficients.size() - 1] *= 2.f;
- // TODO(bugs.webrtc.org/9076): Replace the line above with
- // "coefficients[i] *= 2.f" (*) since we now assume that the last band is
- // always |kNumBands| - 1.
- // (*): "size_t i" must be declared before the main loop.
+ cross_corr[0] *= 2.f; // The first band only gets half contribution.
+ // The Nyquist coefficient is never used.
+ RTC_DCHECK_EQ(k, kFftSizeBy2Plus1 - 1);
}
-void ComputeBandEnergies(
- rtc::ArrayView<const std::complex<float>> fft_coeffs,
- rtc::ArrayView<const size_t, kNumBands> band_boundaries,
- rtc::ArrayView<float, kNumBands> band_energies) {
- RTC_DCHECK_EQ(band_boundaries.size(), band_energies.size());
- auto functor = [fft_coeffs](const size_t freq_bin_index) -> float {
- return std::norm(fft_coeffs[freq_bin_index]);
+void ComputeSmoothedLogMagnitudeSpectrum(
+ rtc::ArrayView<const float> bands_energy,
+ rtc::ArrayView<float, kNumBands> log_bands_energy) {
+ RTC_DCHECK_LE(bands_energy.size(), kNumBands);
+ constexpr float kOneByHundred = 1e-2f;
+ constexpr float kLogOneByHundred = -2.f;
+ // Init.
+ float log_max = kLogOneByHundred;
+ float follow = kLogOneByHundred;
+ const auto smooth = [&log_max, &follow](float x) {
+ x = std::max(log_max - 7.f, std::max(follow - 1.5f, x));
+ log_max = std::max(log_max, x);
+ follow = std::max(follow - 1.5f, x);
+ return x;
};
- ComputeBandCoefficients(functor, band_boundaries, fft_coeffs.size() - 1,
- band_energies);
-}
-
-void ComputeLogBandEnergiesCoefficients(
- rtc::ArrayView<const float, kNumBands> band_energy_coeffs,
- rtc::ArrayView<float, kNumBands> log_band_energy_coeffs) {
- float log_max = -2.f;
- float follow = -2.f;
- for (size_t i = 0; i < band_energy_coeffs.size(); ++i) {
- log_band_energy_coeffs[i] = std::log10(1e-2f + band_energy_coeffs[i]);
- // Smoothing across frequency bands.
- log_band_energy_coeffs[i] = std::max(
- log_max - 7.f, std::max(follow - 1.5f, log_band_energy_coeffs[i]));
- log_max = std::max(log_max, log_band_energy_coeffs[i]);
- follow = std::max(follow - 1.5f, log_band_energy_coeffs[i]);
+ // Smoothing over the bands for which the band energy is defined.
+ for (size_t i = 0; i < bands_energy.size(); ++i) {
+ log_bands_energy[i] = smooth(std::log10(kOneByHundred + bands_energy[i]));
+ }
+ // Smoothing over the remaining bands (zero energy).
+ for (size_t i = bands_energy.size(); i < kNumBands; ++i) {
+ log_bands_energy[i] = smooth(kLogOneByHundred);
}
}
@@ -113,17 +154,28 @@
return dct_table;
}
-void ComputeDct(rtc::ArrayView<const float, kNumBands> in,
+void ComputeDct(rtc::ArrayView<const float> in,
rtc::ArrayView<const float, kNumBands * kNumBands> dct_table,
rtc::ArrayView<float> out) {
+ // DCT scaling factor - i.e., sqrt(2 / kNumBands).
+ constexpr float kDctScalingFactor = 0.301511345f;
+ constexpr float kDctScalingFactorError =
+ kDctScalingFactor * kDctScalingFactor -
+ 2.f / static_cast<float>(kNumBands);
+ static_assert(
+ (kDctScalingFactorError >= 0.f && kDctScalingFactorError < 1e-1f) ||
+ (kDctScalingFactorError < 0.f && kDctScalingFactorError > -1e-1f),
+ "kNumBands changed and kDctScalingFactor has not been updated.");
RTC_DCHECK_NE(in.data(), out.data()) << "In-place DCT is not supported.";
+ RTC_DCHECK_LE(in.size(), kNumBands);
RTC_DCHECK_LE(1, out.size());
RTC_DCHECK_LE(out.size(), in.size());
- std::fill(out.begin(), out.end(), 0.f);
for (size_t i = 0; i < out.size(); ++i) {
+ out[i] = 0.f;
for (size_t j = 0; j < in.size(); ++j) {
- out[i] += in[j] * dct_table[j * in.size() + i];
+ out[i] += in[j] * dct_table[j * kNumBands + i];
}
+ // TODO(bugs.webrtc.org/10480): Scaling factor in the DCT table.
out[i] *= kDctScalingFactor;
}
}
diff --git a/modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h b/modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h
index 14ff560..0ec9652 100644
--- a/modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h
+++ b/modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h
@@ -14,49 +14,75 @@
#include <stddef.h>
#include <array>
#include <complex>
+#include <vector>
#include "api/array_view.h"
-#include "api/function_view.h"
#include "modules/audio_processing/agc2/rnn_vad/common.h"
namespace webrtc {
namespace rnn_vad {
-// Computes FFT boundary indexes corresponding to sub-bands.
-std::array<size_t, kNumBands> ComputeBandBoundaryIndexes(
- size_t sample_rate_hz,
- size_t frame_size_samples);
+// At a sample rate of 24 kHz, the last 3 Opus bands are beyond the Nyquist
+// frequency. However, band #19 gets the contributions from band #18 because
+// of the symmetric triangular filter with peak response at 12 kHz.
+constexpr size_t kOpusBands24kHz = 20;
+static_assert(kOpusBands24kHz < kNumBands,
+ "The number of bands at 24 kHz must be less than those defined "
+ "in the Opus scale at 48 kHz.");
-// Iterates through frequency bands and computes coefficients via |functor| for
-// triangular bands with peak response at each band boundary. |functor| returns
-// a floating point value for the FFT coefficient having index equal to the
-// argument passed to |functor|; that argument is in the range {0, ...
-// |max_freq_bin_index| - 1}.
-void ComputeBandCoefficients(
- rtc::FunctionView<float(size_t)> functor,
- rtc::ArrayView<const size_t, kNumBands> band_boundaries,
- const size_t max_freq_bin_index,
- rtc::ArrayView<float, kNumBands> coefficients);
+// Number of FFT frequency bins covered by each band in the Opus scale at a
+// sample rate of 24 kHz for 20 ms frames.
+// Declared here for unit testing.
+constexpr std::array<int, kOpusBands24kHz - 1> GetOpusScaleNumBins24kHz20ms() {
+ return {4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 24, 24, 32, 48};
+}
-// Given an array of FFT coefficients and a vector of band boundary indexes,
-// computes band energy coefficients.
-void ComputeBandEnergies(
- rtc::ArrayView<const std::complex<float>> fft_coeffs,
- rtc::ArrayView<const size_t, kNumBands> band_boundaries,
- rtc::ArrayView<float, kNumBands> band_energies);
+// TODO(bugs.webrtc.org/10480): Move to a separate file.
+// Class to compute band-wise spectral features in the Opus perceptual scale
+// for 20 ms frames sampled at 24 kHz. The analysis methods apply triangular
+// filters with peak response at the each band boundary.
+class SpectralCorrelator {
+ public:
+ // Ctor.
+ SpectralCorrelator();
+ SpectralCorrelator(const SpectralCorrelator&) = delete;
+ SpectralCorrelator& operator=(const SpectralCorrelator&) = delete;
+ ~SpectralCorrelator();
-// Computes log band energy coefficients.
-void ComputeLogBandEnergiesCoefficients(
- rtc::ArrayView<const float, kNumBands> band_energy_coeffs,
- rtc::ArrayView<float, kNumBands> log_band_energy_coeffs);
+ // Computes the band-wise spectral auto-correlations.
+ void ComputeAutoCorrelation(
+ rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> x,
+ rtc::ArrayView<float, kOpusBands24kHz> auto_corr) const;
-// Creates a DCT table for arrays having size equal to |kNumBands|.
+ // Computes the band-wise spectral cross-correlations.
+ void ComputeCrossCorrelation(
+ rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> x,
+ rtc::ArrayView<const std::complex<float>, kFftSizeBy2Plus1> y,
+ rtc::ArrayView<float, kOpusBands24kHz> cross_corr) const;
+
+ private:
+ const std::vector<float> weights_; // Weights for each Fourier coefficient.
+};
+
+// TODO(bugs.webrtc.org/10480): Move to anonymous namespace in
+// spectral_features.cc. Given a vector of Opus-bands energy coefficients,
+// computes the log magnitude spectrum applying smoothing both over time and
+// over frequency. Declared here for unit testing.
+void ComputeSmoothedLogMagnitudeSpectrum(
+ rtc::ArrayView<const float> bands_energy,
+ rtc::ArrayView<float, kNumBands> log_bands_energy);
+
+// TODO(bugs.webrtc.org/10480): Move to anonymous namespace in
+// spectral_features.cc. Creates a DCT table for arrays having size equal to
+// |kNumBands|. Declared here for unit testing.
std::array<float, kNumBands * kNumBands> ComputeDctTable();
-// Computes DCT for |in| given a pre-computed DCT table. In-place computation is
-// not allowed and |out| can be smaller than |in| in order to only compute the
-// first DCT coefficients.
-void ComputeDct(rtc::ArrayView<const float, kNumBands> in,
+// TODO(bugs.webrtc.org/10480): Move to anonymous namespace in
+// spectral_features.cc. Computes DCT for |in| given a pre-computed DCT table.
+// In-place computation is not allowed and |out| can be smaller than |in| in
+// order to only compute the first DCT coefficients. Declared here for unit
+// testing.
+void ComputeDct(rtc::ArrayView<const float> in,
rtc::ArrayView<const float, kNumBands * kNumBands> dct_table,
rtc::ArrayView<float> out);
diff --git a/modules/audio_processing/agc2/rnn_vad/spectral_features_internal_unittest.cc b/modules/audio_processing/agc2/rnn_vad/spectral_features_internal_unittest.cc
index 5e769bf..4ff7118 100644
--- a/modules/audio_processing/agc2/rnn_vad/spectral_features_internal_unittest.cc
+++ b/modules/audio_processing/agc2/rnn_vad/spectral_features_internal_unittest.cc
@@ -10,6 +10,13 @@
#include "modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h"
+#include <algorithm>
+#include <array>
+#include <complex>
+#include <numeric>
+#include <vector>
+
+#include "api/array_view.h"
#include "modules/audio_processing/agc2/rnn_vad/test_utils.h"
// TODO(bugs.webrtc.org/8948): Add when the issue is fixed.
// #include "test/fpe_observer.h"
@@ -20,58 +27,76 @@
namespace test {
namespace {
-constexpr size_t kSampleRate48kHz = 48000;
-constexpr size_t kFrameSize20ms48kHz = 2 * kSampleRate48kHz / 100;
-constexpr size_t kFftNumCoeffs20ms48kHz = kFrameSize20ms48kHz / 2 + 1;
+// Generates the values for the array named |kOpusBandWeights24kHz20ms| in the
+// anonymous namespace of the .cc file, which is the array of FFT coefficient
+// weights for the Opus scale triangular filters.
+std::vector<float> ComputeTriangularFiltersWeights() {
+ constexpr auto kOpusScaleNumBins24kHz20ms = GetOpusScaleNumBins24kHz20ms();
+ const auto& v = kOpusScaleNumBins24kHz20ms; // Alias.
+ const size_t num_weights = std::accumulate(
+ kOpusScaleNumBins24kHz20ms.begin(), kOpusScaleNumBins24kHz20ms.end(), 0);
+ std::vector<float> weights(num_weights);
+ size_t next_fft_coeff_index = 0;
+ for (size_t band = 0; band < v.size(); ++band) {
+ const size_t band_size = v[band];
+ for (size_t j = 0; j < band_size; ++j) {
+ weights[next_fft_coeff_index + j] = static_cast<float>(j) / band_size;
+ }
+ next_fft_coeff_index += band_size;
+ }
+ return weights;
+}
} // namespace
-// TODO(bugs.webrtc.org/9076): Remove this test before closing the issue.
-// Check that when using precomputed FFT coefficients for frames at 48 kHz, the
-// output of ComputeBandEnergies() is bit exact.
-TEST(RnnVadTest, ComputeBandEnergies48kHzBitExactness) {
- // Initialize input data reader and buffers.
- auto fft_coeffs_reader = CreateFftCoeffsReader();
- const size_t num_frames = fft_coeffs_reader.second;
- ASSERT_EQ(
- kFftNumCoeffs20ms48kHz,
- rtc::CheckedDivExact(fft_coeffs_reader.first->data_length(), num_frames) /
- 2);
- std::array<float, kFftNumCoeffs20ms48kHz> fft_coeffs_real;
- std::array<float, kFftNumCoeffs20ms48kHz> fft_coeffs_imag;
- std::array<std::complex<float>, kFftNumCoeffs20ms48kHz> fft_coeffs;
- // Init expected output reader and buffer.
- auto band_energies_reader = CreateBandEnergyCoeffsReader();
- ASSERT_EQ(num_frames, band_energies_reader.second);
- std::array<float, kNumBands> expected_band_energies;
- // Init band energies coefficients computation.
- const auto band_boundary_indexes =
- ComputeBandBoundaryIndexes(kSampleRate48kHz, kFrameSize20ms48kHz);
- std::array<float, kNumBands> computed_band_energies;
-
- // Check output for every frame.
- {
- // TODO(bugs.webrtc.org/8948): Add when the issue is fixed.
- // FloatingPointExceptionObserver fpe_observer;
- for (size_t i = 0; i < num_frames; ++i) {
- SCOPED_TRACE(i);
- // Read input.
- fft_coeffs_reader.first->ReadChunk(fft_coeffs_real);
- fft_coeffs_reader.first->ReadChunk(fft_coeffs_imag);
- for (size_t i = 0; i < kFftNumCoeffs20ms48kHz; ++i) {
- fft_coeffs[i].real(fft_coeffs_real[i]);
- fft_coeffs[i].imag(fft_coeffs_imag[i]);
- }
- band_energies_reader.first->ReadChunk(expected_band_energies);
- // Compute band energy coefficients and check output.
- ComputeBandEnergies(fft_coeffs, band_boundary_indexes,
- computed_band_energies);
- ExpectEqualFloatArray(expected_band_energies, computed_band_energies);
- }
+// Checks that the values returned by GetOpusScaleNumBins24kHz20ms() match the
+// Opus scale frequency boundaries.
+TEST(RnnVadTest, TestOpusScaleBoundaries) {
+ constexpr int kBandFrequencyBoundariesHz[kNumBands - 1] = {
+ 200, 400, 600, 800, 1000, 1200, 1400, 1600, 2000, 2400, 2800,
+ 3200, 4000, 4800, 5600, 6800, 8000, 9600, 12000, 15600, 20000};
+ constexpr auto kOpusScaleNumBins24kHz20ms = GetOpusScaleNumBins24kHz20ms();
+ int prev = 0;
+ for (size_t i = 0; i < kOpusScaleNumBins24kHz20ms.size(); ++i) {
+ int boundary =
+ kBandFrequencyBoundariesHz[i] * kFrameSize20ms24kHz / kSampleRate24kHz;
+ EXPECT_EQ(kOpusScaleNumBins24kHz20ms[i], boundary - prev);
+ prev = boundary;
}
}
-TEST(RnnVadTest, ComputeLogBandEnergiesCoefficientsBitExactness) {
+// Checks that the computed triangular filters weights for the Opus scale are
+// monotonic withing each Opus band. This test should only be enabled when
+// ComputeTriangularFiltersWeights() is changed and |kOpusBandWeights24kHz20ms|
+// is updated accordingly.
+TEST(RnnVadTest, DISABLED_TestOpusScaleWeights) {
+ auto weights = ComputeTriangularFiltersWeights();
+ size_t i = 0;
+ for (size_t band_size : GetOpusScaleNumBins24kHz20ms()) {
+ SCOPED_TRACE(band_size);
+ rtc::ArrayView<float> band_weights(weights.data() + i, band_size);
+ float prev = -1.f;
+ for (float weight : band_weights) {
+ EXPECT_LT(prev, weight);
+ prev = weight;
+ }
+ i += band_size;
+ }
+}
+
+TEST(RnnVadTest, SpectralCorrelatorValidOutput) {
+ SpectralCorrelator e;
+ std::array<std::complex<float>, kFftSizeBy2Plus1> in;
+ std::array<float, kOpusBands24kHz> out;
+ in.fill({1.f, 1.f});
+ e.ComputeAutoCorrelation(in, out);
+ for (size_t i = 0; i < kOpusBands24kHz; ++i) {
+ SCOPED_TRACE(i);
+ EXPECT_GT(out[i], 0.f);
+ }
+}
+
+TEST(RnnVadTest, ComputeSmoothedLogMagnitudeSpectrumWithinTolerance) {
constexpr std::array<float, kNumBands> input = {
{86.060539245605f, 275.668334960938f, 43.406528472900f, 6.541896820068f,
17.964015960693f, 8.090919494629f, 1.261920094490f, 1.212702631950f,
@@ -90,7 +115,7 @@
{
// TODO(bugs.webrtc.org/8948): Add when the issue is fixed.
// FloatingPointExceptionObserver fpe_observer;
- ComputeLogBandEnergiesCoefficients(input, computed_output);
+ ComputeSmoothedLogMagnitudeSpectrum(input, computed_output);
ExpectNearAbsolute(expected_output, computed_output, 1e-5f);
}
}
diff --git a/modules/audio_processing/agc2/rnn_vad/spectral_features_unittest.cc b/modules/audio_processing/agc2/rnn_vad/spectral_features_unittest.cc
index 557e41e..39b9f93 100644
--- a/modules/audio_processing/agc2/rnn_vad/spectral_features_unittest.cc
+++ b/modules/audio_processing/agc2/rnn_vad/spectral_features_unittest.cc
@@ -32,15 +32,35 @@
}
}
-SpectralFeaturesView GetSpectralFeaturesView(
+rtc::ArrayView<float, kNumBands - kNumLowerBands> GetHigherBandsSpectrum(
std::array<float, kTestFeatureVectorSize>* feature_vector) {
- return {
- {feature_vector->data() + kNumLowerBands, kNumBands - kNumLowerBands},
- {feature_vector->data(), kNumLowerBands},
- {feature_vector->data() + kNumBands, kNumLowerBands},
- {feature_vector->data() + kNumBands + kNumLowerBands, kNumLowerBands},
- {feature_vector->data() + kNumBands + 2 * kNumLowerBands, kNumLowerBands},
- &(*feature_vector)[kNumBands + 3 * kNumLowerBands]};
+ return {feature_vector->data() + kNumLowerBands, kNumBands - kNumLowerBands};
+}
+
+rtc::ArrayView<float, kNumLowerBands> GetAverage(
+ std::array<float, kTestFeatureVectorSize>* feature_vector) {
+ return {feature_vector->data(), kNumLowerBands};
+}
+
+rtc::ArrayView<float, kNumLowerBands> GetFirstDerivative(
+ std::array<float, kTestFeatureVectorSize>* feature_vector) {
+ return {feature_vector->data() + kNumBands, kNumLowerBands};
+}
+
+rtc::ArrayView<float, kNumLowerBands> GetSecondDerivative(
+ std::array<float, kTestFeatureVectorSize>* feature_vector) {
+ return {feature_vector->data() + kNumBands + kNumLowerBands, kNumLowerBands};
+}
+
+rtc::ArrayView<float, kNumLowerBands> GetCepstralCrossCorrelation(
+ std::array<float, kTestFeatureVectorSize>* feature_vector) {
+ return {feature_vector->data() + kNumBands + 2 * kNumLowerBands,
+ kNumLowerBands};
+}
+
+float* GetCepstralVariability(
+ std::array<float, kTestFeatureVectorSize>* feature_vector) {
+ return feature_vector->data() + kNumBands + 3 * kNumLowerBands;
}
constexpr float kInitialFeatureVal = -9999.f;
@@ -54,7 +74,6 @@
rtc::ArrayView<float, kFrameSize20ms24kHz> samples_view(samples);
bool is_silence;
std::array<float, kTestFeatureVectorSize> feature_vector;
- auto feature_vector_view = GetSpectralFeaturesView(&feature_vector);
// Write an initial value in the feature vector to detect changes.
std::fill(feature_vector.begin(), feature_vector.end(), kInitialFeatureVal);
@@ -64,8 +83,12 @@
// With silence.
std::fill(samples.begin(), samples.end(), 0.f);
- is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view,
- feature_vector_view);
+ is_silence = sfe.CheckSilenceComputeFeatures(
+ samples_view, samples_view, GetHigherBandsSpectrum(&feature_vector),
+ GetAverage(&feature_vector), GetFirstDerivative(&feature_vector),
+ GetSecondDerivative(&feature_vector),
+ GetCepstralCrossCorrelation(&feature_vector),
+ GetCepstralVariability(&feature_vector));
// Silence is expected, the output won't be overwritten.
EXPECT_TRUE(is_silence);
EXPECT_TRUE(std::all_of(feature_vector.begin(), feature_vector.end(),
@@ -73,18 +96,22 @@
// With no silence.
WriteTestData(samples);
- is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view,
- feature_vector_view);
+ is_silence = sfe.CheckSilenceComputeFeatures(
+ samples_view, samples_view, GetHigherBandsSpectrum(&feature_vector),
+ GetAverage(&feature_vector), GetFirstDerivative(&feature_vector),
+ GetSecondDerivative(&feature_vector),
+ GetCepstralCrossCorrelation(&feature_vector),
+ GetCepstralVariability(&feature_vector));
// Silence is not expected, the output will be overwritten.
EXPECT_FALSE(is_silence);
EXPECT_FALSE(std::all_of(feature_vector.begin(), feature_vector.end(),
[](float x) { return x == kInitialFeatureVal; }));
}
-// When the input signal does not change, the spectral coefficients average does
-// not change and the derivatives are zero. Similarly, the spectral variability
+// When the input signal does not change, the cepstral coefficients average does
+// not change and the derivatives are zero. Similarly, the cepstral variability
// score does not change either.
-TEST(RnnVadTest, SpectralFeaturesConstantAverageZeroDerivative) {
+TEST(RnnVadTest, CepstralFeaturesConstantAverageZeroDerivative) {
// Initialize.
SpectralFeaturesExtractor sfe;
std::array<float, kFrameSize20ms24kHz> samples;
@@ -94,17 +121,24 @@
// Fill the spectral features with test data.
std::array<float, kTestFeatureVectorSize> feature_vector;
- auto feature_vector_view = GetSpectralFeaturesView(&feature_vector);
- for (size_t i = 0; i < kSpectralCoeffsHistorySize; ++i) {
- is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view,
- feature_vector_view);
+ for (size_t i = 0; i < kCepstralCoeffsHistorySize; ++i) {
+ is_silence = sfe.CheckSilenceComputeFeatures(
+ samples_view, samples_view, GetHigherBandsSpectrum(&feature_vector),
+ GetAverage(&feature_vector), GetFirstDerivative(&feature_vector),
+ GetSecondDerivative(&feature_vector),
+ GetCepstralCrossCorrelation(&feature_vector),
+ GetCepstralVariability(&feature_vector));
}
// Feed the test data one last time but using a different output vector.
std::array<float, kTestFeatureVectorSize> feature_vector_last;
- auto feature_vector_last_view = GetSpectralFeaturesView(&feature_vector_last);
- is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view,
- feature_vector_last_view);
+ is_silence = sfe.CheckSilenceComputeFeatures(
+ samples_view, samples_view, GetHigherBandsSpectrum(&feature_vector_last),
+ GetAverage(&feature_vector_last),
+ GetFirstDerivative(&feature_vector_last),
+ GetSecondDerivative(&feature_vector_last),
+ GetCepstralCrossCorrelation(&feature_vector_last),
+ GetCepstralVariability(&feature_vector_last));
// Average is unchanged.
ExpectEqualFloatArray({feature_vector.data(), kNumLowerBands},
@@ -116,7 +150,7 @@
ExpectEqualFloatArray(
{feature_vector_last.data() + kNumBands + kNumLowerBands, kNumLowerBands},
zeros);
- // Spectral variability is unchanged.
+ // Variability is unchanged.
EXPECT_FLOAT_EQ(feature_vector[kNumBands + 3 * kNumLowerBands],
feature_vector_last[kNumBands + 3 * kNumLowerBands]);
}
diff --git a/modules/audio_processing/agc2/rnn_vad/test_utils.cc b/modules/audio_processing/agc2/rnn_vad/test_utils.cc
index 4dae8cd..14b84a4 100644
--- a/modules/audio_processing/agc2/rnn_vad/test_utils.cc
+++ b/modules/audio_processing/agc2/rnn_vad/test_utils.cc
@@ -87,14 +87,6 @@
return {std::move(ptr), rtc::CheckedDivExact(ptr->data_length(), row_size)};
}
-ReaderPairType CreateBandEnergyCoeffsReader() {
- constexpr size_t num_bands = 22;
- auto ptr = absl::make_unique<BinaryFileReader<float>>(
- test::ResourcePath("audio_processing/agc2/rnn_vad/band_energies", "dat"),
- num_bands);
- return {std::move(ptr), rtc::CheckedDivExact(ptr->data_length(), num_bands)};
-}
-
ReaderPairType CreateSilenceFlagsFeatureMatrixReader() {
constexpr size_t feature_vector_size = 42;
auto ptr = absl::make_unique<BinaryFileReader<float>>(
diff --git a/modules/audio_processing/agc2/rnn_vad/test_utils.h b/modules/audio_processing/agc2/rnn_vad/test_utils.h
index f9d7376..c11af7f 100644
--- a/modules/audio_processing/agc2/rnn_vad/test_utils.h
+++ b/modules/audio_processing/agc2/rnn_vad/test_utils.h
@@ -110,9 +110,6 @@
// Creates a reader for the FFT coefficients.
std::pair<std::unique_ptr<BinaryFileReader<float>>, const size_t>
CreateFftCoeffsReader();
-// Instance a reader for the band energy coefficients.
-std::pair<std::unique_ptr<BinaryFileReader<float>>, const size_t>
-CreateBandEnergyCoeffsReader();
// Creates a reader for the silence flags and the feature matrix.
std::pair<std::unique_ptr<BinaryFileReader<float>>, const size_t>
CreateSilenceFlagsFeatureMatrixReader();