blob: 2b36034734279f510f85c1944044a76f89abca83 [file] [log] [blame]
/*
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "modules/audio_processing/agc2/rnn_vad/rnn.h"
#include <algorithm>
#include <array>
#include <cmath>
#include "rtc_base/checks.h"
#include "third_party/rnnoise/src/rnn_activations.h"
#include "third_party/rnnoise/src/rnn_vad_weights.h"
namespace webrtc {
namespace rnn_vad {
using rnnoise::kWeightsScale;
using rnnoise::kInputLayerInputSize;
static_assert(kFeatureVectorSize == kInputLayerInputSize, "");
using rnnoise::kInputDenseWeights;
using rnnoise::kInputDenseBias;
using rnnoise::kInputLayerOutputSize;
static_assert(kInputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
"Increase kFullyConnectedLayersMaxUnits.");
using rnnoise::kHiddenGruRecurrentWeights;
using rnnoise::kHiddenGruWeights;
using rnnoise::kHiddenGruBias;
using rnnoise::kHiddenLayerOutputSize;
static_assert(kHiddenLayerOutputSize <= kRecurrentLayersMaxUnits,
"Increase kRecurrentLayersMaxUnits.");
using rnnoise::kOutputDenseWeights;
using rnnoise::kOutputDenseBias;
using rnnoise::kOutputLayerOutputSize;
static_assert(kOutputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
"Increase kFullyConnectedLayersMaxUnits.");
using rnnoise::RectifiedLinearUnit;
using rnnoise::SigmoidApproximated;
using rnnoise::TansigApproximated;
FullyConnectedLayer::FullyConnectedLayer(
const size_t input_size,
const size_t output_size,
const rtc::ArrayView<const int8_t> bias,
const rtc::ArrayView<const int8_t> weights,
float (*const activation_function)(float))
: input_size_(input_size),
output_size_(output_size),
bias_(bias),
weights_(weights),
activation_function_(activation_function) {
RTC_DCHECK_LE(output_size_, kFullyConnectedLayersMaxUnits)
<< "Static over-allocation of fully-connected layers output vectors is "
"not sufficient.";
RTC_DCHECK_EQ(output_size_, bias_.size())
<< "Mismatching output size and bias terms array size.";
RTC_DCHECK_EQ(input_size_ * output_size_, weights_.size())
<< "Mismatching input-output size and weight coefficients array size.";
}
FullyConnectedLayer::~FullyConnectedLayer() = default;
rtc::ArrayView<const float> FullyConnectedLayer::GetOutput() const {
return rtc::ArrayView<const float>(output_.data(), output_size_);
}
void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
// TODO(bugs.chromium.org/9076): Optimize using SSE/AVX fused multiply-add
// operations.
for (size_t o = 0; o < output_size_; ++o) {
output_[o] = bias_[o];
// TODO(bugs.chromium.org/9076): Benchmark how different layouts for
// |weights_| change the performance across different platforms.
for (size_t i = 0; i < input_size_; ++i) {
output_[o] += input[i] * weights_[i * output_size_ + o];
}
output_[o] = (*activation_function_)(kWeightsScale * output_[o]);
}
}
GatedRecurrentLayer::GatedRecurrentLayer(
const size_t input_size,
const size_t output_size,
const rtc::ArrayView<const int8_t> bias,
const rtc::ArrayView<const int8_t> weights,
const rtc::ArrayView<const int8_t> recurrent_weights,
float (*const activation_function)(float))
: input_size_(input_size),
output_size_(output_size),
bias_(bias),
weights_(weights),
recurrent_weights_(recurrent_weights),
activation_function_(activation_function) {
RTC_DCHECK_LE(output_size_, kRecurrentLayersMaxUnits)
<< "Static over-allocation of recurrent layers state vectors is not "
<< "sufficient.";
RTC_DCHECK_EQ(3 * output_size_, bias_.size())
<< "Mismatching output size and bias terms array size.";
RTC_DCHECK_EQ(3 * input_size_ * output_size_, weights_.size())
<< "Mismatching input-output size and weight coefficients array size.";
RTC_DCHECK_EQ(3 * input_size_ * output_size_, recurrent_weights_.size())
<< "Mismatching input-output size and recurrent weight coefficients array"
<< " size.";
Reset();
}
GatedRecurrentLayer::~GatedRecurrentLayer() = default;
rtc::ArrayView<const float> GatedRecurrentLayer::GetOutput() const {
return rtc::ArrayView<const float>(state_.data(), output_size_);
}
void GatedRecurrentLayer::Reset() {
state_.fill(0.f);
}
void GatedRecurrentLayer::ComputeOutput(rtc::ArrayView<const float> input) {
// TODO(bugs.chromium.org/9076): Optimize using SSE/AVX fused multiply-add
// operations.
// Stride and offset used to read parameter arrays.
const size_t stride = 3 * output_size_;
size_t offset = 0;
// Compute update gates.
std::array<float, kRecurrentLayersMaxUnits> update;
for (size_t o = 0; o < output_size_; ++o) {
update[o] = bias_[o];
// TODO(bugs.chromium.org/9076): Benchmark how different layouts for
// |weights_| and |recurrent_weights_| change the performance across
// different platforms.
for (size_t i = 0; i < input_size_; ++i) { // Add input.
update[o] += input[i] * weights_[i * stride + o];
}
for (size_t s = 0; s < output_size_; ++s) {
update[o] += state_[s] * recurrent_weights_[s * stride + o];
} // Add state.
update[o] = SigmoidApproximated(kWeightsScale * update[o]);
}
// Compute reset gates.
offset += output_size_;
std::array<float, kRecurrentLayersMaxUnits> reset;
for (size_t o = 0; o < output_size_; ++o) {
reset[o] = bias_[offset + o];
for (size_t i = 0; i < input_size_; ++i) { // Add input.
reset[o] += input[i] * weights_[offset + i * stride + o];
}
for (size_t s = 0; s < output_size_; ++s) { // Add state.
reset[o] += state_[s] * recurrent_weights_[offset + s * stride + o];
}
reset[o] = SigmoidApproximated(kWeightsScale * reset[o]);
}
// Compute output.
offset += output_size_;
std::array<float, kRecurrentLayersMaxUnits> output;
for (size_t o = 0; o < output_size_; ++o) {
output[o] = bias_[offset + o];
for (size_t i = 0; i < input_size_; ++i) { // Add input.
output[o] += input[i] * weights_[offset + i * stride + o];
}
for (size_t s = 0; s < output_size_;
++s) { // Add state through reset gates.
output[o] +=
state_[s] * recurrent_weights_[offset + s * stride + o] * reset[s];
}
output[o] = (*activation_function_)(kWeightsScale * output[o]);
// Update output through the update gates.
output[o] = update[o] * state_[o] + (1.f - update[o]) * output[o];
}
// Update the state. Not done in the previous loop since that would pollute
// the current state and lead to incorrect output values.
std::copy(output.begin(), output.end(), state_.begin());
}
RnnBasedVad::RnnBasedVad()
: input_layer_(kInputLayerInputSize,
kInputLayerOutputSize,
kInputDenseBias,
kInputDenseWeights,
TansigApproximated),
hidden_layer_(kInputLayerOutputSize,
kHiddenLayerOutputSize,
kHiddenGruBias,
kHiddenGruWeights,
kHiddenGruRecurrentWeights,
RectifiedLinearUnit),
output_layer_(kHiddenLayerOutputSize,
kOutputLayerOutputSize,
kOutputDenseBias,
kOutputDenseWeights,
SigmoidApproximated) {
// Input-output chaining size checks.
RTC_DCHECK_EQ(input_layer_.output_size(), hidden_layer_.input_size())
<< "The input and the hidden layers sizes do not match.";
RTC_DCHECK_EQ(hidden_layer_.output_size(), output_layer_.input_size())
<< "The hidden and the output layers sizes do not match.";
}
RnnBasedVad::~RnnBasedVad() = default;
void RnnBasedVad::Reset() {
hidden_layer_.Reset();
}
float RnnBasedVad::ComputeVadProbability(
rtc::ArrayView<const float, kFeatureVectorSize> feature_vector,
bool is_silence) {
if (is_silence) {
Reset();
return 0.f;
}
input_layer_.ComputeOutput(feature_vector);
hidden_layer_.ComputeOutput(input_layer_.GetOutput());
output_layer_.ComputeOutput(hidden_layer_.GetOutput());
const auto vad_output = output_layer_.GetOutput();
return vad_output[0];
}
} // namespace rnn_vad
} // namespace webrtc