blob: 998b88a105301be0f929f0fd3a9d6f1e33405659 [file] [log] [blame]
/*
* Copyright (c) 2024 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "modules/audio_coding/codecs/opus/audio_decoder_opus.h"
#include <cmath>
#include <limits>
#include <optional>
#include <utility>
#include <vector>
#include "api/array_view.h"
#include "api/audio/audio_frame.h"
#include "api/audio_codecs/audio_decoder.h"
#include "api/audio_codecs/opus/audio_encoder_opus_config.h"
#include "api/environment/environment.h"
#include "api/environment/environment_factory.h"
#include "modules/audio_coding/codecs/opus/audio_encoder_opus.h"
#include "modules/audio_coding/test/PCMFile.h"
#include "rtc_base/buffer.h"
#include "rtc_base/checks.h"
#include "rtc_base/random.h"
#include "test/explicit_key_value_config.h"
#include "test/gmock.h"
#include "test/gtest.h"
#include "test/testsupport/file_utils.h"
namespace webrtc {
namespace {
using test::ExplicitKeyValueConfig;
using testing::SizeIs;
using DecodeResult = ::webrtc::AudioDecoder::EncodedAudioFrame::DecodeResult;
using ParseResult = ::webrtc::AudioDecoder::ParseResult;
constexpr int kSampleRateHz = 48000;
constexpr int kInputFrameDurationMs = 10;
constexpr int kInputFrameLength = kInputFrameDurationMs * kSampleRateHz / 1000;
constexpr int kEncoderFrameDurationMs = 20;
constexpr int kEncoderFrameLength =
kEncoderFrameDurationMs * kSampleRateHz / 1000;
constexpr int kPayloadType = 123;
AudioEncoderOpusConfig GetEncoderConfig(int num_channels, bool dtx_enabled) {
AudioEncoderOpusConfig config;
config.frame_size_ms = kEncoderFrameDurationMs;
config.sample_rate_hz = kSampleRateHz;
config.num_channels = num_channels;
config.application = AudioEncoderOpusConfig::ApplicationMode::kVoip;
config.bitrate_bps = 32000;
config.fec_enabled = false;
config.cbr_enabled = false;
config.max_playback_rate_hz = kSampleRateHz;
config.complexity = 10;
config.dtx_enabled = dtx_enabled;
return config;
}
class WhiteNoiseGenerator {
public:
explicit WhiteNoiseGenerator(double amplitude_dbfs)
: amplitude_(
rtc::saturated_cast<int16_t>(std::pow(10, amplitude_dbfs / 20) *
std::numeric_limits<int16_t>::max())),
random_generator_(42) {}
void GenerateNextFrame(rtc::ArrayView<int16_t> frame) {
for (size_t i = 0; i < frame.size(); ++i) {
frame[i] = rtc::saturated_cast<int16_t>(
random_generator_.Rand(-amplitude_, amplitude_));
}
}
private:
const int32_t amplitude_;
Random random_generator_;
};
bool IsZeroedFrame(rtc::ArrayView<const int16_t> audio) {
for (const int16_t& v : audio) {
if (v != 0)
return false;
}
return true;
}
bool IsTrivialStereo(rtc::ArrayView<const int16_t> audio) {
const int num_samples =
rtc::CheckedDivExact(audio.size(), static_cast<size_t>(2));
for (int i = 0, j = 0; i < num_samples; ++i, j += 2) {
if (audio[j] != audio[j + 1]) {
return false;
}
}
return true;
}
void EncodeDecodeSpeech(AudioEncoderOpusImpl& encoder,
AudioDecoderOpusImpl& decoder,
uint32_t& rtp_timestamp,
uint32_t& timestamp,
int max_frames) {
RTC_CHECK(encoder.NumChannels() == 1 || encoder.NumChannels() == 2);
const bool stereo_encoding = encoder.NumChannels() == 2;
const size_t decoder_num_channels = decoder.Channels();
std::vector<int16_t> decoded_frame(kEncoderFrameLength *
decoder_num_channels);
PCMFile pcm_file;
pcm_file.Open(test::ResourcePath(
stereo_encoding ? "near48_stereo" : "near48_mono", "pcm"),
kSampleRateHz, "rb");
pcm_file.ReadStereo(stereo_encoding);
AudioFrame audio_frame;
for (int i = 0; i < max_frames; ++i) {
if (pcm_file.EndOfFile()) {
break;
}
pcm_file.Read10MsData(audio_frame);
rtc::Buffer payload;
encoder.Encode(rtp_timestamp++, audio_frame.data_view().data(), &payload);
// Ignore empty payloads: the encoder needs more audio to produce a packet.
if (payload.size() == 0) {
continue;
}
// Decode.
std::vector<ParseResult> parse_results =
decoder.ParsePayload(std::move(payload), timestamp++);
RTC_CHECK_EQ(parse_results.size(), 1);
std::optional<DecodeResult> decode_results =
parse_results[0].frame->Decode(decoded_frame);
RTC_CHECK(decode_results);
RTC_CHECK_EQ(decode_results->num_decoded_samples, decoded_frame.size());
}
}
void EncodeDecodeNoiseUntilDecoderInDtxMode(AudioEncoderOpusImpl& encoder,
AudioDecoderOpusImpl& decoder,
uint32_t& rtp_timestamp,
uint32_t& timestamp) {
WhiteNoiseGenerator generator(/*amplitude_dbfs=*/-70.0);
std::vector<int16_t> input_frame(kInputFrameLength * encoder.NumChannels());
const size_t decoder_num_channels = decoder.Channels();
std::vector<int16_t> decoded_frame(kEncoderFrameLength *
decoder_num_channels);
for (int i = 0; i < 50; ++i) {
generator.GenerateNextFrame(input_frame);
rtc::Buffer payload;
const AudioEncoder::EncodedInfo info =
encoder.Encode(rtp_timestamp++, input_frame, &payload);
// Ignore empty payloads: the encoder needs more audio to produce a packet.
if (payload.size() == 0) {
continue;
}
// Decode `payload`. If it encodes a DTX packet (i.e., 1 byte payload), the
// decoder will switch to DTX mode. Otherwise, it may update the internal
// decoder parameters for comfort noise generation.
std::vector<ParseResult> parse_results =
decoder.ParsePayload(std::move(payload), timestamp++);
RTC_CHECK_EQ(parse_results.size(), 1);
std::optional<DecodeResult> decode_results =
parse_results[0].frame->Decode(decoded_frame);
RTC_CHECK(decode_results);
RTC_CHECK_EQ(decode_results->num_decoded_samples, decoded_frame.size());
if (parse_results[0].frame->IsDtxPacket()) {
return;
}
}
RTC_CHECK_NOTREACHED();
}
// Generates packets by encoding speech frames and decodes them until a non-DTX
// packet is generated and, when that condition is met, returns the decoded
// audio samples.
std::vector<int16_t> EncodeDecodeSpeechUntilOneFrameIsDecoded(
AudioEncoderOpusImpl& encoder,
AudioDecoderOpusImpl& decoder,
uint32_t& rtp_timestamp,
uint32_t& timestamp) {
RTC_CHECK(encoder.NumChannels() == 1 || encoder.NumChannels() == 2);
const bool stereo_encoding = encoder.NumChannels() == 2;
const size_t decoder_num_channels = decoder.Channels();
std::vector<int16_t> decoded_frame(kEncoderFrameLength *
decoder_num_channels);
PCMFile pcm_file;
pcm_file.Open(test::ResourcePath(
stereo_encoding ? "near48_stereo" : "near48_mono", "pcm"),
kSampleRateHz, "rb");
pcm_file.ReadStereo(stereo_encoding);
AudioFrame audio_frame;
while (true) {
if (pcm_file.EndOfFile()) {
break;
}
pcm_file.Read10MsData(audio_frame);
rtc::Buffer payload;
encoder.Encode(rtp_timestamp++, audio_frame.data_view().data(), &payload);
// Ignore empty payloads: the encoder needs more audio to produce a packet.
if (payload.size() == 0) {
continue;
}
// Decode `payload`.
std::vector<ParseResult> parse_results =
decoder.ParsePayload(std::move(payload), timestamp++);
RTC_CHECK_EQ(parse_results.size(), 1);
std::optional<DecodeResult> decode_results =
parse_results[0].frame->Decode(decoded_frame);
RTC_CHECK(decode_results);
if (parse_results[0].frame->IsDtxPacket()) {
continue;
}
RTC_CHECK_EQ(decode_results->num_decoded_samples, decoded_frame.size());
return decoded_frame;
}
RTC_CHECK_NOTREACHED();
}
} // namespace
TEST(AudioDecoderOpusTest, MonoEncoderStereoDecoderOutputsTrivialStereo) {
const Environment env = EnvironmentFactory().Create();
WhiteNoiseGenerator generator(/*amplitude_dbfs=*/-70.0);
std::array<int16_t, kInputFrameLength> input_frame;
// Create a mono encoder.
const AudioEncoderOpusConfig encoder_config =
GetEncoderConfig(/*num_channels=*/1, /*dtx_enabled=*/false);
AudioEncoderOpusImpl encoder(env, encoder_config, kPayloadType);
// Create a stereo decoder.
constexpr size_t kDecoderNumChannels = 2;
AudioDecoderOpusImpl decoder(env.field_trials(), kDecoderNumChannels,
kSampleRateHz);
std::array<int16_t, kEncoderFrameLength * kDecoderNumChannels> decoded_frame;
uint32_t rtp_timestamp = 0xFFFu;
uint32_t timestamp = 0;
for (int i = 0; i < 30; ++i) {
generator.GenerateNextFrame(input_frame);
rtc::Buffer payload;
encoder.Encode(rtp_timestamp++, input_frame, &payload);
if (payload.size() == 0) {
continue;
}
// Decode.
std::vector<ParseResult> parse_results =
decoder.ParsePayload(std::move(payload), timestamp++);
RTC_CHECK_EQ(parse_results.size(), 1);
std::optional<DecodeResult> decode_results =
parse_results[0].frame->Decode(decoded_frame);
RTC_CHECK(decode_results);
RTC_CHECK_EQ(decode_results->num_decoded_samples, decoded_frame.size());
EXPECT_TRUE(IsTrivialStereo(decoded_frame));
}
}
TEST(AudioDecoderOpusTest,
MonoEncoderStereoDecoderOutputsTrivialStereoComfortNoise) {
const Environment env = EnvironmentFactory().Create();
// Create a mono encoder.
const AudioEncoderOpusConfig encoder_config =
GetEncoderConfig(/*num_channels=*/1, /*dtx_enabled=*/true);
AudioEncoderOpusImpl encoder(env, encoder_config, kPayloadType);
// Create a stereo decoder.
constexpr size_t kDecoderNumChannels = 2;
AudioDecoderOpusImpl decoder(env.field_trials(), kDecoderNumChannels,
kSampleRateHz);
std::vector<int16_t> decoded_frame;
uint32_t rtp_timestamp = 0xFFFu;
uint32_t timestamp = 0;
// Feed the encoder with speech, otherwise DTX will never kick in.
EncodeDecodeSpeech(encoder, decoder, rtp_timestamp, timestamp,
/*max_frames=*/100);
// Feed the encoder with noise until the decoder is in DTX mode.
EncodeDecodeNoiseUntilDecoderInDtxMode(encoder, decoder, rtp_timestamp,
timestamp);
// Decode an empty packet so that Opus generates comfort noise.
decoded_frame.resize(kEncoderFrameLength * kDecoderNumChannels);
AudioDecoder::SpeechType speech_type;
const int num_decoded_samples =
decoder.Decode(/*encoded=*/nullptr, /*encoded_len=*/0, kSampleRateHz,
decoded_frame.size(), decoded_frame.data(), &speech_type);
ASSERT_EQ(speech_type, AudioDecoder::SpeechType::kComfortNoise);
RTC_CHECK_GT(num_decoded_samples, 0);
RTC_CHECK_LE(num_decoded_samples, decoded_frame.size());
rtc::ArrayView<const int16_t> decoded_view(decoded_frame.data(),
num_decoded_samples);
// Make sure that comfort noise is not a muted frame.
ASSERT_FALSE(IsZeroedFrame(decoded_view));
EXPECT_TRUE(IsTrivialStereo(decoded_view));
// Also check the first decoded audio frame after comfort noise.
decoded_frame = EncodeDecodeSpeechUntilOneFrameIsDecoded(
encoder, decoder, rtp_timestamp, timestamp);
ASSERT_THAT(decoded_frame, SizeIs(kDecoderNumChannels * kEncoderFrameLength));
ASSERT_FALSE(IsZeroedFrame(decoded_frame));
EXPECT_TRUE(IsTrivialStereo(decoded_frame));
}
TEST(AudioDecoderOpusTest, MonoEncoderStereoDecoderOutputsTrivialStereoPlc) {
const ExplicitKeyValueConfig trials("WebRTC-Audio-OpusGeneratePlc/Enabled/");
EnvironmentFactory env_factory;
env_factory.Set(&trials);
const Environment env = env_factory.Create();
// Create a mono encoder.
const AudioEncoderOpusConfig encoder_config =
GetEncoderConfig(/*num_channels=*/1, /*dtx_enabled=*/false);
AudioEncoderOpusImpl encoder(env, encoder_config, kPayloadType);
// Create a stereo decoder.
constexpr size_t kDecoderNumChannels = 2;
AudioDecoderOpusImpl decoder(env.field_trials(), kDecoderNumChannels,
kSampleRateHz);
uint32_t rtp_timestamp = 0xFFFu;
uint32_t timestamp = 0;
// Feed the encoder with speech.
EncodeDecodeSpeech(encoder, decoder, rtp_timestamp, timestamp,
/*max_frames=*/100);
// Generate packet loss concealment.
rtc::BufferT<int16_t> concealment_audio;
constexpr int kIgnored = 123;
decoder.GeneratePlc(/*requested_samples_per_channel=*/kIgnored,
&concealment_audio);
RTC_CHECK_GT(concealment_audio.size(), 0);
rtc::ArrayView<const int16_t> decoded_view(concealment_audio.data(),
concealment_audio.size());
// Make sure that packet loss concealment is not a muted frame.
ASSERT_FALSE(IsZeroedFrame(decoded_view));
EXPECT_TRUE(IsTrivialStereo(decoded_view));
// Also check the first decoded audio frame after packet loss concealment.
std::vector<int16_t> decoded_frame = EncodeDecodeSpeechUntilOneFrameIsDecoded(
encoder, decoder, rtp_timestamp, timestamp);
ASSERT_THAT(decoded_frame, SizeIs(kDecoderNumChannels * kEncoderFrameLength));
ASSERT_FALSE(IsZeroedFrame(decoded_frame));
EXPECT_TRUE(IsTrivialStereo(decoded_frame));
}
TEST(AudioDecoderOpusTest,
StereoEncoderStereoDecoderOutputsNonTrivialStereoComfortNoise) {
const Environment env = EnvironmentFactory().Create();
// Create a stereo encoder.
const AudioEncoderOpusConfig encoder_config =
GetEncoderConfig(/*num_channels=*/2, /*dtx_enabled=*/true);
AudioEncoderOpusImpl encoder(env, encoder_config, kPayloadType);
// Create a stereo decoder.
constexpr size_t kDecoderNumChannels = 2;
AudioDecoderOpusImpl decoder(env.field_trials(), kDecoderNumChannels,
kSampleRateHz);
uint32_t rtp_timestamp = 0xFFFu;
uint32_t timestamp = 0;
// Feed the encoder with speech, otherwise DTX will never kick in.
EncodeDecodeSpeech(encoder, decoder, rtp_timestamp, timestamp,
/*max_frames=*/100);
// Feed the encoder with noise and decode until the decoder is in DTX mode.
EncodeDecodeNoiseUntilDecoderInDtxMode(encoder, decoder, rtp_timestamp,
timestamp);
// Decode an empty packet so that Opus generates comfort noise.
std::array<int16_t, kEncoderFrameLength * kDecoderNumChannels> decoded_frame;
AudioDecoder::SpeechType speech_type;
const int num_decoded_samples =
decoder.Decode(/*encoded=*/nullptr, /*encoded_len=*/0, kSampleRateHz,
decoded_frame.size(), decoded_frame.data(), &speech_type);
ASSERT_EQ(speech_type, AudioDecoder::SpeechType::kComfortNoise);
RTC_CHECK_GT(num_decoded_samples, 0);
RTC_CHECK_LE(num_decoded_samples, decoded_frame.size());
rtc::ArrayView<const int16_t> decoded_view(decoded_frame.data(),
num_decoded_samples);
// Make sure that comfort noise is not a muted frame.
ASSERT_FALSE(IsZeroedFrame(decoded_view));
EXPECT_FALSE(IsTrivialStereo(decoded_view));
}
TEST(AudioDecoderOpusTest,
StereoEncoderStereoDecoderOutputsNonTrivialStereoPlc) {
const ExplicitKeyValueConfig trials("WebRTC-Audio-OpusGeneratePlc/Enabled/");
EnvironmentFactory env_factory;
env_factory.Set(&trials);
const Environment env = env_factory.Create();
// Create a stereo encoder.
const AudioEncoderOpusConfig encoder_config =
GetEncoderConfig(/*num_channels=*/2, /*dtx_enabled=*/false);
AudioEncoderOpusImpl encoder(env, encoder_config, kPayloadType);
// Create a stereo decoder.
constexpr size_t kDecoderNumChannels = 2;
AudioDecoderOpusImpl decoder(env.field_trials(), kDecoderNumChannels,
kSampleRateHz);
uint32_t rtp_timestamp = 0xFFFu;
uint32_t timestamp = 0;
// Feed the encoder with speech.
EncodeDecodeSpeech(encoder, decoder, rtp_timestamp, timestamp,
/*max_frames=*/100);
// Generate packet loss concealment.
rtc::BufferT<int16_t> concealment_audio;
constexpr int kIgnored = 123;
decoder.GeneratePlc(/*requested_samples_per_channel=*/kIgnored,
&concealment_audio);
RTC_CHECK_GT(concealment_audio.size(), 0);
rtc::ArrayView<const int16_t> decoded_view(concealment_audio.data(),
concealment_audio.size());
// Make sure that packet loss concealment is not a muted frame.
ASSERT_FALSE(IsZeroedFrame(decoded_view));
EXPECT_FALSE(IsTrivialStereo(decoded_view));
}
} // namespace webrtc