Add unit tests for `AudioDecoderOpusImpl` for stereo
- With mono encoding and stereo decoding check that the decoded
signal is trivial stereo
- DTX tests
- With mono encoding and stereo decoding check that the comfort
noise generated by Opus is NOT(*) trivially stereo
- With stereo encoding and stereo decoding check that the comfort
noise generated by Opus is not trivially stereo
*: the test shows the behavior described in [1] and that needs to
be fixed.
[1] https://issues.webrtc.org/376493209
Bug: webrtc:376493209
Change-Id: I34aacd4bd7c79be9df05c242e912c9981896a73d
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/367206
Reviewed-by: Jakob Ivarsson‎ <jakobi@webrtc.org>
Reviewed-by: Henrik Andreassson <henrika@webrtc.org>
Commit-Queue: Alessio Bazzica <alessiob@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#43363}
diff --git a/modules/BUILD.gn b/modules/BUILD.gn
index 21f866d..52bd86f 100644
--- a/modules/BUILD.gn
+++ b/modules/BUILD.gn
@@ -160,6 +160,7 @@
"../resources/near22_stereo.pcm",
"../resources/near32_stereo.pcm",
"../resources/near44_stereo.pcm",
+ "../resources/near48_mono.pcm",
"../resources/near48_stereo.pcm",
"../resources/near88_stereo.pcm",
"../resources/near8_stereo.pcm",
diff --git a/modules/audio_coding/BUILD.gn b/modules/audio_coding/BUILD.gn
index c42da41..a2a1ca8 100644
--- a/modules/audio_coding/BUILD.gn
+++ b/modules/audio_coding/BUILD.gn
@@ -1373,6 +1373,7 @@
"codecs/cng/cng_unittest.cc",
"codecs/legacy_encoded_audio_frame_unittest.cc",
"codecs/opus/audio_decoder_multi_channel_opus_unittest.cc",
+ "codecs/opus/audio_decoder_opus_unittest.cc",
"codecs/opus/audio_encoder_multi_channel_opus_unittest.cc",
"codecs/opus/audio_encoder_opus_unittest.cc",
"codecs/opus/opus_bandwidth_unittest.cc",
@@ -1459,6 +1460,7 @@
"../../api/audio_codecs/opus:audio_decoder_opus",
"../../api/audio_codecs/opus:audio_encoder_multiopus",
"../../api/audio_codecs/opus:audio_encoder_opus",
+ "../../api/audio_codecs/opus:audio_encoder_opus_config",
"../../api/environment",
"../../api/environment:environment_factory",
"../../api/neteq:default_neteq_controller_factory",
@@ -1475,10 +1477,12 @@
"../../logging:mocks",
"../../logging:rtc_event_audio",
"../../modules/rtp_rtcp:rtp_rtcp_format",
+ "../../rtc_base:buffer",
"../../rtc_base:checks",
"../../rtc_base:digest",
"../../rtc_base:macromagic",
"../../rtc_base:platform_thread",
+ "../../rtc_base:random",
"../../rtc_base:refcount",
"../../rtc_base:rtc_base_tests_utils",
"../../rtc_base:rtc_event",
diff --git a/modules/audio_coding/codecs/opus/audio_decoder_opus_unittest.cc b/modules/audio_coding/codecs/opus/audio_decoder_opus_unittest.cc
new file mode 100644
index 0000000..4610a78
--- /dev/null
+++ b/modules/audio_coding/codecs/opus/audio_decoder_opus_unittest.cc
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2024 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_coding/codecs/opus/audio_decoder_opus.h"
+
+#include <cmath>
+#include <limits>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "api/array_view.h"
+#include "api/audio/audio_frame.h"
+#include "api/audio_codecs/audio_decoder.h"
+#include "api/audio_codecs/opus/audio_encoder_opus_config.h"
+#include "api/environment/environment.h"
+#include "api/environment/environment_factory.h"
+#include "modules/audio_coding/codecs/opus/audio_encoder_opus.h"
+#include "modules/audio_coding/test/PCMFile.h"
+#include "rtc_base/buffer.h"
+#include "rtc_base/checks.h"
+#include "rtc_base/random.h"
+#include "test/gtest.h"
+#include "test/testsupport/file_utils.h"
+
+namespace webrtc {
+namespace {
+
+using DecodeResult = ::webrtc::AudioDecoder::EncodedAudioFrame::DecodeResult;
+using ParseResult = ::webrtc::AudioDecoder::ParseResult;
+
+constexpr int kSampleRateHz = 48000;
+
+constexpr int kInputFrameDurationMs = 10;
+constexpr int kInputFrameLength = kInputFrameDurationMs * kSampleRateHz / 1000;
+
+constexpr int kEncoderFrameDurationMs = 20;
+constexpr int kEncoderFrameLength =
+ kEncoderFrameDurationMs * kSampleRateHz / 1000;
+
+constexpr int kPayloadType = 123;
+
+AudioEncoderOpusConfig GetEncoderConfig(int num_channels, bool dtx_enabled) {
+ AudioEncoderOpusConfig config;
+
+ config.frame_size_ms = kEncoderFrameDurationMs;
+ config.sample_rate_hz = kSampleRateHz;
+ config.num_channels = num_channels;
+ config.application = AudioEncoderOpusConfig::ApplicationMode::kVoip;
+ config.bitrate_bps = 32000;
+ config.fec_enabled = false;
+ config.cbr_enabled = false;
+ config.max_playback_rate_hz = kSampleRateHz;
+ config.complexity = 10;
+ config.dtx_enabled = dtx_enabled;
+
+ return config;
+}
+
+class WhiteNoiseGenerator {
+ public:
+ explicit WhiteNoiseGenerator(double amplitude_dbfs)
+ : amplitude_(
+ rtc::saturated_cast<int16_t>(std::pow(10, amplitude_dbfs / 20) *
+ std::numeric_limits<int16_t>::max())),
+ random_generator_(42) {}
+
+ void GenerateNextFrame(rtc::ArrayView<int16_t> frame) {
+ for (size_t i = 0; i < frame.size(); ++i) {
+ frame[i] = rtc::saturated_cast<int16_t>(
+ random_generator_.Rand(-amplitude_, amplitude_));
+ }
+ }
+
+ private:
+ const int32_t amplitude_;
+ Random random_generator_;
+};
+
+bool IsZeroedFrame(rtc::ArrayView<const int16_t> audio) {
+ for (const int16_t& v : audio) {
+ if (v != 0)
+ return false;
+ }
+ return true;
+}
+
+bool IsTrivialStereo(rtc::ArrayView<const int16_t> audio) {
+ const int num_samples =
+ rtc::CheckedDivExact(audio.size(), static_cast<size_t>(2));
+ for (int i = 0, j = 0; i < num_samples; ++i, j += 2) {
+ if (audio[j] != audio[j + 1]) {
+ return false;
+ }
+ }
+ return true;
+}
+
+void EncodeDecodeSpeech(AudioEncoderOpusImpl& encoder,
+ AudioDecoderOpusImpl& decoder,
+ uint32_t& rtp_timestamp,
+ uint32_t& timestamp,
+ int max_frames) {
+ RTC_CHECK(encoder.NumChannels() == 1 || encoder.NumChannels() == 2);
+ const bool stereo_encoding = encoder.NumChannels() == 2;
+ const size_t decoder_num_channels = decoder.Channels();
+ std::vector<int16_t> decoded_frame(kEncoderFrameLength *
+ decoder_num_channels);
+
+ PCMFile pcm_file;
+ pcm_file.Open(test::ResourcePath(
+ stereo_encoding ? "near48_stereo" : "near48_mono", "pcm"),
+ kSampleRateHz, "rb");
+ pcm_file.ReadStereo(stereo_encoding);
+
+ AudioFrame audio_frame;
+ for (int i = 0; i < max_frames; ++i) {
+ if (pcm_file.EndOfFile()) {
+ break;
+ }
+ pcm_file.Read10MsData(audio_frame);
+ rtc::Buffer payload;
+ encoder.Encode(rtp_timestamp++, audio_frame.data_view().data(), &payload);
+
+ // Ignore empty payloads: the encoder needs more audio to produce a packet.
+ if (payload.size() == 0) {
+ continue;
+ }
+
+ // Decode.
+ std::vector<ParseResult> parse_results =
+ decoder.ParsePayload(std::move(payload), timestamp++);
+ RTC_CHECK_EQ(parse_results.size(), 1);
+ std::optional<DecodeResult> decode_results =
+ parse_results[0].frame->Decode(decoded_frame);
+ RTC_CHECK(decode_results);
+ RTC_CHECK_EQ(decode_results->num_decoded_samples, decoded_frame.size());
+ }
+}
+
+void EncodeDecodeNoiseUntilDecoderInDtxMode(AudioEncoderOpusImpl& encoder,
+ AudioDecoderOpusImpl& decoder,
+ uint32_t& rtp_timestamp,
+ uint32_t& timestamp) {
+ WhiteNoiseGenerator generator(/*amplitude_dbfs=*/-70.0);
+ std::vector<int16_t> input_frame(kInputFrameLength * encoder.NumChannels());
+ const size_t decoder_num_channels = decoder.Channels();
+ std::vector<int16_t> decoded_frame(kEncoderFrameLength *
+ decoder_num_channels);
+
+ bool dtx_packet_found = false;
+ for (int i = 0; i < 50; ++i) {
+ generator.GenerateNextFrame(input_frame);
+ rtc::Buffer payload;
+ const AudioEncoder::EncodedInfo info =
+ encoder.Encode(rtp_timestamp++, input_frame, &payload);
+
+ // Ignore empty payloads: the encoder needs more audio to produce a packet.
+ if (payload.size() == 0) {
+ continue;
+ }
+
+ // Decode `payload`. If not a DTX packet, decoding it may update the
+ // internal decoder parameters for comfort noise generation.
+ std::vector<ParseResult> parse_results =
+ decoder.ParsePayload(std::move(payload), timestamp++);
+ RTC_CHECK_EQ(parse_results.size(), 1);
+ std::optional<DecodeResult> decode_results =
+ parse_results[0].frame->Decode(decoded_frame);
+ RTC_CHECK(decode_results);
+ RTC_CHECK_EQ(decode_results->num_decoded_samples, decoded_frame.size());
+
+ if (parse_results[0].frame->IsDtxPacket()) {
+ // The decoder is now in DTX mode.
+ dtx_packet_found = true;
+ break;
+ }
+ }
+ RTC_CHECK(dtx_packet_found);
+}
+
+} // namespace
+
+TEST(AudioDecoderOpusTest, MonoEncoderStereoDecoderOutputsTrivialStereo) {
+ const Environment env = EnvironmentFactory().Create();
+ WhiteNoiseGenerator generator(/*amplitude_dbfs=*/-70.0);
+ std::array<int16_t, kInputFrameLength> input_frame;
+ // Create a mono encoder.
+ const AudioEncoderOpusConfig encoder_config =
+ GetEncoderConfig(/*num_channels=*/1, /*dtx_enabled=*/false);
+ AudioEncoderOpusImpl encoder(env, encoder_config, kPayloadType);
+ // Create a stereo decoder.
+ constexpr size_t kDecoderNumChannels = 2;
+ AudioDecoderOpusImpl decoder(env.field_trials(), kDecoderNumChannels,
+ kSampleRateHz);
+ std::array<int16_t, kEncoderFrameLength * kDecoderNumChannels> decoded_frame;
+
+ uint32_t rtp_timestamp = 0xFFFu;
+ uint32_t timestamp = 0;
+ for (int i = 0; i < 30; ++i) {
+ generator.GenerateNextFrame(input_frame);
+ rtc::Buffer payload;
+ encoder.Encode(rtp_timestamp++, input_frame, &payload);
+ if (payload.size() == 0) {
+ continue;
+ }
+
+ // Decode.
+ std::vector<ParseResult> parse_results =
+ decoder.ParsePayload(std::move(payload), timestamp++);
+ RTC_CHECK_EQ(parse_results.size(), 1);
+ std::optional<DecodeResult> decode_results =
+ parse_results[0].frame->Decode(decoded_frame);
+ RTC_CHECK(decode_results);
+ RTC_CHECK_EQ(decode_results->num_decoded_samples, decoded_frame.size());
+
+ EXPECT_TRUE(IsTrivialStereo(decoded_frame));
+ }
+}
+
+TEST(AudioDecoderOpusTest, MonoEncoderStereoDecoderOutputsNonTrivialStereoDtx) {
+ const Environment env = EnvironmentFactory().Create();
+ // Create a mono encoder.
+ const AudioEncoderOpusConfig encoder_config =
+ GetEncoderConfig(/*num_channels=*/1, /*dtx_enabled=*/true);
+ AudioEncoderOpusImpl encoder(env, encoder_config, kPayloadType);
+ // Create a stereo decoder.
+ constexpr size_t kDecoderNumChannels = 2;
+ AudioDecoderOpusImpl decoder(env.field_trials(), kDecoderNumChannels,
+ kSampleRateHz);
+
+ uint32_t rtp_timestamp = 0xFFFu;
+ uint32_t timestamp = 0;
+ // Feed the encoder with speech, otherwise DTX will never kick in.
+ EncodeDecodeSpeech(encoder, decoder, rtp_timestamp, timestamp,
+ /*max_frames=*/100);
+ // Feed the encoder with noise until the decoder is in DTX mode.
+ EncodeDecodeNoiseUntilDecoderInDtxMode(encoder, decoder, rtp_timestamp,
+ timestamp);
+
+ // Decode an empty packet so that Opus generates comfort noise.
+ std::array<int16_t, kEncoderFrameLength * kDecoderNumChannels> decoded_frame;
+ AudioDecoder::SpeechType speech_type;
+ const int num_decoded_samples =
+ decoder.Decode(/*encoded=*/nullptr, /*encoded_len=*/0, kSampleRateHz,
+ decoded_frame.size(), decoded_frame.data(), &speech_type);
+ ASSERT_EQ(speech_type, AudioDecoder::SpeechType::kComfortNoise);
+ RTC_CHECK_GT(num_decoded_samples, 0);
+ RTC_CHECK_LE(num_decoded_samples, decoded_frame.size());
+ rtc::ArrayView<const int16_t> decoded_view(decoded_frame.data(),
+ num_decoded_samples);
+ // Make sure that comfort noise is not a muted frame.
+ ASSERT_FALSE(IsZeroedFrame(decoded_view));
+
+ // TODO: https://issues.webrtc.org/376493209 - When fixed, expect true below.
+ EXPECT_FALSE(IsTrivialStereo(decoded_view));
+}
+
+TEST(AudioDecoderOpusTest,
+ StereoEncoderStereoDecoderOutputsNonTrivialStereoDtx) {
+ const Environment env = EnvironmentFactory().Create();
+ // Create a stereo encoder.
+ const AudioEncoderOpusConfig encoder_config =
+ GetEncoderConfig(/*num_channels=*/2, /*dtx_enabled=*/true);
+ AudioEncoderOpusImpl encoder(env, encoder_config, kPayloadType);
+ // Create a stereo decoder.
+ constexpr size_t kDecoderNumChannels = 2;
+ AudioDecoderOpusImpl decoder(env.field_trials(), kDecoderNumChannels,
+ kSampleRateHz);
+
+ uint32_t rtp_timestamp = 0xFFFu;
+ uint32_t timestamp = 0;
+ // Feed the encoder with speech, otherwise DTX will never kick in.
+ EncodeDecodeSpeech(encoder, decoder, rtp_timestamp, timestamp,
+ /*max_frames=*/100);
+ // Feed the encoder with noise and decode until the decoder is in DTX mode.
+ EncodeDecodeNoiseUntilDecoderInDtxMode(encoder, decoder, rtp_timestamp,
+ timestamp);
+
+ // Decode an empty packet so that Opus generates comfort noise.
+ std::array<int16_t, kEncoderFrameLength * kDecoderNumChannels> decoded_frame;
+ AudioDecoder::SpeechType speech_type;
+ const int num_decoded_samples =
+ decoder.Decode(/*encoded=*/nullptr, /*encoded_len=*/0, kSampleRateHz,
+ decoded_frame.size(), decoded_frame.data(), &speech_type);
+ ASSERT_EQ(speech_type, AudioDecoder::SpeechType::kComfortNoise);
+ RTC_CHECK_GT(num_decoded_samples, 0);
+ RTC_CHECK_LE(num_decoded_samples, decoded_frame.size());
+ rtc::ArrayView<const int16_t> decoded_view(decoded_frame.data(),
+ num_decoded_samples);
+ // Make sure that comfort noise is not a muted frame.
+ ASSERT_FALSE(IsZeroedFrame(decoded_view));
+
+ EXPECT_FALSE(IsTrivialStereo(decoded_view));
+}
+
+} // namespace webrtc
diff --git a/resources/near48_mono.pcm.sha1 b/resources/near48_mono.pcm.sha1
new file mode 100644
index 0000000..f9254c7
--- /dev/null
+++ b/resources/near48_mono.pcm.sha1
@@ -0,0 +1 @@
+2b752cdcb86095a0c405724aa1ce4ef910e06d10
\ No newline at end of file