| /* |
| * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. |
| * |
| * Use of this source code is governed by a BSD-style license |
| * that can be found in the LICENSE file in the root of the source |
| * tree. An additional intellectual property rights grant can be found |
| * in the file PATENTS. All contributing project authors may |
| * be found in the AUTHORS file in the root of the source tree. |
| */ |
| |
| #include <memory> |
| #include <vector> |
| |
| #include "common_audio/vad/mock/mock_vad.h" |
| #include "modules/audio_coding/codecs/cng/audio_encoder_cng.h" |
| #include "rtc_base/constructor_magic.h" |
| #include "rtc_base/numerics/safe_conversions.h" |
| #include "test/gtest.h" |
| #include "test/mock_audio_encoder.h" |
| |
| using ::testing::Return; |
| using ::testing::_; |
| using ::testing::SetArgPointee; |
| using ::testing::InSequence; |
| using ::testing::Invoke; |
| |
| namespace webrtc { |
| |
| namespace { |
| static const size_t kMaxNumSamples = 48 * 10 * 2; // 10 ms @ 48 kHz stereo. |
| static const size_t kMockReturnEncodedBytes = 17; |
| static const int kCngPayloadType = 18; |
| } // namespace |
| |
| class AudioEncoderCngTest : public ::testing::Test { |
| protected: |
| AudioEncoderCngTest() |
| : mock_encoder_owner_(new MockAudioEncoder), |
| mock_encoder_(mock_encoder_owner_.get()), |
| mock_vad_(new MockVad), |
| timestamp_(4711), |
| num_audio_samples_10ms_(0), |
| sample_rate_hz_(8000) { |
| memset(audio_, 0, kMaxNumSamples * 2); |
| EXPECT_CALL(*mock_encoder_, NumChannels()).WillRepeatedly(Return(1)); |
| } |
| |
| void TearDown() override { |
| EXPECT_CALL(*mock_vad_, Die()).Times(1); |
| cng_.reset(); |
| } |
| |
| AudioEncoderCngConfig MakeCngConfig() { |
| AudioEncoderCngConfig config; |
| config.speech_encoder = std::move(mock_encoder_owner_); |
| EXPECT_TRUE(config.speech_encoder); |
| |
| // Let the AudioEncoderCng object use a MockVad instead of its internally |
| // created Vad object. |
| config.vad = mock_vad_; |
| config.payload_type = kCngPayloadType; |
| |
| return config; |
| } |
| |
| void CreateCng(AudioEncoderCngConfig&& config) { |
| num_audio_samples_10ms_ = static_cast<size_t>(10 * sample_rate_hz_ / 1000); |
| ASSERT_LE(num_audio_samples_10ms_, kMaxNumSamples); |
| if (config.speech_encoder) { |
| EXPECT_CALL(*mock_encoder_, SampleRateHz()) |
| .WillRepeatedly(Return(sample_rate_hz_)); |
| // Max10MsFramesInAPacket() is just used to verify that the SID frame |
| // period is not too small. The return value does not matter that much, |
| // as long as it is smaller than 10. |
| EXPECT_CALL(*mock_encoder_, Max10MsFramesInAPacket()) |
| .WillOnce(Return(1u)); |
| } |
| cng_ = CreateComfortNoiseEncoder(std::move(config)); |
| } |
| |
| void Encode() { |
| ASSERT_TRUE(cng_) << "Must call CreateCng() first."; |
| encoded_info_ = cng_->Encode( |
| timestamp_, |
| rtc::ArrayView<const int16_t>(audio_, num_audio_samples_10ms_), |
| &encoded_); |
| timestamp_ += static_cast<uint32_t>(num_audio_samples_10ms_); |
| } |
| |
| // Expect |num_calls| calls to the encoder, all successful. The last call |
| // claims to have encoded |kMockReturnEncodedBytes| bytes, and all the |
| // preceding ones 0 bytes. |
| void ExpectEncodeCalls(size_t num_calls) { |
| InSequence s; |
| AudioEncoder::EncodedInfo info; |
| for (size_t j = 0; j < num_calls - 1; ++j) { |
| EXPECT_CALL(*mock_encoder_, EncodeImpl(_, _, _)).WillOnce(Return(info)); |
| } |
| info.encoded_bytes = kMockReturnEncodedBytes; |
| EXPECT_CALL(*mock_encoder_, EncodeImpl(_, _, _)) |
| .WillOnce( |
| Invoke(MockAudioEncoder::FakeEncoding(kMockReturnEncodedBytes))); |
| } |
| |
| // Verifies that the cng_ object waits until it has collected |
| // |blocks_per_frame| blocks of audio, and then dispatches all of them to |
| // the underlying codec (speech or cng). |
| void CheckBlockGrouping(size_t blocks_per_frame, bool active_speech) { |
| EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()) |
| .WillRepeatedly(Return(blocks_per_frame)); |
| auto config = MakeCngConfig(); |
| const int num_cng_coefficients = config.num_cng_coefficients; |
| CreateCng(std::move(config)); |
| EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) |
| .WillRepeatedly(Return(active_speech ? Vad::kActive : Vad::kPassive)); |
| |
| // Don't expect any calls to the encoder yet. |
| EXPECT_CALL(*mock_encoder_, EncodeImpl(_, _, _)).Times(0); |
| for (size_t i = 0; i < blocks_per_frame - 1; ++i) { |
| Encode(); |
| EXPECT_EQ(0u, encoded_info_.encoded_bytes); |
| } |
| if (active_speech) |
| ExpectEncodeCalls(blocks_per_frame); |
| Encode(); |
| if (active_speech) { |
| EXPECT_EQ(kMockReturnEncodedBytes, encoded_info_.encoded_bytes); |
| } else { |
| EXPECT_EQ(static_cast<size_t>(num_cng_coefficients + 1), |
| encoded_info_.encoded_bytes); |
| } |
| } |
| |
| // Verifies that the audio is partitioned into larger blocks before calling |
| // the VAD. |
| void CheckVadInputSize(int input_frame_size_ms, |
| int expected_first_block_size_ms, |
| int expected_second_block_size_ms) { |
| const size_t blocks_per_frame = |
| static_cast<size_t>(input_frame_size_ms / 10); |
| |
| EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()) |
| .WillRepeatedly(Return(blocks_per_frame)); |
| |
| // Expect nothing to happen before the last block is sent to cng_. |
| EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)).Times(0); |
| for (size_t i = 0; i < blocks_per_frame - 1; ++i) { |
| Encode(); |
| } |
| |
| // Let the VAD decision be passive, since an active decision may lead to |
| // early termination of the decision loop. |
| InSequence s; |
| EXPECT_CALL( |
| *mock_vad_, |
| VoiceActivity(_, expected_first_block_size_ms * sample_rate_hz_ / 1000, |
| sample_rate_hz_)) |
| .WillOnce(Return(Vad::kPassive)); |
| if (expected_second_block_size_ms > 0) { |
| EXPECT_CALL(*mock_vad_, |
| VoiceActivity( |
| _, expected_second_block_size_ms * sample_rate_hz_ / 1000, |
| sample_rate_hz_)) |
| .WillOnce(Return(Vad::kPassive)); |
| } |
| |
| // With this call to Encode(), |mock_vad_| should be called according to the |
| // above expectations. |
| Encode(); |
| } |
| |
| // Tests a frame with both active and passive speech. Returns true if the |
| // decision was active speech, false if it was passive. |
| bool CheckMixedActivePassive(Vad::Activity first_type, |
| Vad::Activity second_type) { |
| // Set the speech encoder frame size to 60 ms, to ensure that the VAD will |
| // be called twice. |
| const size_t blocks_per_frame = 6; |
| EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()) |
| .WillRepeatedly(Return(blocks_per_frame)); |
| InSequence s; |
| EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) |
| .WillOnce(Return(first_type)); |
| if (first_type == Vad::kPassive) { |
| // Expect a second call to the VAD only if the first frame was passive. |
| EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) |
| .WillOnce(Return(second_type)); |
| } |
| encoded_info_.payload_type = 0; |
| for (size_t i = 0; i < blocks_per_frame; ++i) { |
| Encode(); |
| } |
| return encoded_info_.payload_type != kCngPayloadType; |
| } |
| |
| std::unique_ptr<AudioEncoder> cng_; |
| std::unique_ptr<MockAudioEncoder> mock_encoder_owner_; |
| MockAudioEncoder* mock_encoder_; |
| MockVad* mock_vad_; // Ownership is transferred to |cng_|. |
| uint32_t timestamp_; |
| int16_t audio_[kMaxNumSamples]; |
| size_t num_audio_samples_10ms_; |
| rtc::Buffer encoded_; |
| AudioEncoder::EncodedInfo encoded_info_; |
| int sample_rate_hz_; |
| |
| RTC_DISALLOW_COPY_AND_ASSIGN(AudioEncoderCngTest); |
| }; |
| |
| TEST_F(AudioEncoderCngTest, CreateAndDestroy) { |
| CreateCng(MakeCngConfig()); |
| } |
| |
| TEST_F(AudioEncoderCngTest, CheckFrameSizePropagation) { |
| CreateCng(MakeCngConfig()); |
| EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()) |
| .WillOnce(Return(17U)); |
| EXPECT_EQ(17U, cng_->Num10MsFramesInNextPacket()); |
| } |
| |
| TEST_F(AudioEncoderCngTest, CheckTargetAudioBitratePropagation) { |
| CreateCng(MakeCngConfig()); |
| EXPECT_CALL(*mock_encoder_, |
| OnReceivedUplinkBandwidth(4711, absl::optional<int64_t>())); |
| cng_->OnReceivedUplinkBandwidth(4711, absl::nullopt); |
| } |
| |
| TEST_F(AudioEncoderCngTest, CheckPacketLossFractionPropagation) { |
| CreateCng(MakeCngConfig()); |
| EXPECT_CALL(*mock_encoder_, OnReceivedUplinkPacketLossFraction(0.5)); |
| cng_->OnReceivedUplinkPacketLossFraction(0.5); |
| } |
| |
| TEST_F(AudioEncoderCngTest, EncodeCallsVad) { |
| EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()) |
| .WillRepeatedly(Return(1U)); |
| CreateCng(MakeCngConfig()); |
| EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) |
| .WillOnce(Return(Vad::kPassive)); |
| Encode(); |
| } |
| |
| TEST_F(AudioEncoderCngTest, EncodeCollects1BlockPassiveSpeech) { |
| CheckBlockGrouping(1, false); |
| } |
| |
| TEST_F(AudioEncoderCngTest, EncodeCollects2BlocksPassiveSpeech) { |
| CheckBlockGrouping(2, false); |
| } |
| |
| TEST_F(AudioEncoderCngTest, EncodeCollects3BlocksPassiveSpeech) { |
| CheckBlockGrouping(3, false); |
| } |
| |
| TEST_F(AudioEncoderCngTest, EncodeCollects1BlockActiveSpeech) { |
| CheckBlockGrouping(1, true); |
| } |
| |
| TEST_F(AudioEncoderCngTest, EncodeCollects2BlocksActiveSpeech) { |
| CheckBlockGrouping(2, true); |
| } |
| |
| TEST_F(AudioEncoderCngTest, EncodeCollects3BlocksActiveSpeech) { |
| CheckBlockGrouping(3, true); |
| } |
| |
| TEST_F(AudioEncoderCngTest, EncodePassive) { |
| const size_t kBlocksPerFrame = 3; |
| EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()) |
| .WillRepeatedly(Return(kBlocksPerFrame)); |
| auto config = MakeCngConfig(); |
| const auto sid_frame_interval_ms = config.sid_frame_interval_ms; |
| const auto num_cng_coefficients = config.num_cng_coefficients; |
| CreateCng(std::move(config)); |
| EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) |
| .WillRepeatedly(Return(Vad::kPassive)); |
| // Expect no calls at all to the speech encoder mock. |
| EXPECT_CALL(*mock_encoder_, EncodeImpl(_, _, _)).Times(0); |
| uint32_t expected_timestamp = timestamp_; |
| for (size_t i = 0; i < 100; ++i) { |
| Encode(); |
| // Check if it was time to call the cng encoder. This is done once every |
| // |kBlocksPerFrame| calls. |
| if ((i + 1) % kBlocksPerFrame == 0) { |
| // Now check if a SID interval has elapsed. |
| if ((i % (sid_frame_interval_ms / 10)) < kBlocksPerFrame) { |
| // If so, verify that we got a CNG encoding. |
| EXPECT_EQ(kCngPayloadType, encoded_info_.payload_type); |
| EXPECT_FALSE(encoded_info_.speech); |
| EXPECT_EQ(static_cast<size_t>(num_cng_coefficients) + 1, |
| encoded_info_.encoded_bytes); |
| EXPECT_EQ(expected_timestamp, encoded_info_.encoded_timestamp); |
| } |
| expected_timestamp += rtc::checked_cast<uint32_t>( |
| kBlocksPerFrame * num_audio_samples_10ms_); |
| } else { |
| // Otherwise, expect no output. |
| EXPECT_EQ(0u, encoded_info_.encoded_bytes); |
| } |
| } |
| } |
| |
| // Verifies that the correct action is taken for frames with both active and |
| // passive speech. |
| TEST_F(AudioEncoderCngTest, MixedActivePassive) { |
| CreateCng(MakeCngConfig()); |
| |
| // All of the frame is active speech. |
| ExpectEncodeCalls(6); |
| EXPECT_TRUE(CheckMixedActivePassive(Vad::kActive, Vad::kActive)); |
| EXPECT_TRUE(encoded_info_.speech); |
| |
| // First half of the frame is active speech. |
| ExpectEncodeCalls(6); |
| EXPECT_TRUE(CheckMixedActivePassive(Vad::kActive, Vad::kPassive)); |
| EXPECT_TRUE(encoded_info_.speech); |
| |
| // Second half of the frame is active speech. |
| ExpectEncodeCalls(6); |
| EXPECT_TRUE(CheckMixedActivePassive(Vad::kPassive, Vad::kActive)); |
| EXPECT_TRUE(encoded_info_.speech); |
| |
| // All of the frame is passive speech. Expect no calls to |mock_encoder_|. |
| EXPECT_FALSE(CheckMixedActivePassive(Vad::kPassive, Vad::kPassive)); |
| EXPECT_FALSE(encoded_info_.speech); |
| } |
| |
| // These tests verify that the audio is partitioned into larger blocks before |
| // calling the VAD. |
| // The parameters for CheckVadInputSize are: |
| // CheckVadInputSize(frame_size, expected_first_block_size, |
| // expected_second_block_size); |
| TEST_F(AudioEncoderCngTest, VadInputSize10Ms) { |
| CreateCng(MakeCngConfig()); |
| CheckVadInputSize(10, 10, 0); |
| } |
| TEST_F(AudioEncoderCngTest, VadInputSize20Ms) { |
| CreateCng(MakeCngConfig()); |
| CheckVadInputSize(20, 20, 0); |
| } |
| TEST_F(AudioEncoderCngTest, VadInputSize30Ms) { |
| CreateCng(MakeCngConfig()); |
| CheckVadInputSize(30, 30, 0); |
| } |
| TEST_F(AudioEncoderCngTest, VadInputSize40Ms) { |
| CreateCng(MakeCngConfig()); |
| CheckVadInputSize(40, 20, 20); |
| } |
| TEST_F(AudioEncoderCngTest, VadInputSize50Ms) { |
| CreateCng(MakeCngConfig()); |
| CheckVadInputSize(50, 30, 20); |
| } |
| TEST_F(AudioEncoderCngTest, VadInputSize60Ms) { |
| CreateCng(MakeCngConfig()); |
| CheckVadInputSize(60, 30, 30); |
| } |
| |
| // Verifies that the correct payload type is set when CNG is encoded. |
| TEST_F(AudioEncoderCngTest, VerifyCngPayloadType) { |
| CreateCng(MakeCngConfig()); |
| EXPECT_CALL(*mock_encoder_, EncodeImpl(_, _, _)).Times(0); |
| EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()).WillOnce(Return(1U)); |
| EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) |
| .WillOnce(Return(Vad::kPassive)); |
| encoded_info_.payload_type = 0; |
| Encode(); |
| EXPECT_EQ(kCngPayloadType, encoded_info_.payload_type); |
| } |
| |
| // Verifies that a SID frame is encoded immediately as the signal changes from |
| // active speech to passive. |
| TEST_F(AudioEncoderCngTest, VerifySidFrameAfterSpeech) { |
| auto config = MakeCngConfig(); |
| const auto num_cng_coefficients = config.num_cng_coefficients; |
| CreateCng(std::move(config)); |
| EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()) |
| .WillRepeatedly(Return(1U)); |
| // Start with encoding noise. |
| EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) |
| .Times(2) |
| .WillRepeatedly(Return(Vad::kPassive)); |
| Encode(); |
| EXPECT_EQ(kCngPayloadType, encoded_info_.payload_type); |
| EXPECT_EQ(static_cast<size_t>(num_cng_coefficients) + 1, |
| encoded_info_.encoded_bytes); |
| // Encode again, and make sure we got no frame at all (since the SID frame |
| // period is 100 ms by default). |
| Encode(); |
| EXPECT_EQ(0u, encoded_info_.encoded_bytes); |
| |
| // Now encode active speech. |
| encoded_info_.payload_type = 0; |
| EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) |
| .WillOnce(Return(Vad::kActive)); |
| EXPECT_CALL(*mock_encoder_, EncodeImpl(_, _, _)) |
| .WillOnce( |
| Invoke(MockAudioEncoder::FakeEncoding(kMockReturnEncodedBytes))); |
| Encode(); |
| EXPECT_EQ(kMockReturnEncodedBytes, encoded_info_.encoded_bytes); |
| |
| // Go back to noise again, and verify that a SID frame is emitted. |
| EXPECT_CALL(*mock_vad_, VoiceActivity(_, _, _)) |
| .WillOnce(Return(Vad::kPassive)); |
| Encode(); |
| EXPECT_EQ(kCngPayloadType, encoded_info_.payload_type); |
| EXPECT_EQ(static_cast<size_t>(num_cng_coefficients) + 1, |
| encoded_info_.encoded_bytes); |
| } |
| |
| // Resetting the CNG should reset both the VAD and the encoder. |
| TEST_F(AudioEncoderCngTest, Reset) { |
| CreateCng(MakeCngConfig()); |
| EXPECT_CALL(*mock_encoder_, Reset()).Times(1); |
| EXPECT_CALL(*mock_vad_, Reset()).Times(1); |
| cng_->Reset(); |
| } |
| |
| #if GTEST_HAS_DEATH_TEST && !defined(WEBRTC_ANDROID) |
| |
| // This test fixture tests various error conditions that makes the |
| // AudioEncoderCng die via CHECKs. |
| class AudioEncoderCngDeathTest : public AudioEncoderCngTest { |
| protected: |
| AudioEncoderCngDeathTest() : AudioEncoderCngTest() { |
| EXPECT_CALL(*mock_vad_, Die()).Times(1); |
| delete mock_vad_; |
| mock_vad_ = nullptr; |
| } |
| |
| // Override AudioEncoderCngTest::TearDown, since that one expects a call to |
| // the destructor of |mock_vad_|. In this case, that object is already |
| // deleted. |
| void TearDown() override { cng_.reset(); } |
| |
| AudioEncoderCngConfig MakeCngConfig() { |
| // Don't provide a Vad mock object, since it would leak when the test dies. |
| auto config = AudioEncoderCngTest::MakeCngConfig(); |
| config.vad = nullptr; |
| return config; |
| } |
| |
| void TryWrongNumCoefficients(int num) { |
| EXPECT_DEATH( |
| [&] { |
| auto config = MakeCngConfig(); |
| config.num_cng_coefficients = num; |
| CreateCng(std::move(config)); |
| }(), |
| "Invalid configuration"); |
| } |
| }; |
| |
| TEST_F(AudioEncoderCngDeathTest, WrongFrameSize) { |
| CreateCng(MakeCngConfig()); |
| num_audio_samples_10ms_ *= 2; // 20 ms frame. |
| EXPECT_DEATH(Encode(), ""); |
| num_audio_samples_10ms_ = 0; // Zero samples. |
| EXPECT_DEATH(Encode(), ""); |
| } |
| |
| TEST_F(AudioEncoderCngDeathTest, WrongNumCoefficientsA) { |
| TryWrongNumCoefficients(-1); |
| } |
| |
| TEST_F(AudioEncoderCngDeathTest, WrongNumCoefficientsB) { |
| TryWrongNumCoefficients(0); |
| } |
| |
| TEST_F(AudioEncoderCngDeathTest, WrongNumCoefficientsC) { |
| TryWrongNumCoefficients(13); |
| } |
| |
| TEST_F(AudioEncoderCngDeathTest, NullSpeechEncoder) { |
| auto config = MakeCngConfig(); |
| config.speech_encoder = nullptr; |
| EXPECT_DEATH(CreateCng(std::move(config)), ""); |
| } |
| |
| TEST_F(AudioEncoderCngDeathTest, StereoEncoder) { |
| EXPECT_CALL(*mock_encoder_, NumChannels()).WillRepeatedly(Return(2)); |
| EXPECT_DEATH(CreateCng(MakeCngConfig()), "Invalid configuration"); |
| } |
| |
| TEST_F(AudioEncoderCngDeathTest, StereoConfig) { |
| EXPECT_DEATH( |
| [&] { |
| auto config = MakeCngConfig(); |
| config.num_channels = 2; |
| CreateCng(std::move(config)); |
| }(), |
| "Invalid configuration"); |
| } |
| |
| TEST_F(AudioEncoderCngDeathTest, EncoderFrameSizeTooLarge) { |
| CreateCng(MakeCngConfig()); |
| EXPECT_CALL(*mock_encoder_, Num10MsFramesInNextPacket()) |
| .WillRepeatedly(Return(7U)); |
| for (int i = 0; i < 6; ++i) |
| Encode(); |
| EXPECT_DEATH(Encode(), |
| "Frame size cannot be larger than 60 ms when using VAD/CNG."); |
| } |
| |
| #endif // GTEST_HAS_DEATH_TEST |
| |
| } // namespace webrtc |