|  | /* | 
|  | *  Copyright (c) 2017 The WebRTC project authors. All Rights Reserved. | 
|  | * | 
|  | *  Use of this source code is governed by a BSD-style license | 
|  | *  that can be found in the LICENSE file in the root of the source | 
|  | *  tree. An additional intellectual property rights grant can be found | 
|  | *  in the file PATENTS.  All contributing project authors may | 
|  | *  be found in the AUTHORS file in the root of the source tree. | 
|  | */ | 
|  |  | 
|  | // This file consists of unit tests for webrtc::test::conversational_speech | 
|  | // members. Part of them focus on accepting or rejecting different | 
|  | // conversational speech setups. A setup is defined by a set of audio tracks and | 
|  | // timing information). | 
|  | // The docstring at the beginning of each TEST(ConversationalSpeechTest, | 
|  | // MultiEndCallSetup*) function looks like the drawing below and indicates which | 
|  | // setup is tested. | 
|  | // | 
|  | //    Accept: | 
|  | //    A 0****..... | 
|  | //    B .....1**** | 
|  | // | 
|  | // The drawing indicates the following: | 
|  | // - the illustrated setup should be accepted, | 
|  | // - there are two speakers (namely, A and B), | 
|  | // - A is the first speaking, B is the second one, | 
|  | // - each character after the speaker's letter indicates a time unit (e.g., 100 | 
|  | //   ms), | 
|  | // - "*" indicates speaking, "." listening, | 
|  | // - numbers indicate the turn index in std::vector<Turn>. | 
|  | // | 
|  | // Note that the same speaker can appear in multiple lines in order to depict | 
|  | // cases in which there are wrong offsets leading to self cross-talk (which is | 
|  | // rejected). | 
|  |  | 
|  | #include <cmath> | 
|  | #include <cstddef> | 
|  | #include <cstdint> | 
|  | #include <cstdio> | 
|  | #include <map> | 
|  | #include <memory> | 
|  | #include <numbers> | 
|  | #include <optional> | 
|  | #include <string> | 
|  | #include <utility> | 
|  | #include <vector> | 
|  |  | 
|  | #include "absl/strings/string_view.h" | 
|  | #include "common_audio/wav_file.h" | 
|  | #include "modules/audio_processing/test/conversational_speech/config.h" | 
|  | #include "modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h" | 
|  | #include "modules/audio_processing/test/conversational_speech/multiend_call.h" | 
|  | #include "modules/audio_processing/test/conversational_speech/simulator.h" | 
|  | #include "modules/audio_processing/test/conversational_speech/timing.h" | 
|  | #include "modules/audio_processing/test/conversational_speech/wavreader_factory.h" | 
|  | #include "rtc_base/logging.h" | 
|  | #include "rtc_base/numerics/safe_conversions.h" | 
|  | #include "test/gmock.h" | 
|  | #include "test/gtest.h" | 
|  | #include "test/testsupport/file_utils.h" | 
|  |  | 
|  | namespace webrtc { | 
|  | namespace test { | 
|  | namespace { | 
|  |  | 
|  | using conversational_speech::LoadTiming; | 
|  | using conversational_speech::MockWavReaderFactory; | 
|  | using conversational_speech::MultiEndCall; | 
|  | using conversational_speech::SaveTiming; | 
|  | using conversational_speech::Turn; | 
|  | using conversational_speech::WavReaderFactory; | 
|  |  | 
|  | const char* const audiotracks_path = "/path/to/audiotracks"; | 
|  | const char* const timing_filepath = "/path/to/timing_file.txt"; | 
|  | const char* const output_path = "/path/to/output_dir"; | 
|  |  | 
|  | const std::vector<Turn> expected_timing = { | 
|  | {"A", "a1", 0, 0},    {"B", "b1", 0, 0}, {"A", "a2", 100, 0}, | 
|  | {"B", "b2", -200, 0}, {"A", "a3", 0, 0}, {"A", "a3", 0, 0}, | 
|  | }; | 
|  | const std::size_t kNumberOfTurns = expected_timing.size(); | 
|  |  | 
|  | // Default arguments for MockWavReaderFactory ctor. | 
|  | // Fake audio track parameters. | 
|  | constexpr int kDefaultSampleRate = 48000; | 
|  | const std::map<std::string, const MockWavReaderFactory::Params> | 
|  | kDefaultMockWavReaderFactoryParamsMap = { | 
|  | {"t300", {kDefaultSampleRate, 1u, 14400u}},   // Mono, 0.3 seconds. | 
|  | {"t500", {kDefaultSampleRate, 1u, 24000u}},   // Mono, 0.5 seconds. | 
|  | {"t1000", {kDefaultSampleRate, 1u, 48000u}},  // Mono, 1.0 seconds. | 
|  | {"sr8000", {8000, 1u, 8000u}},     // 8kHz sample rate, mono, 1 second. | 
|  | {"sr16000", {16000, 1u, 16000u}},  // 16kHz sample rate, mono, 1 second. | 
|  | {"sr16000_stereo", {16000, 2u, 16000u}},  // Like sr16000, but stereo. | 
|  | }; | 
|  | const MockWavReaderFactory::Params& kDefaultMockWavReaderFactoryParams = | 
|  | kDefaultMockWavReaderFactoryParamsMap.at("t500"); | 
|  |  | 
|  | std::unique_ptr<MockWavReaderFactory> CreateMockWavReaderFactory() { | 
|  | return std::unique_ptr<MockWavReaderFactory>( | 
|  | new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams, | 
|  | kDefaultMockWavReaderFactoryParamsMap)); | 
|  | } | 
|  |  | 
|  | void CreateSineWavFile(absl::string_view filepath, | 
|  | const MockWavReaderFactory::Params& params, | 
|  | float frequency_hz = 440.0f) { | 
|  | const double phase_step = | 
|  | 2 * std::numbers::pi * frequency_hz / params.sample_rate; | 
|  | double phase = 0.0; | 
|  | std::vector<int16_t> samples(params.num_samples); | 
|  | for (size_t i = 0; i < params.num_samples; ++i) { | 
|  | samples[i] = saturated_cast<int16_t>(32767.0f * std::sin(phase)); | 
|  | phase += phase_step; | 
|  | } | 
|  |  | 
|  | WavWriter wav_writer(filepath, params.sample_rate, params.num_channels); | 
|  | wav_writer.WriteSamples(samples.data(), params.num_samples); | 
|  | } | 
|  |  | 
|  | // Parameters to generate audio tracks with CreateSineWavFile. | 
|  | struct SineAudioTrackParams { | 
|  | MockWavReaderFactory::Params params; | 
|  | float frequency; | 
|  | }; | 
|  |  | 
|  | // Creates a temporary directory in which sine audio tracks are written. | 
|  | std::string CreateTemporarySineAudioTracks( | 
|  | const std::map<std::string, SineAudioTrackParams>& sine_tracks_params) { | 
|  | // Create temporary directory. | 
|  | std::string temp_directory = | 
|  | OutputPath() + "TempConversationalSpeechAudioTracks"; | 
|  | CreateDir(temp_directory); | 
|  |  | 
|  | // Create sine tracks. | 
|  | for (const auto& it : sine_tracks_params) { | 
|  | const std::string temp_filepath = JoinFilename(temp_directory, it.first); | 
|  | CreateSineWavFile(temp_filepath, it.second.params, it.second.frequency); | 
|  | } | 
|  |  | 
|  | return temp_directory; | 
|  | } | 
|  |  | 
|  | void CheckAudioTrackParams(const WavReaderFactory& wav_reader_factory, | 
|  | absl::string_view filepath, | 
|  | const MockWavReaderFactory::Params& expeted_params) { | 
|  | auto wav_reader = wav_reader_factory.Create(filepath); | 
|  | EXPECT_EQ(expeted_params.sample_rate, wav_reader->SampleRate()); | 
|  | EXPECT_EQ(expeted_params.num_channels, wav_reader->NumChannels()); | 
|  | EXPECT_EQ(expeted_params.num_samples, wav_reader->NumSamples()); | 
|  | } | 
|  |  | 
|  | void DeleteFolderAndContents(absl::string_view dir) { | 
|  | if (!DirExists(dir)) { | 
|  | return; | 
|  | } | 
|  | std::optional<std::vector<std::string>> dir_content = ReadDirectory(dir); | 
|  | EXPECT_TRUE(dir_content); | 
|  | for (const auto& path : *dir_content) { | 
|  | if (DirExists(path)) { | 
|  | DeleteFolderAndContents(path); | 
|  | } else if (FileExists(path)) { | 
|  | // TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed. | 
|  | RemoveFile(path); | 
|  | } else { | 
|  | FAIL(); | 
|  | } | 
|  | } | 
|  | // TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed. | 
|  | RemoveDir(dir); | 
|  | } | 
|  |  | 
|  | }  // namespace | 
|  |  | 
|  | using ::testing::_; | 
|  |  | 
|  | TEST(ConversationalSpeechTest, Settings) { | 
|  | const conversational_speech::Config config(audiotracks_path, timing_filepath, | 
|  | output_path); | 
|  |  | 
|  | // Test getters. | 
|  | EXPECT_EQ(audiotracks_path, config.audiotracks_path()); | 
|  | EXPECT_EQ(timing_filepath, config.timing_filepath()); | 
|  | EXPECT_EQ(output_path, config.output_path()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, TimingSaveLoad) { | 
|  | // Save test timing. | 
|  | const std::string temporary_filepath = | 
|  | TempFilename(OutputPath(), "TempTimingTestFile"); | 
|  | SaveTiming(temporary_filepath, expected_timing); | 
|  |  | 
|  | // Create a std::vector<Turn> instance by loading from file. | 
|  | std::vector<Turn> actual_timing = LoadTiming(temporary_filepath); | 
|  | RemoveFile(temporary_filepath); | 
|  |  | 
|  | // Check size. | 
|  | EXPECT_EQ(expected_timing.size(), actual_timing.size()); | 
|  |  | 
|  | // Check Turn instances. | 
|  | for (size_t index = 0; index < expected_timing.size(); ++index) { | 
|  | EXPECT_EQ(expected_timing[index], actual_timing[index]) | 
|  | << "turn #" << index << " not matching"; | 
|  | } | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallCreate) { | 
|  | auto mock_wavreader_factory = CreateMockWavReaderFactory(); | 
|  |  | 
|  | // There are 5 unique audio tracks to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(5); | 
|  |  | 
|  | // Inject the mock wav reader factory. | 
|  | conversational_speech::MultiEndCall multiend_call( | 
|  | expected_timing, audiotracks_path, std::move(mock_wavreader_factory)); | 
|  | EXPECT_TRUE(multiend_call.valid()); | 
|  |  | 
|  | // Test. | 
|  | EXPECT_EQ(2u, multiend_call.speaker_names().size()); | 
|  | EXPECT_EQ(5u, multiend_call.audiotrack_readers().size()); | 
|  | EXPECT_EQ(6u, multiend_call.speaking_turns().size()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) { | 
|  | const std::vector<Turn> timing = { | 
|  | {"A", "sr8000", 0, 0}, | 
|  | {"B", "sr16000", 0, 0}, | 
|  | }; | 
|  | auto mock_wavreader_factory = CreateMockWavReaderFactory(); | 
|  |  | 
|  | // There are two unique audio tracks to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(2); | 
|  |  | 
|  | MultiEndCall multiend_call(timing, audiotracks_path, | 
|  | std::move(mock_wavreader_factory)); | 
|  | EXPECT_FALSE(multiend_call.valid()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) { | 
|  | const std::vector<Turn> timing = { | 
|  | {"A", "sr16000_stereo", 0, 0}, | 
|  | {"B", "sr16000_stereo", 0, 0}, | 
|  | }; | 
|  | auto mock_wavreader_factory = CreateMockWavReaderFactory(); | 
|  |  | 
|  | // There is one unique audio track to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(1); | 
|  |  | 
|  | MultiEndCall multiend_call(timing, audiotracks_path, | 
|  | std::move(mock_wavreader_factory)); | 
|  | EXPECT_FALSE(multiend_call.valid()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, | 
|  | MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels) { | 
|  | const std::vector<Turn> timing = { | 
|  | {"A", "sr8000", 0, 0}, | 
|  | {"B", "sr16000_stereo", 0, 0}, | 
|  | }; | 
|  | auto mock_wavreader_factory = CreateMockWavReaderFactory(); | 
|  |  | 
|  | // There are two unique audio tracks to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(2); | 
|  |  | 
|  | MultiEndCall multiend_call(timing, audiotracks_path, | 
|  | std::move(mock_wavreader_factory)); | 
|  | EXPECT_FALSE(multiend_call.valid()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) { | 
|  | const std::vector<Turn> timing = { | 
|  | {"A", "t500", -100, 0}, | 
|  | {"B", "t500", 0, 0}, | 
|  | }; | 
|  | auto mock_wavreader_factory = CreateMockWavReaderFactory(); | 
|  |  | 
|  | // There is one unique audio track to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); | 
|  |  | 
|  | conversational_speech::MultiEndCall multiend_call( | 
|  | timing, audiotracks_path, std::move(mock_wavreader_factory)); | 
|  | EXPECT_FALSE(multiend_call.valid()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallSetupSimple) { | 
|  | // Accept: | 
|  | // A 0****..... | 
|  | // B .....1**** | 
|  | constexpr std::size_t expected_duration = kDefaultSampleRate; | 
|  | const std::vector<Turn> timing = { | 
|  | {"A", "t500", 0, 0}, | 
|  | {"B", "t500", 0, 0}, | 
|  | }; | 
|  | auto mock_wavreader_factory = CreateMockWavReaderFactory(); | 
|  |  | 
|  | // There is one unique audio track to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); | 
|  |  | 
|  | conversational_speech::MultiEndCall multiend_call( | 
|  | timing, audiotracks_path, std::move(mock_wavreader_factory)); | 
|  | EXPECT_TRUE(multiend_call.valid()); | 
|  |  | 
|  | // Test. | 
|  | EXPECT_EQ(2u, multiend_call.speaker_names().size()); | 
|  | EXPECT_EQ(1u, multiend_call.audiotrack_readers().size()); | 
|  | EXPECT_EQ(2u, multiend_call.speaking_turns().size()); | 
|  | EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallSetupPause) { | 
|  | // Accept: | 
|  | // A 0****....... | 
|  | // B .......1**** | 
|  | constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2; | 
|  | const std::vector<Turn> timing = { | 
|  | {"A", "t500", 0, 0}, | 
|  | {"B", "t500", 200, 0}, | 
|  | }; | 
|  | auto mock_wavreader_factory = CreateMockWavReaderFactory(); | 
|  |  | 
|  | // There is one unique audio track to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); | 
|  |  | 
|  | conversational_speech::MultiEndCall multiend_call( | 
|  | timing, audiotracks_path, std::move(mock_wavreader_factory)); | 
|  | EXPECT_TRUE(multiend_call.valid()); | 
|  |  | 
|  | // Test. | 
|  | EXPECT_EQ(2u, multiend_call.speaker_names().size()); | 
|  | EXPECT_EQ(1u, multiend_call.audiotrack_readers().size()); | 
|  | EXPECT_EQ(2u, multiend_call.speaking_turns().size()); | 
|  | EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) { | 
|  | // Accept: | 
|  | // A 0****.... | 
|  | // B ....1**** | 
|  | constexpr std::size_t expected_duration = kDefaultSampleRate * 0.9; | 
|  | const std::vector<Turn> timing = { | 
|  | {"A", "t500", 0, 0}, | 
|  | {"B", "t500", -100, 0}, | 
|  | }; | 
|  | auto mock_wavreader_factory = CreateMockWavReaderFactory(); | 
|  |  | 
|  | // There is one unique audio track to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); | 
|  |  | 
|  | conversational_speech::MultiEndCall multiend_call( | 
|  | timing, audiotracks_path, std::move(mock_wavreader_factory)); | 
|  | EXPECT_TRUE(multiend_call.valid()); | 
|  |  | 
|  | // Test. | 
|  | EXPECT_EQ(2u, multiend_call.speaker_names().size()); | 
|  | EXPECT_EQ(1u, multiend_call.audiotrack_readers().size()); | 
|  | EXPECT_EQ(2u, multiend_call.speaking_turns().size()); | 
|  | EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) { | 
|  | // Reject: | 
|  | // A ..0**** | 
|  | // B .1****.  The n-th turn cannot start before the (n-1)-th one. | 
|  | const std::vector<Turn> timing = { | 
|  | {"A", "t500", 200, 0}, | 
|  | {"B", "t500", -600, 0}, | 
|  | }; | 
|  | auto mock_wavreader_factory = CreateMockWavReaderFactory(); | 
|  |  | 
|  | // There is one unique audio track to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); | 
|  |  | 
|  | conversational_speech::MultiEndCall multiend_call( | 
|  | timing, audiotracks_path, std::move(mock_wavreader_factory)); | 
|  | EXPECT_FALSE(multiend_call.valid()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) { | 
|  | // Accept: | 
|  | // A 0****2****... | 
|  | // B ...1********* | 
|  | constexpr std::size_t expected_duration = kDefaultSampleRate * 1.3; | 
|  | const std::vector<Turn> timing = { | 
|  | {"A", "t500", 0, 0}, | 
|  | {"B", "t1000", -200, 0}, | 
|  | {"A", "t500", -800, 0}, | 
|  | }; | 
|  | auto mock_wavreader_factory = CreateMockWavReaderFactory(); | 
|  |  | 
|  | // There are two unique audio tracks to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); | 
|  |  | 
|  | conversational_speech::MultiEndCall multiend_call( | 
|  | timing, audiotracks_path, std::move(mock_wavreader_factory)); | 
|  | EXPECT_TRUE(multiend_call.valid()); | 
|  |  | 
|  | // Test. | 
|  | EXPECT_EQ(2u, multiend_call.speaker_names().size()); | 
|  | EXPECT_EQ(2u, multiend_call.audiotrack_readers().size()); | 
|  | EXPECT_EQ(3u, multiend_call.speaking_turns().size()); | 
|  | EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) { | 
|  | // Reject: | 
|  | // A 0****...... | 
|  | // A ...1****... | 
|  | // B ......2**** | 
|  | //      ^  Turn #1 overlaps with #0 which is from the same speaker. | 
|  | const std::vector<Turn> timing = { | 
|  | {"A", "t500", 0, 0}, | 
|  | {"A", "t500", -200, 0}, | 
|  | {"B", "t500", -200, 0}, | 
|  | }; | 
|  | auto mock_wavreader_factory = CreateMockWavReaderFactory(); | 
|  |  | 
|  | // There is one unique audio track to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); | 
|  |  | 
|  | conversational_speech::MultiEndCall multiend_call( | 
|  | timing, audiotracks_path, std::move(mock_wavreader_factory)); | 
|  | EXPECT_FALSE(multiend_call.valid()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) { | 
|  | // Reject: | 
|  | // A 0********* | 
|  | // B 1**....... | 
|  | // C ...2**.... | 
|  | // A ......3**. | 
|  | //         ^  Turn #3 overlaps with #0 which is from the same speaker. | 
|  | const std::vector<Turn> timing = { | 
|  | {"A", "t1000", 0, 0}, | 
|  | {"B", "t300", -1000, 0}, | 
|  | {"C", "t300", 0, 0}, | 
|  | {"A", "t300", 0, 0}, | 
|  | }; | 
|  | auto mock_wavreader_factory = CreateMockWavReaderFactory(); | 
|  |  | 
|  | // There are two unique audio tracks to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); | 
|  |  | 
|  | conversational_speech::MultiEndCall multiend_call( | 
|  | timing, audiotracks_path, std::move(mock_wavreader_factory)); | 
|  | EXPECT_FALSE(multiend_call.valid()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) { | 
|  | // Accept: | 
|  | // A 0*********.. | 
|  | // B ..1****..... | 
|  | // C .......2**** | 
|  | constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2; | 
|  | const std::vector<Turn> timing = { | 
|  | {"A", "t1000", 0, 0}, | 
|  | {"B", "t500", -800, 0}, | 
|  | {"C", "t500", 0, 0}, | 
|  | }; | 
|  | auto mock_wavreader_factory = CreateMockWavReaderFactory(); | 
|  |  | 
|  | // There are two unique audio tracks to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); | 
|  |  | 
|  | conversational_speech::MultiEndCall multiend_call( | 
|  | timing, audiotracks_path, std::move(mock_wavreader_factory)); | 
|  | EXPECT_TRUE(multiend_call.valid()); | 
|  |  | 
|  | // Test. | 
|  | EXPECT_EQ(3u, multiend_call.speaker_names().size()); | 
|  | EXPECT_EQ(2u, multiend_call.audiotrack_readers().size()); | 
|  | EXPECT_EQ(3u, multiend_call.speaking_turns().size()); | 
|  | EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) { | 
|  | // Reject: | 
|  | // A 0********* | 
|  | // B ..1****... | 
|  | // C ....2****. | 
|  | //       ^  Turn #2 overlaps both with #0 and #1 (cross-talk with 3+ speakers | 
|  | //          not permitted). | 
|  | const std::vector<Turn> timing = { | 
|  | {"A", "t1000", 0, 0}, | 
|  | {"B", "t500", -800, 0}, | 
|  | {"C", "t500", -300, 0}, | 
|  | }; | 
|  | auto mock_wavreader_factory = CreateMockWavReaderFactory(); | 
|  |  | 
|  | // There are two unique audio tracks to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); | 
|  |  | 
|  | conversational_speech::MultiEndCall multiend_call( | 
|  | timing, audiotracks_path, std::move(mock_wavreader_factory)); | 
|  | EXPECT_FALSE(multiend_call.valid()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) { | 
|  | // Accept: | 
|  | // A 0*********.. | 
|  | // B .2****...... | 
|  | // C .......3**** | 
|  | constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2; | 
|  | const std::vector<Turn> timing = { | 
|  | {"A", "t1000", 0, 0}, | 
|  | {"B", "t500", -900, 0}, | 
|  | {"C", "t500", 100, 0}, | 
|  | }; | 
|  | auto mock_wavreader_factory = CreateMockWavReaderFactory(); | 
|  |  | 
|  | // There are two unique audio tracks to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); | 
|  |  | 
|  | conversational_speech::MultiEndCall multiend_call( | 
|  | timing, audiotracks_path, std::move(mock_wavreader_factory)); | 
|  | EXPECT_TRUE(multiend_call.valid()); | 
|  |  | 
|  | // Test. | 
|  | EXPECT_EQ(3u, multiend_call.speaker_names().size()); | 
|  | EXPECT_EQ(2u, multiend_call.audiotrack_readers().size()); | 
|  | EXPECT_EQ(3u, multiend_call.speaking_turns().size()); | 
|  | EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) { | 
|  | // Accept: | 
|  | // A 0**** | 
|  | // B 1**** | 
|  | const std::vector<Turn> timing = { | 
|  | {"A", "t500", 0, 0}, | 
|  | {"B", "t500", -500, 0}, | 
|  | }; | 
|  | auto mock_wavreader_factory = CreateMockWavReaderFactory(); | 
|  |  | 
|  | // There is one unique audio track to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); | 
|  |  | 
|  | conversational_speech::MultiEndCall multiend_call( | 
|  | timing, audiotracks_path, std::move(mock_wavreader_factory)); | 
|  | EXPECT_TRUE(multiend_call.valid()); | 
|  |  | 
|  | // Test. | 
|  | EXPECT_EQ(2u, multiend_call.speaker_names().size()); | 
|  | EXPECT_EQ(1u, multiend_call.audiotrack_readers().size()); | 
|  | EXPECT_EQ(2u, multiend_call.speaking_turns().size()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequence) { | 
|  | // Accept: | 
|  | // A 0****....3****.5**. | 
|  | // B .....1****...4**... | 
|  | // C ......2**.......6**.. | 
|  | constexpr std::size_t expected_duration = kDefaultSampleRate * 1.9; | 
|  | const std::vector<Turn> timing = { | 
|  | {"A", "t500", 0, 0},    {"B", "t500", 0, 0},    {"C", "t300", -400, 0}, | 
|  | {"A", "t500", 0, 0},    {"B", "t300", -100, 0}, {"A", "t300", -100, 0}, | 
|  | {"C", "t300", -200, 0}, | 
|  | }; | 
|  | auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>( | 
|  | new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams, | 
|  | kDefaultMockWavReaderFactoryParamsMap)); | 
|  |  | 
|  | // There are two unique audio tracks to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); | 
|  |  | 
|  | conversational_speech::MultiEndCall multiend_call( | 
|  | timing, audiotracks_path, std::move(mock_wavreader_factory)); | 
|  | EXPECT_TRUE(multiend_call.valid()); | 
|  |  | 
|  | // Test. | 
|  | EXPECT_EQ(3u, multiend_call.speaker_names().size()); | 
|  | EXPECT_EQ(2u, multiend_call.audiotrack_readers().size()); | 
|  | EXPECT_EQ(7u, multiend_call.speaking_turns().size()); | 
|  | EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) { | 
|  | // Reject: | 
|  | // A 0****....3****.6** | 
|  | // B .....1****...4**.. | 
|  | // C ......2**.....5**.. | 
|  | //                 ^ Turns #4, #5 and #6 overlapping (cross-talk with 3+ | 
|  | //                   speakers not permitted). | 
|  | const std::vector<Turn> timing = { | 
|  | {"A", "t500", 0, 0},    {"B", "t500", 0, 0},    {"C", "t300", -400, 0}, | 
|  | {"A", "t500", 0, 0},    {"B", "t300", -100, 0}, {"A", "t300", -200, 0}, | 
|  | {"C", "t300", -200, 0}, | 
|  | }; | 
|  | auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>( | 
|  | new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams, | 
|  | kDefaultMockWavReaderFactoryParamsMap)); | 
|  |  | 
|  | // There are two unique audio tracks to read. | 
|  | EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); | 
|  |  | 
|  | conversational_speech::MultiEndCall multiend_call( | 
|  | timing, audiotracks_path, std::move(mock_wavreader_factory)); | 
|  | EXPECT_FALSE(multiend_call.valid()); | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) { | 
|  | // Parameters with which wav files are created. | 
|  | constexpr int duration_seconds = 5; | 
|  | const int sample_rates[] = {8000, 11025, 16000, 22050, 32000, 44100, 48000}; | 
|  |  | 
|  | for (int sample_rate : sample_rates) { | 
|  | const std::string temp_filename = OutputPath() + "TempSineWavFile_" + | 
|  | std::to_string(sample_rate) + ".wav"; | 
|  |  | 
|  | // Write wav file. | 
|  | const std::size_t num_samples = duration_seconds * sample_rate; | 
|  | MockWavReaderFactory::Params params = {sample_rate, 1u, num_samples}; | 
|  | CreateSineWavFile(temp_filename, params); | 
|  |  | 
|  | // Load wav file and check if params match. | 
|  | WavReaderFactory wav_reader_factory; | 
|  | MockWavReaderFactory::Params expeted_params = {sample_rate, 1u, | 
|  | num_samples}; | 
|  | CheckAudioTrackParams(wav_reader_factory, temp_filename, expeted_params); | 
|  |  | 
|  | // Clean up. | 
|  | RemoveFile(temp_filename); | 
|  | } | 
|  | } | 
|  |  | 
|  | TEST(ConversationalSpeechTest, DISABLED_MultiEndCallSimulator) { | 
|  | // Simulated call (one character corresponding to 500 ms): | 
|  | // A 0*********...........2*********..... | 
|  | // B ...........1*********.....3********* | 
|  | const std::vector<Turn> expected_timing_multiend = { | 
|  | {"A", "t5000_440.wav", 0, 0}, | 
|  | {"B", "t5000_880.wav", 500, 0}, | 
|  | {"A", "t5000_440.wav", 0, 0}, | 
|  | {"B", "t5000_880.wav", -2500, 0}, | 
|  | }; | 
|  | const std::size_t expected_duration_seconds = 18; | 
|  |  | 
|  | // Create temporary audio track files. | 
|  | const int sample_rate = 16000; | 
|  | const std::map<std::string, SineAudioTrackParams> sine_tracks_params = { | 
|  | {"t5000_440.wav", {{sample_rate, 1u, sample_rate * 5}, 440.0}}, | 
|  | {"t5000_880.wav", {{sample_rate, 1u, sample_rate * 5}, 880.0}}, | 
|  | }; | 
|  | const std::string audiotracks_path_multiend = | 
|  | CreateTemporarySineAudioTracks(sine_tracks_params); | 
|  |  | 
|  | // Set up the multi-end call. | 
|  | auto wavreader_factory = | 
|  | std::unique_ptr<WavReaderFactory>(new WavReaderFactory()); | 
|  | MultiEndCall multiend_call(expected_timing_multiend, | 
|  | audiotracks_path_multiend, | 
|  | std::move(wavreader_factory)); | 
|  |  | 
|  | // Simulate the call. | 
|  | std::string output_path_multiend = | 
|  | JoinFilename(audiotracks_path_multiend, "output"); | 
|  | CreateDir(output_path_multiend); | 
|  | RTC_LOG(LS_VERBOSE) << "simulator output path: " << output_path_multiend; | 
|  | auto generated_audiotrak_pairs = | 
|  | conversational_speech::Simulate(multiend_call, output_path_multiend); | 
|  | EXPECT_EQ(2u, generated_audiotrak_pairs->size()); | 
|  |  | 
|  | // Check the output. | 
|  | WavReaderFactory wav_reader_factory; | 
|  | const MockWavReaderFactory::Params expeted_params = { | 
|  | sample_rate, 1u, sample_rate * expected_duration_seconds}; | 
|  | for (const auto& it : *generated_audiotrak_pairs) { | 
|  | RTC_LOG(LS_VERBOSE) << "checking far/near-end for <" << it.first << ">"; | 
|  | CheckAudioTrackParams(wav_reader_factory, it.second.near_end, | 
|  | expeted_params); | 
|  | CheckAudioTrackParams(wav_reader_factory, it.second.far_end, | 
|  | expeted_params); | 
|  | } | 
|  |  | 
|  | // Clean. | 
|  | EXPECT_NO_FATAL_FAILURE(DeleteFolderAndContents(audiotracks_path_multiend)); | 
|  | } | 
|  |  | 
|  | }  // namespace test | 
|  | }  // namespace webrtc |