From 19e087fc91453b761b6b56ba84e7c9c5fafd9e8e Mon Sep 17 00:00:00 2001 From: alessiob Date: Thu, 15 Jun 2017 03:49:57 -0700 Subject: [PATCH] This CL finalizes the Conversational Speech tool. The following changes have been made: - command line args wired, - user output added, - final polishing. BUG=webrtc:7218 Review-Url: https://codereview.webrtc.org/2808053002 Cr-Commit-Position: refs/heads/master@{#18609} --- .../test/conversational_speech/BUILD.gn | 3 + .../test/conversational_speech/README.md | 4 +- .../test/conversational_speech/generator.cc | 35 +++++++++--- .../generator_unittest.cc | 55 ++++++++----------- .../conversational_speech/mock_wavreader.h | 2 +- .../test/conversational_speech/timing.cc | 2 +- 6 files changed, 57 insertions(+), 44 deletions(-) diff --git a/webrtc/modules/audio_processing/test/conversational_speech/BUILD.gn b/webrtc/modules/audio_processing/test/conversational_speech/BUILD.gn index df12fc1527..af24f8ab5e 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/BUILD.gn +++ b/webrtc/modules/audio_processing/test/conversational_speech/BUILD.gn @@ -69,5 +69,8 @@ rtc_source_set("unittest") { "../../../../../webrtc/test:test_support", "//testing/gmock", "//testing/gtest", + "//webrtc:webrtc_common", + "//webrtc/base:rtc_base_approved", + "//webrtc/test:test_support", ] } diff --git a/webrtc/modules/audio_processing/test/conversational_speech/README.md b/webrtc/modules/audio_processing/test/conversational_speech/README.md index 415c65b027..bbb4112fc0 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/README.md +++ b/webrtc/modules/audio_processing/test/conversational_speech/README.md @@ -17,9 +17,7 @@ For instance, echo cancellation in the APM module can be evaluated using two-end audio tracks as input and reverse input. By indicating negative and positive time offsets, one can reproduce cross-talk -and silence in the conversation. - -IMPORTANT: **the whole code has not been landed yet.** +(aka double-talk) and silence in the conversation. ### Example diff --git a/webrtc/modules/audio_processing/test/conversational_speech/generator.cc b/webrtc/modules/audio_processing/test/conversational_speech/generator.cc index 923736ffef..57996c14a4 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/generator.cc +++ b/webrtc/modules/audio_processing/test/conversational_speech/generator.cc @@ -11,8 +11,12 @@ #include #include "gflags/gflags.h" -#include "webrtc/base/logging.h" +#include "webrtc/base/ptr_util.h" #include "webrtc/modules/audio_processing/test/conversational_speech/config.h" +#include "webrtc/modules/audio_processing/test/conversational_speech/timing.h" +#include "webrtc/modules/audio_processing/test/conversational_speech/wavreader_factory.h" +#include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h" +#include "webrtc/modules/audio_processing/test/conversational_speech/simulator.h" #include "webrtc/test/testsupport/fileutils.h" namespace webrtc { @@ -48,14 +52,31 @@ DEFINE_validator(o, dir_exists); int main(int argc, char* argv[]) { google::SetUsageMessage(kUsageDescription); google::ParseCommandLineFlags(&argc, &argv, true); - conversational_speech::Config config(FLAGS_i, FLAGS_t, FLAGS_o); - // TODO(alessiob): remove line below once debugged. - rtc::LogMessage::LogToDebug(rtc::LS_VERBOSE); - LOG(LS_VERBOSE) << "i = " << config.audiotracks_path(); - LOG(LS_VERBOSE) << "t = " << config.timing_filepath(); - LOG(LS_VERBOSE) << "o = " << config.output_path(); + // Load timing. + std::vector timing = + conversational_speech::LoadTiming(config.timing_filepath()); + + // Parse timing and audio tracks. + auto wavreader_factory = rtc::MakeUnique< + conversational_speech::WavReaderFactory>(); + conversational_speech::MultiEndCall multiend_call( + timing, config.audiotracks_path(), std::move(wavreader_factory)); + + // Generate output audio tracks. + auto generated_audiotrack_pairs = conversational_speech::Simulate( + multiend_call, config.output_path()); + + // Show paths to created audio tracks. + std::cout << "Output files:" << std::endl; + for (const auto& output_paths_entry : *generated_audiotrack_pairs) { + std::cout << " speaker: " << output_paths_entry.first << std::endl; + std::cout << " near end: " << output_paths_entry.second.near_end + << std::endl; + std::cout << " far end: " << output_paths_entry.second.far_end + << std::endl; + } return 0; } diff --git a/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc b/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc index b112e8ff25..c38fc435c5 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc +++ b/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc @@ -12,7 +12,7 @@ // members. Part of them focus on accepting or rejecting different // conversational speech setups. A setup is defined by a set of audio tracks and // timing information). -// The docstring at the beginning of each TEST_F(ConversationalSpeechTest, +// The docstring at the beginning of each TEST(ConversationalSpeechTest, // MultiEndCallSetup*) function looks like the drawing below and indicates which // setup is tested. // @@ -174,16 +174,7 @@ void DeleteFolderAndContents(const std::string& dir) { using testing::_; -// TODO(alessiob): Remove fixture once conversational_speech fully implemented -// and replace TEST_F with TEST. -class ConversationalSpeechTest : public testing::Test { - public: - ConversationalSpeechTest() { - rtc::LogMessage::LogToDebug(rtc::LS_VERBOSE); - } -}; - -TEST_F(ConversationalSpeechTest, Settings) { +TEST(ConversationalSpeechTest, Settings) { const conversational_speech::Config config( audiotracks_path, timing_filepath, output_path); @@ -193,7 +184,7 @@ TEST_F(ConversationalSpeechTest, Settings) { EXPECT_EQ(output_path, config.output_path()); } -TEST_F(ConversationalSpeechTest, TimingSaveLoad) { +TEST(ConversationalSpeechTest, TimingSaveLoad) { // Save test timing. const std::string temporary_filepath = TempFilename( OutputPath(), "TempTimingTestFile"); @@ -213,7 +204,7 @@ TEST_F(ConversationalSpeechTest, TimingSaveLoad) { } } -TEST_F(ConversationalSpeechTest, MultiEndCallCreate) { +TEST(ConversationalSpeechTest, MultiEndCallCreate) { auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There are 5 unique audio tracks to read. @@ -230,7 +221,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallCreate) { EXPECT_EQ(6u, multiend_call.speaking_turns().size()); } -TEST_F(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) { +TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) { const std::vector timing = { {"A", "sr8000", 0}, {"B", "sr16000", 0}, @@ -245,7 +236,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) { EXPECT_FALSE(multiend_call.valid()); } -TEST_F(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) { +TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) { const std::vector timing = { {"A", "sr16000_stereo", 0}, {"B", "sr16000_stereo", 0}, @@ -260,7 +251,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) { EXPECT_FALSE(multiend_call.valid()); } -TEST_F(ConversationalSpeechTest, +TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels) { const std::vector timing = { {"A", "sr8000", 0}, @@ -276,7 +267,7 @@ TEST_F(ConversationalSpeechTest, EXPECT_FALSE(multiend_call.valid()); } -TEST_F(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) { +TEST(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) { const std::vector timing = { {"A", "t500", -100}, {"B", "t500", 0}, @@ -291,7 +282,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) { EXPECT_FALSE(multiend_call.valid()); } -TEST_F(ConversationalSpeechTest, MultiEndCallSetupSimple) { +TEST(ConversationalSpeechTest, MultiEndCallSetupSimple) { // Accept: // A 0****..... // B .....1**** @@ -316,7 +307,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupSimple) { EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); } -TEST_F(ConversationalSpeechTest, MultiEndCallSetupPause) { +TEST(ConversationalSpeechTest, MultiEndCallSetupPause) { // Accept: // A 0****....... // B .......1**** @@ -341,7 +332,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupPause) { EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); } -TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) { +TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) { // Accept: // A 0****.... // B ....1**** @@ -366,7 +357,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) { EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); } -TEST_F(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) { +TEST(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) { // Reject: // A ..0**** // B .1****. The n-th turn cannot start before the (n-1)-th one. @@ -384,7 +375,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) { EXPECT_FALSE(multiend_call.valid()); } -TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) { +TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) { // Accept: // A 0****2****... // B ...1********* @@ -410,7 +401,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) { EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); } -TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) { +TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) { // Reject: // A 0****...... // A ...1****... @@ -431,7 +422,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) { EXPECT_FALSE(multiend_call.valid()); } -TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) { +TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) { // Reject: // A 0********* // B 1**....... @@ -454,7 +445,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) { EXPECT_FALSE(multiend_call.valid()); } -TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) { +TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) { // Accept: // A 0*********.. // B ..1****..... @@ -481,7 +472,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) { EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); } -TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) { +TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) { // Reject: // A 0********* // B ..1****... @@ -503,7 +494,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) { EXPECT_FALSE(multiend_call.valid()); } -TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) { +TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) { // Accept: // A 0*********.. // B .2****...... @@ -530,7 +521,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) { EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); } -TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) { +TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) { // Accept: // A 0**** // B 1**** @@ -553,7 +544,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) { EXPECT_EQ(2u, multiend_call.speaking_turns().size()); } -TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequence) { +TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequence) { // Accept: // A 0****....3****.5**. // B .....1****...4**... @@ -586,7 +577,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequence) { EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); } -TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) { +TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) { // Reject: // A 0****....3****.6** // B .....1****...4**.. @@ -614,7 +605,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) { EXPECT_FALSE(multiend_call.valid()); } -TEST_F(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) { +TEST(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) { // Parameters with which wav files are created. constexpr int duration_seconds = 5; const int sample_rates[] = {8000, 11025, 16000, 22050, 32000, 44100, 48000}; @@ -641,7 +632,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) { } } -TEST_F(ConversationalSpeechTest, DISABLED_MultiEndCallSimulator) { +TEST(ConversationalSpeechTest, DISABLED_MultiEndCallSimulator) { // Simulated call (one character corresponding to 500 ms): // A 0*********...........2*********..... // B ...........1*********.....3********* diff --git a/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h index 030e163e80..98dfaa27e8 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h +++ b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h @@ -28,7 +28,7 @@ class MockWavReader : public WavReaderInterface { MockWavReader(int sample_rate, size_t num_channels, size_t num_samples); ~MockWavReader(); - // TODO(alessiob): use ON_CALL to return random samples. + // TODO(alessiob): use ON_CALL to return random samples if needed. MOCK_METHOD1(ReadFloatSamples, size_t(rtc::ArrayView)); MOCK_METHOD1(ReadInt16Samples, size_t(rtc::ArrayView)); diff --git a/webrtc/modules/audio_processing/test/conversational_speech/timing.cc b/webrtc/modules/audio_processing/test/conversational_speech/timing.cc index 0aa44fa42c..53076f1cac 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/timing.cc +++ b/webrtc/modules/audio_processing/test/conversational_speech/timing.cc @@ -53,7 +53,7 @@ std::vector LoadTiming(const std::string& timing_filepath) { void SaveTiming(const std::string& timing_filepath, rtc::ArrayView timing) { std::ofstream outfile(timing_filepath); - // TODO(alessio): check if file open for writing. + RTC_CHECK(outfile.is_open()); for (const Turn& turn : timing) { outfile << turn.speaker_name << " " << turn.audiotrack_file_name << " " << turn.offset << std::endl;