This CL finalizes the Conversational Speech tool.

The following changes have been made: - command line args wired, - user output added, - final polishing. BUG=webrtc:7218 Review-Url: https://codereview.webrtc.org/2808053002 Cr-Commit-Position: refs/heads/master@{#18609}
2017-06-15 03:49:57 -07:00 · 2017-06-15 03:49:57 -07:00 · 19e087fc91
commit 19e087fc91
parent b2152b7b61
6 changed files with 57 additions and 44 deletions
--- a/webrtc/modules/audio_processing/test/conversational_speech/BUILD.gn
+++ b/webrtc/modules/audio_processing/test/conversational_speech/BUILD.gn
@ -69,5 +69,8 @@ rtc_source_set("unittest") {
    "../../../../../webrtc/test:test_support",
    "//testing/gmock",
    "//testing/gtest",
+    "//webrtc:webrtc_common",
+    "//webrtc/base:rtc_base_approved",
+    "//webrtc/test:test_support",
  ]
 }
--- a/webrtc/modules/audio_processing/test/conversational_speech/README.md
+++ b/webrtc/modules/audio_processing/test/conversational_speech/README.md
@ -17,9 +17,7 @@ For instance, echo cancellation in the APM module can be evaluated using two-end
 audio tracks as input and reverse input.

 By indicating negative and positive time offsets, one can reproduce cross-talk
-and silence in the conversation.
-
-IMPORTANT: **the whole code has not been landed yet.**
+(aka double-talk) and silence in the conversation.

 ### Example

--- a/webrtc/modules/audio_processing/test/conversational_speech/generator.cc
+++ b/webrtc/modules/audio_processing/test/conversational_speech/generator.cc
@ -11,8 +11,12 @@
 #include <iostream>

 #include "gflags/gflags.h"
-#include "webrtc/base/logging.h"
+#include "webrtc/base/ptr_util.h"
 #include "webrtc/modules/audio_processing/test/conversational_speech/config.h"
+#include "webrtc/modules/audio_processing/test/conversational_speech/timing.h"
+#include "webrtc/modules/audio_processing/test/conversational_speech/wavreader_factory.h"
+#include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h"
+#include "webrtc/modules/audio_processing/test/conversational_speech/simulator.h"
 #include "webrtc/test/testsupport/fileutils.h"

 namespace webrtc {
@ -48,14 +52,31 @@ DEFINE_validator(o, dir_exists);
 int main(int argc, char* argv[]) {
  google::SetUsageMessage(kUsageDescription);
  google::ParseCommandLineFlags(&argc, &argv, true);
-
  conversational_speech::Config config(FLAGS_i, FLAGS_t, FLAGS_o);

-  // TODO(alessiob): remove line below once debugged.
-  rtc::LogMessage::LogToDebug(rtc::LS_VERBOSE);
-  LOG(LS_VERBOSE) << "i = " << config.audiotracks_path();
-  LOG(LS_VERBOSE) << "t = " << config.timing_filepath();
-  LOG(LS_VERBOSE) << "o = " << config.output_path();
+  // Load timing.
+  std::vector<conversational_speech::Turn> timing =
+      conversational_speech::LoadTiming(config.timing_filepath());
+
+  // Parse timing and audio tracks.
+  auto wavreader_factory = rtc::MakeUnique<
+      conversational_speech::WavReaderFactory>();
+  conversational_speech::MultiEndCall multiend_call(
+      timing, config.audiotracks_path(), std::move(wavreader_factory));
+
+  // Generate output audio tracks.
+  auto generated_audiotrack_pairs = conversational_speech::Simulate(
+      multiend_call, config.output_path());
+
+  // Show paths to created audio tracks.
+  std::cout << "Output files:" << std::endl;
+  for (const auto& output_paths_entry : *generated_audiotrack_pairs) {
+    std::cout << "  speaker: " << output_paths_entry.first << std::endl;
+    std::cout << "    near end: " << output_paths_entry.second.near_end
+        << std::endl;
+    std::cout << "    far end: " << output_paths_entry.second.far_end
+        << std::endl;
+  }

  return 0;
 }
--- a/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc
+++ b/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc
@ -12,7 +12,7 @@
 // members. Part of them focus on accepting or rejecting different
 // conversational speech setups. A setup is defined by a set of audio tracks and
 // timing information).
-// The docstring at the beginning of each TEST_F(ConversationalSpeechTest,
+// The docstring at the beginning of each TEST(ConversationalSpeechTest,
 // MultiEndCallSetup*) function looks like the drawing below and indicates which
 // setup is tested.
 //
@ -174,16 +174,7 @@ void DeleteFolderAndContents(const std::string& dir) {

 using testing::_;

-// TODO(alessiob): Remove fixture once conversational_speech fully implemented
-// and replace TEST_F with TEST.
-class ConversationalSpeechTest : public testing::Test {
- public:
-  ConversationalSpeechTest() {
-    rtc::LogMessage::LogToDebug(rtc::LS_VERBOSE);
-  }
-};
-
-TEST_F(ConversationalSpeechTest, Settings) {
+TEST(ConversationalSpeechTest, Settings) {
  const conversational_speech::Config config(
      audiotracks_path, timing_filepath, output_path);

@ -193,7 +184,7 @@ TEST_F(ConversationalSpeechTest, Settings) {
  EXPECT_EQ(output_path, config.output_path());
 }

-TEST_F(ConversationalSpeechTest, TimingSaveLoad) {
+TEST(ConversationalSpeechTest, TimingSaveLoad) {
  // Save test timing.
  const std::string temporary_filepath = TempFilename(
      OutputPath(), "TempTimingTestFile");
@ -213,7 +204,7 @@ TEST_F(ConversationalSpeechTest, TimingSaveLoad) {
  }
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallCreate) {
+TEST(ConversationalSpeechTest, MultiEndCallCreate) {
  auto mock_wavreader_factory = CreateMockWavReaderFactory();

  // There are 5 unique audio tracks to read.
@ -230,7 +221,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallCreate) {
  EXPECT_EQ(6u, multiend_call.speaking_turns().size());
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
+TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
  const std::vector<Turn> timing = {
      {"A", "sr8000", 0},
      {"B", "sr16000", 0},
@ -245,7 +236,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
  EXPECT_FALSE(multiend_call.valid());
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
+TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
  const std::vector<Turn> timing = {
      {"A", "sr16000_stereo", 0},
      {"B", "sr16000_stereo", 0},
@ -260,7 +251,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
  EXPECT_FALSE(multiend_call.valid());
 }

-TEST_F(ConversationalSpeechTest,
+TEST(ConversationalSpeechTest,
       MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels) {
  const std::vector<Turn> timing = {
      {"A", "sr8000", 0},
@ -276,7 +267,7 @@ TEST_F(ConversationalSpeechTest,
  EXPECT_FALSE(multiend_call.valid());
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
+TEST(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
  const std::vector<Turn> timing = {
      {"A", "t500", -100},
      {"B", "t500", 0},
@ -291,7 +282,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
  EXPECT_FALSE(multiend_call.valid());
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallSetupSimple) {
+TEST(ConversationalSpeechTest, MultiEndCallSetupSimple) {
  // Accept:
  // A 0****.....
  // B .....1****
@ -316,7 +307,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupSimple) {
  EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallSetupPause) {
+TEST(ConversationalSpeechTest, MultiEndCallSetupPause) {
  // Accept:
  // A 0****.......
  // B .......1****
@ -341,7 +332,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupPause) {
  EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) {
+TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) {
  // Accept:
  // A 0****....
  // B ....1****
@ -366,7 +357,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) {
  EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) {
+TEST(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) {
  // Reject:
  // A ..0****
  // B .1****.  The n-th turn cannot start before the (n-1)-th one.
@ -384,7 +375,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) {
  EXPECT_FALSE(multiend_call.valid());
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) {
+TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) {
  // Accept:
  // A 0****2****...
  // B ...1*********
@ -410,7 +401,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) {
  EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) {
+TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) {
  // Reject:
  // A 0****......
  // A ...1****...
@ -431,7 +422,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) {
  EXPECT_FALSE(multiend_call.valid());
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) {
+TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) {
  // Reject:
  // A 0*********
  // B 1**.......
@ -454,7 +445,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) {
  EXPECT_FALSE(multiend_call.valid());
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) {
+TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) {
  // Accept:
  // A 0*********..
  // B ..1****.....
@ -481,7 +472,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) {
  EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) {
+TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) {
  // Reject:
  // A 0*********
  // B ..1****...
@ -503,7 +494,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) {
  EXPECT_FALSE(multiend_call.valid());
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) {
+TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) {
  // Accept:
  // A 0*********..
  // B .2****......
@ -530,7 +521,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) {
  EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) {
+TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) {
  // Accept:
  // A 0****
  // B 1****
@ -553,7 +544,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) {
  EXPECT_EQ(2u, multiend_call.speaking_turns().size());
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequence) {
+TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequence) {
  // Accept:
  // A 0****....3****.5**.
  // B .....1****...4**...
@ -586,7 +577,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequence) {
  EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) {
+TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) {
  // Reject:
  // A 0****....3****.6**
  // B .....1****...4**..
@ -614,7 +605,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) {
  EXPECT_FALSE(multiend_call.valid());
 }

-TEST_F(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) {
+TEST(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) {
  // Parameters with which wav files are created.
  constexpr int duration_seconds = 5;
  const int sample_rates[] = {8000, 11025, 16000, 22050, 32000, 44100, 48000};
@ -641,7 +632,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) {
  }
 }

-TEST_F(ConversationalSpeechTest, DISABLED_MultiEndCallSimulator) {
+TEST(ConversationalSpeechTest, DISABLED_MultiEndCallSimulator) {
  // Simulated call (one character corresponding to 500 ms):
  // A 0*********...........2*********.....
  // B ...........1*********.....3*********
--- a/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h
+++ b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h
@ -28,7 +28,7 @@ class MockWavReader : public WavReaderInterface {
  MockWavReader(int sample_rate, size_t num_channels, size_t num_samples);
  ~MockWavReader();

-  // TODO(alessiob): use ON_CALL to return random samples.
+  // TODO(alessiob): use ON_CALL to return random samples if needed.
  MOCK_METHOD1(ReadFloatSamples, size_t(rtc::ArrayView<float>));
  MOCK_METHOD1(ReadInt16Samples, size_t(rtc::ArrayView<int16_t>));

--- a/webrtc/modules/audio_processing/test/conversational_speech/timing.cc
+++ b/webrtc/modules/audio_processing/test/conversational_speech/timing.cc
@ -53,7 +53,7 @@ std::vector<Turn> LoadTiming(const std::string& timing_filepath) {
 void SaveTiming(const std::string& timing_filepath,
                rtc::ArrayView<const Turn> timing) {
  std::ofstream outfile(timing_filepath);
-  // TODO(alessio): check if file open for writing.
+  RTC_CHECK(outfile.is_open());
  for (const Turn& turn : timing) {
    outfile << turn.speaker_name << " " << turn.audiotrack_file_name
        << " " << turn.offset << std::endl;