The simulator puts into action the schedule of speech turns encoded in a MultiEndCall instance. The output is a set of audio track pairs. There is one set for each speaker and each set contains one near-end and one far-end audio track. The tracks are directly written into wav files instead of creating them in memory. To speed up the creation of the output wav files, *all* the source audio tracks (i.e., the atomic speech turns) are pre-loaded.

The ConversationalSpeechTest.MultiEndCallSimulator unit test defines a conversational speech sequence and creates two wav files (with pure tones at 440 and 880 Hz) that are used as atomic speech turn tracks. This CL also patches MultiEndCall in order to allow input audio tracks with same sample rate and single channel only. BUG=webrtc:7218 Review-Url: https://codereview.webrtc.org/2790933002 Cr-Commit-Position: refs/heads/master@{#18480}
2017-06-07 11:04:35 -07:00 · 2017-06-07 11:04:35 -07:00 · 6b648c4697
commit 6b648c4697
parent bb28b35922
6 changed files with 467 additions and 44 deletions
--- a/webrtc/modules/audio_processing/test/conversational_speech/BUILD.gn
+++ b/webrtc/modules/audio_processing/test/conversational_speech/BUILD.gn
@ -35,6 +35,8 @@ rtc_static_library("lib") {
    "config.h",
    "multiend_call.cc",
    "multiend_call.h",
+    "simulator.cc",
+    "simulator.h",
    "timing.cc",
    "timing.h",
    "wavreader_abstract_factory.h",
@ -67,5 +69,8 @@ rtc_source_set("unittest") {
    "../../../../../webrtc/test:test_support",
    "//testing/gmock",
    "//testing/gtest",
+    "//webrtc:webrtc_common",
+    "//webrtc/base:rtc_base_approved",
+    "//webrtc/test:test_support",
  ]
 }
--- a/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc
+++ b/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc
@ -40,13 +40,16 @@
 #include <cmath>
 #include <map>
 #include <memory>
+#include <vector>

 #include "webrtc/base/logging.h"
+#include "webrtc/base/optional.h"
 #include "webrtc/base/pathutils.h"
 #include "webrtc/common_audio/wav_file.h"
 #include "webrtc/modules/audio_processing/test/conversational_speech/config.h"
 #include "webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h"
 #include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h"
+#include "webrtc/modules/audio_processing/test/conversational_speech/simulator.h"
 #include "webrtc/modules/audio_processing/test/conversational_speech/timing.h"
 #include "webrtc/modules/audio_processing/test/conversational_speech/wavreader_factory.h"
 #include "webrtc/test/gmock.h"
@ -83,9 +86,12 @@ const std::size_t kNumberOfTurns = expected_timing.size();
 constexpr int kDefaultSampleRate = 48000;
 const std::map<std::string, const MockWavReaderFactory::Params>
    kDefaultMockWavReaderFactoryParamsMap = {
-  {"t300", {kDefaultSampleRate, 1u, 14400u}},  // 0.3 seconds.
-  {"t500", {kDefaultSampleRate, 1u, 24000u}},  // 0.5 seconds.
-  {"t1000", {kDefaultSampleRate, 1u, 48000u}},  // 1.0 seconds.
+  {"t300", {kDefaultSampleRate, 1u, 14400u}},  // Mono, 0.3 seconds.
+  {"t500", {kDefaultSampleRate, 1u, 24000u}},  // Mono, 0.5 seconds.
+  {"t1000", {kDefaultSampleRate, 1u, 48000u}},  // Mono, 1.0 seconds.
+  {"sr8000", {8000, 1u, 8000u}},  // 8kHz sample rate, mono, 1 second.
+  {"sr16000", {16000, 1u, 16000u}},  // 16kHz sample rate, mono, 1 second.
+  {"sr16000_stereo", {16000, 2u, 16000u}},  // Like sr16000, but stereo.
 };
 const MockWavReaderFactory::Params& kDefaultMockWavReaderFactoryParams =
    kDefaultMockWavReaderFactoryParamsMap.at("t500");
@ -113,6 +119,57 @@ void CreateSineWavFile(const std::string& filepath,
  wav_writer.WriteSamples(samples.data(), params.num_samples);
 }

+// Parameters to generate audio tracks with CreateSineWavFile.
+struct SineAudioTrackParams {
+  MockWavReaderFactory::Params params;
+  float frequency;
+};
+
+// Creates a temporary directory in which sine audio tracks are written.
+std::string CreateTemporarySineAudioTracks(
+    const std::map<std::string, SineAudioTrackParams>& sine_tracks_params) {
+  // Create temporary directory.
+  rtc::Pathname temp_directory(OutputPath());
+  temp_directory.AppendFolder("TempConversationalSpeechAudioTracks");
+  CreateDir(temp_directory.pathname());
+
+  // Create sine tracks.
+  for (const auto& it : sine_tracks_params) {
+    const rtc::Pathname temp_filepath(temp_directory.pathname(), it.first);
+    CreateSineWavFile(
+        temp_filepath.pathname(), it.second.params, it.second.frequency);
+  }
+
+  return temp_directory.pathname();
+}
+
+void CheckAudioTrackParams(const WavReaderFactory& wav_reader_factory,
+                           const std::string& filepath,
+                           const MockWavReaderFactory::Params& expeted_params) {
+  auto wav_reader = wav_reader_factory.Create(filepath);
+  EXPECT_EQ(expeted_params.sample_rate, wav_reader->SampleRate());
+  EXPECT_EQ(expeted_params.num_channels, wav_reader->NumChannels());
+  EXPECT_EQ(expeted_params.num_samples, wav_reader->NumSamples());
+}
+
+void DeleteFolderAndContents(const std::string& dir) {
+  if (!DirExists(dir)) { return; }
+  rtc::Optional<std::vector<std::string>> dir_content = ReadDirectory(dir);
+  EXPECT_TRUE(dir_content);
+  for (const auto& path : *dir_content) {
+    if (DirExists(path)) {
+      DeleteFolderAndContents(path);
+    } else if (FileExists(path)) {
+      // TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed.
+      RemoveFile(path);
+    } else {
+      FAIL();
+    }
+  }
+  // TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed.
+  RemoveDir(dir);
+}
+
 }  // namespace

 using testing::_;
@ -138,8 +195,8 @@ TEST_F(ConversationalSpeechTest, Settings) {

 TEST_F(ConversationalSpeechTest, TimingSaveLoad) {
  // Save test timing.
-  const std::string temporary_filepath = webrtc::test::TempFilename(
-      webrtc::test::OutputPath(), "TempTimingTestFile");
+  const std::string temporary_filepath = TempFilename(
+      OutputPath(), "TempTimingTestFile");
  SaveTiming(temporary_filepath, expected_timing);

  // Create a std::vector<Turn> instance by loading from file.
@ -173,6 +230,52 @@ TEST_F(ConversationalSpeechTest, MultiEndCallCreate) {
  EXPECT_EQ(6u, multiend_call.speaking_turns().size());
 }

+TEST_F(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
+  const std::vector<Turn> timing = {
+      {"A", "sr8000", 0},
+      {"B", "sr16000", 0},
+  };
+  auto mock_wavreader_factory = CreateMockWavReaderFactory();
+
+  // There are two unique audio tracks to read.
+  EXPECT_CALL(*mock_wavreader_factory, Create(testing::_)).Times(2);
+
+  MultiEndCall multiend_call(
+      timing, audiotracks_path, std::move(mock_wavreader_factory));
+  EXPECT_FALSE(multiend_call.valid());
+}
+
+TEST_F(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
+  const std::vector<Turn> timing = {
+      {"A", "sr16000_stereo", 0},
+      {"B", "sr16000_stereo", 0},
+  };
+  auto mock_wavreader_factory = CreateMockWavReaderFactory();
+
+  // There is one unique audio track to read.
+  EXPECT_CALL(*mock_wavreader_factory, Create(testing::_)).Times(1);
+
+  MultiEndCall multiend_call(
+      timing, audiotracks_path, std::move(mock_wavreader_factory));
+  EXPECT_FALSE(multiend_call.valid());
+}
+
+TEST_F(ConversationalSpeechTest,
+       MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels) {
+  const std::vector<Turn> timing = {
+      {"A", "sr8000", 0},
+      {"B", "sr16000_stereo", 0},
+  };
+  auto mock_wavreader_factory = CreateMockWavReaderFactory();
+
+  // There are two unique audio tracks to read.
+  EXPECT_CALL(*mock_wavreader_factory, Create(testing::_)).Times(2);
+
+  MultiEndCall multiend_call(
+      timing, audiotracks_path, std::move(mock_wavreader_factory));
+  EXPECT_FALSE(multiend_call.valid());
+}
+
 TEST_F(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
  const std::vector<Turn> timing = {
      {"A", "t500", -100},
@ -525,20 +628,70 @@ TEST_F(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) {
    const std::size_t num_samples = duration_seconds * sample_rate;
    MockWavReaderFactory::Params params = {sample_rate, 1u, num_samples};
    CreateSineWavFile(temp_filename.pathname(), params);
-    LOG(LS_VERBOSE) << "wav file @" << sample_rate << " Hz created ("
-        << num_samples << " samples)";

    // Load wav file and check if params match.
    WavReaderFactory wav_reader_factory;
-    auto wav_reader = wav_reader_factory.Create(temp_filename.pathname());
-    EXPECT_EQ(sample_rate, wav_reader->SampleRate());
-    EXPECT_EQ(1u, wav_reader->NumChannels());
-    EXPECT_EQ(num_samples, wav_reader->NumSamples());
+    MockWavReaderFactory::Params expeted_params = {
+        sample_rate, 1u, num_samples};
+    CheckAudioTrackParams(
+        wav_reader_factory, temp_filename.pathname(), expeted_params);

    // Clean up.
    remove(temp_filename.pathname().c_str());
  }
 }

+TEST_F(ConversationalSpeechTest, MultiEndCallSimulator) {
+  // Simulated call (one character corresponding to 500 ms):
+  // A 0*********...........2*********.....
+  // B ...........1*********.....3*********
+  const std::vector<Turn> expected_timing = {
+      {"A", "t5000_440.wav", 0},
+      {"B", "t5000_880.wav", 500},
+      {"A", "t5000_440.wav", 0},
+      {"B", "t5000_880.wav", -2500},
+  };
+  const std::size_t expected_duration_seconds = 18;
+
+  // Create temporary audio track files.
+  const int sample_rate = 16000;
+  const std::map<std::string, SineAudioTrackParams> sine_tracks_params = {
+      {"t5000_440.wav", {{sample_rate, 1u, sample_rate * 5}, 440.0}},
+      {"t5000_880.wav", {{sample_rate, 1u, sample_rate * 5}, 880.0}},
+  };
+  const std::string audiotracks_path = CreateTemporarySineAudioTracks(
+      sine_tracks_params);
+
+  // Set up the multi-end call.
+  auto wavreader_factory = std::unique_ptr<WavReaderFactory>(
+      new WavReaderFactory());
+  MultiEndCall multiend_call(
+      expected_timing, audiotracks_path, std::move(wavreader_factory));
+
+  // Simulate the call.
+  rtc::Pathname output_path(audiotracks_path);
+  output_path.AppendFolder("output");
+  CreateDir(output_path.pathname());
+  LOG(LS_VERBOSE) << "simulator output path: " << output_path.pathname();
+  auto generated_audiotrak_pairs = conversational_speech::Simulate(
+      multiend_call, output_path.pathname());
+  EXPECT_EQ(2u, generated_audiotrak_pairs->size());
+
+  // Check the output.
+  WavReaderFactory wav_reader_factory;
+  const MockWavReaderFactory::Params expeted_params = {
+      sample_rate, 1u, sample_rate * expected_duration_seconds};
+  for (const auto& it : *generated_audiotrak_pairs) {
+    LOG(LS_VERBOSE) << "checking far/near-end for <" << it.first << ">";
+    CheckAudioTrackParams(
+        wav_reader_factory, it.second.near_end, expeted_params);
+    CheckAudioTrackParams(
+        wav_reader_factory, it.second.far_end, expeted_params);
+  }
+
+  // Clean.
+  EXPECT_NO_FATAL_FAILURE(DeleteFolderAndContents(audiotracks_path));
+}
+
 }  // namespace test
 }  // namespace webrtc
--- a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc
+++ b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc
@ -24,36 +24,15 @@ MultiEndCall::MultiEndCall(
    rtc::ArrayView<const Turn> timing, const std::string& audiotracks_path,
    std::unique_ptr<WavReaderAbstractFactory> wavreader_abstract_factory)
        : timing_(timing), audiotracks_path_(audiotracks_path),
-          wavreader_abstract_factory_(std::move(wavreader_abstract_factory)) {
+          wavreader_abstract_factory_(std::move(wavreader_abstract_factory)),
+          valid_(false) {
  FindSpeakerNames();
-  CreateAudioTrackReaders();
-  valid_ = CheckTiming();
+  if (CreateAudioTrackReaders())
+    valid_ = CheckTiming();
 }

 MultiEndCall::~MultiEndCall() = default;

-const std::set<std::string>& MultiEndCall::speaker_names() const {
-  return speaker_names_;
-}
-
-const std::map<std::string, std::unique_ptr<WavReaderInterface>>&
-    MultiEndCall::audiotrack_readers() const {
-  return audiotrack_readers_;
-}
-
-bool MultiEndCall::valid() const {
-  return valid_;
-}
-
-size_t MultiEndCall::total_duration_samples() const {
-  return total_duration_samples_;
-}
-
-const std::vector<MultiEndCall::SpeakingTurn>& MultiEndCall::speaking_turns()
-    const {
-  return speaking_turns_;
-}
-
 void MultiEndCall::FindSpeakerNames() {
  RTC_DCHECK(speaker_names_.empty());
  for (const Turn& turn : timing_) {
@ -61,8 +40,9 @@ void MultiEndCall::FindSpeakerNames() {
  }
 }

-void MultiEndCall::CreateAudioTrackReaders() {
+bool MultiEndCall::CreateAudioTrackReaders() {
  RTC_DCHECK(audiotrack_readers_.empty());
+  sample_rate_hz_ = 0;  // Sample rate will be set when reading the first track.
  for (const Turn& turn : timing_) {
    auto it = audiotrack_readers_.find(turn.audiotrack_file_name);
    if (it != audiotrack_readers_.end())
@ -75,9 +55,24 @@ void MultiEndCall::CreateAudioTrackReaders() {
    // Map the audiotrack file name to a new instance of WavReaderInterface.
    std::unique_ptr<WavReaderInterface> wavreader =
        wavreader_abstract_factory_->Create(audiotrack_file_path.pathname());
+
+    if (sample_rate_hz_ == 0) {
+      sample_rate_hz_ = wavreader->SampleRate();
+    } else if (sample_rate_hz_ != wavreader->SampleRate()) {
+      LOG(LS_ERROR) << "All the audio tracks should have the same sample rate.";
+      return false;
+    }
+
+    if (wavreader->NumChannels() != 1) {
+      LOG(LS_ERROR) << "Only mono audio tracks supported.";
+      return false;
+    }
+
    audiotrack_readers_.emplace(
        turn.audiotrack_file_name, std::move(wavreader));
  }
+
+  return true;
 }

 bool MultiEndCall::CheckTiming() {
--- a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h
+++ b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h
@ -50,19 +50,23 @@ class MultiEndCall {
      std::unique_ptr<WavReaderAbstractFactory> wavreader_abstract_factory);
  ~MultiEndCall();

-  const std::set<std::string>& speaker_names() const;
+  const std::set<std::string>& speaker_names() const { return speaker_names_; }
  const std::map<std::string, std::unique_ptr<WavReaderInterface>>&
-      audiotrack_readers() const;
-  bool valid() const;
-  size_t total_duration_samples() const;
-  const std::vector<SpeakingTurn>& speaking_turns() const;
+      audiotrack_readers() const { return audiotrack_readers_; }
+  bool valid() const { return valid_; }
+  int sample_rate() const { return sample_rate_hz_; }
+  size_t total_duration_samples() const { return total_duration_samples_; }
+  const std::vector<SpeakingTurn>& speaking_turns() const {
+      return speaking_turns_; }

 private:
  // Finds unique speaker names.
  void FindSpeakerNames();

-  // Creates one WavReader instance for each unique audiotrack.
-  void CreateAudioTrackReaders();
+  // Creates one WavReader instance for each unique audiotrack. It returns false
+  // if the audio tracks do not have the same sample rate or if they are not
+  // mono.
+  bool CreateAudioTrackReaders();

  // Validates the speaking turns timing information. Accepts cross-talk, but
  // only up to 2 speakers. Rejects unordered turns and self cross-talk.
@ -75,6 +79,7 @@ class MultiEndCall {
  std::map<std::string, std::unique_ptr<WavReaderInterface>>
      audiotrack_readers_;
  bool valid_;
+  int sample_rate_hz_;
  size_t total_duration_samples_;
  std::vector<SpeakingTurn> speaking_turns_;

--- a/webrtc/modules/audio_processing/test/conversational_speech/simulator.cc
+++ b/webrtc/modules/audio_processing/test/conversational_speech/simulator.cc
@ -0,0 +1,221 @@
+/*
+ *  Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/test/conversational_speech/simulator.h"
+
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "webrtc/base/array_view.h"
+#include "webrtc/base/constructormagic.h"
+#include "webrtc/base/logging.h"
+#include "webrtc/base/pathutils.h"
+#include "webrtc/base/ptr_util.h"
+#include "webrtc/common_audio/wav_file.h"
+#include "webrtc/modules/audio_processing/test/conversational_speech/wavreader_interface.h"
+
+namespace webrtc {
+namespace test {
+namespace {
+
+using conversational_speech::MultiEndCall;
+using conversational_speech::SpeakerOutputFilePaths;
+using conversational_speech::WavReaderInterface;
+
+// Combines output path and speaker names to define the output file paths for
+// the near-end and far=end audio tracks.
+std::unique_ptr<std::map<std::string, SpeakerOutputFilePaths>>
+    InitSpeakerOutputFilePaths(const std::set<std::string>& speaker_names,
+                               const std::string& output_path) {
+  // Create map.
+  auto speaker_output_file_paths_map = rtc::MakeUnique<
+      std::map<std::string, SpeakerOutputFilePaths>>();
+
+  // Add near-end and far-end output paths into the map.
+  for (const auto& speaker_name : speaker_names) {
+    const rtc::Pathname near_end_path(
+        output_path, "s_" + speaker_name + "-near_end.wav");
+    LOG(LS_VERBOSE) << "The near-end audio track will be created in "
+        << near_end_path.pathname() << ".";
+
+    const rtc::Pathname far_end_path(
+        output_path, "s_" + speaker_name + "-far_end.wav");
+    LOG(LS_VERBOSE) << "The far-end audio track will be created in "
+        << far_end_path.pathname() << ".";
+
+    // Add to map.
+    speaker_output_file_paths_map->emplace(
+        std::piecewise_construct,
+        std::forward_as_tuple(speaker_name),
+        std::forward_as_tuple(near_end_path.pathname(),
+                              far_end_path.pathname()));
+  }
+
+  return speaker_output_file_paths_map;
+}
+
+// Class that provides one WavWriter for the near-end and one for the far-end
+// output track of a speaker.
+class SpeakerWavWriters {
+ public:
+  SpeakerWavWriters(
+      const SpeakerOutputFilePaths& output_file_paths, int sample_rate)
+          : near_end_wav_writer_(output_file_paths.near_end, sample_rate, 1u),
+            far_end_wav_writer_(output_file_paths.far_end, sample_rate, 1u) {}
+  WavWriter* near_end_wav_writer() {
+    return &near_end_wav_writer_;
+  }
+  WavWriter* far_end_wav_writer() {
+    return &far_end_wav_writer_;
+  }
+ private:
+  WavWriter near_end_wav_writer_;
+  WavWriter far_end_wav_writer_;
+};
+
+// Initializes one WavWriter instance for each speaker and both the near-end and
+// far-end output tracks.
+std::unique_ptr<std::map<std::string, SpeakerWavWriters>>
+    InitSpeakersWavWriters(const std::map<std::string, SpeakerOutputFilePaths>&
+                           speaker_output_file_paths, int sample_rate) {
+  // Create map.
+  auto speaker_wav_writers_map = rtc::MakeUnique<
+      std::map<std::string, SpeakerWavWriters>>();
+
+  // Add SpeakerWavWriters instance into the map.
+  for (auto it = speaker_output_file_paths.begin();
+      it != speaker_output_file_paths.end(); ++it) {
+    speaker_wav_writers_map->emplace(
+        std::piecewise_construct,
+        std::forward_as_tuple(it->first),
+        std::forward_as_tuple(it->second, sample_rate));
+  }
+
+  return speaker_wav_writers_map;
+}
+
+// Reads all the samples for each audio track.
+std::unique_ptr<std::map<std::string, std::vector<int16_t>>> PreloadAudioTracks(
+    const std::map<std::string, std::unique_ptr<WavReaderInterface>>&
+        audiotrack_readers) {
+  // Create map.
+  auto audiotracks_map = rtc::MakeUnique<
+      std::map<std::string, std::vector<int16_t>>>();
+
+  // Add audio track vectors.
+  for (auto it = audiotrack_readers.begin(); it != audiotrack_readers.end();
+      ++it) {
+    // Add map entry.
+    audiotracks_map->emplace(
+        std::piecewise_construct,
+        std::forward_as_tuple(it->first),
+        std::forward_as_tuple(it->second->NumSamples()));
+
+    // Read samples.
+    it->second->ReadInt16Samples(audiotracks_map->at(it->first));
+  }
+
+  return audiotracks_map;
+}
+
+// Writes all the values in |source_samples| via |wav_writer|. If the number of
+// previously written samples in |wav_writer| is less than |interval_begin|, it
+// adds zeros as left padding. The padding corresponds to intervals during which
+// a speaker is not active.
+void PadLeftWriteChunk(rtc::ArrayView<const int16_t> source_samples,
+                       size_t interval_begin, WavWriter* wav_writer) {
+  // Add left padding.
+  RTC_CHECK(wav_writer);
+  RTC_CHECK_GE(interval_begin, wav_writer->num_samples());
+  size_t padding_size = interval_begin - wav_writer->num_samples();
+  if (padding_size != 0) {
+    const std::vector<int16_t> padding(padding_size, 0);
+    wav_writer->WriteSamples(padding.data(), padding_size);
+  }
+
+  // Write source samples.
+  wav_writer->WriteSamples(source_samples.data(), source_samples.size());
+}
+
+// Appends zeros via |wav_writer|. The number of zeros is always non-negative
+// and equal to the difference between the previously written samples and
+// |pad_samples|.
+void PadRightWrite(WavWriter* wav_writer, size_t pad_samples) {
+  RTC_CHECK(wav_writer);
+  RTC_CHECK_GE(pad_samples, wav_writer->num_samples());
+  size_t padding_size = pad_samples - wav_writer->num_samples();
+  if (padding_size != 0) {
+    const std::vector<int16_t> padding(padding_size, 0);
+    wav_writer->WriteSamples(padding.data(), padding_size);
+  }
+}
+
+}  // namespace
+
+namespace conversational_speech {
+
+std::unique_ptr<std::map<std::string, SpeakerOutputFilePaths>> Simulate(
+    const MultiEndCall& multiend_call, const std::string& output_path) {
+  // Set output file paths and initialize wav writers.
+  const auto& speaker_names = multiend_call.speaker_names();
+  auto speaker_output_file_paths = InitSpeakerOutputFilePaths(
+      speaker_names, output_path);
+  auto speakers_wav_writers = InitSpeakersWavWriters(
+      *speaker_output_file_paths, multiend_call.sample_rate());
+
+  // Preload all the input audio tracks.
+  const auto& audiotrack_readers = multiend_call.audiotrack_readers();
+  auto audiotracks = PreloadAudioTracks(audiotrack_readers);
+
+  // TODO(alessiob): When speaker_names.size() == 2, near-end and far-end
+  // across the 2 speakers are symmetric; hence, the code below could be
+  // replaced by only creating the near-end or the far-end. However, this would
+  // require to split the unit tests and document the behavior in README.md.
+  // In practice, it should not be an issue since the files are not expected to
+  // be signinificant.
+
+  // Write near-end and far-end output tracks.
+  for (const auto& speaking_turn : multiend_call.speaking_turns()) {
+    const std::string& active_speaker_name = speaking_turn.speaker_name;
+    auto source_audiotrack = audiotracks->at(
+        speaking_turn.audiotrack_file_name);
+
+    // Write active speaker's chunk to active speaker's near-end.
+    PadLeftWriteChunk(source_audiotrack, speaking_turn.begin,
+                      speakers_wav_writers->at(
+                          active_speaker_name).near_end_wav_writer());
+
+    // Write active speaker's chunk to other participants' far-ends.
+    for (const std::string& speaker_name : speaker_names) {
+      if (speaker_name == active_speaker_name)
+        continue;
+      PadLeftWriteChunk(source_audiotrack, speaking_turn.begin,
+                        speakers_wav_writers->at(
+                            speaker_name).far_end_wav_writer());
+    }
+  }
+
+  // Finalize all the output tracks with right padding.
+  // This is required to make all the output tracks duration equal.
+  size_t duration_samples = multiend_call.total_duration_samples();
+  for (const std::string& speaker_name : speaker_names) {
+    PadRightWrite(speakers_wav_writers->at(speaker_name).near_end_wav_writer(),
+                  duration_samples);
+    PadRightWrite(speakers_wav_writers->at(speaker_name).far_end_wav_writer(),
+                  duration_samples);
+  }
+
+  return speaker_output_file_paths;
+}
+
+}  // namespace conversational_speech
+}  // namespace test
+}  // namespace webrtc
--- a/webrtc/modules/audio_processing/test/conversational_speech/simulator.h
+++ b/webrtc/modules/audio_processing/test/conversational_speech/simulator.h
@ -0,0 +1,44 @@
+/*
+ *  Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_SIMULATOR_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_SIMULATOR_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "webrtc/base/constructormagic.h"
+#include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h"
+
+namespace webrtc {
+namespace test {
+namespace conversational_speech {
+
+struct SpeakerOutputFilePaths {
+  SpeakerOutputFilePaths(const std::string& new_near_end,
+                         const std::string& new_far_end)
+      : near_end(new_near_end),
+        far_end(new_far_end) {}
+  // Paths to the near-end and far-end audio track files.
+  const std::string near_end;
+  const std::string far_end;
+};
+
+// Generates the near-end and far-end audio track pairs for each speaker.
+std::unique_ptr<std::map<std::string, SpeakerOutputFilePaths>>
+    Simulate(const MultiEndCall& multiend_call, const std::string& output_path);
+
+}  // namespace conversational_speech
+}  // namespace test
+}  // namespace webrtc
+
+#endif  // WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_SIMULATOR_H_