Revert of Conversational speech tool, simualtor + unit tests (patchset #12 id:220001 of https://codereview.webrtc.org/2790933002/ )
Reason for revert:
Compile Error.
Original issue's description:
> The simulator puts into action the schedule of speech turns encoded in a MultiEndCall instance. The output is a set of audio track pairs. There is one set for each speaker and each set contains one near-end and one far-end audio track. The tracks are directly written into wav files instead of creating them in memory. To speed up the creation of the output wav files, *all* the source audio tracks (i.e., the atomic speech turns) are pre-loaded.
>
> The ConversationalSpeechTest.MultiEndCallSimulator unit test defines a conversational speech sequence and creates two wav files (with pure tones at 440 and 880 Hz) that are used as atomic speech turn tracks.
>
> This CL also patches MultiEndCall in order to allow input audio tracks with same sample rate and single channel only.
>
> BUG=webrtc:7218
>
> Review-Url: https://codereview.webrtc.org/2790933002
> Cr-Commit-Position: refs/heads/master@{#18480}
> Committed: 6b648c4697
TBR=minyue@webrtc.org,alessiob@webrtc.org
# Skipping CQ checks because original CL landed less than 1 days ago.
NOPRESUBMIT=true
NOTREECHECKS=true
NOTRY=true
BUG=webrtc:7218
Review-Url: https://codereview.webrtc.org/2925123003
Cr-Commit-Position: refs/heads/master@{#18481}
This commit is contained in:
parent
6b648c4697
commit
4c72cf43df
@ -35,8 +35,6 @@ rtc_static_library("lib") {
|
||||
"config.h",
|
||||
"multiend_call.cc",
|
||||
"multiend_call.h",
|
||||
"simulator.cc",
|
||||
"simulator.h",
|
||||
"timing.cc",
|
||||
"timing.h",
|
||||
"wavreader_abstract_factory.h",
|
||||
@ -69,8 +67,5 @@ rtc_source_set("unittest") {
|
||||
"../../../../../webrtc/test:test_support",
|
||||
"//testing/gmock",
|
||||
"//testing/gtest",
|
||||
"//webrtc:webrtc_common",
|
||||
"//webrtc/base:rtc_base_approved",
|
||||
"//webrtc/test:test_support",
|
||||
]
|
||||
}
|
||||
|
||||
@ -40,16 +40,13 @@
|
||||
#include <cmath>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "webrtc/base/logging.h"
|
||||
#include "webrtc/base/optional.h"
|
||||
#include "webrtc/base/pathutils.h"
|
||||
#include "webrtc/common_audio/wav_file.h"
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/config.h"
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h"
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h"
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/simulator.h"
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/timing.h"
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/wavreader_factory.h"
|
||||
#include "webrtc/test/gmock.h"
|
||||
@ -86,12 +83,9 @@ const std::size_t kNumberOfTurns = expected_timing.size();
|
||||
constexpr int kDefaultSampleRate = 48000;
|
||||
const std::map<std::string, const MockWavReaderFactory::Params>
|
||||
kDefaultMockWavReaderFactoryParamsMap = {
|
||||
{"t300", {kDefaultSampleRate, 1u, 14400u}}, // Mono, 0.3 seconds.
|
||||
{"t500", {kDefaultSampleRate, 1u, 24000u}}, // Mono, 0.5 seconds.
|
||||
{"t1000", {kDefaultSampleRate, 1u, 48000u}}, // Mono, 1.0 seconds.
|
||||
{"sr8000", {8000, 1u, 8000u}}, // 8kHz sample rate, mono, 1 second.
|
||||
{"sr16000", {16000, 1u, 16000u}}, // 16kHz sample rate, mono, 1 second.
|
||||
{"sr16000_stereo", {16000, 2u, 16000u}}, // Like sr16000, but stereo.
|
||||
{"t300", {kDefaultSampleRate, 1u, 14400u}}, // 0.3 seconds.
|
||||
{"t500", {kDefaultSampleRate, 1u, 24000u}}, // 0.5 seconds.
|
||||
{"t1000", {kDefaultSampleRate, 1u, 48000u}}, // 1.0 seconds.
|
||||
};
|
||||
const MockWavReaderFactory::Params& kDefaultMockWavReaderFactoryParams =
|
||||
kDefaultMockWavReaderFactoryParamsMap.at("t500");
|
||||
@ -119,57 +113,6 @@ void CreateSineWavFile(const std::string& filepath,
|
||||
wav_writer.WriteSamples(samples.data(), params.num_samples);
|
||||
}
|
||||
|
||||
// Parameters to generate audio tracks with CreateSineWavFile.
|
||||
struct SineAudioTrackParams {
|
||||
MockWavReaderFactory::Params params;
|
||||
float frequency;
|
||||
};
|
||||
|
||||
// Creates a temporary directory in which sine audio tracks are written.
|
||||
std::string CreateTemporarySineAudioTracks(
|
||||
const std::map<std::string, SineAudioTrackParams>& sine_tracks_params) {
|
||||
// Create temporary directory.
|
||||
rtc::Pathname temp_directory(OutputPath());
|
||||
temp_directory.AppendFolder("TempConversationalSpeechAudioTracks");
|
||||
CreateDir(temp_directory.pathname());
|
||||
|
||||
// Create sine tracks.
|
||||
for (const auto& it : sine_tracks_params) {
|
||||
const rtc::Pathname temp_filepath(temp_directory.pathname(), it.first);
|
||||
CreateSineWavFile(
|
||||
temp_filepath.pathname(), it.second.params, it.second.frequency);
|
||||
}
|
||||
|
||||
return temp_directory.pathname();
|
||||
}
|
||||
|
||||
void CheckAudioTrackParams(const WavReaderFactory& wav_reader_factory,
|
||||
const std::string& filepath,
|
||||
const MockWavReaderFactory::Params& expeted_params) {
|
||||
auto wav_reader = wav_reader_factory.Create(filepath);
|
||||
EXPECT_EQ(expeted_params.sample_rate, wav_reader->SampleRate());
|
||||
EXPECT_EQ(expeted_params.num_channels, wav_reader->NumChannels());
|
||||
EXPECT_EQ(expeted_params.num_samples, wav_reader->NumSamples());
|
||||
}
|
||||
|
||||
void DeleteFolderAndContents(const std::string& dir) {
|
||||
if (!DirExists(dir)) { return; }
|
||||
rtc::Optional<std::vector<std::string>> dir_content = ReadDirectory(dir);
|
||||
EXPECT_TRUE(dir_content);
|
||||
for (const auto& path : *dir_content) {
|
||||
if (DirExists(path)) {
|
||||
DeleteFolderAndContents(path);
|
||||
} else if (FileExists(path)) {
|
||||
// TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed.
|
||||
RemoveFile(path);
|
||||
} else {
|
||||
FAIL();
|
||||
}
|
||||
}
|
||||
// TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed.
|
||||
RemoveDir(dir);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
using testing::_;
|
||||
@ -195,8 +138,8 @@ TEST_F(ConversationalSpeechTest, Settings) {
|
||||
|
||||
TEST_F(ConversationalSpeechTest, TimingSaveLoad) {
|
||||
// Save test timing.
|
||||
const std::string temporary_filepath = TempFilename(
|
||||
OutputPath(), "TempTimingTestFile");
|
||||
const std::string temporary_filepath = webrtc::test::TempFilename(
|
||||
webrtc::test::OutputPath(), "TempTimingTestFile");
|
||||
SaveTiming(temporary_filepath, expected_timing);
|
||||
|
||||
// Create a std::vector<Turn> instance by loading from file.
|
||||
@ -230,52 +173,6 @@ TEST_F(ConversationalSpeechTest, MultiEndCallCreate) {
|
||||
EXPECT_EQ(6u, multiend_call.speaking_turns().size());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "sr8000", 0},
|
||||
{"B", "sr16000", 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
// There are two unique audio tracks to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(testing::_)).Times(2);
|
||||
|
||||
MultiEndCall multiend_call(
|
||||
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "sr16000_stereo", 0},
|
||||
{"B", "sr16000_stereo", 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
// There is one unique audio track to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(testing::_)).Times(1);
|
||||
|
||||
MultiEndCall multiend_call(
|
||||
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest,
|
||||
MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels) {
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "sr8000", 0},
|
||||
{"B", "sr16000_stereo", 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
// There are two unique audio tracks to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(testing::_)).Times(2);
|
||||
|
||||
MultiEndCall multiend_call(
|
||||
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", -100},
|
||||
@ -628,70 +525,20 @@ TEST_F(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) {
|
||||
const std::size_t num_samples = duration_seconds * sample_rate;
|
||||
MockWavReaderFactory::Params params = {sample_rate, 1u, num_samples};
|
||||
CreateSineWavFile(temp_filename.pathname(), params);
|
||||
LOG(LS_VERBOSE) << "wav file @" << sample_rate << " Hz created ("
|
||||
<< num_samples << " samples)";
|
||||
|
||||
// Load wav file and check if params match.
|
||||
WavReaderFactory wav_reader_factory;
|
||||
MockWavReaderFactory::Params expeted_params = {
|
||||
sample_rate, 1u, num_samples};
|
||||
CheckAudioTrackParams(
|
||||
wav_reader_factory, temp_filename.pathname(), expeted_params);
|
||||
auto wav_reader = wav_reader_factory.Create(temp_filename.pathname());
|
||||
EXPECT_EQ(sample_rate, wav_reader->SampleRate());
|
||||
EXPECT_EQ(1u, wav_reader->NumChannels());
|
||||
EXPECT_EQ(num_samples, wav_reader->NumSamples());
|
||||
|
||||
// Clean up.
|
||||
remove(temp_filename.pathname().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSimulator) {
|
||||
// Simulated call (one character corresponding to 500 ms):
|
||||
// A 0*********...........2*********.....
|
||||
// B ...........1*********.....3*********
|
||||
const std::vector<Turn> expected_timing = {
|
||||
{"A", "t5000_440.wav", 0},
|
||||
{"B", "t5000_880.wav", 500},
|
||||
{"A", "t5000_440.wav", 0},
|
||||
{"B", "t5000_880.wav", -2500},
|
||||
};
|
||||
const std::size_t expected_duration_seconds = 18;
|
||||
|
||||
// Create temporary audio track files.
|
||||
const int sample_rate = 16000;
|
||||
const std::map<std::string, SineAudioTrackParams> sine_tracks_params = {
|
||||
{"t5000_440.wav", {{sample_rate, 1u, sample_rate * 5}, 440.0}},
|
||||
{"t5000_880.wav", {{sample_rate, 1u, sample_rate * 5}, 880.0}},
|
||||
};
|
||||
const std::string audiotracks_path = CreateTemporarySineAudioTracks(
|
||||
sine_tracks_params);
|
||||
|
||||
// Set up the multi-end call.
|
||||
auto wavreader_factory = std::unique_ptr<WavReaderFactory>(
|
||||
new WavReaderFactory());
|
||||
MultiEndCall multiend_call(
|
||||
expected_timing, audiotracks_path, std::move(wavreader_factory));
|
||||
|
||||
// Simulate the call.
|
||||
rtc::Pathname output_path(audiotracks_path);
|
||||
output_path.AppendFolder("output");
|
||||
CreateDir(output_path.pathname());
|
||||
LOG(LS_VERBOSE) << "simulator output path: " << output_path.pathname();
|
||||
auto generated_audiotrak_pairs = conversational_speech::Simulate(
|
||||
multiend_call, output_path.pathname());
|
||||
EXPECT_EQ(2u, generated_audiotrak_pairs->size());
|
||||
|
||||
// Check the output.
|
||||
WavReaderFactory wav_reader_factory;
|
||||
const MockWavReaderFactory::Params expeted_params = {
|
||||
sample_rate, 1u, sample_rate * expected_duration_seconds};
|
||||
for (const auto& it : *generated_audiotrak_pairs) {
|
||||
LOG(LS_VERBOSE) << "checking far/near-end for <" << it.first << ">";
|
||||
CheckAudioTrackParams(
|
||||
wav_reader_factory, it.second.near_end, expeted_params);
|
||||
CheckAudioTrackParams(
|
||||
wav_reader_factory, it.second.far_end, expeted_params);
|
||||
}
|
||||
|
||||
// Clean.
|
||||
EXPECT_NO_FATAL_FAILURE(DeleteFolderAndContents(audiotracks_path));
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace webrtc
|
||||
|
||||
@ -24,15 +24,36 @@ MultiEndCall::MultiEndCall(
|
||||
rtc::ArrayView<const Turn> timing, const std::string& audiotracks_path,
|
||||
std::unique_ptr<WavReaderAbstractFactory> wavreader_abstract_factory)
|
||||
: timing_(timing), audiotracks_path_(audiotracks_path),
|
||||
wavreader_abstract_factory_(std::move(wavreader_abstract_factory)),
|
||||
valid_(false) {
|
||||
wavreader_abstract_factory_(std::move(wavreader_abstract_factory)) {
|
||||
FindSpeakerNames();
|
||||
if (CreateAudioTrackReaders())
|
||||
valid_ = CheckTiming();
|
||||
CreateAudioTrackReaders();
|
||||
valid_ = CheckTiming();
|
||||
}
|
||||
|
||||
MultiEndCall::~MultiEndCall() = default;
|
||||
|
||||
const std::set<std::string>& MultiEndCall::speaker_names() const {
|
||||
return speaker_names_;
|
||||
}
|
||||
|
||||
const std::map<std::string, std::unique_ptr<WavReaderInterface>>&
|
||||
MultiEndCall::audiotrack_readers() const {
|
||||
return audiotrack_readers_;
|
||||
}
|
||||
|
||||
bool MultiEndCall::valid() const {
|
||||
return valid_;
|
||||
}
|
||||
|
||||
size_t MultiEndCall::total_duration_samples() const {
|
||||
return total_duration_samples_;
|
||||
}
|
||||
|
||||
const std::vector<MultiEndCall::SpeakingTurn>& MultiEndCall::speaking_turns()
|
||||
const {
|
||||
return speaking_turns_;
|
||||
}
|
||||
|
||||
void MultiEndCall::FindSpeakerNames() {
|
||||
RTC_DCHECK(speaker_names_.empty());
|
||||
for (const Turn& turn : timing_) {
|
||||
@ -40,9 +61,8 @@ void MultiEndCall::FindSpeakerNames() {
|
||||
}
|
||||
}
|
||||
|
||||
bool MultiEndCall::CreateAudioTrackReaders() {
|
||||
void MultiEndCall::CreateAudioTrackReaders() {
|
||||
RTC_DCHECK(audiotrack_readers_.empty());
|
||||
sample_rate_hz_ = 0; // Sample rate will be set when reading the first track.
|
||||
for (const Turn& turn : timing_) {
|
||||
auto it = audiotrack_readers_.find(turn.audiotrack_file_name);
|
||||
if (it != audiotrack_readers_.end())
|
||||
@ -55,24 +75,9 @@ bool MultiEndCall::CreateAudioTrackReaders() {
|
||||
// Map the audiotrack file name to a new instance of WavReaderInterface.
|
||||
std::unique_ptr<WavReaderInterface> wavreader =
|
||||
wavreader_abstract_factory_->Create(audiotrack_file_path.pathname());
|
||||
|
||||
if (sample_rate_hz_ == 0) {
|
||||
sample_rate_hz_ = wavreader->SampleRate();
|
||||
} else if (sample_rate_hz_ != wavreader->SampleRate()) {
|
||||
LOG(LS_ERROR) << "All the audio tracks should have the same sample rate.";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (wavreader->NumChannels() != 1) {
|
||||
LOG(LS_ERROR) << "Only mono audio tracks supported.";
|
||||
return false;
|
||||
}
|
||||
|
||||
audiotrack_readers_.emplace(
|
||||
turn.audiotrack_file_name, std::move(wavreader));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool MultiEndCall::CheckTiming() {
|
||||
|
||||
@ -50,23 +50,19 @@ class MultiEndCall {
|
||||
std::unique_ptr<WavReaderAbstractFactory> wavreader_abstract_factory);
|
||||
~MultiEndCall();
|
||||
|
||||
const std::set<std::string>& speaker_names() const { return speaker_names_; }
|
||||
const std::set<std::string>& speaker_names() const;
|
||||
const std::map<std::string, std::unique_ptr<WavReaderInterface>>&
|
||||
audiotrack_readers() const { return audiotrack_readers_; }
|
||||
bool valid() const { return valid_; }
|
||||
int sample_rate() const { return sample_rate_hz_; }
|
||||
size_t total_duration_samples() const { return total_duration_samples_; }
|
||||
const std::vector<SpeakingTurn>& speaking_turns() const {
|
||||
return speaking_turns_; }
|
||||
audiotrack_readers() const;
|
||||
bool valid() const;
|
||||
size_t total_duration_samples() const;
|
||||
const std::vector<SpeakingTurn>& speaking_turns() const;
|
||||
|
||||
private:
|
||||
// Finds unique speaker names.
|
||||
void FindSpeakerNames();
|
||||
|
||||
// Creates one WavReader instance for each unique audiotrack. It returns false
|
||||
// if the audio tracks do not have the same sample rate or if they are not
|
||||
// mono.
|
||||
bool CreateAudioTrackReaders();
|
||||
// Creates one WavReader instance for each unique audiotrack.
|
||||
void CreateAudioTrackReaders();
|
||||
|
||||
// Validates the speaking turns timing information. Accepts cross-talk, but
|
||||
// only up to 2 speakers. Rejects unordered turns and self cross-talk.
|
||||
@ -79,7 +75,6 @@ class MultiEndCall {
|
||||
std::map<std::string, std::unique_ptr<WavReaderInterface>>
|
||||
audiotrack_readers_;
|
||||
bool valid_;
|
||||
int sample_rate_hz_;
|
||||
size_t total_duration_samples_;
|
||||
std::vector<SpeakingTurn> speaking_turns_;
|
||||
|
||||
|
||||
@ -1,221 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/simulator.h"
|
||||
|
||||
#include <set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "webrtc/base/array_view.h"
|
||||
#include "webrtc/base/constructormagic.h"
|
||||
#include "webrtc/base/logging.h"
|
||||
#include "webrtc/base/pathutils.h"
|
||||
#include "webrtc/base/ptr_util.h"
|
||||
#include "webrtc/common_audio/wav_file.h"
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/wavreader_interface.h"
|
||||
|
||||
namespace webrtc {
|
||||
namespace test {
|
||||
namespace {
|
||||
|
||||
using conversational_speech::MultiEndCall;
|
||||
using conversational_speech::SpeakerOutputFilePaths;
|
||||
using conversational_speech::WavReaderInterface;
|
||||
|
||||
// Combines output path and speaker names to define the output file paths for
|
||||
// the near-end and far=end audio tracks.
|
||||
std::unique_ptr<std::map<std::string, SpeakerOutputFilePaths>>
|
||||
InitSpeakerOutputFilePaths(const std::set<std::string>& speaker_names,
|
||||
const std::string& output_path) {
|
||||
// Create map.
|
||||
auto speaker_output_file_paths_map = rtc::MakeUnique<
|
||||
std::map<std::string, SpeakerOutputFilePaths>>();
|
||||
|
||||
// Add near-end and far-end output paths into the map.
|
||||
for (const auto& speaker_name : speaker_names) {
|
||||
const rtc::Pathname near_end_path(
|
||||
output_path, "s_" + speaker_name + "-near_end.wav");
|
||||
LOG(LS_VERBOSE) << "The near-end audio track will be created in "
|
||||
<< near_end_path.pathname() << ".";
|
||||
|
||||
const rtc::Pathname far_end_path(
|
||||
output_path, "s_" + speaker_name + "-far_end.wav");
|
||||
LOG(LS_VERBOSE) << "The far-end audio track will be created in "
|
||||
<< far_end_path.pathname() << ".";
|
||||
|
||||
// Add to map.
|
||||
speaker_output_file_paths_map->emplace(
|
||||
std::piecewise_construct,
|
||||
std::forward_as_tuple(speaker_name),
|
||||
std::forward_as_tuple(near_end_path.pathname(),
|
||||
far_end_path.pathname()));
|
||||
}
|
||||
|
||||
return speaker_output_file_paths_map;
|
||||
}
|
||||
|
||||
// Class that provides one WavWriter for the near-end and one for the far-end
|
||||
// output track of a speaker.
|
||||
class SpeakerWavWriters {
|
||||
public:
|
||||
SpeakerWavWriters(
|
||||
const SpeakerOutputFilePaths& output_file_paths, int sample_rate)
|
||||
: near_end_wav_writer_(output_file_paths.near_end, sample_rate, 1u),
|
||||
far_end_wav_writer_(output_file_paths.far_end, sample_rate, 1u) {}
|
||||
WavWriter* near_end_wav_writer() {
|
||||
return &near_end_wav_writer_;
|
||||
}
|
||||
WavWriter* far_end_wav_writer() {
|
||||
return &far_end_wav_writer_;
|
||||
}
|
||||
private:
|
||||
WavWriter near_end_wav_writer_;
|
||||
WavWriter far_end_wav_writer_;
|
||||
};
|
||||
|
||||
// Initializes one WavWriter instance for each speaker and both the near-end and
|
||||
// far-end output tracks.
|
||||
std::unique_ptr<std::map<std::string, SpeakerWavWriters>>
|
||||
InitSpeakersWavWriters(const std::map<std::string, SpeakerOutputFilePaths>&
|
||||
speaker_output_file_paths, int sample_rate) {
|
||||
// Create map.
|
||||
auto speaker_wav_writers_map = rtc::MakeUnique<
|
||||
std::map<std::string, SpeakerWavWriters>>();
|
||||
|
||||
// Add SpeakerWavWriters instance into the map.
|
||||
for (auto it = speaker_output_file_paths.begin();
|
||||
it != speaker_output_file_paths.end(); ++it) {
|
||||
speaker_wav_writers_map->emplace(
|
||||
std::piecewise_construct,
|
||||
std::forward_as_tuple(it->first),
|
||||
std::forward_as_tuple(it->second, sample_rate));
|
||||
}
|
||||
|
||||
return speaker_wav_writers_map;
|
||||
}
|
||||
|
||||
// Reads all the samples for each audio track.
|
||||
std::unique_ptr<std::map<std::string, std::vector<int16_t>>> PreloadAudioTracks(
|
||||
const std::map<std::string, std::unique_ptr<WavReaderInterface>>&
|
||||
audiotrack_readers) {
|
||||
// Create map.
|
||||
auto audiotracks_map = rtc::MakeUnique<
|
||||
std::map<std::string, std::vector<int16_t>>>();
|
||||
|
||||
// Add audio track vectors.
|
||||
for (auto it = audiotrack_readers.begin(); it != audiotrack_readers.end();
|
||||
++it) {
|
||||
// Add map entry.
|
||||
audiotracks_map->emplace(
|
||||
std::piecewise_construct,
|
||||
std::forward_as_tuple(it->first),
|
||||
std::forward_as_tuple(it->second->NumSamples()));
|
||||
|
||||
// Read samples.
|
||||
it->second->ReadInt16Samples(audiotracks_map->at(it->first));
|
||||
}
|
||||
|
||||
return audiotracks_map;
|
||||
}
|
||||
|
||||
// Writes all the values in |source_samples| via |wav_writer|. If the number of
|
||||
// previously written samples in |wav_writer| is less than |interval_begin|, it
|
||||
// adds zeros as left padding. The padding corresponds to intervals during which
|
||||
// a speaker is not active.
|
||||
void PadLeftWriteChunk(rtc::ArrayView<const int16_t> source_samples,
|
||||
size_t interval_begin, WavWriter* wav_writer) {
|
||||
// Add left padding.
|
||||
RTC_CHECK(wav_writer);
|
||||
RTC_CHECK_GE(interval_begin, wav_writer->num_samples());
|
||||
size_t padding_size = interval_begin - wav_writer->num_samples();
|
||||
if (padding_size != 0) {
|
||||
const std::vector<int16_t> padding(padding_size, 0);
|
||||
wav_writer->WriteSamples(padding.data(), padding_size);
|
||||
}
|
||||
|
||||
// Write source samples.
|
||||
wav_writer->WriteSamples(source_samples.data(), source_samples.size());
|
||||
}
|
||||
|
||||
// Appends zeros via |wav_writer|. The number of zeros is always non-negative
|
||||
// and equal to the difference between the previously written samples and
|
||||
// |pad_samples|.
|
||||
void PadRightWrite(WavWriter* wav_writer, size_t pad_samples) {
|
||||
RTC_CHECK(wav_writer);
|
||||
RTC_CHECK_GE(pad_samples, wav_writer->num_samples());
|
||||
size_t padding_size = pad_samples - wav_writer->num_samples();
|
||||
if (padding_size != 0) {
|
||||
const std::vector<int16_t> padding(padding_size, 0);
|
||||
wav_writer->WriteSamples(padding.data(), padding_size);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace conversational_speech {
|
||||
|
||||
std::unique_ptr<std::map<std::string, SpeakerOutputFilePaths>> Simulate(
|
||||
const MultiEndCall& multiend_call, const std::string& output_path) {
|
||||
// Set output file paths and initialize wav writers.
|
||||
const auto& speaker_names = multiend_call.speaker_names();
|
||||
auto speaker_output_file_paths = InitSpeakerOutputFilePaths(
|
||||
speaker_names, output_path);
|
||||
auto speakers_wav_writers = InitSpeakersWavWriters(
|
||||
*speaker_output_file_paths, multiend_call.sample_rate());
|
||||
|
||||
// Preload all the input audio tracks.
|
||||
const auto& audiotrack_readers = multiend_call.audiotrack_readers();
|
||||
auto audiotracks = PreloadAudioTracks(audiotrack_readers);
|
||||
|
||||
// TODO(alessiob): When speaker_names.size() == 2, near-end and far-end
|
||||
// across the 2 speakers are symmetric; hence, the code below could be
|
||||
// replaced by only creating the near-end or the far-end. However, this would
|
||||
// require to split the unit tests and document the behavior in README.md.
|
||||
// In practice, it should not be an issue since the files are not expected to
|
||||
// be signinificant.
|
||||
|
||||
// Write near-end and far-end output tracks.
|
||||
for (const auto& speaking_turn : multiend_call.speaking_turns()) {
|
||||
const std::string& active_speaker_name = speaking_turn.speaker_name;
|
||||
auto source_audiotrack = audiotracks->at(
|
||||
speaking_turn.audiotrack_file_name);
|
||||
|
||||
// Write active speaker's chunk to active speaker's near-end.
|
||||
PadLeftWriteChunk(source_audiotrack, speaking_turn.begin,
|
||||
speakers_wav_writers->at(
|
||||
active_speaker_name).near_end_wav_writer());
|
||||
|
||||
// Write active speaker's chunk to other participants' far-ends.
|
||||
for (const std::string& speaker_name : speaker_names) {
|
||||
if (speaker_name == active_speaker_name)
|
||||
continue;
|
||||
PadLeftWriteChunk(source_audiotrack, speaking_turn.begin,
|
||||
speakers_wav_writers->at(
|
||||
speaker_name).far_end_wav_writer());
|
||||
}
|
||||
}
|
||||
|
||||
// Finalize all the output tracks with right padding.
|
||||
// This is required to make all the output tracks duration equal.
|
||||
size_t duration_samples = multiend_call.total_duration_samples();
|
||||
for (const std::string& speaker_name : speaker_names) {
|
||||
PadRightWrite(speakers_wav_writers->at(speaker_name).near_end_wav_writer(),
|
||||
duration_samples);
|
||||
PadRightWrite(speakers_wav_writers->at(speaker_name).far_end_wav_writer(),
|
||||
duration_samples);
|
||||
}
|
||||
|
||||
return speaker_output_file_paths;
|
||||
}
|
||||
|
||||
} // namespace conversational_speech
|
||||
} // namespace test
|
||||
} // namespace webrtc
|
||||
@ -1,44 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_SIMULATOR_H_
|
||||
#define WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_SIMULATOR_H_
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "webrtc/base/constructormagic.h"
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h"
|
||||
|
||||
namespace webrtc {
|
||||
namespace test {
|
||||
namespace conversational_speech {
|
||||
|
||||
struct SpeakerOutputFilePaths {
|
||||
SpeakerOutputFilePaths(const std::string& new_near_end,
|
||||
const std::string& new_far_end)
|
||||
: near_end(new_near_end),
|
||||
far_end(new_far_end) {}
|
||||
// Paths to the near-end and far-end audio track files.
|
||||
const std::string near_end;
|
||||
const std::string far_end;
|
||||
};
|
||||
|
||||
// Generates the near-end and far-end audio track pairs for each speaker.
|
||||
std::unique_ptr<std::map<std::string, SpeakerOutputFilePaths>>
|
||||
Simulate(const MultiEndCall& multiend_call, const std::string& output_path);
|
||||
|
||||
} // namespace conversational_speech
|
||||
} // namespace test
|
||||
} // namespace webrtc
|
||||
|
||||
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_SIMULATOR_H_
|
||||
Loading…
x
Reference in New Issue
Block a user