This CL finalizes the Conversational Speech tool.
The following changes have been made: - command line args wired, - user output added, - final polishing. BUG=webrtc:7218 Review-Url: https://codereview.webrtc.org/2808053002 Cr-Commit-Position: refs/heads/master@{#18609}
This commit is contained in:
parent
b2152b7b61
commit
19e087fc91
@ -69,5 +69,8 @@ rtc_source_set("unittest") {
|
||||
"../../../../../webrtc/test:test_support",
|
||||
"//testing/gmock",
|
||||
"//testing/gtest",
|
||||
"//webrtc:webrtc_common",
|
||||
"//webrtc/base:rtc_base_approved",
|
||||
"//webrtc/test:test_support",
|
||||
]
|
||||
}
|
||||
|
||||
@ -17,9 +17,7 @@ For instance, echo cancellation in the APM module can be evaluated using two-end
|
||||
audio tracks as input and reverse input.
|
||||
|
||||
By indicating negative and positive time offsets, one can reproduce cross-talk
|
||||
and silence in the conversation.
|
||||
|
||||
IMPORTANT: **the whole code has not been landed yet.**
|
||||
(aka double-talk) and silence in the conversation.
|
||||
|
||||
### Example
|
||||
|
||||
|
||||
@ -11,8 +11,12 @@
|
||||
#include <iostream>
|
||||
|
||||
#include "gflags/gflags.h"
|
||||
#include "webrtc/base/logging.h"
|
||||
#include "webrtc/base/ptr_util.h"
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/config.h"
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/timing.h"
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/wavreader_factory.h"
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h"
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/simulator.h"
|
||||
#include "webrtc/test/testsupport/fileutils.h"
|
||||
|
||||
namespace webrtc {
|
||||
@ -48,14 +52,31 @@ DEFINE_validator(o, dir_exists);
|
||||
int main(int argc, char* argv[]) {
|
||||
google::SetUsageMessage(kUsageDescription);
|
||||
google::ParseCommandLineFlags(&argc, &argv, true);
|
||||
|
||||
conversational_speech::Config config(FLAGS_i, FLAGS_t, FLAGS_o);
|
||||
|
||||
// TODO(alessiob): remove line below once debugged.
|
||||
rtc::LogMessage::LogToDebug(rtc::LS_VERBOSE);
|
||||
LOG(LS_VERBOSE) << "i = " << config.audiotracks_path();
|
||||
LOG(LS_VERBOSE) << "t = " << config.timing_filepath();
|
||||
LOG(LS_VERBOSE) << "o = " << config.output_path();
|
||||
// Load timing.
|
||||
std::vector<conversational_speech::Turn> timing =
|
||||
conversational_speech::LoadTiming(config.timing_filepath());
|
||||
|
||||
// Parse timing and audio tracks.
|
||||
auto wavreader_factory = rtc::MakeUnique<
|
||||
conversational_speech::WavReaderFactory>();
|
||||
conversational_speech::MultiEndCall multiend_call(
|
||||
timing, config.audiotracks_path(), std::move(wavreader_factory));
|
||||
|
||||
// Generate output audio tracks.
|
||||
auto generated_audiotrack_pairs = conversational_speech::Simulate(
|
||||
multiend_call, config.output_path());
|
||||
|
||||
// Show paths to created audio tracks.
|
||||
std::cout << "Output files:" << std::endl;
|
||||
for (const auto& output_paths_entry : *generated_audiotrack_pairs) {
|
||||
std::cout << " speaker: " << output_paths_entry.first << std::endl;
|
||||
std::cout << " near end: " << output_paths_entry.second.near_end
|
||||
<< std::endl;
|
||||
std::cout << " far end: " << output_paths_entry.second.far_end
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -12,7 +12,7 @@
|
||||
// members. Part of them focus on accepting or rejecting different
|
||||
// conversational speech setups. A setup is defined by a set of audio tracks and
|
||||
// timing information).
|
||||
// The docstring at the beginning of each TEST_F(ConversationalSpeechTest,
|
||||
// The docstring at the beginning of each TEST(ConversationalSpeechTest,
|
||||
// MultiEndCallSetup*) function looks like the drawing below and indicates which
|
||||
// setup is tested.
|
||||
//
|
||||
@ -174,16 +174,7 @@ void DeleteFolderAndContents(const std::string& dir) {
|
||||
|
||||
using testing::_;
|
||||
|
||||
// TODO(alessiob): Remove fixture once conversational_speech fully implemented
|
||||
// and replace TEST_F with TEST.
|
||||
class ConversationalSpeechTest : public testing::Test {
|
||||
public:
|
||||
ConversationalSpeechTest() {
|
||||
rtc::LogMessage::LogToDebug(rtc::LS_VERBOSE);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(ConversationalSpeechTest, Settings) {
|
||||
TEST(ConversationalSpeechTest, Settings) {
|
||||
const conversational_speech::Config config(
|
||||
audiotracks_path, timing_filepath, output_path);
|
||||
|
||||
@ -193,7 +184,7 @@ TEST_F(ConversationalSpeechTest, Settings) {
|
||||
EXPECT_EQ(output_path, config.output_path());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, TimingSaveLoad) {
|
||||
TEST(ConversationalSpeechTest, TimingSaveLoad) {
|
||||
// Save test timing.
|
||||
const std::string temporary_filepath = TempFilename(
|
||||
OutputPath(), "TempTimingTestFile");
|
||||
@ -213,7 +204,7 @@ TEST_F(ConversationalSpeechTest, TimingSaveLoad) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallCreate) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallCreate) {
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
// There are 5 unique audio tracks to read.
|
||||
@ -230,7 +221,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallCreate) {
|
||||
EXPECT_EQ(6u, multiend_call.speaking_turns().size());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "sr8000", 0},
|
||||
{"B", "sr16000", 0},
|
||||
@ -245,7 +236,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "sr16000_stereo", 0},
|
||||
{"B", "sr16000_stereo", 0},
|
||||
@ -260,7 +251,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest,
|
||||
TEST(ConversationalSpeechTest,
|
||||
MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels) {
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "sr8000", 0},
|
||||
@ -276,7 +267,7 @@ TEST_F(ConversationalSpeechTest,
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", -100},
|
||||
{"B", "t500", 0},
|
||||
@ -291,7 +282,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupSimple) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupSimple) {
|
||||
// Accept:
|
||||
// A 0****.....
|
||||
// B .....1****
|
||||
@ -316,7 +307,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupSimple) {
|
||||
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupPause) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupPause) {
|
||||
// Accept:
|
||||
// A 0****.......
|
||||
// B .......1****
|
||||
@ -341,7 +332,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupPause) {
|
||||
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) {
|
||||
// Accept:
|
||||
// A 0****....
|
||||
// B ....1****
|
||||
@ -366,7 +357,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) {
|
||||
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) {
|
||||
// Reject:
|
||||
// A ..0****
|
||||
// B .1****. The n-th turn cannot start before the (n-1)-th one.
|
||||
@ -384,7 +375,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) {
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) {
|
||||
// Accept:
|
||||
// A 0****2****...
|
||||
// B ...1*********
|
||||
@ -410,7 +401,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) {
|
||||
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) {
|
||||
// Reject:
|
||||
// A 0****......
|
||||
// A ...1****...
|
||||
@ -431,7 +422,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) {
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) {
|
||||
// Reject:
|
||||
// A 0*********
|
||||
// B 1**.......
|
||||
@ -454,7 +445,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) {
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) {
|
||||
// Accept:
|
||||
// A 0*********..
|
||||
// B ..1****.....
|
||||
@ -481,7 +472,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) {
|
||||
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) {
|
||||
// Reject:
|
||||
// A 0*********
|
||||
// B ..1****...
|
||||
@ -503,7 +494,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) {
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) {
|
||||
// Accept:
|
||||
// A 0*********..
|
||||
// B .2****......
|
||||
@ -530,7 +521,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) {
|
||||
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) {
|
||||
// Accept:
|
||||
// A 0****
|
||||
// B 1****
|
||||
@ -553,7 +544,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) {
|
||||
EXPECT_EQ(2u, multiend_call.speaking_turns().size());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequence) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequence) {
|
||||
// Accept:
|
||||
// A 0****....3****.5**.
|
||||
// B .....1****...4**...
|
||||
@ -586,7 +577,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequence) {
|
||||
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) {
|
||||
// Reject:
|
||||
// A 0****....3****.6**
|
||||
// B .....1****...4**..
|
||||
@ -614,7 +605,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) {
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) {
|
||||
TEST(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) {
|
||||
// Parameters with which wav files are created.
|
||||
constexpr int duration_seconds = 5;
|
||||
const int sample_rates[] = {8000, 11025, 16000, 22050, 32000, 44100, 48000};
|
||||
@ -641,7 +632,7 @@ TEST_F(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, DISABLED_MultiEndCallSimulator) {
|
||||
TEST(ConversationalSpeechTest, DISABLED_MultiEndCallSimulator) {
|
||||
// Simulated call (one character corresponding to 500 ms):
|
||||
// A 0*********...........2*********.....
|
||||
// B ...........1*********.....3*********
|
||||
|
||||
@ -28,7 +28,7 @@ class MockWavReader : public WavReaderInterface {
|
||||
MockWavReader(int sample_rate, size_t num_channels, size_t num_samples);
|
||||
~MockWavReader();
|
||||
|
||||
// TODO(alessiob): use ON_CALL to return random samples.
|
||||
// TODO(alessiob): use ON_CALL to return random samples if needed.
|
||||
MOCK_METHOD1(ReadFloatSamples, size_t(rtc::ArrayView<float>));
|
||||
MOCK_METHOD1(ReadInt16Samples, size_t(rtc::ArrayView<int16_t>));
|
||||
|
||||
|
||||
@ -53,7 +53,7 @@ std::vector<Turn> LoadTiming(const std::string& timing_filepath) {
|
||||
void SaveTiming(const std::string& timing_filepath,
|
||||
rtc::ArrayView<const Turn> timing) {
|
||||
std::ofstream outfile(timing_filepath);
|
||||
// TODO(alessio): check if file open for writing.
|
||||
RTC_CHECK(outfile.is_open());
|
||||
for (const Turn& turn : timing) {
|
||||
outfile << turn.speaker_name << " " << turn.audiotrack_file_name
|
||||
<< " " << turn.offset << std::endl;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user