From 8d23c050f2d25e625f16b98e5cb6ada42529b23e Mon Sep 17 00:00:00 2001 From: alessiob Date: Fri, 7 Apr 2017 12:05:08 -0700 Subject: [PATCH] MultiEndCall::CheckTiming() verifies that a set of audio tracks and timing information is valid to simulate conversational speech. Unordered turns are rejected. Self cross-talk and cross-talk with 3 or more speakers are not permitted since it would require mixing at the simulation step. This CL includes extensive tests to match accept or reject decisions on several different timing setups. The setups are simulated using mocks (by far more light-weight than using actual timing and audio track files). The client code, the unit tests in this case, passes information about the fake audio tracks to MockWavReaderFactory. MockWavReader instances are then created using the parameters defined in the client code. To improve the readability of the tests, generator_unittest.cc includes a docstring explaining how each MultiEndCallSetup* test is documented. Run tests as follows: $ out/Default/modules_unittests --gtest_filter=ConversationalSpeechTest.* BUG=webrtc:7218 Review-Url: https://codereview.webrtc.org/2781573002 Cr-Commit-Position: refs/heads/master@{#17592} --- .../test/conversational_speech/BUILD.gn | 18 +- .../generator_unittest.cc | 407 +++++++++++++++++- .../conversational_speech/mock_wavreader.cc | 32 ++ .../conversational_speech/mock_wavreader.h | 10 +- .../mock_wavreader_factory.cc | 48 ++- .../mock_wavreader_factory.h | 22 +- .../conversational_speech/multiend_call.cc | 136 +++++- .../conversational_speech/multiend_call.h | 32 +- 8 files changed, 667 insertions(+), 38 deletions(-) create mode 100644 webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.cc diff --git a/webrtc/modules/audio_processing/test/conversational_speech/BUILD.gn b/webrtc/modules/audio_processing/test/conversational_speech/BUILD.gn index ca5977fae8..10601fed3c 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/BUILD.gn +++ b/webrtc/modules/audio_processing/test/conversational_speech/BUILD.gn @@ -6,7 +6,7 @@ # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. -import("//webrtc/webrtc.gni") +import("../../../../../webrtc/webrtc.gni") group("conversational_speech") { testonly = true @@ -22,9 +22,9 @@ rtc_executable("conversational_speech_generator") { ] deps = [ ":lib", + "../../../../../webrtc/base:rtc_base_approved", + "../../../../../webrtc/test:test_support", "//third_party/gflags", - "//webrtc/base:rtc_base_approved", - "//webrtc/test:test_support", ] } @@ -45,9 +45,9 @@ rtc_static_library("lib") { "wavreader_interface.h", ] deps = [ - "//webrtc:webrtc_common", - "//webrtc/base:rtc_base_approved", - "//webrtc/common_audio", + "../../../../../webrtc:webrtc_common", + "../../../../../webrtc/base:rtc_base_approved", + "../../../../../webrtc/common_audio", ] visibility = [ ":*" ] # Only targets in this file can depend on this. } @@ -56,15 +56,17 @@ rtc_source_set("unittest") { testonly = true sources = [ "generator_unittest.cc", + "mock_wavreader.cc", "mock_wavreader.h", "mock_wavreader_factory.cc", "mock_wavreader_factory.h", ] deps = [ ":lib", + "../../../../../webrtc:webrtc_common", + "../../../../../webrtc/base:rtc_base_approved", + "../../../../../webrtc/test:test_support", "//testing/gmock", "//testing/gtest", - "//webrtc:webrtc_common", - "//webrtc/test:test_support", ] } diff --git a/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc b/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc index 59454d9d47..406d95cf21 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc +++ b/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc @@ -8,9 +8,36 @@ * be found in the AUTHORS file in the root of the source tree. */ +// This file consists of unit tests for webrtc::test::conversational_speech +// members. Part of them focus on accepting or rejecting different +// conversational speech setups. A setup is defined by a set of audio tracks and +// timing information). +// The docstring at the beginning of each TEST_F(ConversationalSpeechTest, +// MultiEndCallSetup*) function looks like the drawing below and indicates which +// setup is tested. +// +// Accept: +// A 0****..... +// B .....1**** +// +// The drawing indicates the following: +// - the illustrated setup should be accepted, +// - there are two speakers (namely, A and B), +// - A is the first speaking, B is the second one, +// - each character after the speaker's letter indicates a time unit (e.g., 100 +// ms), +// - "*" indicates speaking, "." listening, +// - numbers indicate the turn index in std::vector. +// +// Note that the same speaker can appear in multiple lines in order to depict +// cases in which there are wrong offsets leading to self cross-talk (which is +// rejected). + #include +#include #include +#include "webrtc/base/logging.h" #include "webrtc/modules/audio_processing/test/conversational_speech/config.h" #include "webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h" #include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h" @@ -44,9 +71,38 @@ const std::vector expected_timing = { }; const std::size_t kNumberOfTurns = expected_timing.size(); +// Default arguments for MockWavReaderFactory ctor. +// Fake audio track parameters. +constexpr int kDefaultSampleRate = 48000; +const std::map + kDefaultMockWavReaderFactoryParamsMap = { + {"t300", {kDefaultSampleRate, 1u, 14400u}}, // 0.3 seconds. + {"t500", {kDefaultSampleRate, 1u, 24000u}}, // 0.5 seconds. + {"t1000", {kDefaultSampleRate, 1u, 48000u}}, // 1.0 seconds. +}; +const MockWavReaderFactory::Params& kDefaultMockWavReaderFactoryParams = + kDefaultMockWavReaderFactoryParamsMap.at("t500"); + +std::unique_ptr CreateMockWavReaderFactory() { + return std::unique_ptr( + new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams, + kDefaultMockWavReaderFactoryParamsMap)); +} + } // namespace -TEST(ConversationalSpeechTest, Settings) { +using testing::_; + +// TODO(alessiob): Remove fixture once conversational_speech fully implemented +// and replace TEST_F with TEST. +class ConversationalSpeechTest : public testing::Test { + public: + ConversationalSpeechTest() { + rtc::LogMessage::LogToDebug(rtc::LS_VERBOSE); + } +}; + +TEST_F(ConversationalSpeechTest, Settings) { const conversational_speech::Config config( audiotracks_path, timing_filepath, output_path); @@ -56,7 +112,7 @@ TEST(ConversationalSpeechTest, Settings) { EXPECT_EQ(output_path, config.output_path()); } -TEST(ConversationalSpeechTest, TimingSaveLoad) { +TEST_F(ConversationalSpeechTest, TimingSaveLoad) { // Save test timing. const std::string temporary_filepath = webrtc::test::TempFilename( webrtc::test::OutputPath(), "TempTimingTestFile"); @@ -76,20 +132,359 @@ TEST(ConversationalSpeechTest, TimingSaveLoad) { } } -TEST(ConversationalSpeechTest, MultiEndCallCreate) { - auto mock_wavreader_factory = std::unique_ptr( - new MockWavReaderFactory()); +TEST_F(ConversationalSpeechTest, MultiEndCallCreate) { + auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There are 5 unique audio tracks to read. - EXPECT_CALL(*mock_wavreader_factory, Create(testing::_)).Times(5); + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(5); // Inject the mock wav reader factory. conversational_speech::MultiEndCall multiend_call( expected_timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); // Test. EXPECT_EQ(2u, multiend_call.speaker_names().size()); EXPECT_EQ(5u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(6u, multiend_call.speaking_turns().size()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) { + const std::vector timing = { + {"A", "t500", -100}, + {"B", "t500", 0}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There is one unique audio track to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_FALSE(multiend_call.valid()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupSimple) { + // Accept: + // A 0****..... + // B .....1**** + constexpr std::size_t expected_duration = kDefaultSampleRate; + const std::vector timing = { + {"A", "t500", 0}, + {"B", "t500", 0}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There is one unique audio track to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); + + // Test. + EXPECT_EQ(2u, multiend_call.speaker_names().size()); + EXPECT_EQ(1u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(2u, multiend_call.speaking_turns().size()); + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupPause) { + // Accept: + // A 0****....... + // B .......1**** + constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2; + const std::vector timing = { + {"A", "t500", 0}, + {"B", "t500", 200}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There is one unique audio track to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); + + // Test. + EXPECT_EQ(2u, multiend_call.speaker_names().size()); + EXPECT_EQ(1u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(2u, multiend_call.speaking_turns().size()); + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) { + // Accept: + // A 0****.... + // B ....1**** + constexpr std::size_t expected_duration = kDefaultSampleRate * 0.9; + const std::vector timing = { + {"A", "t500", 0}, + {"B", "t500", -100}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There is one unique audio track to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); + + // Test. + EXPECT_EQ(2u, multiend_call.speaker_names().size()); + EXPECT_EQ(1u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(2u, multiend_call.speaking_turns().size()); + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) { + // Reject: + // A ..0**** + // B .1****. The n-th turn cannot start before the (n-1)-th one. + const std::vector timing = { + {"A", "t500", 200}, + {"B", "t500", -600}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There is one unique audio track to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_FALSE(multiend_call.valid()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) { + // Accept: + // A 0****2****... + // B ...1********* + constexpr std::size_t expected_duration = kDefaultSampleRate * 1.3; + const std::vector timing = { + {"A", "t500", 0}, + {"B", "t1000", -200}, + {"A", "t500", -800}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There are two unique audio tracks to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); + + // Test. + EXPECT_EQ(2u, multiend_call.speaker_names().size()); + EXPECT_EQ(2u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(3u, multiend_call.speaking_turns().size()); + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) { + // Reject: + // A 0****...... + // A ...1****... + // B ......2**** + // ^ Turn #1 overlaps with #0 which is from the same speaker. + const std::vector timing = { + {"A", "t500", 0}, + {"A", "t500", -200}, + {"B", "t500", -200}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There is one unique audio track to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_FALSE(multiend_call.valid()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) { + // Reject: + // A 0********* + // B 1**....... + // C ...2**.... + // A ......3**. + // ^ Turn #3 overlaps with #0 which is from the same speaker. + const std::vector timing = { + {"A", "t1000", 0}, + {"B", "t300", -1000}, + {"C", "t300", 0}, + {"A", "t300", 0}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There are two unique audio tracks to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_FALSE(multiend_call.valid()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) { + // Accept: + // A 0*********.. + // B ..1****..... + // C .......2**** + constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2; + const std::vector timing = { + {"A", "t1000", 0}, + {"B", "t500", -800}, + {"C", "t500", 0}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There are two unique audio tracks to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); + + // Test. + EXPECT_EQ(3u, multiend_call.speaker_names().size()); + EXPECT_EQ(2u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(3u, multiend_call.speaking_turns().size()); + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) { + // Reject: + // A 0********* + // B ..1****... + // C ....2****. + // ^ Turn #2 overlaps both with #0 and #1 (cross-talk with 3+ speakers + // not permitted). + const std::vector timing = { + {"A", "t1000", 0}, + {"B", "t500", -800}, + {"C", "t500", -300}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There are two unique audio tracks to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_FALSE(multiend_call.valid()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) { + // Accept: + // A 0*********.. + // B .2****...... + // C .......3**** + constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2; + const std::vector timing = { + {"A", "t1000", 0}, + {"B", "t500", -900}, + {"C", "t500", 100}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There are two unique audio tracks to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); + + // Test. + EXPECT_EQ(3u, multiend_call.speaker_names().size()); + EXPECT_EQ(2u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(3u, multiend_call.speaking_turns().size()); + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) { + // Accept: + // A 0**** + // B 1**** + const std::vector timing = { + {"A", "t500", 0}, + {"B", "t500", -500}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There is one unique audio track to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); + + // Test. + EXPECT_EQ(2u, multiend_call.speaker_names().size()); + EXPECT_EQ(1u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(2u, multiend_call.speaking_turns().size()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequence) { + // Accept: + // A 0****....3****.5**. + // B .....1****...4**... + // C ......2**.......6**.. + constexpr std::size_t expected_duration = kDefaultSampleRate * 1.9; + const std::vector timing = { + {"A", "t500", 0}, + {"B", "t500", 0}, + {"C", "t300", -400}, + {"A", "t500", 0}, + {"B", "t300", -100}, + {"A", "t300", -100}, + {"C", "t300", -200}, + }; + auto mock_wavreader_factory = std::unique_ptr( + new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams, + kDefaultMockWavReaderFactoryParamsMap)); + + // There are two unique audio tracks to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); + + // Test. + EXPECT_EQ(3u, multiend_call.speaker_names().size()); + EXPECT_EQ(2u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(7u, multiend_call.speaking_turns().size()); + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) { + // Reject: + // A 0****....3****.6** + // B .....1****...4**.. + // C ......2**.....5**.. + // ^ Turns #4, #5 and #6 overlapping (cross-talk with 3+ + // speakers not permitted). + const std::vector timing = { + {"A", "t500", 0}, + {"B", "t500", 0}, + {"C", "t300", -400}, + {"A", "t500", 0}, + {"B", "t300", -100}, + {"A", "t300", -200}, + {"C", "t300", -200}, + }; + auto mock_wavreader_factory = std::unique_ptr( + new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams, + kDefaultMockWavReaderFactoryParamsMap)); + + // There are two unique audio tracks to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_FALSE(multiend_call.valid()); } } // namespace test diff --git a/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.cc b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.cc new file mode 100644 index 0000000000..7d2f2b663e --- /dev/null +++ b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.cc @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h" + +namespace webrtc { +namespace test { +namespace conversational_speech { + +using testing::Return; + +MockWavReader::MockWavReader( + int sample_rate, size_t num_channels, size_t num_samples) + : sample_rate_(sample_rate), num_channels_(num_channels), + num_samples_(num_samples) { + ON_CALL(*this, sample_rate()).WillByDefault(Return(sample_rate_)); + ON_CALL(*this, num_channels()).WillByDefault(Return(num_channels_)); + ON_CALL(*this, num_samples()).WillByDefault(Return(num_samples_)); +} + +MockWavReader::~MockWavReader() = default; + +} // namespace conversational_speech +} // namespace test +} // namespace webrtc diff --git a/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h index 83aa9382e5..d71e6f896b 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h +++ b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h @@ -24,17 +24,13 @@ namespace conversational_speech { class MockWavReader : public WavReaderInterface { public: - MockWavReader( - int sample_rate, size_t num_channels, size_t num_samples) - : sample_rate_(sample_rate), num_channels_(num_channels), - num_samples_(num_samples) {} - ~MockWavReader() = default; + MockWavReader(int sample_rate, size_t num_channels, size_t num_samples); + ~MockWavReader(); - // TOOD(alessiob): use ON_CALL to return random samples. + // TODO(alessiob): use ON_CALL to return random samples. MOCK_METHOD2(ReadFloatSamples, size_t(size_t, float*)); MOCK_METHOD2(ReadInt16Samples, size_t(size_t, int16_t*)); - // TOOD(alessiob): use ON_CALL to return properties. MOCK_CONST_METHOD0(sample_rate, int()); MOCK_CONST_METHOD0(num_channels, size_t()); MOCK_CONST_METHOD0(num_samples, size_t()); diff --git a/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.cc b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.cc index 1097639527..2dd21dadf1 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.cc +++ b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.cc @@ -10,14 +10,60 @@ #include "webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h" +#include "webrtc/base/logging.h" +#include "webrtc/base/pathutils.h" +#include "webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h" +#include "webrtc/test/gmock.h" + namespace webrtc { namespace test { namespace conversational_speech { -MockWavReaderFactory::MockWavReaderFactory() = default; +using testing::_; +using testing::Invoke; + +MockWavReaderFactory::MockWavReaderFactory( + const Params& default_params, + const std::map& params) + : default_params_(default_params), + audiotrack_names_params_(params) { + ON_CALL(*this, Create(_)).WillByDefault(Invoke( + this, &MockWavReaderFactory::CreateMock)); +} + +MockWavReaderFactory::MockWavReaderFactory(const Params& default_params) + : MockWavReaderFactory(default_params, + std::map{}) {} MockWavReaderFactory::~MockWavReaderFactory() = default; +std::unique_ptr MockWavReaderFactory::CreateMock( + const std::string& filepath) { + // Search the parameters corresponding to filepath. + const rtc::Pathname audiotrack_file_path(filepath); + const auto it = audiotrack_names_params_.find( + audiotrack_file_path.filename()); + + // If not found, use default parameters. + if (it == audiotrack_names_params_.end()) { + LOG(LS_VERBOSE) << "using default parameters for " << filepath; + return std::unique_ptr( + new MockWavReader(default_params_.sample_rate, + default_params_.num_channels, + default_params_.num_samples)); + } + + // Found, use the audiotrack-specific parameters. + LOG(LS_VERBOSE) << "using ad-hoc parameters for " << filepath; + LOG(LS_VERBOSE) << "sample_rate " << it->second.sample_rate; + LOG(LS_VERBOSE) << "num_channels " << it->second.num_channels; + LOG(LS_VERBOSE) << "num_samples " << it->second.num_samples; + return std::unique_ptr( + new MockWavReader(it->second.sample_rate, + it->second.num_channels, + it->second.num_samples)); +} + } // namespace conversational_speech } // namespace test } // namespace webrtc diff --git a/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h index 3686d12dd7..d22856e80c 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h +++ b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h @@ -11,6 +11,7 @@ #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_MOCK_WAVREADER_FACTORY_H_ #define WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_MOCK_WAVREADER_FACTORY_H_ +#include #include #include @@ -24,15 +25,28 @@ namespace conversational_speech { class MockWavReaderFactory : public WavReaderAbstractFactory { public: - MockWavReaderFactory(); - // TODO(alessiob): add ctor that gets map string->(sr, #samples, #channels). + struct Params{ + int sample_rate; + size_t num_channels; + size_t num_samples; + }; + + MockWavReaderFactory(const Params& default_params, + const std::map& params); + explicit MockWavReaderFactory(const Params& default_params); ~MockWavReaderFactory(); - // TODO(alessiob): use ON_CALL to return MockWavReader with desired params. MOCK_CONST_METHOD1(Create, std::unique_ptr( const std::string&)); - // TODO(alessiob): add const ref to map (see ctor to add). + private: + // Creates a MockWavReader instance using the parameters in + // audiotrack_names_params_ if the entry corresponding to filepath exists, + // otherwise creates a MockWavReader instance using the default parameters. + std::unique_ptr CreateMock(const std::string& filepath); + + const Params& default_params_; + std::map audiotrack_names_params_; }; } // namespace conversational_speech diff --git a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc index f16aa753fa..ad1d9a0c87 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc +++ b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc @@ -10,8 +10,10 @@ #include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h" -#include +#include +#include +#include "webrtc/base/logging.h" #include "webrtc/base/pathutils.h" namespace webrtc { @@ -25,7 +27,7 @@ MultiEndCall::MultiEndCall( wavreader_abstract_factory_(std::move(wavreader_abstract_factory)) { FindSpeakerNames(); CreateAudioTrackReaders(); - CheckTiming(); + valid_ = CheckTiming(); } MultiEndCall::~MultiEndCall() = default; @@ -39,10 +41,23 @@ const std::map>& return audiotrack_readers_; } +bool MultiEndCall::valid() const { + return valid_; +} + +size_t MultiEndCall::total_duration_samples() const { + return total_duration_samples_; +} + +const std::vector& MultiEndCall::speaking_turns() + const { + return speaking_turns_; +} + void MultiEndCall::FindSpeakerNames() { RTC_DCHECK(speaker_names_.empty()); for (const Turn& turn : timing_) { - speaker_names_.insert(turn.speaker_name); + speaker_names_.emplace(turn.speaker_name); } } @@ -60,14 +75,119 @@ void MultiEndCall::CreateAudioTrackReaders() { // Map the audiotrack file name to a new instance of WavReaderInterface. std::unique_ptr wavreader = wavreader_abstract_factory_->Create(audiotrack_file_path.pathname()); - audiotrack_readers_.insert(std::make_pair( - turn.audiotrack_file_name, std::move(wavreader))); + audiotrack_readers_.emplace( + turn.audiotrack_file_name, std::move(wavreader)); } } -void MultiEndCall::CheckTiming() { - // TODO(alessiob): use audiotrack lengths and offset to check whether the - // timing is valid. +bool MultiEndCall::CheckTiming() { + struct Interval { + size_t begin; + size_t end; + }; + size_t number_of_turns = timing_.size(); + auto millisecond_to_samples = [](int ms, int sr) -> int { + // Truncation may happen if the sampling rate is not an integer multiple + // of 1000 (e.g., 44100). + return ms * sr / 1000; + }; + auto in_interval = [](size_t value, const Interval& interval) { + return interval.begin <= value && value < interval.end; + }; + total_duration_samples_ = 0; + speaking_turns_.clear(); + + // Begin and end timestamps for the last two turns (unit: number of samples). + Interval second_last_turn = {0, 0}; + Interval last_turn = {0, 0}; + + // Initialize map to store speaking turn indices of each speaker (used to + // detect self cross-talk). + std::map> speaking_turn_indices; + for (const std::string& speaker_name : speaker_names_) { + speaking_turn_indices.emplace( + std::piecewise_construct, + std::forward_as_tuple(speaker_name), + std::forward_as_tuple()); + } + + // Parse turns. + for (size_t turn_index = 0; turn_index < number_of_turns; ++turn_index) { + const Turn& turn = timing_[turn_index]; + auto it = audiotrack_readers_.find(turn.audiotrack_file_name); + RTC_CHECK(it != audiotrack_readers_.end()) + << "Audio track reader not created"; + + // Begin and end timestamps for the current turn. + int offset_samples = millisecond_to_samples( + turn.offset, it->second->sample_rate()); + size_t begin_timestamp = last_turn.end + offset_samples; + size_t end_timestamp = begin_timestamp + it->second->num_samples(); + LOG(LS_INFO) << "turn #" << turn_index << " " << begin_timestamp + << "-" << end_timestamp << " ms"; + + // The order is invalid if the offset is negative and its absolute value is + // larger then the duration of the previous turn. + if (offset_samples < 0 && -offset_samples > static_cast( + last_turn.end - last_turn.begin)) { + LOG(LS_ERROR) << "invalid order"; + return false; + } + + // Cross-talk with 3 or more speakers occurs when the beginning of the + // current interval falls in the last two turns. + if (turn_index > 1 && in_interval(begin_timestamp, last_turn) + && in_interval(begin_timestamp, second_last_turn)) { + LOG(LS_ERROR) << "cross-talk with 3+ speakers"; + return false; + } + + // Append turn. + speaking_turns_.emplace_back( + turn.speaker_name, turn.audiotrack_file_name, + begin_timestamp, end_timestamp); + + // Save speaking turn index for self cross-talk detection. + RTC_DCHECK_EQ(speaking_turns_.size(), turn_index + 1); + speaking_turn_indices[turn.speaker_name].push_back(turn_index); + + // Update total duration of the consversational speech. + if (total_duration_samples_ < end_timestamp) + total_duration_samples_ = end_timestamp; + + // Update and continue with next turn. + second_last_turn = last_turn; + last_turn.begin = begin_timestamp; + last_turn.end = end_timestamp; + } + + // Detect self cross-talk. + for (const std::string& speaker_name : speaker_names_) { + LOG(LS_INFO) << "checking self cross-talk for <" + << speaker_name << ">"; + + // Copy all turns for this speaker to new vector. + std::vector speaking_turns_for_name; + std::copy_if(speaking_turns_.begin(), speaking_turns_.end(), + std::back_inserter(speaking_turns_for_name), + [&speaker_name](const SpeakingTurn& st){ + return st.speaker_name == speaker_name; }); + + // Check for overlap between adjacent elements. + // This is a sufficient condition for self cross-talk since the intervals + // are sorted by begin timestamp. + auto overlap = std::adjacent_find( + speaking_turns_for_name.begin(), speaking_turns_for_name.end(), + [](const SpeakingTurn& a, const SpeakingTurn& b) { + return a.end > b.begin; }); + + if (overlap != speaking_turns_for_name.end()) { + LOG(LS_ERROR) << "Self cross-talk detected"; + return false; + } + } + + return true; } } // namespace conversational_speech diff --git a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h index 234cb2799e..dd03a07e7a 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h +++ b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h @@ -11,10 +11,13 @@ #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_MULTIEND_CALL_H_ #define WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_MULTIEND_CALL_H_ +#include #include #include #include #include +#include +#include #include "webrtc/base/array_view.h" #include "webrtc/base/constructormagic.h" @@ -28,6 +31,20 @@ namespace conversational_speech { class MultiEndCall { public: + struct SpeakingTurn { + // Constructor required in order to use std::vector::emplace_back(). + SpeakingTurn(std::string new_speaker_name, + std::string new_audiotrack_file_name, + size_t new_begin, size_t new_end) + : speaker_name(std::move(new_speaker_name)), + audiotrack_file_name(std::move(new_audiotrack_file_name)), + begin(new_begin), end(new_end) {} + std::string speaker_name; + std::string audiotrack_file_name; + size_t begin; + size_t end; + }; + MultiEndCall( rtc::ArrayView timing, const std::string& audiotracks_path, std::unique_ptr wavreader_abstract_factory); @@ -36,16 +53,20 @@ class MultiEndCall { const std::set& speaker_names() const; const std::map>& audiotrack_readers() const; + bool valid() const; + size_t total_duration_samples() const; + const std::vector& speaking_turns() const; private: - // Find unique speaker names. + // Finds unique speaker names. void FindSpeakerNames(); - // Create one WavReader instance for each unique audiotrack. + // Creates one WavReader instance for each unique audiotrack. void CreateAudioTrackReaders(); - // Check the speaking turns timing. - void CheckTiming(); + // Validates the speaking turns timing information. Accepts cross-talk, but + // only up to 2 speakers. Rejects unordered turns and self cross-talk. + bool CheckTiming(); rtc::ArrayView timing_; const std::string& audiotracks_path_; @@ -53,6 +74,9 @@ class MultiEndCall { std::set speaker_names_; std::map> audiotrack_readers_; + bool valid_; + size_t total_duration_samples_; + std::vector speaking_turns_; RTC_DISALLOW_COPY_AND_ASSIGN(MultiEndCall); };