diff --git a/webrtc/modules/audio_processing/test/conversational_speech/BUILD.gn b/webrtc/modules/audio_processing/test/conversational_speech/BUILD.gn index ca5977fae8..10601fed3c 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/BUILD.gn +++ b/webrtc/modules/audio_processing/test/conversational_speech/BUILD.gn @@ -6,7 +6,7 @@ # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. -import("//webrtc/webrtc.gni") +import("../../../../../webrtc/webrtc.gni") group("conversational_speech") { testonly = true @@ -22,9 +22,9 @@ rtc_executable("conversational_speech_generator") { ] deps = [ ":lib", + "../../../../../webrtc/base:rtc_base_approved", + "../../../../../webrtc/test:test_support", "//third_party/gflags", - "//webrtc/base:rtc_base_approved", - "//webrtc/test:test_support", ] } @@ -45,9 +45,9 @@ rtc_static_library("lib") { "wavreader_interface.h", ] deps = [ - "//webrtc:webrtc_common", - "//webrtc/base:rtc_base_approved", - "//webrtc/common_audio", + "../../../../../webrtc:webrtc_common", + "../../../../../webrtc/base:rtc_base_approved", + "../../../../../webrtc/common_audio", ] visibility = [ ":*" ] # Only targets in this file can depend on this. } @@ -56,15 +56,17 @@ rtc_source_set("unittest") { testonly = true sources = [ "generator_unittest.cc", + "mock_wavreader.cc", "mock_wavreader.h", "mock_wavreader_factory.cc", "mock_wavreader_factory.h", ] deps = [ ":lib", + "../../../../../webrtc:webrtc_common", + "../../../../../webrtc/base:rtc_base_approved", + "../../../../../webrtc/test:test_support", "//testing/gmock", "//testing/gtest", - "//webrtc:webrtc_common", - "//webrtc/test:test_support", ] } diff --git a/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc b/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc index 59454d9d47..406d95cf21 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc +++ b/webrtc/modules/audio_processing/test/conversational_speech/generator_unittest.cc @@ -8,9 +8,36 @@ * be found in the AUTHORS file in the root of the source tree. */ +// This file consists of unit tests for webrtc::test::conversational_speech +// members. Part of them focus on accepting or rejecting different +// conversational speech setups. A setup is defined by a set of audio tracks and +// timing information). +// The docstring at the beginning of each TEST_F(ConversationalSpeechTest, +// MultiEndCallSetup*) function looks like the drawing below and indicates which +// setup is tested. +// +// Accept: +// A 0****..... +// B .....1**** +// +// The drawing indicates the following: +// - the illustrated setup should be accepted, +// - there are two speakers (namely, A and B), +// - A is the first speaking, B is the second one, +// - each character after the speaker's letter indicates a time unit (e.g., 100 +// ms), +// - "*" indicates speaking, "." listening, +// - numbers indicate the turn index in std::vector. +// +// Note that the same speaker can appear in multiple lines in order to depict +// cases in which there are wrong offsets leading to self cross-talk (which is +// rejected). + #include +#include #include +#include "webrtc/base/logging.h" #include "webrtc/modules/audio_processing/test/conversational_speech/config.h" #include "webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h" #include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h" @@ -44,9 +71,38 @@ const std::vector expected_timing = { }; const std::size_t kNumberOfTurns = expected_timing.size(); +// Default arguments for MockWavReaderFactory ctor. +// Fake audio track parameters. +constexpr int kDefaultSampleRate = 48000; +const std::map + kDefaultMockWavReaderFactoryParamsMap = { + {"t300", {kDefaultSampleRate, 1u, 14400u}}, // 0.3 seconds. + {"t500", {kDefaultSampleRate, 1u, 24000u}}, // 0.5 seconds. + {"t1000", {kDefaultSampleRate, 1u, 48000u}}, // 1.0 seconds. +}; +const MockWavReaderFactory::Params& kDefaultMockWavReaderFactoryParams = + kDefaultMockWavReaderFactoryParamsMap.at("t500"); + +std::unique_ptr CreateMockWavReaderFactory() { + return std::unique_ptr( + new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams, + kDefaultMockWavReaderFactoryParamsMap)); +} + } // namespace -TEST(ConversationalSpeechTest, Settings) { +using testing::_; + +// TODO(alessiob): Remove fixture once conversational_speech fully implemented +// and replace TEST_F with TEST. +class ConversationalSpeechTest : public testing::Test { + public: + ConversationalSpeechTest() { + rtc::LogMessage::LogToDebug(rtc::LS_VERBOSE); + } +}; + +TEST_F(ConversationalSpeechTest, Settings) { const conversational_speech::Config config( audiotracks_path, timing_filepath, output_path); @@ -56,7 +112,7 @@ TEST(ConversationalSpeechTest, Settings) { EXPECT_EQ(output_path, config.output_path()); } -TEST(ConversationalSpeechTest, TimingSaveLoad) { +TEST_F(ConversationalSpeechTest, TimingSaveLoad) { // Save test timing. const std::string temporary_filepath = webrtc::test::TempFilename( webrtc::test::OutputPath(), "TempTimingTestFile"); @@ -76,20 +132,359 @@ TEST(ConversationalSpeechTest, TimingSaveLoad) { } } -TEST(ConversationalSpeechTest, MultiEndCallCreate) { - auto mock_wavreader_factory = std::unique_ptr( - new MockWavReaderFactory()); +TEST_F(ConversationalSpeechTest, MultiEndCallCreate) { + auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There are 5 unique audio tracks to read. - EXPECT_CALL(*mock_wavreader_factory, Create(testing::_)).Times(5); + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(5); // Inject the mock wav reader factory. conversational_speech::MultiEndCall multiend_call( expected_timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); // Test. EXPECT_EQ(2u, multiend_call.speaker_names().size()); EXPECT_EQ(5u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(6u, multiend_call.speaking_turns().size()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) { + const std::vector timing = { + {"A", "t500", -100}, + {"B", "t500", 0}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There is one unique audio track to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_FALSE(multiend_call.valid()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupSimple) { + // Accept: + // A 0****..... + // B .....1**** + constexpr std::size_t expected_duration = kDefaultSampleRate; + const std::vector timing = { + {"A", "t500", 0}, + {"B", "t500", 0}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There is one unique audio track to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); + + // Test. + EXPECT_EQ(2u, multiend_call.speaker_names().size()); + EXPECT_EQ(1u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(2u, multiend_call.speaking_turns().size()); + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupPause) { + // Accept: + // A 0****....... + // B .......1**** + constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2; + const std::vector timing = { + {"A", "t500", 0}, + {"B", "t500", 200}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There is one unique audio track to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); + + // Test. + EXPECT_EQ(2u, multiend_call.speaker_names().size()); + EXPECT_EQ(1u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(2u, multiend_call.speaking_turns().size()); + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) { + // Accept: + // A 0****.... + // B ....1**** + constexpr std::size_t expected_duration = kDefaultSampleRate * 0.9; + const std::vector timing = { + {"A", "t500", 0}, + {"B", "t500", -100}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There is one unique audio track to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); + + // Test. + EXPECT_EQ(2u, multiend_call.speaker_names().size()); + EXPECT_EQ(1u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(2u, multiend_call.speaking_turns().size()); + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) { + // Reject: + // A ..0**** + // B .1****. The n-th turn cannot start before the (n-1)-th one. + const std::vector timing = { + {"A", "t500", 200}, + {"B", "t500", -600}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There is one unique audio track to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_FALSE(multiend_call.valid()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) { + // Accept: + // A 0****2****... + // B ...1********* + constexpr std::size_t expected_duration = kDefaultSampleRate * 1.3; + const std::vector timing = { + {"A", "t500", 0}, + {"B", "t1000", -200}, + {"A", "t500", -800}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There are two unique audio tracks to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); + + // Test. + EXPECT_EQ(2u, multiend_call.speaker_names().size()); + EXPECT_EQ(2u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(3u, multiend_call.speaking_turns().size()); + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) { + // Reject: + // A 0****...... + // A ...1****... + // B ......2**** + // ^ Turn #1 overlaps with #0 which is from the same speaker. + const std::vector timing = { + {"A", "t500", 0}, + {"A", "t500", -200}, + {"B", "t500", -200}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There is one unique audio track to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_FALSE(multiend_call.valid()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) { + // Reject: + // A 0********* + // B 1**....... + // C ...2**.... + // A ......3**. + // ^ Turn #3 overlaps with #0 which is from the same speaker. + const std::vector timing = { + {"A", "t1000", 0}, + {"B", "t300", -1000}, + {"C", "t300", 0}, + {"A", "t300", 0}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There are two unique audio tracks to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_FALSE(multiend_call.valid()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) { + // Accept: + // A 0*********.. + // B ..1****..... + // C .......2**** + constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2; + const std::vector timing = { + {"A", "t1000", 0}, + {"B", "t500", -800}, + {"C", "t500", 0}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There are two unique audio tracks to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); + + // Test. + EXPECT_EQ(3u, multiend_call.speaker_names().size()); + EXPECT_EQ(2u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(3u, multiend_call.speaking_turns().size()); + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) { + // Reject: + // A 0********* + // B ..1****... + // C ....2****. + // ^ Turn #2 overlaps both with #0 and #1 (cross-talk with 3+ speakers + // not permitted). + const std::vector timing = { + {"A", "t1000", 0}, + {"B", "t500", -800}, + {"C", "t500", -300}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There are two unique audio tracks to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_FALSE(multiend_call.valid()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) { + // Accept: + // A 0*********.. + // B .2****...... + // C .......3**** + constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2; + const std::vector timing = { + {"A", "t1000", 0}, + {"B", "t500", -900}, + {"C", "t500", 100}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There are two unique audio tracks to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); + + // Test. + EXPECT_EQ(3u, multiend_call.speaker_names().size()); + EXPECT_EQ(2u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(3u, multiend_call.speaking_turns().size()); + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) { + // Accept: + // A 0**** + // B 1**** + const std::vector timing = { + {"A", "t500", 0}, + {"B", "t500", -500}, + }; + auto mock_wavreader_factory = CreateMockWavReaderFactory(); + + // There is one unique audio track to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); + + // Test. + EXPECT_EQ(2u, multiend_call.speaker_names().size()); + EXPECT_EQ(1u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(2u, multiend_call.speaking_turns().size()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequence) { + // Accept: + // A 0****....3****.5**. + // B .....1****...4**... + // C ......2**.......6**.. + constexpr std::size_t expected_duration = kDefaultSampleRate * 1.9; + const std::vector timing = { + {"A", "t500", 0}, + {"B", "t500", 0}, + {"C", "t300", -400}, + {"A", "t500", 0}, + {"B", "t300", -100}, + {"A", "t300", -100}, + {"C", "t300", -200}, + }; + auto mock_wavreader_factory = std::unique_ptr( + new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams, + kDefaultMockWavReaderFactoryParamsMap)); + + // There are two unique audio tracks to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_TRUE(multiend_call.valid()); + + // Test. + EXPECT_EQ(3u, multiend_call.speaker_names().size()); + EXPECT_EQ(2u, multiend_call.audiotrack_readers().size()); + EXPECT_EQ(7u, multiend_call.speaking_turns().size()); + EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); +} + +TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) { + // Reject: + // A 0****....3****.6** + // B .....1****...4**.. + // C ......2**.....5**.. + // ^ Turns #4, #5 and #6 overlapping (cross-talk with 3+ + // speakers not permitted). + const std::vector timing = { + {"A", "t500", 0}, + {"B", "t500", 0}, + {"C", "t300", -400}, + {"A", "t500", 0}, + {"B", "t300", -100}, + {"A", "t300", -200}, + {"C", "t300", -200}, + }; + auto mock_wavreader_factory = std::unique_ptr( + new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams, + kDefaultMockWavReaderFactoryParamsMap)); + + // There are two unique audio tracks to read. + EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); + + conversational_speech::MultiEndCall multiend_call( + timing, audiotracks_path, std::move(mock_wavreader_factory)); + EXPECT_FALSE(multiend_call.valid()); } } // namespace test diff --git a/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.cc b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.cc new file mode 100644 index 0000000000..7d2f2b663e --- /dev/null +++ b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.cc @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h" + +namespace webrtc { +namespace test { +namespace conversational_speech { + +using testing::Return; + +MockWavReader::MockWavReader( + int sample_rate, size_t num_channels, size_t num_samples) + : sample_rate_(sample_rate), num_channels_(num_channels), + num_samples_(num_samples) { + ON_CALL(*this, sample_rate()).WillByDefault(Return(sample_rate_)); + ON_CALL(*this, num_channels()).WillByDefault(Return(num_channels_)); + ON_CALL(*this, num_samples()).WillByDefault(Return(num_samples_)); +} + +MockWavReader::~MockWavReader() = default; + +} // namespace conversational_speech +} // namespace test +} // namespace webrtc diff --git a/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h index 83aa9382e5..d71e6f896b 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h +++ b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h @@ -24,17 +24,13 @@ namespace conversational_speech { class MockWavReader : public WavReaderInterface { public: - MockWavReader( - int sample_rate, size_t num_channels, size_t num_samples) - : sample_rate_(sample_rate), num_channels_(num_channels), - num_samples_(num_samples) {} - ~MockWavReader() = default; + MockWavReader(int sample_rate, size_t num_channels, size_t num_samples); + ~MockWavReader(); - // TOOD(alessiob): use ON_CALL to return random samples. + // TODO(alessiob): use ON_CALL to return random samples. MOCK_METHOD2(ReadFloatSamples, size_t(size_t, float*)); MOCK_METHOD2(ReadInt16Samples, size_t(size_t, int16_t*)); - // TOOD(alessiob): use ON_CALL to return properties. MOCK_CONST_METHOD0(sample_rate, int()); MOCK_CONST_METHOD0(num_channels, size_t()); MOCK_CONST_METHOD0(num_samples, size_t()); diff --git a/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.cc b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.cc index 1097639527..2dd21dadf1 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.cc +++ b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.cc @@ -10,14 +10,60 @@ #include "webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h" +#include "webrtc/base/logging.h" +#include "webrtc/base/pathutils.h" +#include "webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h" +#include "webrtc/test/gmock.h" + namespace webrtc { namespace test { namespace conversational_speech { -MockWavReaderFactory::MockWavReaderFactory() = default; +using testing::_; +using testing::Invoke; + +MockWavReaderFactory::MockWavReaderFactory( + const Params& default_params, + const std::map& params) + : default_params_(default_params), + audiotrack_names_params_(params) { + ON_CALL(*this, Create(_)).WillByDefault(Invoke( + this, &MockWavReaderFactory::CreateMock)); +} + +MockWavReaderFactory::MockWavReaderFactory(const Params& default_params) + : MockWavReaderFactory(default_params, + std::map{}) {} MockWavReaderFactory::~MockWavReaderFactory() = default; +std::unique_ptr MockWavReaderFactory::CreateMock( + const std::string& filepath) { + // Search the parameters corresponding to filepath. + const rtc::Pathname audiotrack_file_path(filepath); + const auto it = audiotrack_names_params_.find( + audiotrack_file_path.filename()); + + // If not found, use default parameters. + if (it == audiotrack_names_params_.end()) { + LOG(LS_VERBOSE) << "using default parameters for " << filepath; + return std::unique_ptr( + new MockWavReader(default_params_.sample_rate, + default_params_.num_channels, + default_params_.num_samples)); + } + + // Found, use the audiotrack-specific parameters. + LOG(LS_VERBOSE) << "using ad-hoc parameters for " << filepath; + LOG(LS_VERBOSE) << "sample_rate " << it->second.sample_rate; + LOG(LS_VERBOSE) << "num_channels " << it->second.num_channels; + LOG(LS_VERBOSE) << "num_samples " << it->second.num_samples; + return std::unique_ptr( + new MockWavReader(it->second.sample_rate, + it->second.num_channels, + it->second.num_samples)); +} + } // namespace conversational_speech } // namespace test } // namespace webrtc diff --git a/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h index 3686d12dd7..d22856e80c 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h +++ b/webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h @@ -11,6 +11,7 @@ #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_MOCK_WAVREADER_FACTORY_H_ #define WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_MOCK_WAVREADER_FACTORY_H_ +#include #include #include @@ -24,15 +25,28 @@ namespace conversational_speech { class MockWavReaderFactory : public WavReaderAbstractFactory { public: - MockWavReaderFactory(); - // TODO(alessiob): add ctor that gets map string->(sr, #samples, #channels). + struct Params{ + int sample_rate; + size_t num_channels; + size_t num_samples; + }; + + MockWavReaderFactory(const Params& default_params, + const std::map& params); + explicit MockWavReaderFactory(const Params& default_params); ~MockWavReaderFactory(); - // TODO(alessiob): use ON_CALL to return MockWavReader with desired params. MOCK_CONST_METHOD1(Create, std::unique_ptr( const std::string&)); - // TODO(alessiob): add const ref to map (see ctor to add). + private: + // Creates a MockWavReader instance using the parameters in + // audiotrack_names_params_ if the entry corresponding to filepath exists, + // otherwise creates a MockWavReader instance using the default parameters. + std::unique_ptr CreateMock(const std::string& filepath); + + const Params& default_params_; + std::map audiotrack_names_params_; }; } // namespace conversational_speech diff --git a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc index f16aa753fa..ad1d9a0c87 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc +++ b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.cc @@ -10,8 +10,10 @@ #include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h" -#include +#include +#include +#include "webrtc/base/logging.h" #include "webrtc/base/pathutils.h" namespace webrtc { @@ -25,7 +27,7 @@ MultiEndCall::MultiEndCall( wavreader_abstract_factory_(std::move(wavreader_abstract_factory)) { FindSpeakerNames(); CreateAudioTrackReaders(); - CheckTiming(); + valid_ = CheckTiming(); } MultiEndCall::~MultiEndCall() = default; @@ -39,10 +41,23 @@ const std::map>& return audiotrack_readers_; } +bool MultiEndCall::valid() const { + return valid_; +} + +size_t MultiEndCall::total_duration_samples() const { + return total_duration_samples_; +} + +const std::vector& MultiEndCall::speaking_turns() + const { + return speaking_turns_; +} + void MultiEndCall::FindSpeakerNames() { RTC_DCHECK(speaker_names_.empty()); for (const Turn& turn : timing_) { - speaker_names_.insert(turn.speaker_name); + speaker_names_.emplace(turn.speaker_name); } } @@ -60,14 +75,119 @@ void MultiEndCall::CreateAudioTrackReaders() { // Map the audiotrack file name to a new instance of WavReaderInterface. std::unique_ptr wavreader = wavreader_abstract_factory_->Create(audiotrack_file_path.pathname()); - audiotrack_readers_.insert(std::make_pair( - turn.audiotrack_file_name, std::move(wavreader))); + audiotrack_readers_.emplace( + turn.audiotrack_file_name, std::move(wavreader)); } } -void MultiEndCall::CheckTiming() { - // TODO(alessiob): use audiotrack lengths and offset to check whether the - // timing is valid. +bool MultiEndCall::CheckTiming() { + struct Interval { + size_t begin; + size_t end; + }; + size_t number_of_turns = timing_.size(); + auto millisecond_to_samples = [](int ms, int sr) -> int { + // Truncation may happen if the sampling rate is not an integer multiple + // of 1000 (e.g., 44100). + return ms * sr / 1000; + }; + auto in_interval = [](size_t value, const Interval& interval) { + return interval.begin <= value && value < interval.end; + }; + total_duration_samples_ = 0; + speaking_turns_.clear(); + + // Begin and end timestamps for the last two turns (unit: number of samples). + Interval second_last_turn = {0, 0}; + Interval last_turn = {0, 0}; + + // Initialize map to store speaking turn indices of each speaker (used to + // detect self cross-talk). + std::map> speaking_turn_indices; + for (const std::string& speaker_name : speaker_names_) { + speaking_turn_indices.emplace( + std::piecewise_construct, + std::forward_as_tuple(speaker_name), + std::forward_as_tuple()); + } + + // Parse turns. + for (size_t turn_index = 0; turn_index < number_of_turns; ++turn_index) { + const Turn& turn = timing_[turn_index]; + auto it = audiotrack_readers_.find(turn.audiotrack_file_name); + RTC_CHECK(it != audiotrack_readers_.end()) + << "Audio track reader not created"; + + // Begin and end timestamps for the current turn. + int offset_samples = millisecond_to_samples( + turn.offset, it->second->sample_rate()); + size_t begin_timestamp = last_turn.end + offset_samples; + size_t end_timestamp = begin_timestamp + it->second->num_samples(); + LOG(LS_INFO) << "turn #" << turn_index << " " << begin_timestamp + << "-" << end_timestamp << " ms"; + + // The order is invalid if the offset is negative and its absolute value is + // larger then the duration of the previous turn. + if (offset_samples < 0 && -offset_samples > static_cast( + last_turn.end - last_turn.begin)) { + LOG(LS_ERROR) << "invalid order"; + return false; + } + + // Cross-talk with 3 or more speakers occurs when the beginning of the + // current interval falls in the last two turns. + if (turn_index > 1 && in_interval(begin_timestamp, last_turn) + && in_interval(begin_timestamp, second_last_turn)) { + LOG(LS_ERROR) << "cross-talk with 3+ speakers"; + return false; + } + + // Append turn. + speaking_turns_.emplace_back( + turn.speaker_name, turn.audiotrack_file_name, + begin_timestamp, end_timestamp); + + // Save speaking turn index for self cross-talk detection. + RTC_DCHECK_EQ(speaking_turns_.size(), turn_index + 1); + speaking_turn_indices[turn.speaker_name].push_back(turn_index); + + // Update total duration of the consversational speech. + if (total_duration_samples_ < end_timestamp) + total_duration_samples_ = end_timestamp; + + // Update and continue with next turn. + second_last_turn = last_turn; + last_turn.begin = begin_timestamp; + last_turn.end = end_timestamp; + } + + // Detect self cross-talk. + for (const std::string& speaker_name : speaker_names_) { + LOG(LS_INFO) << "checking self cross-talk for <" + << speaker_name << ">"; + + // Copy all turns for this speaker to new vector. + std::vector speaking_turns_for_name; + std::copy_if(speaking_turns_.begin(), speaking_turns_.end(), + std::back_inserter(speaking_turns_for_name), + [&speaker_name](const SpeakingTurn& st){ + return st.speaker_name == speaker_name; }); + + // Check for overlap between adjacent elements. + // This is a sufficient condition for self cross-talk since the intervals + // are sorted by begin timestamp. + auto overlap = std::adjacent_find( + speaking_turns_for_name.begin(), speaking_turns_for_name.end(), + [](const SpeakingTurn& a, const SpeakingTurn& b) { + return a.end > b.begin; }); + + if (overlap != speaking_turns_for_name.end()) { + LOG(LS_ERROR) << "Self cross-talk detected"; + return false; + } + } + + return true; } } // namespace conversational_speech diff --git a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h index 234cb2799e..dd03a07e7a 100644 --- a/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h +++ b/webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h @@ -11,10 +11,13 @@ #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_MULTIEND_CALL_H_ #define WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_MULTIEND_CALL_H_ +#include #include #include #include #include +#include +#include #include "webrtc/base/array_view.h" #include "webrtc/base/constructormagic.h" @@ -28,6 +31,20 @@ namespace conversational_speech { class MultiEndCall { public: + struct SpeakingTurn { + // Constructor required in order to use std::vector::emplace_back(). + SpeakingTurn(std::string new_speaker_name, + std::string new_audiotrack_file_name, + size_t new_begin, size_t new_end) + : speaker_name(std::move(new_speaker_name)), + audiotrack_file_name(std::move(new_audiotrack_file_name)), + begin(new_begin), end(new_end) {} + std::string speaker_name; + std::string audiotrack_file_name; + size_t begin; + size_t end; + }; + MultiEndCall( rtc::ArrayView timing, const std::string& audiotracks_path, std::unique_ptr wavreader_abstract_factory); @@ -36,16 +53,20 @@ class MultiEndCall { const std::set& speaker_names() const; const std::map>& audiotrack_readers() const; + bool valid() const; + size_t total_duration_samples() const; + const std::vector& speaking_turns() const; private: - // Find unique speaker names. + // Finds unique speaker names. void FindSpeakerNames(); - // Create one WavReader instance for each unique audiotrack. + // Creates one WavReader instance for each unique audiotrack. void CreateAudioTrackReaders(); - // Check the speaking turns timing. - void CheckTiming(); + // Validates the speaking turns timing information. Accepts cross-talk, but + // only up to 2 speakers. Rejects unordered turns and self cross-talk. + bool CheckTiming(); rtc::ArrayView timing_; const std::string& audiotracks_path_; @@ -53,6 +74,9 @@ class MultiEndCall { std::set speaker_names_; std::map> audiotrack_readers_; + bool valid_; + size_t total_duration_samples_; + std::vector speaking_turns_; RTC_DISALLOW_COPY_AND_ASSIGN(MultiEndCall); };