MultiEndCall::CheckTiming() verifies that a set of audio tracks and timing information is valid to simulate conversational speech. Unordered turns are rejected. Self cross-talk and cross-talk with 3 or more speakers are not permitted since it would require mixing at the simulation step.
This CL includes extensive tests to match accept or reject decisions on several different timing setups. The setups are simulated using mocks (by far more light-weight than using actual timing and audio track files). The client code, the unit tests in this case, passes information about the fake audio tracks to MockWavReaderFactory. MockWavReader instances are then created using the parameters defined in the client code. To improve the readability of the tests, generator_unittest.cc includes a docstring explaining how each MultiEndCallSetup* test is documented. Run tests as follows: $ out/Default/modules_unittests --gtest_filter=ConversationalSpeechTest.* BUG=webrtc:7218 Review-Url: https://codereview.webrtc.org/2781573002 Cr-Commit-Position: refs/heads/master@{#17592}
This commit is contained in:
parent
292084c376
commit
8d23c050f2
@ -6,7 +6,7 @@
|
||||
# in the file PATENTS. All contributing project authors may
|
||||
# be found in the AUTHORS file in the root of the source tree.
|
||||
|
||||
import("//webrtc/webrtc.gni")
|
||||
import("../../../../../webrtc/webrtc.gni")
|
||||
|
||||
group("conversational_speech") {
|
||||
testonly = true
|
||||
@ -22,9 +22,9 @@ rtc_executable("conversational_speech_generator") {
|
||||
]
|
||||
deps = [
|
||||
":lib",
|
||||
"../../../../../webrtc/base:rtc_base_approved",
|
||||
"../../../../../webrtc/test:test_support",
|
||||
"//third_party/gflags",
|
||||
"//webrtc/base:rtc_base_approved",
|
||||
"//webrtc/test:test_support",
|
||||
]
|
||||
}
|
||||
|
||||
@ -45,9 +45,9 @@ rtc_static_library("lib") {
|
||||
"wavreader_interface.h",
|
||||
]
|
||||
deps = [
|
||||
"//webrtc:webrtc_common",
|
||||
"//webrtc/base:rtc_base_approved",
|
||||
"//webrtc/common_audio",
|
||||
"../../../../../webrtc:webrtc_common",
|
||||
"../../../../../webrtc/base:rtc_base_approved",
|
||||
"../../../../../webrtc/common_audio",
|
||||
]
|
||||
visibility = [ ":*" ] # Only targets in this file can depend on this.
|
||||
}
|
||||
@ -56,15 +56,17 @@ rtc_source_set("unittest") {
|
||||
testonly = true
|
||||
sources = [
|
||||
"generator_unittest.cc",
|
||||
"mock_wavreader.cc",
|
||||
"mock_wavreader.h",
|
||||
"mock_wavreader_factory.cc",
|
||||
"mock_wavreader_factory.h",
|
||||
]
|
||||
deps = [
|
||||
":lib",
|
||||
"../../../../../webrtc:webrtc_common",
|
||||
"../../../../../webrtc/base:rtc_base_approved",
|
||||
"../../../../../webrtc/test:test_support",
|
||||
"//testing/gmock",
|
||||
"//testing/gtest",
|
||||
"//webrtc:webrtc_common",
|
||||
"//webrtc/test:test_support",
|
||||
]
|
||||
}
|
||||
|
||||
@ -8,9 +8,36 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
// This file consists of unit tests for webrtc::test::conversational_speech
|
||||
// members. Part of them focus on accepting or rejecting different
|
||||
// conversational speech setups. A setup is defined by a set of audio tracks and
|
||||
// timing information).
|
||||
// The docstring at the beginning of each TEST_F(ConversationalSpeechTest,
|
||||
// MultiEndCallSetup*) function looks like the drawing below and indicates which
|
||||
// setup is tested.
|
||||
//
|
||||
// Accept:
|
||||
// A 0****.....
|
||||
// B .....1****
|
||||
//
|
||||
// The drawing indicates the following:
|
||||
// - the illustrated setup should be accepted,
|
||||
// - there are two speakers (namely, A and B),
|
||||
// - A is the first speaking, B is the second one,
|
||||
// - each character after the speaker's letter indicates a time unit (e.g., 100
|
||||
// ms),
|
||||
// - "*" indicates speaking, "." listening,
|
||||
// - numbers indicate the turn index in std::vector<Turn>.
|
||||
//
|
||||
// Note that the same speaker can appear in multiple lines in order to depict
|
||||
// cases in which there are wrong offsets leading to self cross-talk (which is
|
||||
// rejected).
|
||||
|
||||
#include <stdio.h>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
|
||||
#include "webrtc/base/logging.h"
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/config.h"
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h"
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h"
|
||||
@ -44,9 +71,38 @@ const std::vector<Turn> expected_timing = {
|
||||
};
|
||||
const std::size_t kNumberOfTurns = expected_timing.size();
|
||||
|
||||
// Default arguments for MockWavReaderFactory ctor.
|
||||
// Fake audio track parameters.
|
||||
constexpr int kDefaultSampleRate = 48000;
|
||||
const std::map<std::string, const MockWavReaderFactory::Params>
|
||||
kDefaultMockWavReaderFactoryParamsMap = {
|
||||
{"t300", {kDefaultSampleRate, 1u, 14400u}}, // 0.3 seconds.
|
||||
{"t500", {kDefaultSampleRate, 1u, 24000u}}, // 0.5 seconds.
|
||||
{"t1000", {kDefaultSampleRate, 1u, 48000u}}, // 1.0 seconds.
|
||||
};
|
||||
const MockWavReaderFactory::Params& kDefaultMockWavReaderFactoryParams =
|
||||
kDefaultMockWavReaderFactoryParamsMap.at("t500");
|
||||
|
||||
std::unique_ptr<MockWavReaderFactory> CreateMockWavReaderFactory() {
|
||||
return std::unique_ptr<MockWavReaderFactory>(
|
||||
new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
|
||||
kDefaultMockWavReaderFactoryParamsMap));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST(ConversationalSpeechTest, Settings) {
|
||||
using testing::_;
|
||||
|
||||
// TODO(alessiob): Remove fixture once conversational_speech fully implemented
|
||||
// and replace TEST_F with TEST.
|
||||
class ConversationalSpeechTest : public testing::Test {
|
||||
public:
|
||||
ConversationalSpeechTest() {
|
||||
rtc::LogMessage::LogToDebug(rtc::LS_VERBOSE);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(ConversationalSpeechTest, Settings) {
|
||||
const conversational_speech::Config config(
|
||||
audiotracks_path, timing_filepath, output_path);
|
||||
|
||||
@ -56,7 +112,7 @@ TEST(ConversationalSpeechTest, Settings) {
|
||||
EXPECT_EQ(output_path, config.output_path());
|
||||
}
|
||||
|
||||
TEST(ConversationalSpeechTest, TimingSaveLoad) {
|
||||
TEST_F(ConversationalSpeechTest, TimingSaveLoad) {
|
||||
// Save test timing.
|
||||
const std::string temporary_filepath = webrtc::test::TempFilename(
|
||||
webrtc::test::OutputPath(), "TempTimingTestFile");
|
||||
@ -76,20 +132,359 @@ TEST(ConversationalSpeechTest, TimingSaveLoad) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ConversationalSpeechTest, MultiEndCallCreate) {
|
||||
auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
|
||||
new MockWavReaderFactory());
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallCreate) {
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
// There are 5 unique audio tracks to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(testing::_)).Times(5);
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(5);
|
||||
|
||||
// Inject the mock wav reader factory.
|
||||
conversational_speech::MultiEndCall multiend_call(
|
||||
expected_timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_TRUE(multiend_call.valid());
|
||||
|
||||
// Test.
|
||||
EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
||||
EXPECT_EQ(5u, multiend_call.audiotrack_readers().size());
|
||||
EXPECT_EQ(6u, multiend_call.speaking_turns().size());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", -100},
|
||||
{"B", "t500", 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
// There is one unique audio track to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
||||
|
||||
conversational_speech::MultiEndCall multiend_call(
|
||||
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupSimple) {
|
||||
// Accept:
|
||||
// A 0****.....
|
||||
// B .....1****
|
||||
constexpr std::size_t expected_duration = kDefaultSampleRate;
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 0},
|
||||
{"B", "t500", 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
// There is one unique audio track to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
||||
|
||||
conversational_speech::MultiEndCall multiend_call(
|
||||
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_TRUE(multiend_call.valid());
|
||||
|
||||
// Test.
|
||||
EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
||||
EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
|
||||
EXPECT_EQ(2u, multiend_call.speaking_turns().size());
|
||||
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupPause) {
|
||||
// Accept:
|
||||
// A 0****.......
|
||||
// B .......1****
|
||||
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 0},
|
||||
{"B", "t500", 200},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
// There is one unique audio track to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
||||
|
||||
conversational_speech::MultiEndCall multiend_call(
|
||||
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_TRUE(multiend_call.valid());
|
||||
|
||||
// Test.
|
||||
EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
||||
EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
|
||||
EXPECT_EQ(2u, multiend_call.speaking_turns().size());
|
||||
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) {
|
||||
// Accept:
|
||||
// A 0****....
|
||||
// B ....1****
|
||||
constexpr std::size_t expected_duration = kDefaultSampleRate * 0.9;
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 0},
|
||||
{"B", "t500", -100},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
// There is one unique audio track to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
||||
|
||||
conversational_speech::MultiEndCall multiend_call(
|
||||
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_TRUE(multiend_call.valid());
|
||||
|
||||
// Test.
|
||||
EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
||||
EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
|
||||
EXPECT_EQ(2u, multiend_call.speaking_turns().size());
|
||||
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) {
|
||||
// Reject:
|
||||
// A ..0****
|
||||
// B .1****. The n-th turn cannot start before the (n-1)-th one.
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 200},
|
||||
{"B", "t500", -600},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
// There is one unique audio track to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
||||
|
||||
conversational_speech::MultiEndCall multiend_call(
|
||||
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) {
|
||||
// Accept:
|
||||
// A 0****2****...
|
||||
// B ...1*********
|
||||
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.3;
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 0},
|
||||
{"B", "t1000", -200},
|
||||
{"A", "t500", -800},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
// There are two unique audio tracks to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
||||
|
||||
conversational_speech::MultiEndCall multiend_call(
|
||||
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_TRUE(multiend_call.valid());
|
||||
|
||||
// Test.
|
||||
EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
||||
EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
|
||||
EXPECT_EQ(3u, multiend_call.speaking_turns().size());
|
||||
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) {
|
||||
// Reject:
|
||||
// A 0****......
|
||||
// A ...1****...
|
||||
// B ......2****
|
||||
// ^ Turn #1 overlaps with #0 which is from the same speaker.
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 0},
|
||||
{"A", "t500", -200},
|
||||
{"B", "t500", -200},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
// There is one unique audio track to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
||||
|
||||
conversational_speech::MultiEndCall multiend_call(
|
||||
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) {
|
||||
// Reject:
|
||||
// A 0*********
|
||||
// B 1**.......
|
||||
// C ...2**....
|
||||
// A ......3**.
|
||||
// ^ Turn #3 overlaps with #0 which is from the same speaker.
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t1000", 0},
|
||||
{"B", "t300", -1000},
|
||||
{"C", "t300", 0},
|
||||
{"A", "t300", 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
// There are two unique audio tracks to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
||||
|
||||
conversational_speech::MultiEndCall multiend_call(
|
||||
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) {
|
||||
// Accept:
|
||||
// A 0*********..
|
||||
// B ..1****.....
|
||||
// C .......2****
|
||||
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t1000", 0},
|
||||
{"B", "t500", -800},
|
||||
{"C", "t500", 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
// There are two unique audio tracks to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
||||
|
||||
conversational_speech::MultiEndCall multiend_call(
|
||||
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_TRUE(multiend_call.valid());
|
||||
|
||||
// Test.
|
||||
EXPECT_EQ(3u, multiend_call.speaker_names().size());
|
||||
EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
|
||||
EXPECT_EQ(3u, multiend_call.speaking_turns().size());
|
||||
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) {
|
||||
// Reject:
|
||||
// A 0*********
|
||||
// B ..1****...
|
||||
// C ....2****.
|
||||
// ^ Turn #2 overlaps both with #0 and #1 (cross-talk with 3+ speakers
|
||||
// not permitted).
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t1000", 0},
|
||||
{"B", "t500", -800},
|
||||
{"C", "t500", -300},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
// There are two unique audio tracks to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
||||
|
||||
conversational_speech::MultiEndCall multiend_call(
|
||||
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) {
|
||||
// Accept:
|
||||
// A 0*********..
|
||||
// B .2****......
|
||||
// C .......3****
|
||||
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t1000", 0},
|
||||
{"B", "t500", -900},
|
||||
{"C", "t500", 100},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
// There are two unique audio tracks to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
||||
|
||||
conversational_speech::MultiEndCall multiend_call(
|
||||
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_TRUE(multiend_call.valid());
|
||||
|
||||
// Test.
|
||||
EXPECT_EQ(3u, multiend_call.speaker_names().size());
|
||||
EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
|
||||
EXPECT_EQ(3u, multiend_call.speaking_turns().size());
|
||||
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) {
|
||||
// Accept:
|
||||
// A 0****
|
||||
// B 1****
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 0},
|
||||
{"B", "t500", -500},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
// There is one unique audio track to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
||||
|
||||
conversational_speech::MultiEndCall multiend_call(
|
||||
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_TRUE(multiend_call.valid());
|
||||
|
||||
// Test.
|
||||
EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
||||
EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
|
||||
EXPECT_EQ(2u, multiend_call.speaking_turns().size());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequence) {
|
||||
// Accept:
|
||||
// A 0****....3****.5**.
|
||||
// B .....1****...4**...
|
||||
// C ......2**.......6**..
|
||||
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.9;
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 0},
|
||||
{"B", "t500", 0},
|
||||
{"C", "t300", -400},
|
||||
{"A", "t500", 0},
|
||||
{"B", "t300", -100},
|
||||
{"A", "t300", -100},
|
||||
{"C", "t300", -200},
|
||||
};
|
||||
auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
|
||||
new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
|
||||
kDefaultMockWavReaderFactoryParamsMap));
|
||||
|
||||
// There are two unique audio tracks to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
||||
|
||||
conversational_speech::MultiEndCall multiend_call(
|
||||
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_TRUE(multiend_call.valid());
|
||||
|
||||
// Test.
|
||||
EXPECT_EQ(3u, multiend_call.speaker_names().size());
|
||||
EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
|
||||
EXPECT_EQ(7u, multiend_call.speaking_turns().size());
|
||||
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
||||
}
|
||||
|
||||
TEST_F(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) {
|
||||
// Reject:
|
||||
// A 0****....3****.6**
|
||||
// B .....1****...4**..
|
||||
// C ......2**.....5**..
|
||||
// ^ Turns #4, #5 and #6 overlapping (cross-talk with 3+
|
||||
// speakers not permitted).
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 0},
|
||||
{"B", "t500", 0},
|
||||
{"C", "t300", -400},
|
||||
{"A", "t500", 0},
|
||||
{"B", "t300", -100},
|
||||
{"A", "t300", -200},
|
||||
{"C", "t300", -200},
|
||||
};
|
||||
auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
|
||||
new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
|
||||
kDefaultMockWavReaderFactoryParamsMap));
|
||||
|
||||
// There are two unique audio tracks to read.
|
||||
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
||||
|
||||
conversational_speech::MultiEndCall multiend_call(
|
||||
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
||||
EXPECT_FALSE(multiend_call.valid());
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
|
||||
@ -0,0 +1,32 @@
|
||||
/*
|
||||
* Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h"
|
||||
|
||||
namespace webrtc {
|
||||
namespace test {
|
||||
namespace conversational_speech {
|
||||
|
||||
using testing::Return;
|
||||
|
||||
MockWavReader::MockWavReader(
|
||||
int sample_rate, size_t num_channels, size_t num_samples)
|
||||
: sample_rate_(sample_rate), num_channels_(num_channels),
|
||||
num_samples_(num_samples) {
|
||||
ON_CALL(*this, sample_rate()).WillByDefault(Return(sample_rate_));
|
||||
ON_CALL(*this, num_channels()).WillByDefault(Return(num_channels_));
|
||||
ON_CALL(*this, num_samples()).WillByDefault(Return(num_samples_));
|
||||
}
|
||||
|
||||
MockWavReader::~MockWavReader() = default;
|
||||
|
||||
} // namespace conversational_speech
|
||||
} // namespace test
|
||||
} // namespace webrtc
|
||||
@ -24,17 +24,13 @@ namespace conversational_speech {
|
||||
|
||||
class MockWavReader : public WavReaderInterface {
|
||||
public:
|
||||
MockWavReader(
|
||||
int sample_rate, size_t num_channels, size_t num_samples)
|
||||
: sample_rate_(sample_rate), num_channels_(num_channels),
|
||||
num_samples_(num_samples) {}
|
||||
~MockWavReader() = default;
|
||||
MockWavReader(int sample_rate, size_t num_channels, size_t num_samples);
|
||||
~MockWavReader();
|
||||
|
||||
// TOOD(alessiob): use ON_CALL to return random samples.
|
||||
// TODO(alessiob): use ON_CALL to return random samples.
|
||||
MOCK_METHOD2(ReadFloatSamples, size_t(size_t, float*));
|
||||
MOCK_METHOD2(ReadInt16Samples, size_t(size_t, int16_t*));
|
||||
|
||||
// TOOD(alessiob): use ON_CALL to return properties.
|
||||
MOCK_CONST_METHOD0(sample_rate, int());
|
||||
MOCK_CONST_METHOD0(num_channels, size_t());
|
||||
MOCK_CONST_METHOD0(num_samples, size_t());
|
||||
|
||||
@ -10,14 +10,60 @@
|
||||
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h"
|
||||
|
||||
#include "webrtc/base/logging.h"
|
||||
#include "webrtc/base/pathutils.h"
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/mock_wavreader.h"
|
||||
#include "webrtc/test/gmock.h"
|
||||
|
||||
namespace webrtc {
|
||||
namespace test {
|
||||
namespace conversational_speech {
|
||||
|
||||
MockWavReaderFactory::MockWavReaderFactory() = default;
|
||||
using testing::_;
|
||||
using testing::Invoke;
|
||||
|
||||
MockWavReaderFactory::MockWavReaderFactory(
|
||||
const Params& default_params,
|
||||
const std::map<std::string, const Params>& params)
|
||||
: default_params_(default_params),
|
||||
audiotrack_names_params_(params) {
|
||||
ON_CALL(*this, Create(_)).WillByDefault(Invoke(
|
||||
this, &MockWavReaderFactory::CreateMock));
|
||||
}
|
||||
|
||||
MockWavReaderFactory::MockWavReaderFactory(const Params& default_params)
|
||||
: MockWavReaderFactory(default_params,
|
||||
std::map<std::string, const Params>{}) {}
|
||||
|
||||
MockWavReaderFactory::~MockWavReaderFactory() = default;
|
||||
|
||||
std::unique_ptr<WavReaderInterface> MockWavReaderFactory::CreateMock(
|
||||
const std::string& filepath) {
|
||||
// Search the parameters corresponding to filepath.
|
||||
const rtc::Pathname audiotrack_file_path(filepath);
|
||||
const auto it = audiotrack_names_params_.find(
|
||||
audiotrack_file_path.filename());
|
||||
|
||||
// If not found, use default parameters.
|
||||
if (it == audiotrack_names_params_.end()) {
|
||||
LOG(LS_VERBOSE) << "using default parameters for " << filepath;
|
||||
return std::unique_ptr<WavReaderInterface>(
|
||||
new MockWavReader(default_params_.sample_rate,
|
||||
default_params_.num_channels,
|
||||
default_params_.num_samples));
|
||||
}
|
||||
|
||||
// Found, use the audiotrack-specific parameters.
|
||||
LOG(LS_VERBOSE) << "using ad-hoc parameters for " << filepath;
|
||||
LOG(LS_VERBOSE) << "sample_rate " << it->second.sample_rate;
|
||||
LOG(LS_VERBOSE) << "num_channels " << it->second.num_channels;
|
||||
LOG(LS_VERBOSE) << "num_samples " << it->second.num_samples;
|
||||
return std::unique_ptr<WavReaderInterface>(
|
||||
new MockWavReader(it->second.sample_rate,
|
||||
it->second.num_channels,
|
||||
it->second.num_samples));
|
||||
}
|
||||
|
||||
} // namespace conversational_speech
|
||||
} // namespace test
|
||||
} // namespace webrtc
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_MOCK_WAVREADER_FACTORY_H_
|
||||
#define WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_MOCK_WAVREADER_FACTORY_H_
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
@ -24,15 +25,28 @@ namespace conversational_speech {
|
||||
|
||||
class MockWavReaderFactory : public WavReaderAbstractFactory {
|
||||
public:
|
||||
MockWavReaderFactory();
|
||||
// TODO(alessiob): add ctor that gets map string->(sr, #samples, #channels).
|
||||
struct Params{
|
||||
int sample_rate;
|
||||
size_t num_channels;
|
||||
size_t num_samples;
|
||||
};
|
||||
|
||||
MockWavReaderFactory(const Params& default_params,
|
||||
const std::map<std::string, const Params>& params);
|
||||
explicit MockWavReaderFactory(const Params& default_params);
|
||||
~MockWavReaderFactory();
|
||||
|
||||
// TODO(alessiob): use ON_CALL to return MockWavReader with desired params.
|
||||
MOCK_CONST_METHOD1(Create, std::unique_ptr<WavReaderInterface>(
|
||||
const std::string&));
|
||||
|
||||
// TODO(alessiob): add const ref to map (see ctor to add).
|
||||
private:
|
||||
// Creates a MockWavReader instance using the parameters in
|
||||
// audiotrack_names_params_ if the entry corresponding to filepath exists,
|
||||
// otherwise creates a MockWavReader instance using the default parameters.
|
||||
std::unique_ptr<WavReaderInterface> CreateMock(const std::string& filepath);
|
||||
|
||||
const Params& default_params_;
|
||||
std::map<std::string, const Params> audiotrack_names_params_;
|
||||
};
|
||||
|
||||
} // namespace conversational_speech
|
||||
|
||||
@ -10,8 +10,10 @@
|
||||
|
||||
#include "webrtc/modules/audio_processing/test/conversational_speech/multiend_call.h"
|
||||
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
|
||||
#include "webrtc/base/logging.h"
|
||||
#include "webrtc/base/pathutils.h"
|
||||
|
||||
namespace webrtc {
|
||||
@ -25,7 +27,7 @@ MultiEndCall::MultiEndCall(
|
||||
wavreader_abstract_factory_(std::move(wavreader_abstract_factory)) {
|
||||
FindSpeakerNames();
|
||||
CreateAudioTrackReaders();
|
||||
CheckTiming();
|
||||
valid_ = CheckTiming();
|
||||
}
|
||||
|
||||
MultiEndCall::~MultiEndCall() = default;
|
||||
@ -39,10 +41,23 @@ const std::map<std::string, std::unique_ptr<WavReaderInterface>>&
|
||||
return audiotrack_readers_;
|
||||
}
|
||||
|
||||
bool MultiEndCall::valid() const {
|
||||
return valid_;
|
||||
}
|
||||
|
||||
size_t MultiEndCall::total_duration_samples() const {
|
||||
return total_duration_samples_;
|
||||
}
|
||||
|
||||
const std::vector<MultiEndCall::SpeakingTurn>& MultiEndCall::speaking_turns()
|
||||
const {
|
||||
return speaking_turns_;
|
||||
}
|
||||
|
||||
void MultiEndCall::FindSpeakerNames() {
|
||||
RTC_DCHECK(speaker_names_.empty());
|
||||
for (const Turn& turn : timing_) {
|
||||
speaker_names_.insert(turn.speaker_name);
|
||||
speaker_names_.emplace(turn.speaker_name);
|
||||
}
|
||||
}
|
||||
|
||||
@ -60,14 +75,119 @@ void MultiEndCall::CreateAudioTrackReaders() {
|
||||
// Map the audiotrack file name to a new instance of WavReaderInterface.
|
||||
std::unique_ptr<WavReaderInterface> wavreader =
|
||||
wavreader_abstract_factory_->Create(audiotrack_file_path.pathname());
|
||||
audiotrack_readers_.insert(std::make_pair(
|
||||
turn.audiotrack_file_name, std::move(wavreader)));
|
||||
audiotrack_readers_.emplace(
|
||||
turn.audiotrack_file_name, std::move(wavreader));
|
||||
}
|
||||
}
|
||||
|
||||
void MultiEndCall::CheckTiming() {
|
||||
// TODO(alessiob): use audiotrack lengths and offset to check whether the
|
||||
// timing is valid.
|
||||
bool MultiEndCall::CheckTiming() {
|
||||
struct Interval {
|
||||
size_t begin;
|
||||
size_t end;
|
||||
};
|
||||
size_t number_of_turns = timing_.size();
|
||||
auto millisecond_to_samples = [](int ms, int sr) -> int {
|
||||
// Truncation may happen if the sampling rate is not an integer multiple
|
||||
// of 1000 (e.g., 44100).
|
||||
return ms * sr / 1000;
|
||||
};
|
||||
auto in_interval = [](size_t value, const Interval& interval) {
|
||||
return interval.begin <= value && value < interval.end;
|
||||
};
|
||||
total_duration_samples_ = 0;
|
||||
speaking_turns_.clear();
|
||||
|
||||
// Begin and end timestamps for the last two turns (unit: number of samples).
|
||||
Interval second_last_turn = {0, 0};
|
||||
Interval last_turn = {0, 0};
|
||||
|
||||
// Initialize map to store speaking turn indices of each speaker (used to
|
||||
// detect self cross-talk).
|
||||
std::map<std::string, std::vector<size_t>> speaking_turn_indices;
|
||||
for (const std::string& speaker_name : speaker_names_) {
|
||||
speaking_turn_indices.emplace(
|
||||
std::piecewise_construct,
|
||||
std::forward_as_tuple(speaker_name),
|
||||
std::forward_as_tuple());
|
||||
}
|
||||
|
||||
// Parse turns.
|
||||
for (size_t turn_index = 0; turn_index < number_of_turns; ++turn_index) {
|
||||
const Turn& turn = timing_[turn_index];
|
||||
auto it = audiotrack_readers_.find(turn.audiotrack_file_name);
|
||||
RTC_CHECK(it != audiotrack_readers_.end())
|
||||
<< "Audio track reader not created";
|
||||
|
||||
// Begin and end timestamps for the current turn.
|
||||
int offset_samples = millisecond_to_samples(
|
||||
turn.offset, it->second->sample_rate());
|
||||
size_t begin_timestamp = last_turn.end + offset_samples;
|
||||
size_t end_timestamp = begin_timestamp + it->second->num_samples();
|
||||
LOG(LS_INFO) << "turn #" << turn_index << " " << begin_timestamp
|
||||
<< "-" << end_timestamp << " ms";
|
||||
|
||||
// The order is invalid if the offset is negative and its absolute value is
|
||||
// larger then the duration of the previous turn.
|
||||
if (offset_samples < 0 && -offset_samples > static_cast<int>(
|
||||
last_turn.end - last_turn.begin)) {
|
||||
LOG(LS_ERROR) << "invalid order";
|
||||
return false;
|
||||
}
|
||||
|
||||
// Cross-talk with 3 or more speakers occurs when the beginning of the
|
||||
// current interval falls in the last two turns.
|
||||
if (turn_index > 1 && in_interval(begin_timestamp, last_turn)
|
||||
&& in_interval(begin_timestamp, second_last_turn)) {
|
||||
LOG(LS_ERROR) << "cross-talk with 3+ speakers";
|
||||
return false;
|
||||
}
|
||||
|
||||
// Append turn.
|
||||
speaking_turns_.emplace_back(
|
||||
turn.speaker_name, turn.audiotrack_file_name,
|
||||
begin_timestamp, end_timestamp);
|
||||
|
||||
// Save speaking turn index for self cross-talk detection.
|
||||
RTC_DCHECK_EQ(speaking_turns_.size(), turn_index + 1);
|
||||
speaking_turn_indices[turn.speaker_name].push_back(turn_index);
|
||||
|
||||
// Update total duration of the consversational speech.
|
||||
if (total_duration_samples_ < end_timestamp)
|
||||
total_duration_samples_ = end_timestamp;
|
||||
|
||||
// Update and continue with next turn.
|
||||
second_last_turn = last_turn;
|
||||
last_turn.begin = begin_timestamp;
|
||||
last_turn.end = end_timestamp;
|
||||
}
|
||||
|
||||
// Detect self cross-talk.
|
||||
for (const std::string& speaker_name : speaker_names_) {
|
||||
LOG(LS_INFO) << "checking self cross-talk for <"
|
||||
<< speaker_name << ">";
|
||||
|
||||
// Copy all turns for this speaker to new vector.
|
||||
std::vector<SpeakingTurn> speaking_turns_for_name;
|
||||
std::copy_if(speaking_turns_.begin(), speaking_turns_.end(),
|
||||
std::back_inserter(speaking_turns_for_name),
|
||||
[&speaker_name](const SpeakingTurn& st){
|
||||
return st.speaker_name == speaker_name; });
|
||||
|
||||
// Check for overlap between adjacent elements.
|
||||
// This is a sufficient condition for self cross-talk since the intervals
|
||||
// are sorted by begin timestamp.
|
||||
auto overlap = std::adjacent_find(
|
||||
speaking_turns_for_name.begin(), speaking_turns_for_name.end(),
|
||||
[](const SpeakingTurn& a, const SpeakingTurn& b) {
|
||||
return a.end > b.begin; });
|
||||
|
||||
if (overlap != speaking_turns_for_name.end()) {
|
||||
LOG(LS_ERROR) << "Self cross-talk detected";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace conversational_speech
|
||||
|
||||
@ -11,10 +11,13 @@
|
||||
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_MULTIEND_CALL_H_
|
||||
#define WEBRTC_MODULES_AUDIO_PROCESSING_TEST_CONVERSATIONAL_SPEECH_MULTIEND_CALL_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "webrtc/base/array_view.h"
|
||||
#include "webrtc/base/constructormagic.h"
|
||||
@ -28,6 +31,20 @@ namespace conversational_speech {
|
||||
|
||||
class MultiEndCall {
|
||||
public:
|
||||
struct SpeakingTurn {
|
||||
// Constructor required in order to use std::vector::emplace_back().
|
||||
SpeakingTurn(std::string new_speaker_name,
|
||||
std::string new_audiotrack_file_name,
|
||||
size_t new_begin, size_t new_end)
|
||||
: speaker_name(std::move(new_speaker_name)),
|
||||
audiotrack_file_name(std::move(new_audiotrack_file_name)),
|
||||
begin(new_begin), end(new_end) {}
|
||||
std::string speaker_name;
|
||||
std::string audiotrack_file_name;
|
||||
size_t begin;
|
||||
size_t end;
|
||||
};
|
||||
|
||||
MultiEndCall(
|
||||
rtc::ArrayView<const Turn> timing, const std::string& audiotracks_path,
|
||||
std::unique_ptr<WavReaderAbstractFactory> wavreader_abstract_factory);
|
||||
@ -36,16 +53,20 @@ class MultiEndCall {
|
||||
const std::set<std::string>& speaker_names() const;
|
||||
const std::map<std::string, std::unique_ptr<WavReaderInterface>>&
|
||||
audiotrack_readers() const;
|
||||
bool valid() const;
|
||||
size_t total_duration_samples() const;
|
||||
const std::vector<SpeakingTurn>& speaking_turns() const;
|
||||
|
||||
private:
|
||||
// Find unique speaker names.
|
||||
// Finds unique speaker names.
|
||||
void FindSpeakerNames();
|
||||
|
||||
// Create one WavReader instance for each unique audiotrack.
|
||||
// Creates one WavReader instance for each unique audiotrack.
|
||||
void CreateAudioTrackReaders();
|
||||
|
||||
// Check the speaking turns timing.
|
||||
void CheckTiming();
|
||||
// Validates the speaking turns timing information. Accepts cross-talk, but
|
||||
// only up to 2 speakers. Rejects unordered turns and self cross-talk.
|
||||
bool CheckTiming();
|
||||
|
||||
rtc::ArrayView<const Turn> timing_;
|
||||
const std::string& audiotracks_path_;
|
||||
@ -53,6 +74,9 @@ class MultiEndCall {
|
||||
std::set<std::string> speaker_names_;
|
||||
std::map<std::string, std::unique_ptr<WavReaderInterface>>
|
||||
audiotrack_readers_;
|
||||
bool valid_;
|
||||
size_t total_duration_samples_;
|
||||
std::vector<SpeakingTurn> speaking_turns_;
|
||||
|
||||
RTC_DISALLOW_COPY_AND_ASSIGN(MultiEndCall);
|
||||
};
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user