Change levels of different speech signal in tool.
The conversational_speech_generator tool now adjusts the level of different speech segments. Implementation: The Turn and MultiEndCall::SpeakingTurn structs have an extra 'gain' member. It's read and parsed in timing.cc and put in a Turn struct. It's put in a SpeakingTurn struct in multiend_call.cc and read and applied to the signal in simulator.cc Bug: webrtc:7494 Change-Id: I9b82a896eb616c8b5ef14d41dfdfd085ef1d3fbb Reviewed-on: https://webrtc-review.googlesource.com/26280 Commit-Queue: Alex Loiko <aleloi@webrtc.org> Reviewed-by: Alessio Bazzica <alessiob@webrtc.org> Cr-Commit-Position: refs/heads/master@{#21714}
This commit is contained in:
parent
736d2f7d12
commit
f475e3aa0e
@ -36,7 +36,9 @@ A a4 0
|
||||
|
||||
The first column indicates the speaker name, the second contains the audio track
|
||||
file names, and the third the offsets (in milliseconds) used to concatenate the
|
||||
chunks.
|
||||
chunks. An optional fourth column contains positive or negative integral gains
|
||||
in dB that will be applied to the tracks. It's possible to specify the gain for
|
||||
some turns but not for others. If the gain is left out, no gain is applied.
|
||||
|
||||
Assume that all the audio tracks in the example above are 1000 ms long.
|
||||
The tool will then generate two tracks (A and B) that look like this:
|
||||
|
||||
@ -72,12 +72,8 @@ const char* const timing_filepath = "/path/to/timing_file.txt";
|
||||
const char* const output_path = "/path/to/output_dir";
|
||||
|
||||
const std::vector<Turn> expected_timing = {
|
||||
{"A", "a1", 0},
|
||||
{"B", "b1", 0},
|
||||
{"A", "a2", 100},
|
||||
{"B", "b2", -200},
|
||||
{"A", "a3", 0},
|
||||
{"A", "a3", 0},
|
||||
{"A", "a1", 0, 0}, {"B", "b1", 0, 0}, {"A", "a2", 100, 0},
|
||||
{"B", "b2", -200, 0}, {"A", "a3", 0, 0}, {"A", "a3", 0, 0},
|
||||
};
|
||||
const std::size_t kNumberOfTurns = expected_timing.size();
|
||||
|
||||
@ -223,8 +219,7 @@ TEST(ConversationalSpeechTest, MultiEndCallCreate) {
|
||||
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "sr8000", 0},
|
||||
{"B", "sr16000", 0},
|
||||
{"A", "sr8000", 0, 0}, {"B", "sr16000", 0, 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
@ -238,8 +233,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
|
||||
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "sr16000_stereo", 0},
|
||||
{"B", "sr16000_stereo", 0},
|
||||
{"A", "sr16000_stereo", 0, 0}, {"B", "sr16000_stereo", 0, 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
@ -254,8 +248,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
|
||||
TEST(ConversationalSpeechTest,
|
||||
MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels) {
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "sr8000", 0},
|
||||
{"B", "sr16000_stereo", 0},
|
||||
{"A", "sr8000", 0, 0}, {"B", "sr16000_stereo", 0, 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
@ -269,8 +262,7 @@ TEST(ConversationalSpeechTest,
|
||||
|
||||
TEST(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", -100},
|
||||
{"B", "t500", 0},
|
||||
{"A", "t500", -100, 0}, {"B", "t500", 0, 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
@ -288,8 +280,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupSimple) {
|
||||
// B .....1****
|
||||
constexpr std::size_t expected_duration = kDefaultSampleRate;
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 0},
|
||||
{"B", "t500", 0},
|
||||
{"A", "t500", 0, 0}, {"B", "t500", 0, 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
@ -313,8 +304,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupPause) {
|
||||
// B .......1****
|
||||
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 0},
|
||||
{"B", "t500", 200},
|
||||
{"A", "t500", 0, 0}, {"B", "t500", 200, 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
@ -338,8 +328,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) {
|
||||
// B ....1****
|
||||
constexpr std::size_t expected_duration = kDefaultSampleRate * 0.9;
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 0},
|
||||
{"B", "t500", -100},
|
||||
{"A", "t500", 0, 0}, {"B", "t500", -100, 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
@ -362,8 +351,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) {
|
||||
// A ..0****
|
||||
// B .1****. The n-th turn cannot start before the (n-1)-th one.
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 200},
|
||||
{"B", "t500", -600},
|
||||
{"A", "t500", 200, 0}, {"B", "t500", -600, 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
@ -381,9 +369,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) {
|
||||
// B ...1*********
|
||||
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.3;
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 0},
|
||||
{"B", "t1000", -200},
|
||||
{"A", "t500", -800},
|
||||
{"A", "t500", 0, 0}, {"B", "t1000", -200, 0}, {"A", "t500", -800, 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
@ -408,9 +394,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) {
|
||||
// B ......2****
|
||||
// ^ Turn #1 overlaps with #0 which is from the same speaker.
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 0},
|
||||
{"A", "t500", -200},
|
||||
{"B", "t500", -200},
|
||||
{"A", "t500", 0, 0}, {"A", "t500", -200, 0}, {"B", "t500", -200, 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
@ -430,10 +414,10 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) {
|
||||
// A ......3**.
|
||||
// ^ Turn #3 overlaps with #0 which is from the same speaker.
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t1000", 0},
|
||||
{"B", "t300", -1000},
|
||||
{"C", "t300", 0},
|
||||
{"A", "t300", 0},
|
||||
{"A", "t1000", 0, 0},
|
||||
{"B", "t300", -1000, 0},
|
||||
{"C", "t300", 0, 0},
|
||||
{"A", "t300", 0, 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
@ -452,9 +436,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) {
|
||||
// C .......2****
|
||||
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t1000", 0},
|
||||
{"B", "t500", -800},
|
||||
{"C", "t500", 0},
|
||||
{"A", "t1000", 0, 0}, {"B", "t500", -800, 0}, {"C", "t500", 0, 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
@ -480,9 +462,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) {
|
||||
// ^ Turn #2 overlaps both with #0 and #1 (cross-talk with 3+ speakers
|
||||
// not permitted).
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t1000", 0},
|
||||
{"B", "t500", -800},
|
||||
{"C", "t500", -300},
|
||||
{"A", "t1000", 0, 0}, {"B", "t500", -800, 0}, {"C", "t500", -300, 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
@ -501,9 +481,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) {
|
||||
// C .......3****
|
||||
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t1000", 0},
|
||||
{"B", "t500", -900},
|
||||
{"C", "t500", 100},
|
||||
{"A", "t1000", 0, 0}, {"B", "t500", -900, 0}, {"C", "t500", 100, 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
@ -526,8 +504,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) {
|
||||
// A 0****
|
||||
// B 1****
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 0},
|
||||
{"B", "t500", -500},
|
||||
{"A", "t500", 0, 0}, {"B", "t500", -500, 0},
|
||||
};
|
||||
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
||||
|
||||
@ -551,13 +528,9 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequence) {
|
||||
// C ......2**.......6**..
|
||||
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.9;
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 0},
|
||||
{"B", "t500", 0},
|
||||
{"C", "t300", -400},
|
||||
{"A", "t500", 0},
|
||||
{"B", "t300", -100},
|
||||
{"A", "t300", -100},
|
||||
{"C", "t300", -200},
|
||||
{"A", "t500", 0, 0}, {"B", "t500", 0, 0}, {"C", "t300", -400, 0},
|
||||
{"A", "t500", 0, 0}, {"B", "t300", -100, 0}, {"A", "t300", -100, 0},
|
||||
{"C", "t300", -200, 0},
|
||||
};
|
||||
auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
|
||||
new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
|
||||
@ -585,13 +558,9 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) {
|
||||
// ^ Turns #4, #5 and #6 overlapping (cross-talk with 3+
|
||||
// speakers not permitted).
|
||||
const std::vector<Turn> timing = {
|
||||
{"A", "t500", 0},
|
||||
{"B", "t500", 0},
|
||||
{"C", "t300", -400},
|
||||
{"A", "t500", 0},
|
||||
{"B", "t300", -100},
|
||||
{"A", "t300", -200},
|
||||
{"C", "t300", -200},
|
||||
{"A", "t500", 0, 0}, {"B", "t500", 0, 0}, {"C", "t300", -400, 0},
|
||||
{"A", "t500", 0, 0}, {"B", "t300", -100, 0}, {"A", "t300", -200, 0},
|
||||
{"C", "t300", -200, 0},
|
||||
};
|
||||
auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
|
||||
new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
|
||||
@ -637,10 +606,10 @@ TEST(ConversationalSpeechTest, DISABLED_MultiEndCallSimulator) {
|
||||
// A 0*********...........2*********.....
|
||||
// B ...........1*********.....3*********
|
||||
const std::vector<Turn> expected_timing = {
|
||||
{"A", "t5000_440.wav", 0},
|
||||
{"B", "t5000_880.wav", 500},
|
||||
{"A", "t5000_440.wav", 0},
|
||||
{"B", "t5000_880.wav", -2500},
|
||||
{"A", "t5000_440.wav", 0, 0},
|
||||
{"B", "t5000_880.wav", 500, 0},
|
||||
{"A", "t5000_440.wav", 0, 0},
|
||||
{"B", "t5000_880.wav", -2500, 0},
|
||||
};
|
||||
const std::size_t expected_duration_seconds = 18;
|
||||
|
||||
|
||||
@ -139,9 +139,8 @@ bool MultiEndCall::CheckTiming() {
|
||||
}
|
||||
|
||||
// Append turn.
|
||||
speaking_turns_.emplace_back(
|
||||
turn.speaker_name, turn.audiotrack_file_name,
|
||||
begin_timestamp, end_timestamp);
|
||||
speaking_turns_.emplace_back(turn.speaker_name, turn.audiotrack_file_name,
|
||||
begin_timestamp, end_timestamp, turn.gain);
|
||||
|
||||
// Save speaking turn index for self cross-talk detection.
|
||||
RTC_DCHECK_EQ(speaking_turns_.size(), turn_index + 1);
|
||||
|
||||
@ -35,14 +35,19 @@ class MultiEndCall {
|
||||
// Constructor required in order to use std::vector::emplace_back().
|
||||
SpeakingTurn(std::string new_speaker_name,
|
||||
std::string new_audiotrack_file_name,
|
||||
size_t new_begin, size_t new_end)
|
||||
size_t new_begin,
|
||||
size_t new_end,
|
||||
int gain)
|
||||
: speaker_name(std::move(new_speaker_name)),
|
||||
audiotrack_file_name(std::move(new_audiotrack_file_name)),
|
||||
begin(new_begin), end(new_end) {}
|
||||
begin(new_begin),
|
||||
end(new_end),
|
||||
gain(gain) {}
|
||||
std::string speaker_name;
|
||||
std::string audiotrack_file_name;
|
||||
size_t begin;
|
||||
size_t end;
|
||||
int gain;
|
||||
};
|
||||
|
||||
MultiEndCall(
|
||||
|
||||
@ -10,6 +10,9 @@
|
||||
|
||||
#include "modules/audio_processing/test/conversational_speech/simulator.h"
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
@ -19,6 +22,7 @@
|
||||
#include "modules/audio_processing/test/conversational_speech/wavreader_interface.h"
|
||||
#include "rtc_base/constructormagic.h"
|
||||
#include "rtc_base/logging.h"
|
||||
#include "rtc_base/numerics/safe_conversions.h"
|
||||
#include "rtc_base/pathutils.h"
|
||||
#include "rtc_base/ptr_util.h"
|
||||
|
||||
@ -158,6 +162,17 @@ void PadRightWrite(WavWriter* wav_writer, size_t pad_samples) {
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleSignal(rtc::ArrayView<const int16_t> source_samples,
|
||||
int gain,
|
||||
rtc::ArrayView<int16_t> output_samples) {
|
||||
const float gain_linear = pow(10.0, gain / 20.0);
|
||||
RTC_DCHECK_EQ(source_samples.size(), output_samples.size());
|
||||
std::transform(source_samples.begin(), source_samples.end(),
|
||||
output_samples.begin(), [gain_linear](int16_t x) -> int16_t {
|
||||
return rtc::saturated_cast<int16_t>(x * gain_linear);
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace conversational_speech {
|
||||
@ -185,21 +200,23 @@ std::unique_ptr<std::map<std::string, SpeakerOutputFilePaths>> Simulate(
|
||||
// Write near-end and far-end output tracks.
|
||||
for (const auto& speaking_turn : multiend_call.speaking_turns()) {
|
||||
const std::string& active_speaker_name = speaking_turn.speaker_name;
|
||||
auto source_audiotrack = audiotracks->at(
|
||||
speaking_turn.audiotrack_file_name);
|
||||
const auto source_audiotrack =
|
||||
audiotracks->at(speaking_turn.audiotrack_file_name);
|
||||
std::vector<int16_t> scaled_audiotrack(source_audiotrack.size());
|
||||
ScaleSignal(source_audiotrack, speaking_turn.gain, scaled_audiotrack);
|
||||
|
||||
// Write active speaker's chunk to active speaker's near-end.
|
||||
PadLeftWriteChunk(source_audiotrack, speaking_turn.begin,
|
||||
speakers_wav_writers->at(
|
||||
active_speaker_name).near_end_wav_writer());
|
||||
PadLeftWriteChunk(
|
||||
scaled_audiotrack, speaking_turn.begin,
|
||||
speakers_wav_writers->at(active_speaker_name).near_end_wav_writer());
|
||||
|
||||
// Write active speaker's chunk to other participants' far-ends.
|
||||
for (const std::string& speaker_name : speaker_names) {
|
||||
if (speaker_name == active_speaker_name)
|
||||
continue;
|
||||
PadLeftWriteChunk(source_audiotrack, speaking_turn.begin,
|
||||
speakers_wav_writers->at(
|
||||
speaker_name).far_end_wav_writer());
|
||||
PadLeftWriteChunk(
|
||||
scaled_audiotrack, speaking_turn.begin,
|
||||
speakers_wav_writers->at(speaker_name).far_end_wav_writer());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -21,8 +21,8 @@ namespace conversational_speech {
|
||||
|
||||
bool Turn::operator==(const Turn &b) const {
|
||||
return b.speaker_name == speaker_name &&
|
||||
b.audiotrack_file_name == audiotrack_file_name &&
|
||||
b.offset == offset;
|
||||
b.audiotrack_file_name == audiotrack_file_name && b.offset == offset &&
|
||||
b.gain == gain;
|
||||
}
|
||||
|
||||
std::vector<Turn> LoadTiming(const std::string& timing_filepath) {
|
||||
@ -30,8 +30,13 @@ std::vector<Turn> LoadTiming(const std::string& timing_filepath) {
|
||||
auto parse_line = [](const std::string& line) {
|
||||
std::vector<std::string> fields;
|
||||
rtc::split(line, ' ', &fields);
|
||||
RTC_CHECK_EQ(fields.size(), 3);
|
||||
return Turn(fields[0], fields[1], std::atol(fields[2].c_str()));
|
||||
RTC_CHECK_GE(fields.size(), 3);
|
||||
RTC_CHECK_LE(fields.size(), 4);
|
||||
int gain = 0;
|
||||
if (fields.size() == 4) {
|
||||
gain = std::atof(fields[3].c_str());
|
||||
}
|
||||
return Turn(fields[0], fields[1], std::atol(fields[2].c_str()), gain);
|
||||
};
|
||||
|
||||
// Init.
|
||||
@ -55,8 +60,8 @@ void SaveTiming(const std::string& timing_filepath,
|
||||
std::ofstream outfile(timing_filepath);
|
||||
RTC_CHECK(outfile.is_open());
|
||||
for (const Turn& turn : timing) {
|
||||
outfile << turn.speaker_name << " " << turn.audiotrack_file_name
|
||||
<< " " << turn.offset << std::endl;
|
||||
outfile << turn.speaker_name << " " << turn.audiotrack_file_name << " "
|
||||
<< turn.offset << " " << turn.gain << std::endl;
|
||||
}
|
||||
outfile.close();
|
||||
}
|
||||
|
||||
@ -21,15 +21,19 @@ namespace test {
|
||||
namespace conversational_speech {
|
||||
|
||||
struct Turn{
|
||||
Turn(std::string new_speaker_name, std::string new_audiotrack_file_name,
|
||||
int new_offset)
|
||||
Turn(std::string new_speaker_name,
|
||||
std::string new_audiotrack_file_name,
|
||||
int new_offset,
|
||||
int gain)
|
||||
: speaker_name(new_speaker_name),
|
||||
audiotrack_file_name(new_audiotrack_file_name),
|
||||
offset(new_offset) {}
|
||||
offset(new_offset),
|
||||
gain(gain) {}
|
||||
bool operator==(const Turn &b) const;
|
||||
std::string speaker_name;
|
||||
std::string audiotrack_file_name;
|
||||
int offset;
|
||||
int gain;
|
||||
};
|
||||
|
||||
// Loads a list of turns from a file.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user