Change levels of different speech signal in tool.

The conversational_speech_generator tool now adjusts the level of
different speech segments.

Implementation:
The Turn and MultiEndCall::SpeakingTurn structs have an extra 'gain'
member.  It's read and parsed in timing.cc and put in a Turn
struct. It's put in a SpeakingTurn struct in multiend_call.cc and read
and applied to the signal in simulator.cc

Bug: webrtc:7494
Change-Id: I9b82a896eb616c8b5ef14d41dfdfd085ef1d3fbb
Reviewed-on: https://webrtc-review.googlesource.com/26280
Commit-Queue: Alex Loiko <aleloi@webrtc.org>
Reviewed-by: Alessio Bazzica <alessiob@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#21714}
This commit is contained in:
Alex Loiko 2018-01-22 14:18:28 +01:00 committed by Commit Bot
parent 736d2f7d12
commit f475e3aa0e
7 changed files with 85 additions and 84 deletions

View File

@ -36,7 +36,9 @@ A a4 0
The first column indicates the speaker name, the second contains the audio track
file names, and the third the offsets (in milliseconds) used to concatenate the
chunks.
chunks. An optional fourth column contains positive or negative integral gains
in dB that will be applied to the tracks. It's possible to specify the gain for
some turns but not for others. If the gain is left out, no gain is applied.
Assume that all the audio tracks in the example above are 1000 ms long.
The tool will then generate two tracks (A and B) that look like this:

View File

@ -72,12 +72,8 @@ const char* const timing_filepath = "/path/to/timing_file.txt";
const char* const output_path = "/path/to/output_dir";
const std::vector<Turn> expected_timing = {
{"A", "a1", 0},
{"B", "b1", 0},
{"A", "a2", 100},
{"B", "b2", -200},
{"A", "a3", 0},
{"A", "a3", 0},
{"A", "a1", 0, 0}, {"B", "b1", 0, 0}, {"A", "a2", 100, 0},
{"B", "b2", -200, 0}, {"A", "a3", 0, 0}, {"A", "a3", 0, 0},
};
const std::size_t kNumberOfTurns = expected_timing.size();
@ -223,8 +219,7 @@ TEST(ConversationalSpeechTest, MultiEndCallCreate) {
TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
const std::vector<Turn> timing = {
{"A", "sr8000", 0},
{"B", "sr16000", 0},
{"A", "sr8000", 0, 0}, {"B", "sr16000", 0, 0},
};
auto mock_wavreader_factory = CreateMockWavReaderFactory();
@ -238,8 +233,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
const std::vector<Turn> timing = {
{"A", "sr16000_stereo", 0},
{"B", "sr16000_stereo", 0},
{"A", "sr16000_stereo", 0, 0}, {"B", "sr16000_stereo", 0, 0},
};
auto mock_wavreader_factory = CreateMockWavReaderFactory();
@ -254,8 +248,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
TEST(ConversationalSpeechTest,
MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels) {
const std::vector<Turn> timing = {
{"A", "sr8000", 0},
{"B", "sr16000_stereo", 0},
{"A", "sr8000", 0, 0}, {"B", "sr16000_stereo", 0, 0},
};
auto mock_wavreader_factory = CreateMockWavReaderFactory();
@ -269,8 +262,7 @@ TEST(ConversationalSpeechTest,
TEST(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
const std::vector<Turn> timing = {
{"A", "t500", -100},
{"B", "t500", 0},
{"A", "t500", -100, 0}, {"B", "t500", 0, 0},
};
auto mock_wavreader_factory = CreateMockWavReaderFactory();
@ -288,8 +280,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupSimple) {
// B .....1****
constexpr std::size_t expected_duration = kDefaultSampleRate;
const std::vector<Turn> timing = {
{"A", "t500", 0},
{"B", "t500", 0},
{"A", "t500", 0, 0}, {"B", "t500", 0, 0},
};
auto mock_wavreader_factory = CreateMockWavReaderFactory();
@ -313,8 +304,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupPause) {
// B .......1****
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
const std::vector<Turn> timing = {
{"A", "t500", 0},
{"B", "t500", 200},
{"A", "t500", 0, 0}, {"B", "t500", 200, 0},
};
auto mock_wavreader_factory = CreateMockWavReaderFactory();
@ -338,8 +328,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) {
// B ....1****
constexpr std::size_t expected_duration = kDefaultSampleRate * 0.9;
const std::vector<Turn> timing = {
{"A", "t500", 0},
{"B", "t500", -100},
{"A", "t500", 0, 0}, {"B", "t500", -100, 0},
};
auto mock_wavreader_factory = CreateMockWavReaderFactory();
@ -362,8 +351,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) {
// A ..0****
// B .1****. The n-th turn cannot start before the (n-1)-th one.
const std::vector<Turn> timing = {
{"A", "t500", 200},
{"B", "t500", -600},
{"A", "t500", 200, 0}, {"B", "t500", -600, 0},
};
auto mock_wavreader_factory = CreateMockWavReaderFactory();
@ -381,9 +369,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) {
// B ...1*********
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.3;
const std::vector<Turn> timing = {
{"A", "t500", 0},
{"B", "t1000", -200},
{"A", "t500", -800},
{"A", "t500", 0, 0}, {"B", "t1000", -200, 0}, {"A", "t500", -800, 0},
};
auto mock_wavreader_factory = CreateMockWavReaderFactory();
@ -408,9 +394,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) {
// B ......2****
// ^ Turn #1 overlaps with #0 which is from the same speaker.
const std::vector<Turn> timing = {
{"A", "t500", 0},
{"A", "t500", -200},
{"B", "t500", -200},
{"A", "t500", 0, 0}, {"A", "t500", -200, 0}, {"B", "t500", -200, 0},
};
auto mock_wavreader_factory = CreateMockWavReaderFactory();
@ -430,10 +414,10 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) {
// A ......3**.
// ^ Turn #3 overlaps with #0 which is from the same speaker.
const std::vector<Turn> timing = {
{"A", "t1000", 0},
{"B", "t300", -1000},
{"C", "t300", 0},
{"A", "t300", 0},
{"A", "t1000", 0, 0},
{"B", "t300", -1000, 0},
{"C", "t300", 0, 0},
{"A", "t300", 0, 0},
};
auto mock_wavreader_factory = CreateMockWavReaderFactory();
@ -452,9 +436,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) {
// C .......2****
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
const std::vector<Turn> timing = {
{"A", "t1000", 0},
{"B", "t500", -800},
{"C", "t500", 0},
{"A", "t1000", 0, 0}, {"B", "t500", -800, 0}, {"C", "t500", 0, 0},
};
auto mock_wavreader_factory = CreateMockWavReaderFactory();
@ -480,9 +462,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) {
// ^ Turn #2 overlaps both with #0 and #1 (cross-talk with 3+ speakers
// not permitted).
const std::vector<Turn> timing = {
{"A", "t1000", 0},
{"B", "t500", -800},
{"C", "t500", -300},
{"A", "t1000", 0, 0}, {"B", "t500", -800, 0}, {"C", "t500", -300, 0},
};
auto mock_wavreader_factory = CreateMockWavReaderFactory();
@ -501,9 +481,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) {
// C .......3****
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
const std::vector<Turn> timing = {
{"A", "t1000", 0},
{"B", "t500", -900},
{"C", "t500", 100},
{"A", "t1000", 0, 0}, {"B", "t500", -900, 0}, {"C", "t500", 100, 0},
};
auto mock_wavreader_factory = CreateMockWavReaderFactory();
@ -526,8 +504,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) {
// A 0****
// B 1****
const std::vector<Turn> timing = {
{"A", "t500", 0},
{"B", "t500", -500},
{"A", "t500", 0, 0}, {"B", "t500", -500, 0},
};
auto mock_wavreader_factory = CreateMockWavReaderFactory();
@ -551,13 +528,9 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequence) {
// C ......2**.......6**..
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.9;
const std::vector<Turn> timing = {
{"A", "t500", 0},
{"B", "t500", 0},
{"C", "t300", -400},
{"A", "t500", 0},
{"B", "t300", -100},
{"A", "t300", -100},
{"C", "t300", -200},
{"A", "t500", 0, 0}, {"B", "t500", 0, 0}, {"C", "t300", -400, 0},
{"A", "t500", 0, 0}, {"B", "t300", -100, 0}, {"A", "t300", -100, 0},
{"C", "t300", -200, 0},
};
auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
@ -585,13 +558,9 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) {
// ^ Turns #4, #5 and #6 overlapping (cross-talk with 3+
// speakers not permitted).
const std::vector<Turn> timing = {
{"A", "t500", 0},
{"B", "t500", 0},
{"C", "t300", -400},
{"A", "t500", 0},
{"B", "t300", -100},
{"A", "t300", -200},
{"C", "t300", -200},
{"A", "t500", 0, 0}, {"B", "t500", 0, 0}, {"C", "t300", -400, 0},
{"A", "t500", 0, 0}, {"B", "t300", -100, 0}, {"A", "t300", -200, 0},
{"C", "t300", -200, 0},
};
auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
@ -637,10 +606,10 @@ TEST(ConversationalSpeechTest, DISABLED_MultiEndCallSimulator) {
// A 0*********...........2*********.....
// B ...........1*********.....3*********
const std::vector<Turn> expected_timing = {
{"A", "t5000_440.wav", 0},
{"B", "t5000_880.wav", 500},
{"A", "t5000_440.wav", 0},
{"B", "t5000_880.wav", -2500},
{"A", "t5000_440.wav", 0, 0},
{"B", "t5000_880.wav", 500, 0},
{"A", "t5000_440.wav", 0, 0},
{"B", "t5000_880.wav", -2500, 0},
};
const std::size_t expected_duration_seconds = 18;

View File

@ -139,9 +139,8 @@ bool MultiEndCall::CheckTiming() {
}
// Append turn.
speaking_turns_.emplace_back(
turn.speaker_name, turn.audiotrack_file_name,
begin_timestamp, end_timestamp);
speaking_turns_.emplace_back(turn.speaker_name, turn.audiotrack_file_name,
begin_timestamp, end_timestamp, turn.gain);
// Save speaking turn index for self cross-talk detection.
RTC_DCHECK_EQ(speaking_turns_.size(), turn_index + 1);

View File

@ -35,14 +35,19 @@ class MultiEndCall {
// Constructor required in order to use std::vector::emplace_back().
SpeakingTurn(std::string new_speaker_name,
std::string new_audiotrack_file_name,
size_t new_begin, size_t new_end)
size_t new_begin,
size_t new_end,
int gain)
: speaker_name(std::move(new_speaker_name)),
audiotrack_file_name(std::move(new_audiotrack_file_name)),
begin(new_begin), end(new_end) {}
begin(new_begin),
end(new_end),
gain(gain) {}
std::string speaker_name;
std::string audiotrack_file_name;
size_t begin;
size_t end;
int gain;
};
MultiEndCall(

View File

@ -10,6 +10,9 @@
#include "modules/audio_processing/test/conversational_speech/simulator.h"
#include <math.h>
#include <algorithm>
#include <set>
#include <utility>
#include <vector>
@ -19,6 +22,7 @@
#include "modules/audio_processing/test/conversational_speech/wavreader_interface.h"
#include "rtc_base/constructormagic.h"
#include "rtc_base/logging.h"
#include "rtc_base/numerics/safe_conversions.h"
#include "rtc_base/pathutils.h"
#include "rtc_base/ptr_util.h"
@ -158,6 +162,17 @@ void PadRightWrite(WavWriter* wav_writer, size_t pad_samples) {
}
}
void ScaleSignal(rtc::ArrayView<const int16_t> source_samples,
int gain,
rtc::ArrayView<int16_t> output_samples) {
const float gain_linear = pow(10.0, gain / 20.0);
RTC_DCHECK_EQ(source_samples.size(), output_samples.size());
std::transform(source_samples.begin(), source_samples.end(),
output_samples.begin(), [gain_linear](int16_t x) -> int16_t {
return rtc::saturated_cast<int16_t>(x * gain_linear);
});
}
} // namespace
namespace conversational_speech {
@ -185,21 +200,23 @@ std::unique_ptr<std::map<std::string, SpeakerOutputFilePaths>> Simulate(
// Write near-end and far-end output tracks.
for (const auto& speaking_turn : multiend_call.speaking_turns()) {
const std::string& active_speaker_name = speaking_turn.speaker_name;
auto source_audiotrack = audiotracks->at(
speaking_turn.audiotrack_file_name);
const auto source_audiotrack =
audiotracks->at(speaking_turn.audiotrack_file_name);
std::vector<int16_t> scaled_audiotrack(source_audiotrack.size());
ScaleSignal(source_audiotrack, speaking_turn.gain, scaled_audiotrack);
// Write active speaker's chunk to active speaker's near-end.
PadLeftWriteChunk(source_audiotrack, speaking_turn.begin,
speakers_wav_writers->at(
active_speaker_name).near_end_wav_writer());
PadLeftWriteChunk(
scaled_audiotrack, speaking_turn.begin,
speakers_wav_writers->at(active_speaker_name).near_end_wav_writer());
// Write active speaker's chunk to other participants' far-ends.
for (const std::string& speaker_name : speaker_names) {
if (speaker_name == active_speaker_name)
continue;
PadLeftWriteChunk(source_audiotrack, speaking_turn.begin,
speakers_wav_writers->at(
speaker_name).far_end_wav_writer());
PadLeftWriteChunk(
scaled_audiotrack, speaking_turn.begin,
speakers_wav_writers->at(speaker_name).far_end_wav_writer());
}
}

View File

@ -21,8 +21,8 @@ namespace conversational_speech {
bool Turn::operator==(const Turn &b) const {
return b.speaker_name == speaker_name &&
b.audiotrack_file_name == audiotrack_file_name &&
b.offset == offset;
b.audiotrack_file_name == audiotrack_file_name && b.offset == offset &&
b.gain == gain;
}
std::vector<Turn> LoadTiming(const std::string& timing_filepath) {
@ -30,8 +30,13 @@ std::vector<Turn> LoadTiming(const std::string& timing_filepath) {
auto parse_line = [](const std::string& line) {
std::vector<std::string> fields;
rtc::split(line, ' ', &fields);
RTC_CHECK_EQ(fields.size(), 3);
return Turn(fields[0], fields[1], std::atol(fields[2].c_str()));
RTC_CHECK_GE(fields.size(), 3);
RTC_CHECK_LE(fields.size(), 4);
int gain = 0;
if (fields.size() == 4) {
gain = std::atof(fields[3].c_str());
}
return Turn(fields[0], fields[1], std::atol(fields[2].c_str()), gain);
};
// Init.
@ -55,8 +60,8 @@ void SaveTiming(const std::string& timing_filepath,
std::ofstream outfile(timing_filepath);
RTC_CHECK(outfile.is_open());
for (const Turn& turn : timing) {
outfile << turn.speaker_name << " " << turn.audiotrack_file_name
<< " " << turn.offset << std::endl;
outfile << turn.speaker_name << " " << turn.audiotrack_file_name << " "
<< turn.offset << " " << turn.gain << std::endl;
}
outfile.close();
}

View File

@ -21,15 +21,19 @@ namespace test {
namespace conversational_speech {
struct Turn{
Turn(std::string new_speaker_name, std::string new_audiotrack_file_name,
int new_offset)
Turn(std::string new_speaker_name,
std::string new_audiotrack_file_name,
int new_offset,
int gain)
: speaker_name(new_speaker_name),
audiotrack_file_name(new_audiotrack_file_name),
offset(new_offset) {}
offset(new_offset),
gain(gain) {}
bool operator==(const Turn &b) const;
std::string speaker_name;
std::string audiotrack_file_name;
int offset;
int gain;
};
// Loads a list of turns from a file.