From f475e3aa0e7df2aae5b973d864789b769c295ca6 Mon Sep 17 00:00:00 2001
From: Alex Loiko <aleloi@webrtc.org>
Date: Mon, 22 Jan 2018 14:18:28 +0100
Subject: [PATCH] Change levels of different speech signal in tool.

The conversational_speech_generator tool now adjusts the level of
different speech segments.

Implementation:
The Turn and MultiEndCall::SpeakingTurn structs have an extra 'gain'
member.  It's read and parsed in timing.cc and put in a Turn
struct. It's put in a SpeakingTurn struct in multiend_call.cc and read
and applied to the signal in simulator.cc

Bug: webrtc:7494
Change-Id: I9b82a896eb616c8b5ef14d41dfdfd085ef1d3fbb
Reviewed-on: https://webrtc-review.googlesource.com/26280
Commit-Queue: Alex Loiko <aleloi@webrtc.org>
Reviewed-by: Alessio Bazzica <alessiob@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#21714}
---
 .../test/conversational_speech/README.md      |  4 +-
 .../generator_unittest.cc                     | 91 ++++++-------------
 .../conversational_speech/multiend_call.cc    |  5 +-
 .../conversational_speech/multiend_call.h     |  9 +-
 .../test/conversational_speech/simulator.cc   | 33 +++++--
 .../test/conversational_speech/timing.cc      | 17 ++--
 .../test/conversational_speech/timing.h       | 10 +-
 7 files changed, 85 insertions(+), 84 deletions(-)
diff --git a/modules/audio_processing/test/conversational_speech/README.md b/modules/audio_processing/test/conversational_speech/README.md
index bbb4112fc0..0fa66669e6 100644
--- a/modules/audio_processing/test/conversational_speech/README.md
+++ b/modules/audio_processing/test/conversational_speech/README.md
@@ -36,7 +36,9 @@ A a4 0
 
 The first column indicates the speaker name, the second contains the audio track
 file names, and the third the offsets (in milliseconds) used to concatenate the
-chunks.
+chunks. An optional fourth column contains positive or negative integral gains
+in dB that will be applied to the tracks. It's possible to specify the gain for
+some turns but not for others. If the gain is left out, no gain is applied.
 
 Assume that all the audio tracks in the example above are 1000 ms long.
 The tool will then generate two tracks (A and B) that look like this:
diff --git a/modules/audio_processing/test/conversational_speech/generator_unittest.cc b/modules/audio_processing/test/conversational_speech/generator_unittest.cc
index 064e8c82d7..7cd1bf7e29 100644
--- a/modules/audio_processing/test/conversational_speech/generator_unittest.cc
+++ b/modules/audio_processing/test/conversational_speech/generator_unittest.cc
@@ -72,12 +72,8 @@ const char* const timing_filepath = "/path/to/timing_file.txt";
 const char* const output_path = "/path/to/output_dir";
 
 const std::vector<Turn> expected_timing = {
-    {"A", "a1", 0},
-    {"B", "b1", 0},
-    {"A", "a2", 100},
-    {"B", "b2", -200},
-    {"A", "a3", 0},
-    {"A", "a3", 0},
+    {"A", "a1", 0, 0},    {"B", "b1", 0, 0}, {"A", "a2", 100, 0},
+    {"B", "b2", -200, 0}, {"A", "a3", 0, 0}, {"A", "a3", 0, 0},
 };
 const std::size_t kNumberOfTurns = expected_timing.size();
 
@@ -223,8 +219,7 @@ TEST(ConversationalSpeechTest, MultiEndCallCreate) {
 
 TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
   const std::vector<Turn> timing = {
-      {"A", "sr8000", 0},
-      {"B", "sr16000", 0},
+      {"A", "sr8000", 0, 0}, {"B", "sr16000", 0, 0},
   };
   auto mock_wavreader_factory = CreateMockWavReaderFactory();
 
@@ -238,8 +233,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
 
 TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
   const std::vector<Turn> timing = {
-      {"A", "sr16000_stereo", 0},
-      {"B", "sr16000_stereo", 0},
+      {"A", "sr16000_stereo", 0, 0}, {"B", "sr16000_stereo", 0, 0},
   };
   auto mock_wavreader_factory = CreateMockWavReaderFactory();
 
@@ -254,8 +248,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
 TEST(ConversationalSpeechTest,
        MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels) {
   const std::vector<Turn> timing = {
-      {"A", "sr8000", 0},
-      {"B", "sr16000_stereo", 0},
+      {"A", "sr8000", 0, 0}, {"B", "sr16000_stereo", 0, 0},
   };
   auto mock_wavreader_factory = CreateMockWavReaderFactory();
 
@@ -269,8 +262,7 @@ TEST(ConversationalSpeechTest,
 
 TEST(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
   const std::vector<Turn> timing = {
-      {"A", "t500", -100},
-      {"B", "t500", 0},
+      {"A", "t500", -100, 0}, {"B", "t500", 0, 0},
   };
   auto mock_wavreader_factory = CreateMockWavReaderFactory();
 
@@ -288,8 +280,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupSimple) {
   // B .....1****
   constexpr std::size_t expected_duration = kDefaultSampleRate;
   const std::vector<Turn> timing = {
-      {"A", "t500", 0},
-      {"B", "t500", 0},
+      {"A", "t500", 0, 0}, {"B", "t500", 0, 0},
   };
   auto mock_wavreader_factory = CreateMockWavReaderFactory();
 
@@ -313,8 +304,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupPause) {
   // B .......1****
   constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
   const std::vector<Turn> timing = {
-      {"A", "t500", 0},
-      {"B", "t500", 200},
+      {"A", "t500", 0, 0}, {"B", "t500", 200, 0},
   };
   auto mock_wavreader_factory = CreateMockWavReaderFactory();
 
@@ -338,8 +328,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) {
   // B ....1****
   constexpr std::size_t expected_duration = kDefaultSampleRate * 0.9;
   const std::vector<Turn> timing = {
-      {"A", "t500", 0},
-      {"B", "t500", -100},
+      {"A", "t500", 0, 0}, {"B", "t500", -100, 0},
   };
   auto mock_wavreader_factory = CreateMockWavReaderFactory();
 
@@ -362,8 +351,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) {
   // A ..0****
   // B .1****.  The n-th turn cannot start before the (n-1)-th one.
   const std::vector<Turn> timing = {
-      {"A", "t500", 200},
-      {"B", "t500", -600},
+      {"A", "t500", 200, 0}, {"B", "t500", -600, 0},
   };
   auto mock_wavreader_factory = CreateMockWavReaderFactory();
 
@@ -381,9 +369,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) {
   // B ...1*********
   constexpr std::size_t expected_duration = kDefaultSampleRate * 1.3;
   const std::vector<Turn> timing = {
-      {"A", "t500", 0},
-      {"B", "t1000", -200},
-      {"A", "t500", -800},
+      {"A", "t500", 0, 0}, {"B", "t1000", -200, 0}, {"A", "t500", -800, 0},
   };
   auto mock_wavreader_factory = CreateMockWavReaderFactory();
 
@@ -408,9 +394,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) {
   // B ......2****
   //      ^  Turn #1 overlaps with #0 which is from the same speaker.
   const std::vector<Turn> timing = {
-      {"A", "t500", 0},
-      {"A", "t500", -200},
-      {"B", "t500", -200},
+      {"A", "t500", 0, 0}, {"A", "t500", -200, 0}, {"B", "t500", -200, 0},
   };
   auto mock_wavreader_factory = CreateMockWavReaderFactory();
 
@@ -430,10 +414,10 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) {
   // A ......3**.
   //         ^  Turn #3 overlaps with #0 which is from the same speaker.
   const std::vector<Turn> timing = {
-      {"A", "t1000", 0},
-      {"B", "t300", -1000},
-      {"C", "t300", 0},
-      {"A", "t300", 0},
+      {"A", "t1000", 0, 0},
+      {"B", "t300", -1000, 0},
+      {"C", "t300", 0, 0},
+      {"A", "t300", 0, 0},
   };
   auto mock_wavreader_factory = CreateMockWavReaderFactory();
 
@@ -452,9 +436,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) {
   // C .......2****
   constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
   const std::vector<Turn> timing = {
-      {"A", "t1000", 0},
-      {"B", "t500", -800},
-      {"C", "t500", 0},
+      {"A", "t1000", 0, 0}, {"B", "t500", -800, 0}, {"C", "t500", 0, 0},
   };
   auto mock_wavreader_factory = CreateMockWavReaderFactory();
 
@@ -480,9 +462,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) {
   //       ^  Turn #2 overlaps both with #0 and #1 (cross-talk with 3+ speakers
   //          not permitted).
   const std::vector<Turn> timing = {
-      {"A", "t1000", 0},
-      {"B", "t500", -800},
-      {"C", "t500", -300},
+      {"A", "t1000", 0, 0}, {"B", "t500", -800, 0}, {"C", "t500", -300, 0},
   };
   auto mock_wavreader_factory = CreateMockWavReaderFactory();
 
@@ -501,9 +481,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) {
   // C .......3****
   constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
   const std::vector<Turn> timing = {
-      {"A", "t1000", 0},
-      {"B", "t500", -900},
-      {"C", "t500", 100},
+      {"A", "t1000", 0, 0}, {"B", "t500", -900, 0}, {"C", "t500", 100, 0},
   };
   auto mock_wavreader_factory = CreateMockWavReaderFactory();
 
@@ -526,8 +504,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) {
   // A 0****
   // B 1****
   const std::vector<Turn> timing = {
-      {"A", "t500", 0},
-      {"B", "t500", -500},
+      {"A", "t500", 0, 0}, {"B", "t500", -500, 0},
   };
   auto mock_wavreader_factory = CreateMockWavReaderFactory();
 
@@ -551,13 +528,9 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequence) {
   // C ......2**.......6**..
   constexpr std::size_t expected_duration = kDefaultSampleRate * 1.9;
   const std::vector<Turn> timing = {
-      {"A", "t500", 0},
-      {"B", "t500", 0},
-      {"C", "t300", -400},
-      {"A", "t500", 0},
-      {"B", "t300", -100},
-      {"A", "t300", -100},
-      {"C", "t300", -200},
+      {"A", "t500", 0, 0},    {"B", "t500", 0, 0},    {"C", "t300", -400, 0},
+      {"A", "t500", 0, 0},    {"B", "t300", -100, 0}, {"A", "t300", -100, 0},
+      {"C", "t300", -200, 0},
   };
   auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
       new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
@@ -585,13 +558,9 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) {
   //                 ^ Turns #4, #5 and #6 overlapping (cross-talk with 3+
   //                   speakers not permitted).
   const std::vector<Turn> timing = {
-      {"A", "t500", 0},
-      {"B", "t500", 0},
-      {"C", "t300", -400},
-      {"A", "t500", 0},
-      {"B", "t300", -100},
-      {"A", "t300", -200},
-      {"C", "t300", -200},
+      {"A", "t500", 0, 0},    {"B", "t500", 0, 0},    {"C", "t300", -400, 0},
+      {"A", "t500", 0, 0},    {"B", "t300", -100, 0}, {"A", "t300", -200, 0},
+      {"C", "t300", -200, 0},
   };
   auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
       new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
@@ -637,10 +606,10 @@ TEST(ConversationalSpeechTest, DISABLED_MultiEndCallSimulator) {
   // A 0*********...........2*********.....
   // B ...........1*********.....3*********
   const std::vector<Turn> expected_timing = {
-      {"A", "t5000_440.wav", 0},
-      {"B", "t5000_880.wav", 500},
-      {"A", "t5000_440.wav", 0},
-      {"B", "t5000_880.wav", -2500},
+      {"A", "t5000_440.wav", 0, 0},
+      {"B", "t5000_880.wav", 500, 0},
+      {"A", "t5000_440.wav", 0, 0},
+      {"B", "t5000_880.wav", -2500, 0},
   };
   const std::size_t expected_duration_seconds = 18;
 
diff --git a/modules/audio_processing/test/conversational_speech/multiend_call.cc b/modules/audio_processing/test/conversational_speech/multiend_call.cc
index 76cf774afc..f5411bd97f 100644
--- a/modules/audio_processing/test/conversational_speech/multiend_call.cc
+++ b/modules/audio_processing/test/conversational_speech/multiend_call.cc
@@ -139,9 +139,8 @@ bool MultiEndCall::CheckTiming() {
     }
 
     // Append turn.
-    speaking_turns_.emplace_back(
-        turn.speaker_name, turn.audiotrack_file_name,
-        begin_timestamp, end_timestamp);
+    speaking_turns_.emplace_back(turn.speaker_name, turn.audiotrack_file_name,
+                                 begin_timestamp, end_timestamp, turn.gain);
 
     // Save speaking turn index for self cross-talk detection.
     RTC_DCHECK_EQ(speaking_turns_.size(), turn_index + 1);
diff --git a/modules/audio_processing/test/conversational_speech/multiend_call.h b/modules/audio_processing/test/conversational_speech/multiend_call.h
index e4cee7ac96..d91058f654 100644
--- a/modules/audio_processing/test/conversational_speech/multiend_call.h
+++ b/modules/audio_processing/test/conversational_speech/multiend_call.h
@@ -35,14 +35,19 @@ class MultiEndCall {
     // Constructor required in order to use std::vector::emplace_back().
     SpeakingTurn(std::string new_speaker_name,
                  std::string new_audiotrack_file_name,
-                 size_t new_begin, size_t new_end)
+                 size_t new_begin,
+                 size_t new_end,
+                 int gain)
         : speaker_name(std::move(new_speaker_name)),
           audiotrack_file_name(std::move(new_audiotrack_file_name)),
-          begin(new_begin), end(new_end) {}
+          begin(new_begin),
+          end(new_end),
+          gain(gain) {}
     std::string speaker_name;
     std::string audiotrack_file_name;
     size_t begin;
     size_t end;
+    int gain;
   };
 
   MultiEndCall(
diff --git a/modules/audio_processing/test/conversational_speech/simulator.cc b/modules/audio_processing/test/conversational_speech/simulator.cc
index 84a9ef55c8..2c02ea6234 100644
--- a/modules/audio_processing/test/conversational_speech/simulator.cc
+++ b/modules/audio_processing/test/conversational_speech/simulator.cc
@@ -10,6 +10,9 @@
 
 #include "modules/audio_processing/test/conversational_speech/simulator.h"
 
+#include <math.h>
+
+#include <algorithm>
 #include <set>
 #include <utility>
 #include <vector>
@@ -19,6 +22,7 @@
 #include "modules/audio_processing/test/conversational_speech/wavreader_interface.h"
 #include "rtc_base/constructormagic.h"
 #include "rtc_base/logging.h"
+#include "rtc_base/numerics/safe_conversions.h"
 #include "rtc_base/pathutils.h"
 #include "rtc_base/ptr_util.h"
 
@@ -158,6 +162,17 @@ void PadRightWrite(WavWriter* wav_writer, size_t pad_samples) {
   }
 }
 
+void ScaleSignal(rtc::ArrayView<const int16_t> source_samples,
+                 int gain,
+                 rtc::ArrayView<int16_t> output_samples) {
+  const float gain_linear = pow(10.0, gain / 20.0);
+  RTC_DCHECK_EQ(source_samples.size(), output_samples.size());
+  std::transform(source_samples.begin(), source_samples.end(),
+                 output_samples.begin(), [gain_linear](int16_t x) -> int16_t {
+                   return rtc::saturated_cast<int16_t>(x * gain_linear);
+                 });
+}
+
 }  // namespace
 
 namespace conversational_speech {
@@ -185,21 +200,23 @@ std::unique_ptr<std::map<std::string, SpeakerOutputFilePaths>> Simulate(
   // Write near-end and far-end output tracks.
   for (const auto& speaking_turn : multiend_call.speaking_turns()) {
     const std::string& active_speaker_name = speaking_turn.speaker_name;
-    auto source_audiotrack = audiotracks->at(
-        speaking_turn.audiotrack_file_name);
+    const auto source_audiotrack =
+        audiotracks->at(speaking_turn.audiotrack_file_name);
+    std::vector<int16_t> scaled_audiotrack(source_audiotrack.size());
+    ScaleSignal(source_audiotrack, speaking_turn.gain, scaled_audiotrack);
 
     // Write active speaker's chunk to active speaker's near-end.
-    PadLeftWriteChunk(source_audiotrack, speaking_turn.begin,
-                      speakers_wav_writers->at(
-                          active_speaker_name).near_end_wav_writer());
+    PadLeftWriteChunk(
+        scaled_audiotrack, speaking_turn.begin,
+        speakers_wav_writers->at(active_speaker_name).near_end_wav_writer());
 
     // Write active speaker's chunk to other participants' far-ends.
     for (const std::string& speaker_name : speaker_names) {
       if (speaker_name == active_speaker_name)
         continue;
-      PadLeftWriteChunk(source_audiotrack, speaking_turn.begin,
-                        speakers_wav_writers->at(
-                            speaker_name).far_end_wav_writer());
+      PadLeftWriteChunk(
+          scaled_audiotrack, speaking_turn.begin,
+          speakers_wav_writers->at(speaker_name).far_end_wav_writer());
     }
   }
 
diff --git a/modules/audio_processing/test/conversational_speech/timing.cc b/modules/audio_processing/test/conversational_speech/timing.cc
index 773a42ebd7..2e96d2b783 100644
--- a/modules/audio_processing/test/conversational_speech/timing.cc
+++ b/modules/audio_processing/test/conversational_speech/timing.cc
@@ -21,8 +21,8 @@ namespace conversational_speech {
 
 bool Turn::operator==(const Turn &b) const {
   return b.speaker_name == speaker_name &&
-         b.audiotrack_file_name == audiotrack_file_name &&
-         b.offset == offset;
+         b.audiotrack_file_name == audiotrack_file_name && b.offset == offset &&
+         b.gain == gain;
 }
 
 std::vector<Turn> LoadTiming(const std::string& timing_filepath) {
@@ -30,8 +30,13 @@ std::vector<Turn> LoadTiming(const std::string& timing_filepath) {
   auto parse_line = [](const std::string& line) {
     std::vector<std::string> fields;
     rtc::split(line, ' ', &fields);
-    RTC_CHECK_EQ(fields.size(), 3);
-    return Turn(fields[0], fields[1], std::atol(fields[2].c_str()));
+    RTC_CHECK_GE(fields.size(), 3);
+    RTC_CHECK_LE(fields.size(), 4);
+    int gain = 0;
+    if (fields.size() == 4) {
+      gain = std::atof(fields[3].c_str());
+    }
+    return Turn(fields[0], fields[1], std::atol(fields[2].c_str()), gain);
   };
 
   // Init.
@@ -55,8 +60,8 @@ void SaveTiming(const std::string& timing_filepath,
   std::ofstream outfile(timing_filepath);
   RTC_CHECK(outfile.is_open());
   for (const Turn& turn : timing) {
-    outfile << turn.speaker_name << " " << turn.audiotrack_file_name
-        << " " << turn.offset << std::endl;
+    outfile << turn.speaker_name << " " << turn.audiotrack_file_name << " "
+            << turn.offset << " " << turn.gain << std::endl;
   }
   outfile.close();
 }
diff --git a/modules/audio_processing/test/conversational_speech/timing.h b/modules/audio_processing/test/conversational_speech/timing.h
index dc43361815..07a26e14c7 100644
--- a/modules/audio_processing/test/conversational_speech/timing.h
+++ b/modules/audio_processing/test/conversational_speech/timing.h
@@ -21,15 +21,19 @@ namespace test {
 namespace conversational_speech {
 
 struct Turn{
-  Turn(std::string new_speaker_name, std::string new_audiotrack_file_name,
-       int new_offset)
+  Turn(std::string new_speaker_name,
+       std::string new_audiotrack_file_name,
+       int new_offset,
+       int gain)
       : speaker_name(new_speaker_name),
         audiotrack_file_name(new_audiotrack_file_name),
-        offset(new_offset) {}
+        offset(new_offset),
+        gain(gain) {}
   bool operator==(const Turn &b) const;
   std::string speaker_name;
   std::string audiotrack_file_name;
   int offset;
+  int gain;
 };
 
 // Loads a list of turns from a file.