Change levels of different speech signal in tool.

The conversational_speech_generator tool now adjusts the level of different speech segments. Implementation: The Turn and MultiEndCall::SpeakingTurn structs have an extra 'gain' member. It's read and parsed in timing.cc and put in a Turn struct. It's put in a SpeakingTurn struct in multiend_call.cc and read and applied to the signal in simulator.cc Bug: webrtc:7494 Change-Id: I9b82a896eb616c8b5ef14d41dfdfd085ef1d3fbb Reviewed-on: https://webrtc-review.googlesource.com/26280 Commit-Queue: Alex Loiko <aleloi@webrtc.org> Reviewed-by: Alessio Bazzica <alessiob@webrtc.org> Cr-Commit-Position: refs/heads/master@{#21714}
2018-01-22 14:18:28 +01:00 · 2018-01-22 14:18:28 +01:00 · f475e3aa0e
commit f475e3aa0e
parent 736d2f7d12
7 changed files with 85 additions and 84 deletions
--- a/modules/audio_processing/test/conversational_speech/README.md
+++ b/modules/audio_processing/test/conversational_speech/README.md
@ -36,7 +36,9 @@ A a4 0

 The first column indicates the speaker name, the second contains the audio track
 file names, and the third the offsets (in milliseconds) used to concatenate the
-chunks.
+chunks. An optional fourth column contains positive or negative integral gains
+in dB that will be applied to the tracks. It's possible to specify the gain for
+some turns but not for others. If the gain is left out, no gain is applied.

 Assume that all the audio tracks in the example above are 1000 ms long.
 The tool will then generate two tracks (A and B) that look like this:
--- a/modules/audio_processing/test/conversational_speech/generator_unittest.cc
+++ b/modules/audio_processing/test/conversational_speech/generator_unittest.cc
@ -72,12 +72,8 @@ const char* const timing_filepath = "/path/to/timing_file.txt";
 const char* const output_path = "/path/to/output_dir";

 const std::vector<Turn> expected_timing = {
-    {"A", "a1", 0},
-    {"B", "b1", 0},
-    {"A", "a2", 100},
-    {"B", "b2", -200},
-    {"A", "a3", 0},
-    {"A", "a3", 0},
+    {"A", "a1", 0, 0},    {"B", "b1", 0, 0}, {"A", "a2", 100, 0},
+    {"B", "b2", -200, 0}, {"A", "a3", 0, 0}, {"A", "a3", 0, 0},
 };
 const std::size_t kNumberOfTurns = expected_timing.size();

@ -223,8 +219,7 @@ TEST(ConversationalSpeechTest, MultiEndCallCreate) {

 TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
  const std::vector<Turn> timing = {
-      {"A", "sr8000", 0},
-      {"B", "sr16000", 0},
+      {"A", "sr8000", 0, 0}, {"B", "sr16000", 0, 0},
  };
  auto mock_wavreader_factory = CreateMockWavReaderFactory();

@ -238,8 +233,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {

 TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
  const std::vector<Turn> timing = {
-      {"A", "sr16000_stereo", 0},
-      {"B", "sr16000_stereo", 0},
+      {"A", "sr16000_stereo", 0, 0}, {"B", "sr16000_stereo", 0, 0},
  };
  auto mock_wavreader_factory = CreateMockWavReaderFactory();

@ -254,8 +248,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
 TEST(ConversationalSpeechTest,
       MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels) {
  const std::vector<Turn> timing = {
-      {"A", "sr8000", 0},
-      {"B", "sr16000_stereo", 0},
+      {"A", "sr8000", 0, 0}, {"B", "sr16000_stereo", 0, 0},
  };
  auto mock_wavreader_factory = CreateMockWavReaderFactory();

@ -269,8 +262,7 @@ TEST(ConversationalSpeechTest,

 TEST(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
  const std::vector<Turn> timing = {
-      {"A", "t500", -100},
-      {"B", "t500", 0},
+      {"A", "t500", -100, 0}, {"B", "t500", 0, 0},
  };
  auto mock_wavreader_factory = CreateMockWavReaderFactory();

@ -288,8 +280,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupSimple) {
  // B .....1****
  constexpr std::size_t expected_duration = kDefaultSampleRate;
  const std::vector<Turn> timing = {
-      {"A", "t500", 0},
-      {"B", "t500", 0},
+      {"A", "t500", 0, 0}, {"B", "t500", 0, 0},
  };
  auto mock_wavreader_factory = CreateMockWavReaderFactory();

@ -313,8 +304,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupPause) {
  // B .......1****
  constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
  const std::vector<Turn> timing = {
-      {"A", "t500", 0},
-      {"B", "t500", 200},
+      {"A", "t500", 0, 0}, {"B", "t500", 200, 0},
  };
  auto mock_wavreader_factory = CreateMockWavReaderFactory();

@ -338,8 +328,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) {
  // B ....1****
  constexpr std::size_t expected_duration = kDefaultSampleRate * 0.9;
  const std::vector<Turn> timing = {
-      {"A", "t500", 0},
-      {"B", "t500", -100},
+      {"A", "t500", 0, 0}, {"B", "t500", -100, 0},
  };
  auto mock_wavreader_factory = CreateMockWavReaderFactory();

@ -362,8 +351,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) {
  // A ..0****
  // B .1****.  The n-th turn cannot start before the (n-1)-th one.
  const std::vector<Turn> timing = {
-      {"A", "t500", 200},
-      {"B", "t500", -600},
+      {"A", "t500", 200, 0}, {"B", "t500", -600, 0},
  };
  auto mock_wavreader_factory = CreateMockWavReaderFactory();

@ -381,9 +369,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) {
  // B ...1*********
  constexpr std::size_t expected_duration = kDefaultSampleRate * 1.3;
  const std::vector<Turn> timing = {
-      {"A", "t500", 0},
-      {"B", "t1000", -200},
-      {"A", "t500", -800},
+      {"A", "t500", 0, 0}, {"B", "t1000", -200, 0}, {"A", "t500", -800, 0},
  };
  auto mock_wavreader_factory = CreateMockWavReaderFactory();

@ -408,9 +394,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) {
  // B ......2****
  //      ^  Turn #1 overlaps with #0 which is from the same speaker.
  const std::vector<Turn> timing = {
-      {"A", "t500", 0},
-      {"A", "t500", -200},
-      {"B", "t500", -200},
+      {"A", "t500", 0, 0}, {"A", "t500", -200, 0}, {"B", "t500", -200, 0},
  };
  auto mock_wavreader_factory = CreateMockWavReaderFactory();

@ -430,10 +414,10 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) {
  // A ......3**.
  //         ^  Turn #3 overlaps with #0 which is from the same speaker.
  const std::vector<Turn> timing = {
-      {"A", "t1000", 0},
-      {"B", "t300", -1000},
-      {"C", "t300", 0},
-      {"A", "t300", 0},
+      {"A", "t1000", 0, 0},
+      {"B", "t300", -1000, 0},
+      {"C", "t300", 0, 0},
+      {"A", "t300", 0, 0},
  };
  auto mock_wavreader_factory = CreateMockWavReaderFactory();

@ -452,9 +436,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) {
  // C .......2****
  constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
  const std::vector<Turn> timing = {
-      {"A", "t1000", 0},
-      {"B", "t500", -800},
-      {"C", "t500", 0},
+      {"A", "t1000", 0, 0}, {"B", "t500", -800, 0}, {"C", "t500", 0, 0},
  };
  auto mock_wavreader_factory = CreateMockWavReaderFactory();

@ -480,9 +462,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) {
  //       ^  Turn #2 overlaps both with #0 and #1 (cross-talk with 3+ speakers
  //          not permitted).
  const std::vector<Turn> timing = {
-      {"A", "t1000", 0},
-      {"B", "t500", -800},
-      {"C", "t500", -300},
+      {"A", "t1000", 0, 0}, {"B", "t500", -800, 0}, {"C", "t500", -300, 0},
  };
  auto mock_wavreader_factory = CreateMockWavReaderFactory();

@ -501,9 +481,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) {
  // C .......3****
  constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
  const std::vector<Turn> timing = {
-      {"A", "t1000", 0},
-      {"B", "t500", -900},
-      {"C", "t500", 100},
+      {"A", "t1000", 0, 0}, {"B", "t500", -900, 0}, {"C", "t500", 100, 0},
  };
  auto mock_wavreader_factory = CreateMockWavReaderFactory();

@ -526,8 +504,7 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) {
  // A 0****
  // B 1****
  const std::vector<Turn> timing = {
-      {"A", "t500", 0},
-      {"B", "t500", -500},
+      {"A", "t500", 0, 0}, {"B", "t500", -500, 0},
  };
  auto mock_wavreader_factory = CreateMockWavReaderFactory();

@ -551,13 +528,9 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequence) {
  // C ......2**.......6**..
  constexpr std::size_t expected_duration = kDefaultSampleRate * 1.9;
  const std::vector<Turn> timing = {
-      {"A", "t500", 0},
-      {"B", "t500", 0},
-      {"C", "t300", -400},
-      {"A", "t500", 0},
-      {"B", "t300", -100},
-      {"A", "t300", -100},
-      {"C", "t300", -200},
+      {"A", "t500", 0, 0},    {"B", "t500", 0, 0},    {"C", "t300", -400, 0},
+      {"A", "t500", 0, 0},    {"B", "t300", -100, 0}, {"A", "t300", -100, 0},
+      {"C", "t300", -200, 0},
  };
  auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
      new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
@ -585,13 +558,9 @@ TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) {
  //                 ^ Turns #4, #5 and #6 overlapping (cross-talk with 3+
  //                   speakers not permitted).
  const std::vector<Turn> timing = {
-      {"A", "t500", 0},
-      {"B", "t500", 0},
-      {"C", "t300", -400},
-      {"A", "t500", 0},
-      {"B", "t300", -100},
-      {"A", "t300", -200},
-      {"C", "t300", -200},
+      {"A", "t500", 0, 0},    {"B", "t500", 0, 0},    {"C", "t300", -400, 0},
+      {"A", "t500", 0, 0},    {"B", "t300", -100, 0}, {"A", "t300", -200, 0},
+      {"C", "t300", -200, 0},
  };
  auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
      new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
@ -637,10 +606,10 @@ TEST(ConversationalSpeechTest, DISABLED_MultiEndCallSimulator) {
  // A 0*********...........2*********.....
  // B ...........1*********.....3*********
  const std::vector<Turn> expected_timing = {
-      {"A", "t5000_440.wav", 0},
-      {"B", "t5000_880.wav", 500},
-      {"A", "t5000_440.wav", 0},
-      {"B", "t5000_880.wav", -2500},
+      {"A", "t5000_440.wav", 0, 0},
+      {"B", "t5000_880.wav", 500, 0},
+      {"A", "t5000_440.wav", 0, 0},
+      {"B", "t5000_880.wav", -2500, 0},
  };
  const std::size_t expected_duration_seconds = 18;

--- a/modules/audio_processing/test/conversational_speech/multiend_call.cc
+++ b/modules/audio_processing/test/conversational_speech/multiend_call.cc
@ -139,9 +139,8 @@ bool MultiEndCall::CheckTiming() {
    }

    // Append turn.
-    speaking_turns_.emplace_back(
-        turn.speaker_name, turn.audiotrack_file_name,
-        begin_timestamp, end_timestamp);
+    speaking_turns_.emplace_back(turn.speaker_name, turn.audiotrack_file_name,
+                                 begin_timestamp, end_timestamp, turn.gain);

    // Save speaking turn index for self cross-talk detection.
    RTC_DCHECK_EQ(speaking_turns_.size(), turn_index + 1);
--- a/modules/audio_processing/test/conversational_speech/multiend_call.h
+++ b/modules/audio_processing/test/conversational_speech/multiend_call.h
@ -35,14 +35,19 @@ class MultiEndCall {
    // Constructor required in order to use std::vector::emplace_back().
    SpeakingTurn(std::string new_speaker_name,
                 std::string new_audiotrack_file_name,
-                 size_t new_begin, size_t new_end)
+                 size_t new_begin,
+                 size_t new_end,
+                 int gain)
        : speaker_name(std::move(new_speaker_name)),
          audiotrack_file_name(std::move(new_audiotrack_file_name)),
-          begin(new_begin), end(new_end) {}
+          begin(new_begin),
+          end(new_end),
+          gain(gain) {}
    std::string speaker_name;
    std::string audiotrack_file_name;
    size_t begin;
    size_t end;
+    int gain;
  };

  MultiEndCall(
--- a/modules/audio_processing/test/conversational_speech/simulator.cc
+++ b/modules/audio_processing/test/conversational_speech/simulator.cc
@ -10,6 +10,9 @@

 #include "modules/audio_processing/test/conversational_speech/simulator.h"

+#include <math.h>
+
+#include <algorithm>
 #include <set>
 #include <utility>
 #include <vector>
@ -19,6 +22,7 @@
 #include "modules/audio_processing/test/conversational_speech/wavreader_interface.h"
 #include "rtc_base/constructormagic.h"
 #include "rtc_base/logging.h"
+#include "rtc_base/numerics/safe_conversions.h"
 #include "rtc_base/pathutils.h"
 #include "rtc_base/ptr_util.h"

@ -158,6 +162,17 @@ void PadRightWrite(WavWriter* wav_writer, size_t pad_samples) {
  }
 }

+void ScaleSignal(rtc::ArrayView<const int16_t> source_samples,
+                 int gain,
+                 rtc::ArrayView<int16_t> output_samples) {
+  const float gain_linear = pow(10.0, gain / 20.0);
+  RTC_DCHECK_EQ(source_samples.size(), output_samples.size());
+  std::transform(source_samples.begin(), source_samples.end(),
+                 output_samples.begin(), [gain_linear](int16_t x) -> int16_t {
+                   return rtc::saturated_cast<int16_t>(x * gain_linear);
+                 });
+}
+
 }  // namespace

 namespace conversational_speech {
@ -185,21 +200,23 @@ std::unique_ptr<std::map<std::string, SpeakerOutputFilePaths>> Simulate(
  // Write near-end and far-end output tracks.
  for (const auto& speaking_turn : multiend_call.speaking_turns()) {
    const std::string& active_speaker_name = speaking_turn.speaker_name;
-    auto source_audiotrack = audiotracks->at(
-        speaking_turn.audiotrack_file_name);
+    const auto source_audiotrack =
+        audiotracks->at(speaking_turn.audiotrack_file_name);
+    std::vector<int16_t> scaled_audiotrack(source_audiotrack.size());
+    ScaleSignal(source_audiotrack, speaking_turn.gain, scaled_audiotrack);

    // Write active speaker's chunk to active speaker's near-end.
-    PadLeftWriteChunk(source_audiotrack, speaking_turn.begin,
-                      speakers_wav_writers->at(
-                          active_speaker_name).near_end_wav_writer());
+    PadLeftWriteChunk(
+        scaled_audiotrack, speaking_turn.begin,
+        speakers_wav_writers->at(active_speaker_name).near_end_wav_writer());

    // Write active speaker's chunk to other participants' far-ends.
    for (const std::string& speaker_name : speaker_names) {
      if (speaker_name == active_speaker_name)
        continue;
-      PadLeftWriteChunk(source_audiotrack, speaking_turn.begin,
-                        speakers_wav_writers->at(
-                            speaker_name).far_end_wav_writer());
+      PadLeftWriteChunk(
+          scaled_audiotrack, speaking_turn.begin,
+          speakers_wav_writers->at(speaker_name).far_end_wav_writer());
    }
  }

--- a/modules/audio_processing/test/conversational_speech/timing.cc
+++ b/modules/audio_processing/test/conversational_speech/timing.cc
@ -21,8 +21,8 @@ namespace conversational_speech {

 bool Turn::operator==(const Turn &b) const {
  return b.speaker_name == speaker_name &&
-         b.audiotrack_file_name == audiotrack_file_name &&
-         b.offset == offset;
+         b.audiotrack_file_name == audiotrack_file_name && b.offset == offset &&
+         b.gain == gain;
 }

 std::vector<Turn> LoadTiming(const std::string& timing_filepath) {
@ -30,8 +30,13 @@ std::vector<Turn> LoadTiming(const std::string& timing_filepath) {
  auto parse_line = [](const std::string& line) {
    std::vector<std::string> fields;
    rtc::split(line, ' ', &fields);
-    RTC_CHECK_EQ(fields.size(), 3);
-    return Turn(fields[0], fields[1], std::atol(fields[2].c_str()));
+    RTC_CHECK_GE(fields.size(), 3);
+    RTC_CHECK_LE(fields.size(), 4);
+    int gain = 0;
+    if (fields.size() == 4) {
+      gain = std::atof(fields[3].c_str());
+    }
+    return Turn(fields[0], fields[1], std::atol(fields[2].c_str()), gain);
  };

  // Init.
@ -55,8 +60,8 @@ void SaveTiming(const std::string& timing_filepath,
  std::ofstream outfile(timing_filepath);
  RTC_CHECK(outfile.is_open());
  for (const Turn& turn : timing) {
-    outfile << turn.speaker_name << " " << turn.audiotrack_file_name
-        << " " << turn.offset << std::endl;
+    outfile << turn.speaker_name << " " << turn.audiotrack_file_name << " "
+            << turn.offset << " " << turn.gain << std::endl;
  }
  outfile.close();
 }
--- a/modules/audio_processing/test/conversational_speech/timing.h
+++ b/modules/audio_processing/test/conversational_speech/timing.h
@ -21,15 +21,19 @@ namespace test {
 namespace conversational_speech {

 struct Turn{
-  Turn(std::string new_speaker_name, std::string new_audiotrack_file_name,
-       int new_offset)
+  Turn(std::string new_speaker_name,
+       std::string new_audiotrack_file_name,
+       int new_offset,
+       int gain)
      : speaker_name(new_speaker_name),
        audiotrack_file_name(new_audiotrack_file_name),
-        offset(new_offset) {}
+        offset(new_offset),
+        gain(gain) {}
  bool operator==(const Turn &b) const;
  std::string speaker_name;
  std::string audiotrack_file_name;
  int offset;
+  int gain;
 };

 // Loads a list of turns from a file.