From ff2bf4b195c96f708325ead8db5dad9e246f6c19 Mon Sep 17 00:00:00 2001
From: Tommi <tommi@webrtc.org>
Date: Wed, 5 Jun 2024 18:25:11 +0200
Subject: [PATCH] Update FrameCombiner to use audio view methods for
 interleaved buffers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Along the way slightly simplify the class interface since views
carry audio properties. Also, now allocating FrameCombiner allocates
the mixing buffer in the same allocation.

Bug: chromium:335805780
Change-Id: Id7a76b040c11064e1e4daf01a371328769162554
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/352502
Commit-Queue: Tomas Gunnarsson <tommi@webrtc.org>
Reviewed-by: Per Åhgren <peah@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#42465}
---
 modules/audio_mixer/frame_combiner.cc         | 96 +++++++++----------
 modules/audio_mixer/frame_combiner.h          |  6 +-
 .../audio_mixer/frame_combiner_unittest.cc    |  9 +-
 modules/audio_processing/agc2/limiter.cc      | 11 ++-
 4 files changed, 60 insertions(+), 62 deletions(-)
diff --git a/modules/audio_mixer/frame_combiner.cc b/modules/audio_mixer/frame_combiner.cc
index 96d1d86f67..b8d897e34e 100644
--- a/modules/audio_mixer/frame_combiner.cc
+++ b/modules/audio_mixer/frame_combiner.cc
@@ -36,17 +36,13 @@
 namespace webrtc {
 namespace {
 
-using MixingBuffer =
-    std::array<std::array<float, FrameCombiner::kMaximumChannelSize>,
-               FrameCombiner::kMaximumNumberOfChannels>;
-
 void SetAudioFrameFields(rtc::ArrayView<const AudioFrame* const> mix_list,
                          size_t number_of_channels,
                          int sample_rate,
                          size_t number_of_streams,
                          AudioFrame* audio_frame_for_mixing) {
-  const size_t samples_per_channel = static_cast<size_t>(
-      (sample_rate * webrtc::AudioMixerImpl::kFrameDurationInMs) / 1000);
+  const size_t samples_per_channel =
+      SampleRateToDefaultChannelSize(sample_rate);
 
   // TODO(minyue): Issue bugs.webrtc.org/3390.
   // Audio frame timestamp. The 'timestamp_' field is set to dummy
@@ -85,32 +81,25 @@ void MixFewFramesWithNoLimiter(rtc::ArrayView<const AudioFrame* const> mix_list,
     return;
   }
   RTC_DCHECK_LE(mix_list.size(), 1);
-  std::copy(mix_list[0]->data(),
-            mix_list[0]->data() +
-                mix_list[0]->num_channels_ * mix_list[0]->samples_per_channel_,
-            audio_frame_for_mixing->mutable_data());
+  InterleavedView<int16_t> dst = audio_frame_for_mixing->mutable_data(
+      mix_list[0]->samples_per_channel_, mix_list[0]->num_channels_);
+  CopySamples(dst, mix_list[0]->data_view());
 }
 
 void MixToFloatFrame(rtc::ArrayView<const AudioFrame* const> mix_list,
-                     size_t samples_per_channel,
-                     size_t number_of_channels,
-                     MixingBuffer* mixing_buffer) {
-  RTC_DCHECK_LE(samples_per_channel, FrameCombiner::kMaximumChannelSize);
-  RTC_DCHECK_LE(number_of_channels, FrameCombiner::kMaximumNumberOfChannels);
+                     DeinterleavedView<float>& mixing_buffer) {
+  const size_t number_of_channels = NumChannels(mixing_buffer);
   // Clear the mixing buffer.
-  *mixing_buffer = {};
+  rtc::ArrayView<float> raw_data = mixing_buffer.data();
+  ClearSamples(raw_data);
 
   // Convert to FloatS16 and mix.
   for (size_t i = 0; i < mix_list.size(); ++i) {
-    const AudioFrame* const frame = mix_list[i];
-    const int16_t* const frame_data = frame->data();
-    for (size_t j = 0; j < std::min(number_of_channels,
-                                    FrameCombiner::kMaximumNumberOfChannels);
-         ++j) {
-      for (size_t k = 0; k < std::min(samples_per_channel,
-                                      FrameCombiner::kMaximumChannelSize);
-           ++k) {
-        (*mixing_buffer)[j][k] += frame_data[number_of_channels * k + j];
+    InterleavedView<const int16_t> frame_data = mix_list[i]->data_view();
+    for (size_t j = 0; j < NumChannels(mixing_buffer); ++j) {
+      MonoView<float> channel = mixing_buffer[j];
+      for (size_t k = 0; k < SamplesPerChannel(channel); ++k) {
+        channel[k] += frame_data[number_of_channels * k + j];
       }
     }
   }
@@ -127,13 +116,13 @@ void RunLimiter(AudioFrameView<float> mixing_buffer_view, Limiter* limiter) {
 // Both interleaves and rounds.
 void InterleaveToAudioFrame(AudioFrameView<const float> mixing_buffer_view,
                             AudioFrame* audio_frame_for_mixing) {
-  const size_t number_of_channels = mixing_buffer_view.num_channels();
-  const size_t samples_per_channel = mixing_buffer_view.samples_per_channel();
-  int16_t* const mixing_data = audio_frame_for_mixing->mutable_data();
+  InterleavedView<int16_t> mixing_data = audio_frame_for_mixing->mutable_data(
+      mixing_buffer_view.samples_per_channel(),
+      mixing_buffer_view.num_channels());
   // Put data in the result frame.
-  for (size_t i = 0; i < number_of_channels; ++i) {
-    for (size_t j = 0; j < samples_per_channel; ++j) {
-      mixing_data[number_of_channels * j + i] =
+  for (size_t i = 0; i < mixing_data.num_channels(); ++i) {
+    for (size_t j = 0; j < mixing_data.samples_per_channel(); ++j) {
+      mixing_data[mixing_data.num_channels() * j + i] =
           FloatS16ToS16(mixing_buffer_view.channel(i)[j]);
     }
   }
@@ -145,9 +134,6 @@ constexpr size_t FrameCombiner::kMaximumChannelSize;
 
 FrameCombiner::FrameCombiner(bool use_limiter)
     : data_dumper_(new ApmDataDumper(0)),
-      mixing_buffer_(
-          std::make_unique<std::array<std::array<float, kMaximumChannelSize>,
-                                      kMaximumNumberOfChannels>>()),
       limiter_(static_cast<size_t>(48000), data_dumper_.get(), "AudioMixer"),
       use_limiter_(use_limiter) {
   static_assert(kMaximumChannelSize * kMaximumNumberOfChannels <=
@@ -163,17 +149,26 @@ void FrameCombiner::Combine(rtc::ArrayView<AudioFrame* const> mix_list,
                             size_t number_of_streams,
                             AudioFrame* audio_frame_for_mixing) {
   RTC_DCHECK(audio_frame_for_mixing);
+  RTC_DCHECK_GT(sample_rate, 0);
+
+  // Note: `mix_list` is allowed to be empty.
+  // See FrameCombiner.CombiningZeroFramesShouldProduceSilence.
+
+  // Make sure to cap `number_of_channels` to the kMaximumNumberOfChannels
+  // limits since processing from hereon out will be bound by them.
+  number_of_channels = std::min(number_of_channels, kMaximumNumberOfChannels);
 
   SetAudioFrameFields(mix_list, number_of_channels, sample_rate,
                       number_of_streams, audio_frame_for_mixing);
 
-  const size_t samples_per_channel = static_cast<size_t>(
-      (sample_rate * webrtc::AudioMixerImpl::kFrameDurationInMs) / 1000);
+  size_t samples_per_channel = SampleRateToDefaultChannelSize(sample_rate);
 
+#if RTC_DCHECK_IS_ON
   for (const auto* frame : mix_list) {
     RTC_DCHECK_EQ(samples_per_channel, frame->samples_per_channel_);
     RTC_DCHECK_EQ(sample_rate, frame->sample_rate_hz_);
   }
+#endif
 
   // The 'num_channels_' field of frames in 'mix_list' could be
   // different from 'number_of_channels'.
@@ -186,22 +181,27 @@ void FrameCombiner::Combine(rtc::ArrayView<AudioFrame* const> mix_list,
     return;
   }
 
-  MixToFloatFrame(mix_list, samples_per_channel, number_of_channels,
-                  mixing_buffer_.get());
-
-  const size_t output_number_of_channels =
-      std::min(number_of_channels, kMaximumNumberOfChannels);
-  const size_t output_samples_per_channel =
-      std::min(samples_per_channel, kMaximumChannelSize);
+  // Make sure that the size of the view based on the desired
+  // `samples_per_channel` and `number_of_channels` doesn't exceed the size of
+  // the `mixing_buffer_` buffer.
+  RTC_DCHECK_LE(samples_per_channel, kMaximumChannelSize);
+  // Since the above check is a DCHECK only, clamp down on `samples_per_channel`
+  // to make sure we don't exceed the buffer size in non-dcheck builds.
+  // See also FrameCombinerDeathTest.DebugBuildCrashesWithHighRate.
+  samples_per_channel = std::min(samples_per_channel, kMaximumChannelSize);
+  DeinterleavedView<float> deinterleaved(
+      mixing_buffer_.data(), samples_per_channel, number_of_channels);
+  MixToFloatFrame(mix_list, deinterleaved);
 
   // Put float data in an AudioFrameView.
+  // TODO(tommi): We should be able to just use `deinterleaved` without an
+  // additional array of pointers.
   std::array<float*, kMaximumNumberOfChannels> channel_pointers{};
-  for (size_t i = 0; i < output_number_of_channels; ++i) {
-    channel_pointers[i] = &(*mixing_buffer_.get())[i][0];
+  for (size_t i = 0; i < number_of_channels; ++i) {
+    channel_pointers[i] = deinterleaved[i].data();
   }
-  AudioFrameView<float> mixing_buffer_view(&channel_pointers[0],
-                                           output_number_of_channels,
-                                           output_samples_per_channel);
+  AudioFrameView<float> mixing_buffer_view(
+      channel_pointers.data(), number_of_channels, samples_per_channel);
 
   if (use_limiter_) {
     RunLimiter(mixing_buffer_view, &limiter_);
diff --git a/modules/audio_mixer/frame_combiner.h b/modules/audio_mixer/frame_combiner.h
index 6185b29f8a..fc35d89543 100644
--- a/modules/audio_mixer/frame_combiner.h
+++ b/modules/audio_mixer/frame_combiner.h
@@ -42,14 +42,12 @@ class FrameCombiner {
   static constexpr size_t kMaximumNumberOfChannels = 8;
   static constexpr size_t kMaximumChannelSize = 48 * 10;
 
-  using MixingBuffer = std::array<std::array<float, kMaximumChannelSize>,
-                                  kMaximumNumberOfChannels>;
-
  private:
   std::unique_ptr<ApmDataDumper> data_dumper_;
-  std::unique_ptr<MixingBuffer> mixing_buffer_;
   Limiter limiter_;
   const bool use_limiter_;
+  std::array<float, kMaximumChannelSize * kMaximumNumberOfChannels>
+      mixing_buffer_;
 };
 }  // namespace webrtc
 
diff --git a/modules/audio_mixer/frame_combiner_unittest.cc b/modules/audio_mixer/frame_combiner_unittest.cc
index 486f551f78..80c2b99575 100644
--- a/modules/audio_mixer/frame_combiner_unittest.cc
+++ b/modules/audio_mixer/frame_combiner_unittest.cc
@@ -186,14 +186,13 @@ TEST(FrameCombinerDeathTest, DebugBuildCrashesWithHighRate) {
       const std::vector<AudioFrame*> frames_to_combine(
           all_frames.begin(), all_frames.begin() + number_of_frames);
       AudioFrame audio_frame_for_mixing;
-#if RTC_DCHECK_IS_ON && GTEST_HAS_DEATH_TEST && !defined(WEBRTC_ANDROID)
+#if GTEST_HAS_DEATH_TEST && !defined(WEBRTC_ANDROID)
       EXPECT_DEATH(
           combiner.Combine(frames_to_combine, number_of_channels, rate,
                            frames_to_combine.size(), &audio_frame_for_mixing),
-          "");
-#elif !RTC_DCHECK_IS_ON
-      combiner.Combine(frames_to_combine, number_of_channels, rate,
-                       frames_to_combine.size(), &audio_frame_for_mixing);
+          "")
+          << "number_of_channels=" << number_of_channels << ", rate=" << rate
+          << ", frames to combine=" << frames_to_combine.size();
 #endif
     }
   }
diff --git a/modules/audio_processing/agc2/limiter.cc b/modules/audio_processing/agc2/limiter.cc
index 7a1e2202be..57731baddc 100644
--- a/modules/audio_processing/agc2/limiter.cc
+++ b/modules/audio_processing/agc2/limiter.cc
@@ -73,12 +73,13 @@ void ComputePerSampleSubframeFactors(
   }
 }
 
-void ScaleSamples(rtc::ArrayView<const float> per_sample_scaling_factors,
+void ScaleSamples(MonoView<const float> per_sample_scaling_factors,
                   AudioFrameView<float> signal) {
   const int samples_per_channel = signal.samples_per_channel();
-  RTC_DCHECK_EQ(samples_per_channel, per_sample_scaling_factors.size());
+  RTC_DCHECK_EQ(samples_per_channel,
+                SamplesPerChannel(per_sample_scaling_factors));
   for (int i = 0; i < signal.num_channels(); ++i) {
-    rtc::ArrayView<float> channel = signal.channel(i);
+    MonoView<float> channel = signal.channel(i);
     for (int j = 0; j < samples_per_channel; ++j) {
       channel[j] = rtc::SafeClamp(channel[j] * per_sample_scaling_factors[j],
                                   kMinFloatS16Value, kMaxFloatS16Value);
@@ -119,8 +120,8 @@ void Limiter::Process(AudioFrameView<float> signal) {
   const int samples_per_channel = signal.samples_per_channel();
   RTC_DCHECK_LE(samples_per_channel, kMaximalNumberOfSamplesPerChannel);
 
-  auto per_sample_scaling_factors = rtc::ArrayView<float>(
-      &per_sample_scaling_factors_[0], samples_per_channel);
+  auto per_sample_scaling_factors =
+      MonoView<float>(&per_sample_scaling_factors_[0], samples_per_channel);
   ComputePerSampleSubframeFactors(scaling_factors_, samples_per_channel,
                                   per_sample_scaling_factors);
   ScaleSamples(per_sample_scaling_factors, signal);