Remove all AudioBuffer code that is not related to storing audio data

This CL moves/removes all code from the AudioBuffer that: -Is not directly handling audio data (e.g., keytaps, VAD descisions). -Is caching aggregated versions of the rest of the audio data. -Is not used (or only used in testing) Bug: webrtc:10882 Change-Id: I737deb3f692748eff30f46ad806b2c6f6292802c Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/149072 Reviewed-by: Gustaf Ullberg <gustaf@webrtc.org> Commit-Queue: Per Åhgren <peah@webrtc.org> Cr-Commit-Position: refs/heads/master@{#28866}
2019-08-15 12:15:46 +02:00 · 2019-08-15 12:15:46 +02:00 · a1351271e6
commit a1351271e6
parent 6e4791fe49
10 changed files with 125 additions and 224 deletions
--- a/modules/audio_processing/audio_buffer.cc
+++ b/modules/audio_processing/audio_buffer.cc
@ -27,15 +27,6 @@ const size_t kSamplesPer16kHzChannel = 160;
 const size_t kSamplesPer32kHzChannel = 320;
 const size_t kSamplesPer48kHzChannel = 480;

-int KeyboardChannelIndex(const StreamConfig& stream_config) {
-  if (!stream_config.has_keyboard()) {
-    RTC_NOTREACHED();
-    return 0;
-  }
-
-  return stream_config.num_channels();
-}
-
 size_t NumBandsFromSamplesPerChannel(size_t num_frames) {
  size_t num_bands = 1;
  if (num_frames == kSamplesPer32kHzChannel ||
@ -60,10 +51,6 @@ AudioBuffer::AudioBuffer(size_t input_num_frames,
      num_channels_(num_process_channels),
      num_bands_(NumBandsFromSamplesPerChannel(proc_num_frames_)),
      num_split_frames_(rtc::CheckedDivExact(proc_num_frames_, num_bands_)),
-      mixed_low_pass_valid_(false),
-      reference_copied_(false),
-      activity_(AudioFrame::kVadUnknown),
-      keyboard_data_(NULL),
      data_(new IFChannelBuffer(proc_num_frames_, num_proc_channels_)),
      output_buffer_(new IFChannelBuffer(output_num_frames_, num_channels_)) {
  RTC_DCHECK_GT(input_num_frames_, 0);
@ -118,10 +105,6 @@ void AudioBuffer::CopyFrom(const float* const* data,
        new IFChannelBuffer(input_num_frames_, num_proc_channels_));
  }

-  if (stream_config.has_keyboard()) {
-    keyboard_data_ = data[KeyboardChannelIndex(stream_config)];
-  }
-
  // Downmix.
  const float* const* data_ptr = data;
  if (need_to_downmix) {
@ -179,10 +162,6 @@ void AudioBuffer::CopyTo(const StreamConfig& stream_config,
 }

 void AudioBuffer::InitForNewData() {
-  keyboard_data_ = NULL;
-  mixed_low_pass_valid_ = false;
-  reference_copied_ = false;
-  activity_ = AudioFrame::kVadUnknown;
  num_channels_ = num_proc_channels_;
  data_->set_num_channels(num_proc_channels_);
  if (split_data_.get()) {
@ -195,7 +174,6 @@ const int16_t* const* AudioBuffer::channels_const() const {
 }

 int16_t* const* AudioBuffer::channels() {
-  mixed_low_pass_valid_ = false;
  return data_->ibuf()->channels();
 }

@ -205,7 +183,6 @@ const int16_t* const* AudioBuffer::split_bands_const(size_t channel) const {
 }

 int16_t* const* AudioBuffer::split_bands(size_t channel) {
-  mixed_low_pass_valid_ = false;
  return split_data_.get() ? split_data_->ibuf()->bands(channel)
                           : data_->ibuf()->bands(channel);
 }
@ -218,39 +195,11 @@ const int16_t* const* AudioBuffer::split_channels_const(Band band) const {
  }
 }

-int16_t* const* AudioBuffer::split_channels(Band band) {
-  mixed_low_pass_valid_ = false;
-  if (split_data_.get()) {
-    return split_data_->ibuf()->channels(band);
-  } else {
-    return band == kBand0To8kHz ? data_->ibuf()->channels() : nullptr;
-  }
-}
-
-ChannelBuffer<int16_t>* AudioBuffer::data() {
-  mixed_low_pass_valid_ = false;
-  return data_->ibuf();
-}
-
-const ChannelBuffer<int16_t>* AudioBuffer::data() const {
-  return data_->ibuf_const();
-}
-
-ChannelBuffer<int16_t>* AudioBuffer::split_data() {
-  mixed_low_pass_valid_ = false;
-  return split_data_.get() ? split_data_->ibuf() : data_->ibuf();
-}
-
-const ChannelBuffer<int16_t>* AudioBuffer::split_data() const {
-  return split_data_.get() ? split_data_->ibuf_const() : data_->ibuf_const();
-}
-
 const float* const* AudioBuffer::channels_const_f() const {
  return data_->fbuf_const()->channels();
 }

 float* const* AudioBuffer::channels_f() {
-  mixed_low_pass_valid_ = false;
  return data_->fbuf()->channels();
 }

@ -260,85 +209,10 @@ const float* const* AudioBuffer::split_bands_const_f(size_t channel) const {
 }

 float* const* AudioBuffer::split_bands_f(size_t channel) {
-  mixed_low_pass_valid_ = false;
  return split_data_.get() ? split_data_->fbuf()->bands(channel)
                           : data_->fbuf()->bands(channel);
 }

-const float* const* AudioBuffer::split_channels_const_f(Band band) const {
-  if (split_data_.get()) {
-    return split_data_->fbuf_const()->channels(band);
-  } else {
-    return band == kBand0To8kHz ? data_->fbuf_const()->channels() : nullptr;
-  }
-}
-
-float* const* AudioBuffer::split_channels_f(Band band) {
-  mixed_low_pass_valid_ = false;
-  if (split_data_.get()) {
-    return split_data_->fbuf()->channels(band);
-  } else {
-    return band == kBand0To8kHz ? data_->fbuf()->channels() : nullptr;
-  }
-}
-
-ChannelBuffer<float>* AudioBuffer::data_f() {
-  mixed_low_pass_valid_ = false;
-  return data_->fbuf();
-}
-
-const ChannelBuffer<float>* AudioBuffer::data_f() const {
-  return data_->fbuf_const();
-}
-
-ChannelBuffer<float>* AudioBuffer::split_data_f() {
-  mixed_low_pass_valid_ = false;
-  return split_data_.get() ? split_data_->fbuf() : data_->fbuf();
-}
-
-const ChannelBuffer<float>* AudioBuffer::split_data_f() const {
-  return split_data_.get() ? split_data_->fbuf_const() : data_->fbuf_const();
-}
-
-const int16_t* AudioBuffer::mixed_low_pass_data() {
-  if (num_proc_channels_ == 1) {
-    return split_bands_const(0)[kBand0To8kHz];
-  }
-
-  if (!mixed_low_pass_valid_) {
-    if (!mixed_low_pass_channels_.get()) {
-      mixed_low_pass_channels_.reset(
-          new ChannelBuffer<int16_t>(num_split_frames_, 1));
-    }
-
-    DownmixToMono<int16_t, int32_t>(split_channels_const(kBand0To8kHz),
-                                    num_split_frames_, num_channels_,
-                                    mixed_low_pass_channels_->channels()[0]);
-    mixed_low_pass_valid_ = true;
-  }
-  return mixed_low_pass_channels_->channels()[0];
-}
-
-const int16_t* AudioBuffer::low_pass_reference(int channel) const {
-  if (!reference_copied_) {
-    return NULL;
-  }
-
-  return low_pass_reference_channels_->channels()[channel];
-}
-
-const float* AudioBuffer::keyboard_data() const {
-  return keyboard_data_;
-}
-
-void AudioBuffer::set_activity(AudioFrame::VADActivity activity) {
-  activity_ = activity;
-}
-
-AudioFrame::VADActivity AudioBuffer::activity() const {
-  return activity_;
-}
-
 size_t AudioBuffer::num_channels() const {
  return num_channels_;
 }
@ -359,17 +233,12 @@ size_t AudioBuffer::num_frames_per_band() const {
  return num_split_frames_;
 }

-size_t AudioBuffer::num_keyboard_frames() const {
-  // We don't resample the keyboard channel.
-  return input_num_frames_;
-}
-
 size_t AudioBuffer::num_bands() const {
  return num_bands_;
 }

 // The resampler is only for supporting 48kHz to 16kHz in the reverse stream.
-void AudioBuffer::DeinterleaveFrom(AudioFrame* frame) {
+void AudioBuffer::DeinterleaveFrom(const AudioFrame* frame) {
  RTC_DCHECK_EQ(frame->num_channels_, num_input_channels_);
  RTC_DCHECK_EQ(frame->samples_per_channel_, input_num_frames_);
  InitForNewData();
@ -378,7 +247,6 @@ void AudioBuffer::DeinterleaveFrom(AudioFrame* frame) {
    input_buffer_.reset(
        new IFChannelBuffer(input_num_frames_, num_proc_channels_));
  }
-  activity_ = frame->vad_activity_;

  int16_t* const* deinterleaved;
  if (input_num_frames_ == proc_num_frames_) {
@ -407,12 +275,7 @@ void AudioBuffer::DeinterleaveFrom(AudioFrame* frame) {
  }
 }

-void AudioBuffer::InterleaveTo(AudioFrame* frame, bool data_changed) const {
-  frame->vad_activity_ = activity_;
-  if (!data_changed) {
-    return;
-  }
-
+void AudioBuffer::InterleaveTo(AudioFrame* frame) const {
  RTC_DCHECK(frame->num_channels_ == num_channels_ || num_channels_ == 1);
  RTC_DCHECK_EQ(frame->samples_per_channel_, output_num_frames_);

@ -437,21 +300,6 @@ void AudioBuffer::InterleaveTo(AudioFrame* frame, bool data_changed) const {
  }
 }

-void AudioBuffer::CopyLowPassToReference() {
-  reference_copied_ = true;
-  if (!low_pass_reference_channels_.get() ||
-      low_pass_reference_channels_->num_channels() != num_channels_) {
-    low_pass_reference_channels_.reset(
-        new ChannelBuffer<int16_t>(num_split_frames_, num_proc_channels_));
-  }
-  for (size_t i = 0; i < num_proc_channels_; i++) {
-    memcpy(low_pass_reference_channels_->channels()[i],
-           split_bands_const(i)[kBand0To8kHz],
-           low_pass_reference_channels_->num_frames_per_band() *
-               sizeof(split_bands_const(i)[kBand0To8kHz][0]));
-  }
-}
-
 void AudioBuffer::SplitIntoFrequencyBands() {
  splitting_filter_->Analysis(data_.get(), split_data_.get());
 }
--- a/modules/audio_processing/audio_buffer.h
+++ b/modules/audio_processing/audio_buffer.h
@ -40,10 +40,10 @@ class AudioBuffer {
  virtual ~AudioBuffer();

  size_t num_channels() const;
+  size_t num_proc_channels() const { return num_proc_channels_; }
  void set_num_channels(size_t num_channels);
  size_t num_frames() const;
  size_t num_frames_per_band() const;
-  size_t num_keyboard_frames() const;
  size_t num_bands() const;

  // Returns a pointer array to the full-band channels.
@ -76,44 +76,17 @@ class AudioBuffer {
  // 0 <= band < |num_bands_|
  // 0 <= channel < |num_proc_channels_|
  // 0 <= sample < |num_split_frames_|
-  int16_t* const* split_channels(Band band);
  const int16_t* const* split_channels_const(Band band) const;
-  float* const* split_channels_f(Band band);
-  const float* const* split_channels_const_f(Band band) const;
-
-  // Returns a pointer to the ChannelBuffer that encapsulates the full-band
-  // data.
-  ChannelBuffer<int16_t>* data();
-  const ChannelBuffer<int16_t>* data() const;
-  ChannelBuffer<float>* data_f();
-  const ChannelBuffer<float>* data_f() const;
-
-  // Returns a pointer to the ChannelBuffer that encapsulates the split data.
-  ChannelBuffer<int16_t>* split_data();
-  const ChannelBuffer<int16_t>* split_data() const;
-  ChannelBuffer<float>* split_data_f();
-  const ChannelBuffer<float>* split_data_f() const;
-
-  // Returns a pointer to the low-pass data downmixed to mono. If this data
-  // isn't already available it re-calculates it.
-  const int16_t* mixed_low_pass_data();
-  const int16_t* low_pass_reference(int channel) const;
-
-  const float* keyboard_data() const;
-
-  void set_activity(AudioFrame::VADActivity activity);
-  AudioFrame::VADActivity activity() const;

  // Use for int16 interleaved data.
-  void DeinterleaveFrom(AudioFrame* audioFrame);
+  void DeinterleaveFrom(const AudioFrame* audioFrame);
  // If |data_changed| is false, only the non-audio data members will be copied
  // to |frame|.
-  void InterleaveTo(AudioFrame* frame, bool data_changed) const;
+  void InterleaveTo(AudioFrame* frame) const;

  // Use for float deinterleaved data.
  void CopyFrom(const float* const* data, const StreamConfig& stream_config);
  void CopyTo(const StreamConfig& stream_config, float* const* data);
-  void CopyLowPassToReference();

  // Splits the signal into different bands.
  void SplitIntoFrequencyBands();
@ -142,16 +115,10 @@ class AudioBuffer {

  size_t num_bands_;
  size_t num_split_frames_;
-  bool mixed_low_pass_valid_;
-  bool reference_copied_;
-  AudioFrame::VADActivity activity_;

-  const float* keyboard_data_;
  std::unique_ptr<IFChannelBuffer> data_;
  std::unique_ptr<IFChannelBuffer> split_data_;
  std::unique_ptr<SplittingFilter> splitting_filter_;
-  std::unique_ptr<ChannelBuffer<int16_t>> mixed_low_pass_channels_;
-  std::unique_ptr<ChannelBuffer<int16_t>> low_pass_reference_channels_;
  std::unique_ptr<IFChannelBuffer> input_buffer_;
  std::unique_ptr<IFChannelBuffer> output_buffer_;
  std::unique_ptr<ChannelBuffer<float>> process_buffer_;
--- a/modules/audio_processing/audio_buffer_unittest.cc
+++ b/modules/audio_processing/audio_buffer_unittest.cc
@ -21,10 +21,6 @@ const size_t kStereo = 2u;
 const size_t kMono = 1u;

 void ExpectNumChannels(const AudioBuffer& ab, size_t num_channels) {
-  EXPECT_EQ(ab.data()->num_channels(), num_channels);
-  EXPECT_EQ(ab.data_f()->num_channels(), num_channels);
-  EXPECT_EQ(ab.split_data()->num_channels(), num_channels);
-  EXPECT_EQ(ab.split_data_f()->num_channels(), num_channels);
  EXPECT_EQ(ab.num_channels(), num_channels);
 }

--- a/modules/audio_processing/audio_processing_impl.cc
+++ b/modules/audio_processing/audio_processing_impl.cc
@ -949,6 +949,7 @@ int AudioProcessingImpl::ProcessStream(const float* const* src,
    RecordUnprocessedCaptureStream(src);
  }

+  capture_.keyboard_info.Extract(src, formats_.api_format.input_stream());
  capture_.capture_audio->CopyFrom(src, formats_.api_format.input_stream());
  RETURN_ON_ERR(ProcessCaptureStreamLocked());
  capture_.capture_audio->CopyTo(formats_.api_format.output_stream(), dest);
@ -1243,11 +1244,14 @@ int AudioProcessingImpl::ProcessStream(AudioFrame* frame) {
    RecordUnprocessedCaptureStream(*frame);
  }

+  capture_.vad_activity = frame->vad_activity_;
  capture_.capture_audio->DeinterleaveFrom(frame);
  RETURN_ON_ERR(ProcessCaptureStreamLocked());
-  capture_.capture_audio->InterleaveTo(
-      frame, submodule_states_.CaptureMultiBandProcessingActive() ||
-                 submodule_states_.CaptureFullBandProcessingActive());
+  if (submodule_states_.CaptureMultiBandProcessingActive() ||
+      submodule_states_.CaptureFullBandProcessingActive()) {
+    capture_.capture_audio->InterleaveTo(frame);
+  }
+  frame->vad_activity_ = capture_.vad_activity;

  if (aec_dump_) {
    RecordProcessedCaptureStream(*frame);
@ -1361,7 +1365,8 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() {
    }

    if (public_submodules_->noise_suppression->is_enabled()) {
-      capture_buffer->CopyLowPassToReference();
+      private_submodules_->echo_control_mobile->CopyLowPassReference(
+          capture_buffer);
    }

    public_submodules_->noise_suppression->ProcessCaptureAudio(capture_buffer);
@ -1393,7 +1398,15 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() {
    public_submodules_->noise_suppression->ProcessCaptureAudio(capture_buffer);
  }

-  public_submodules_->voice_detection->ProcessCaptureAudio(capture_buffer);
+  if (public_submodules_->voice_detection->is_enabled() &&
+      !public_submodules_->voice_detection->using_external_vad()) {
+    bool voice_active =
+        public_submodules_->voice_detection->ProcessCaptureAudio(
+            capture_buffer);
+    capture_.vad_activity =
+        voice_active ? AudioFrame::kVadActive : AudioFrame::kVadPassive;
+  }
+
  if (config_.voice_detection.enabled) {
    private_submodules_->voice_detector->ProcessCaptureAudio(capture_buffer);
    capture_.stats.voice_detected =
@ -1440,8 +1453,9 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() {
        capture_buffer->channels_f()[0], capture_buffer->num_frames(),
        capture_buffer->num_channels(),
        capture_buffer->split_bands_const_f(0)[kBand0To8kHz],
-        capture_buffer->num_frames_per_band(), capture_buffer->keyboard_data(),
-        capture_buffer->num_keyboard_frames(), voice_probability,
+        capture_buffer->num_frames_per_band(),
+        capture_.keyboard_info.keyboard_data,
+        capture_.keyboard_info.num_keyboard_frames, voice_probability,
        capture_.key_pressed);
  }

@ -1598,9 +1612,10 @@ int AudioProcessingImpl::ProcessReverseStream(AudioFrame* frame) {

  render_.render_audio->DeinterleaveFrom(frame);
  RETURN_ON_ERR(ProcessRenderStreamLocked());
-  render_.render_audio->InterleaveTo(
-      frame, submodule_states_.RenderMultiBandProcessingActive() ||
-                 submodule_states_.RenderFullBandProcessingActive());
+  if (submodule_states_.RenderMultiBandProcessingActive() ||
+      submodule_states_.RenderFullBandProcessingActive()) {
+    render_.render_audio->InterleaveTo(frame);
+  }
  return kNoError;
 }

@ -2117,6 +2132,17 @@ AudioProcessingImpl::ApmCaptureState::ApmCaptureState(

 AudioProcessingImpl::ApmCaptureState::~ApmCaptureState() = default;

+void AudioProcessingImpl::ApmCaptureState::KeyboardInfo::Extract(
+    const float* const* data,
+    const StreamConfig& stream_config) {
+  if (stream_config.has_keyboard()) {
+    keyboard_data = data[stream_config.num_channels()];
+  } else {
+    keyboard_data = NULL;
+  }
+  num_keyboard_frames = stream_config.num_frames();
+}
+
 AudioProcessingImpl::ApmRenderState::ApmRenderState() = default;

 AudioProcessingImpl::ApmRenderState::~ApmRenderState() = default;
--- a/modules/audio_processing/audio_processing_impl.h
+++ b/modules/audio_processing/audio_processing_impl.h
@ -394,6 +394,12 @@ class AudioProcessingImpl : public AudioProcessing {
    int playout_volume;
    int prev_playout_volume;
    AudioProcessingStats stats;
+    struct KeyboardInfo {
+      void Extract(const float* const* data, const StreamConfig& stream_config);
+      size_t num_keyboard_frames = 0;
+      const float* keyboard_data = nullptr;
+    } keyboard_info;
+    AudioFrame::VADActivity vad_activity = AudioFrame::kVadUnknown;
  } capture_ RTC_GUARDED_BY(crit_capture_);

  struct ApmCaptureNonLockedState {
--- a/modules/audio_processing/echo_control_mobile_impl.cc
+++ b/modules/audio_processing/echo_control_mobile_impl.cc
@ -101,7 +101,10 @@ class EchoControlMobileImpl::Canceller {
 };

 EchoControlMobileImpl::EchoControlMobileImpl()
-    : routing_mode_(kSpeakerphone), comfort_noise_enabled_(false) {}
+    : routing_mode_(kSpeakerphone), comfort_noise_enabled_(false) {
+  low_pass_reference_[0].fill(0);
+  low_pass_reference_[1].fill(0);
+}

 EchoControlMobileImpl::~EchoControlMobileImpl() {}

@ -168,7 +171,9 @@ int EchoControlMobileImpl::ProcessCaptureAudio(AudioBuffer* audio,
  for (size_t capture = 0; capture < audio->num_channels(); ++capture) {
    // TODO(ajm): improve how this works, possibly inside AECM.
    //            This is kind of hacked up.
-    const int16_t* noisy = audio->low_pass_reference(capture);
+    RTC_DCHECK_LT(capture, low_pass_reference_.size());
+    const int16_t* noisy =
+        reference_copied_ ? low_pass_reference_[capture].data() : nullptr;
    const int16_t* clean = audio->split_bands_const(capture)[kBand0To8kHz];
    if (noisy == NULL) {
      noisy = clean;
@ -195,6 +200,16 @@ int EchoControlMobileImpl::ProcessCaptureAudio(AudioBuffer* audio,
  return AudioProcessing::kNoError;
 }

+void EchoControlMobileImpl::CopyLowPassReference(AudioBuffer* audio) {
+  RTC_DCHECK_LE(audio->num_channels(), low_pass_reference_.size());
+  reference_copied_ = true;
+  for (size_t capture = 0; capture < audio->num_channels(); ++capture) {
+    memcpy(low_pass_reference_[capture].data(),
+           audio->split_bands_const(capture)[kBand0To8kHz],
+           audio->num_frames_per_band() * sizeof(int16_t));
+  }
+}
+
 int EchoControlMobileImpl::set_routing_mode(RoutingMode mode) {
  if (MapSetting(mode) == -1) {
    return AudioProcessing::kBadParameterError;
@ -219,6 +234,9 @@ bool EchoControlMobileImpl::is_comfort_noise_enabled() const {
 void EchoControlMobileImpl::Initialize(int sample_rate_hz,
                                       size_t num_reverse_channels,
                                       size_t num_output_channels) {
+  low_pass_reference_[0].fill(0);
+  low_pass_reference_[1].fill(0);
+
  stream_properties_.reset(new StreamProperties(
      sample_rate_hz, num_reverse_channels, num_output_channels));

--- a/modules/audio_processing/echo_control_mobile_impl.h
+++ b/modules/audio_processing/echo_control_mobile_impl.h
@ -54,6 +54,7 @@ class EchoControlMobileImpl {

  void ProcessRenderAudio(rtc::ArrayView<const int16_t> packed_render_audio);
  int ProcessCaptureAudio(AudioBuffer* audio, int stream_delay_ms);
+  void CopyLowPassReference(AudioBuffer* audio);

  void Initialize(int sample_rate_hz,
                  size_t num_reverse_channels,
@ -78,6 +79,8 @@ class EchoControlMobileImpl {

  std::vector<std::unique_ptr<Canceller>> cancellers_;
  std::unique_ptr<StreamProperties> stream_properties_;
+  std::array<std::array<int16_t, 160>, 2> low_pass_reference_;
+  bool reference_copied_ = false;
 };
 }  // namespace webrtc

--- a/modules/audio_processing/gain_control_impl.cc
+++ b/modules/audio_processing/gain_control_impl.cc
@ -120,10 +120,28 @@ void GainControlImpl::PackRenderAudioBuffer(
    std::vector<int16_t>* packed_buffer) {
  RTC_DCHECK_GE(160, audio->num_frames_per_band());

+  std::array<int16_t, 160> mixed_low_pass_data;
+  rtc::ArrayView<const int16_t> mixed_low_pass;
+  if (audio->num_proc_channels() == 1) {
+    mixed_low_pass =
+        rtc::ArrayView<const int16_t>(audio->split_bands_const(0)[kBand0To8kHz],
+                                      audio->num_frames_per_band());
+  } else {
+    const int num_channels = static_cast<int>(audio->num_channels());
+    for (size_t i = 0; i < audio->num_frames_per_band(); ++i) {
+      int32_t value = audio->split_channels_const(kBand0To8kHz)[0][i];
+      for (int j = 1; j < num_channels; ++j) {
+        value += audio->split_channels_const(kBand0To8kHz)[j][i];
+      }
+      mixed_low_pass_data[i] = value / num_channels;
+    }
+    mixed_low_pass = rtc::ArrayView<const int16_t>(
+        mixed_low_pass_data.data(), audio->num_frames_per_band());
+  }
+
  packed_buffer->clear();
-  packed_buffer->insert(
-      packed_buffer->end(), audio->mixed_low_pass_data(),
-      (audio->mixed_low_pass_data() + audio->num_frames_per_band()));
+  packed_buffer->insert(packed_buffer->end(), mixed_low_pass.data(),
+                        (mixed_low_pass.data() + audio->num_frames_per_band()));
 }

 int GainControlImpl::AnalyzeCaptureAudio(AudioBuffer* audio) {
--- a/modules/audio_processing/voice_detection_impl.cc
+++ b/modules/audio_processing/voice_detection_impl.cc
@ -54,30 +54,42 @@ void VoiceDetectionImpl::Initialize(int sample_rate_hz) {
  set_likelihood(likelihood_);
 }

-void VoiceDetectionImpl::ProcessCaptureAudio(AudioBuffer* audio) {
+bool VoiceDetectionImpl::ProcessCaptureAudio(AudioBuffer* audio) {
  rtc::CritScope cs(crit_);
-  if (!enabled_) {
-    return;
-  }
-  if (using_external_vad_) {
-    using_external_vad_ = false;
-    return;
-  }
+  RTC_DCHECK(enabled_);

  RTC_DCHECK_GE(160, audio->num_frames_per_band());
-  // TODO(ajm): concatenate data in frame buffer here.
-  int vad_ret =
-      WebRtcVad_Process(vad_->state(), sample_rate_hz_,
-                        audio->mixed_low_pass_data(), frame_size_samples_);
+  std::array<int16_t, 160> mixed_low_pass_data;
+  rtc::ArrayView<const int16_t> mixed_low_pass;
+  if (audio->num_proc_channels() == 1) {
+    mixed_low_pass =
+        rtc::ArrayView<const int16_t>(audio->split_bands_const(0)[kBand0To8kHz],
+                                      audio->num_frames_per_band());
+  } else {
+    const int num_channels = static_cast<int>(audio->num_channels());
+    for (size_t i = 0; i < audio->num_frames_per_band(); ++i) {
+      int32_t value = audio->split_channels_const(kBand0To8kHz)[0][i];
+      for (int j = 1; j < num_channels; ++j) {
+        value += audio->split_channels_const(kBand0To8kHz)[j][i];
+      }
+      mixed_low_pass_data[i] = value / num_channels;
+    }
+    mixed_low_pass = rtc::ArrayView<const int16_t>(
+        mixed_low_pass_data.data(), audio->num_frames_per_band());
+  }
+
+  int vad_ret = WebRtcVad_Process(vad_->state(), sample_rate_hz_,
+                                  mixed_low_pass.data(), frame_size_samples_);
  if (vad_ret == 0) {
    stream_has_voice_ = false;
-    audio->set_activity(AudioFrame::kVadPassive);
+    return false;
  } else if (vad_ret == 1) {
    stream_has_voice_ = true;
-    audio->set_activity(AudioFrame::kVadActive);
  } else {
    RTC_NOTREACHED();
  }
+
+  return stream_has_voice_;
 }

 int VoiceDetectionImpl::Enable(bool enable) {
--- a/modules/audio_processing/voice_detection_impl.h
+++ b/modules/audio_processing/voice_detection_impl.h
@ -31,7 +31,14 @@ class VoiceDetectionImpl : public VoiceDetection {

  // TODO(peah): Fold into ctor, once public API is removed.
  void Initialize(int sample_rate_hz);
-  void ProcessCaptureAudio(AudioBuffer* audio);
+
+  // Returns the VAD activity.
+  bool ProcessCaptureAudio(AudioBuffer* audio);
+
+  bool using_external_vad() const {
+    rtc::CritScope cs(crit_);
+    return using_external_vad_;
+  }

  // VoiceDetection implementation.
  int Enable(bool enable) override;