Remove all AudioBuffer code that is not related to storing audio data

This CL moves/removes all code from the AudioBuffer that:
-Is not directly handling audio data (e.g., keytaps, VAD descisions).
-Is caching aggregated versions of the rest of the audio data.
-Is not used (or only used in testing)

Bug: webrtc:10882
Change-Id: I737deb3f692748eff30f46ad806b2c6f6292802c
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/149072
Reviewed-by: Gustaf Ullberg <gustaf@webrtc.org>
Commit-Queue: Per Åhgren <peah@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#28866}
diff --git a/modules/audio_processing/audio_buffer.cc b/modules/audio_processing/audio_buffer.cc
index 1a99463..584111c 100644
--- a/modules/audio_processing/audio_buffer.cc
+++ b/modules/audio_processing/audio_buffer.cc
@@ -27,15 +27,6 @@
 const size_t kSamplesPer32kHzChannel = 320;
 const size_t kSamplesPer48kHzChannel = 480;
 
-int KeyboardChannelIndex(const StreamConfig& stream_config) {
-  if (!stream_config.has_keyboard()) {
-    RTC_NOTREACHED();
-    return 0;
-  }
-
-  return stream_config.num_channels();
-}
-
 size_t NumBandsFromSamplesPerChannel(size_t num_frames) {
   size_t num_bands = 1;
   if (num_frames == kSamplesPer32kHzChannel ||
@@ -60,10 +51,6 @@
       num_channels_(num_process_channels),
       num_bands_(NumBandsFromSamplesPerChannel(proc_num_frames_)),
       num_split_frames_(rtc::CheckedDivExact(proc_num_frames_, num_bands_)),
-      mixed_low_pass_valid_(false),
-      reference_copied_(false),
-      activity_(AudioFrame::kVadUnknown),
-      keyboard_data_(NULL),
       data_(new IFChannelBuffer(proc_num_frames_, num_proc_channels_)),
       output_buffer_(new IFChannelBuffer(output_num_frames_, num_channels_)) {
   RTC_DCHECK_GT(input_num_frames_, 0);
@@ -118,10 +105,6 @@
         new IFChannelBuffer(input_num_frames_, num_proc_channels_));
   }
 
-  if (stream_config.has_keyboard()) {
-    keyboard_data_ = data[KeyboardChannelIndex(stream_config)];
-  }
-
   // Downmix.
   const float* const* data_ptr = data;
   if (need_to_downmix) {
@@ -179,10 +162,6 @@
 }
 
 void AudioBuffer::InitForNewData() {
-  keyboard_data_ = NULL;
-  mixed_low_pass_valid_ = false;
-  reference_copied_ = false;
-  activity_ = AudioFrame::kVadUnknown;
   num_channels_ = num_proc_channels_;
   data_->set_num_channels(num_proc_channels_);
   if (split_data_.get()) {
@@ -195,7 +174,6 @@
 }
 
 int16_t* const* AudioBuffer::channels() {
-  mixed_low_pass_valid_ = false;
   return data_->ibuf()->channels();
 }
 
@@ -205,7 +183,6 @@
 }
 
 int16_t* const* AudioBuffer::split_bands(size_t channel) {
-  mixed_low_pass_valid_ = false;
   return split_data_.get() ? split_data_->ibuf()->bands(channel)
                            : data_->ibuf()->bands(channel);
 }
@@ -218,39 +195,11 @@
   }
 }
 
-int16_t* const* AudioBuffer::split_channels(Band band) {
-  mixed_low_pass_valid_ = false;
-  if (split_data_.get()) {
-    return split_data_->ibuf()->channels(band);
-  } else {
-    return band == kBand0To8kHz ? data_->ibuf()->channels() : nullptr;
-  }
-}
-
-ChannelBuffer<int16_t>* AudioBuffer::data() {
-  mixed_low_pass_valid_ = false;
-  return data_->ibuf();
-}
-
-const ChannelBuffer<int16_t>* AudioBuffer::data() const {
-  return data_->ibuf_const();
-}
-
-ChannelBuffer<int16_t>* AudioBuffer::split_data() {
-  mixed_low_pass_valid_ = false;
-  return split_data_.get() ? split_data_->ibuf() : data_->ibuf();
-}
-
-const ChannelBuffer<int16_t>* AudioBuffer::split_data() const {
-  return split_data_.get() ? split_data_->ibuf_const() : data_->ibuf_const();
-}
-
 const float* const* AudioBuffer::channels_const_f() const {
   return data_->fbuf_const()->channels();
 }
 
 float* const* AudioBuffer::channels_f() {
-  mixed_low_pass_valid_ = false;
   return data_->fbuf()->channels();
 }
 
@@ -260,85 +209,10 @@
 }
 
 float* const* AudioBuffer::split_bands_f(size_t channel) {
-  mixed_low_pass_valid_ = false;
   return split_data_.get() ? split_data_->fbuf()->bands(channel)
                            : data_->fbuf()->bands(channel);
 }
 
-const float* const* AudioBuffer::split_channels_const_f(Band band) const {
-  if (split_data_.get()) {
-    return split_data_->fbuf_const()->channels(band);
-  } else {
-    return band == kBand0To8kHz ? data_->fbuf_const()->channels() : nullptr;
-  }
-}
-
-float* const* AudioBuffer::split_channels_f(Band band) {
-  mixed_low_pass_valid_ = false;
-  if (split_data_.get()) {
-    return split_data_->fbuf()->channels(band);
-  } else {
-    return band == kBand0To8kHz ? data_->fbuf()->channels() : nullptr;
-  }
-}
-
-ChannelBuffer<float>* AudioBuffer::data_f() {
-  mixed_low_pass_valid_ = false;
-  return data_->fbuf();
-}
-
-const ChannelBuffer<float>* AudioBuffer::data_f() const {
-  return data_->fbuf_const();
-}
-
-ChannelBuffer<float>* AudioBuffer::split_data_f() {
-  mixed_low_pass_valid_ = false;
-  return split_data_.get() ? split_data_->fbuf() : data_->fbuf();
-}
-
-const ChannelBuffer<float>* AudioBuffer::split_data_f() const {
-  return split_data_.get() ? split_data_->fbuf_const() : data_->fbuf_const();
-}
-
-const int16_t* AudioBuffer::mixed_low_pass_data() {
-  if (num_proc_channels_ == 1) {
-    return split_bands_const(0)[kBand0To8kHz];
-  }
-
-  if (!mixed_low_pass_valid_) {
-    if (!mixed_low_pass_channels_.get()) {
-      mixed_low_pass_channels_.reset(
-          new ChannelBuffer<int16_t>(num_split_frames_, 1));
-    }
-
-    DownmixToMono<int16_t, int32_t>(split_channels_const(kBand0To8kHz),
-                                    num_split_frames_, num_channels_,
-                                    mixed_low_pass_channels_->channels()[0]);
-    mixed_low_pass_valid_ = true;
-  }
-  return mixed_low_pass_channels_->channels()[0];
-}
-
-const int16_t* AudioBuffer::low_pass_reference(int channel) const {
-  if (!reference_copied_) {
-    return NULL;
-  }
-
-  return low_pass_reference_channels_->channels()[channel];
-}
-
-const float* AudioBuffer::keyboard_data() const {
-  return keyboard_data_;
-}
-
-void AudioBuffer::set_activity(AudioFrame::VADActivity activity) {
-  activity_ = activity;
-}
-
-AudioFrame::VADActivity AudioBuffer::activity() const {
-  return activity_;
-}
-
 size_t AudioBuffer::num_channels() const {
   return num_channels_;
 }
@@ -359,17 +233,12 @@
   return num_split_frames_;
 }
 
-size_t AudioBuffer::num_keyboard_frames() const {
-  // We don't resample the keyboard channel.
-  return input_num_frames_;
-}
-
 size_t AudioBuffer::num_bands() const {
   return num_bands_;
 }
 
 // The resampler is only for supporting 48kHz to 16kHz in the reverse stream.
-void AudioBuffer::DeinterleaveFrom(AudioFrame* frame) {
+void AudioBuffer::DeinterleaveFrom(const AudioFrame* frame) {
   RTC_DCHECK_EQ(frame->num_channels_, num_input_channels_);
   RTC_DCHECK_EQ(frame->samples_per_channel_, input_num_frames_);
   InitForNewData();
@@ -378,7 +247,6 @@
     input_buffer_.reset(
         new IFChannelBuffer(input_num_frames_, num_proc_channels_));
   }
-  activity_ = frame->vad_activity_;
 
   int16_t* const* deinterleaved;
   if (input_num_frames_ == proc_num_frames_) {
@@ -407,12 +275,7 @@
   }
 }
 
-void AudioBuffer::InterleaveTo(AudioFrame* frame, bool data_changed) const {
-  frame->vad_activity_ = activity_;
-  if (!data_changed) {
-    return;
-  }
-
+void AudioBuffer::InterleaveTo(AudioFrame* frame) const {
   RTC_DCHECK(frame->num_channels_ == num_channels_ || num_channels_ == 1);
   RTC_DCHECK_EQ(frame->samples_per_channel_, output_num_frames_);
 
@@ -437,21 +300,6 @@
   }
 }
 
-void AudioBuffer::CopyLowPassToReference() {
-  reference_copied_ = true;
-  if (!low_pass_reference_channels_.get() ||
-      low_pass_reference_channels_->num_channels() != num_channels_) {
-    low_pass_reference_channels_.reset(
-        new ChannelBuffer<int16_t>(num_split_frames_, num_proc_channels_));
-  }
-  for (size_t i = 0; i < num_proc_channels_; i++) {
-    memcpy(low_pass_reference_channels_->channels()[i],
-           split_bands_const(i)[kBand0To8kHz],
-           low_pass_reference_channels_->num_frames_per_band() *
-               sizeof(split_bands_const(i)[kBand0To8kHz][0]));
-  }
-}
-
 void AudioBuffer::SplitIntoFrequencyBands() {
   splitting_filter_->Analysis(data_.get(), split_data_.get());
 }
diff --git a/modules/audio_processing/audio_buffer.h b/modules/audio_processing/audio_buffer.h
index 8fba9f9..c1bfb63 100644
--- a/modules/audio_processing/audio_buffer.h
+++ b/modules/audio_processing/audio_buffer.h
@@ -40,10 +40,10 @@
   virtual ~AudioBuffer();
 
   size_t num_channels() const;
+  size_t num_proc_channels() const { return num_proc_channels_; }
   void set_num_channels(size_t num_channels);
   size_t num_frames() const;
   size_t num_frames_per_band() const;
-  size_t num_keyboard_frames() const;
   size_t num_bands() const;
 
   // Returns a pointer array to the full-band channels.
@@ -76,44 +76,17 @@
   // 0 <= band < |num_bands_|
   // 0 <= channel < |num_proc_channels_|
   // 0 <= sample < |num_split_frames_|
-  int16_t* const* split_channels(Band band);
   const int16_t* const* split_channels_const(Band band) const;
-  float* const* split_channels_f(Band band);
-  const float* const* split_channels_const_f(Band band) const;
-
-  // Returns a pointer to the ChannelBuffer that encapsulates the full-band
-  // data.
-  ChannelBuffer<int16_t>* data();
-  const ChannelBuffer<int16_t>* data() const;
-  ChannelBuffer<float>* data_f();
-  const ChannelBuffer<float>* data_f() const;
-
-  // Returns a pointer to the ChannelBuffer that encapsulates the split data.
-  ChannelBuffer<int16_t>* split_data();
-  const ChannelBuffer<int16_t>* split_data() const;
-  ChannelBuffer<float>* split_data_f();
-  const ChannelBuffer<float>* split_data_f() const;
-
-  // Returns a pointer to the low-pass data downmixed to mono. If this data
-  // isn't already available it re-calculates it.
-  const int16_t* mixed_low_pass_data();
-  const int16_t* low_pass_reference(int channel) const;
-
-  const float* keyboard_data() const;
-
-  void set_activity(AudioFrame::VADActivity activity);
-  AudioFrame::VADActivity activity() const;
 
   // Use for int16 interleaved data.
-  void DeinterleaveFrom(AudioFrame* audioFrame);
+  void DeinterleaveFrom(const AudioFrame* audioFrame);
   // If |data_changed| is false, only the non-audio data members will be copied
   // to |frame|.
-  void InterleaveTo(AudioFrame* frame, bool data_changed) const;
+  void InterleaveTo(AudioFrame* frame) const;
 
   // Use for float deinterleaved data.
   void CopyFrom(const float* const* data, const StreamConfig& stream_config);
   void CopyTo(const StreamConfig& stream_config, float* const* data);
-  void CopyLowPassToReference();
 
   // Splits the signal into different bands.
   void SplitIntoFrequencyBands();
@@ -142,16 +115,10 @@
 
   size_t num_bands_;
   size_t num_split_frames_;
-  bool mixed_low_pass_valid_;
-  bool reference_copied_;
-  AudioFrame::VADActivity activity_;
 
-  const float* keyboard_data_;
   std::unique_ptr<IFChannelBuffer> data_;
   std::unique_ptr<IFChannelBuffer> split_data_;
   std::unique_ptr<SplittingFilter> splitting_filter_;
-  std::unique_ptr<ChannelBuffer<int16_t>> mixed_low_pass_channels_;
-  std::unique_ptr<ChannelBuffer<int16_t>> low_pass_reference_channels_;
   std::unique_ptr<IFChannelBuffer> input_buffer_;
   std::unique_ptr<IFChannelBuffer> output_buffer_;
   std::unique_ptr<ChannelBuffer<float>> process_buffer_;
diff --git a/modules/audio_processing/audio_buffer_unittest.cc b/modules/audio_processing/audio_buffer_unittest.cc
index 5c23159..b884799 100644
--- a/modules/audio_processing/audio_buffer_unittest.cc
+++ b/modules/audio_processing/audio_buffer_unittest.cc
@@ -21,10 +21,6 @@
 const size_t kMono = 1u;
 
 void ExpectNumChannels(const AudioBuffer& ab, size_t num_channels) {
-  EXPECT_EQ(ab.data()->num_channels(), num_channels);
-  EXPECT_EQ(ab.data_f()->num_channels(), num_channels);
-  EXPECT_EQ(ab.split_data()->num_channels(), num_channels);
-  EXPECT_EQ(ab.split_data_f()->num_channels(), num_channels);
   EXPECT_EQ(ab.num_channels(), num_channels);
 }
 
diff --git a/modules/audio_processing/audio_processing_impl.cc b/modules/audio_processing/audio_processing_impl.cc
index 9b4ae81..804802f 100644
--- a/modules/audio_processing/audio_processing_impl.cc
+++ b/modules/audio_processing/audio_processing_impl.cc
@@ -949,6 +949,7 @@
     RecordUnprocessedCaptureStream(src);
   }
 
+  capture_.keyboard_info.Extract(src, formats_.api_format.input_stream());
   capture_.capture_audio->CopyFrom(src, formats_.api_format.input_stream());
   RETURN_ON_ERR(ProcessCaptureStreamLocked());
   capture_.capture_audio->CopyTo(formats_.api_format.output_stream(), dest);
@@ -1243,11 +1244,14 @@
     RecordUnprocessedCaptureStream(*frame);
   }
 
+  capture_.vad_activity = frame->vad_activity_;
   capture_.capture_audio->DeinterleaveFrom(frame);
   RETURN_ON_ERR(ProcessCaptureStreamLocked());
-  capture_.capture_audio->InterleaveTo(
-      frame, submodule_states_.CaptureMultiBandProcessingActive() ||
-                 submodule_states_.CaptureFullBandProcessingActive());
+  if (submodule_states_.CaptureMultiBandProcessingActive() ||
+      submodule_states_.CaptureFullBandProcessingActive()) {
+    capture_.capture_audio->InterleaveTo(frame);
+  }
+  frame->vad_activity_ = capture_.vad_activity;
 
   if (aec_dump_) {
     RecordProcessedCaptureStream(*frame);
@@ -1361,7 +1365,8 @@
     }
 
     if (public_submodules_->noise_suppression->is_enabled()) {
-      capture_buffer->CopyLowPassToReference();
+      private_submodules_->echo_control_mobile->CopyLowPassReference(
+          capture_buffer);
     }
 
     public_submodules_->noise_suppression->ProcessCaptureAudio(capture_buffer);
@@ -1393,7 +1398,15 @@
     public_submodules_->noise_suppression->ProcessCaptureAudio(capture_buffer);
   }
 
-  public_submodules_->voice_detection->ProcessCaptureAudio(capture_buffer);
+  if (public_submodules_->voice_detection->is_enabled() &&
+      !public_submodules_->voice_detection->using_external_vad()) {
+    bool voice_active =
+        public_submodules_->voice_detection->ProcessCaptureAudio(
+            capture_buffer);
+    capture_.vad_activity =
+        voice_active ? AudioFrame::kVadActive : AudioFrame::kVadPassive;
+  }
+
   if (config_.voice_detection.enabled) {
     private_submodules_->voice_detector->ProcessCaptureAudio(capture_buffer);
     capture_.stats.voice_detected =
@@ -1440,8 +1453,9 @@
         capture_buffer->channels_f()[0], capture_buffer->num_frames(),
         capture_buffer->num_channels(),
         capture_buffer->split_bands_const_f(0)[kBand0To8kHz],
-        capture_buffer->num_frames_per_band(), capture_buffer->keyboard_data(),
-        capture_buffer->num_keyboard_frames(), voice_probability,
+        capture_buffer->num_frames_per_band(),
+        capture_.keyboard_info.keyboard_data,
+        capture_.keyboard_info.num_keyboard_frames, voice_probability,
         capture_.key_pressed);
   }
 
@@ -1598,9 +1612,10 @@
 
   render_.render_audio->DeinterleaveFrom(frame);
   RETURN_ON_ERR(ProcessRenderStreamLocked());
-  render_.render_audio->InterleaveTo(
-      frame, submodule_states_.RenderMultiBandProcessingActive() ||
-                 submodule_states_.RenderFullBandProcessingActive());
+  if (submodule_states_.RenderMultiBandProcessingActive() ||
+      submodule_states_.RenderFullBandProcessingActive()) {
+    render_.render_audio->InterleaveTo(frame);
+  }
   return kNoError;
 }
 
@@ -2117,6 +2132,17 @@
 
 AudioProcessingImpl::ApmCaptureState::~ApmCaptureState() = default;
 
+void AudioProcessingImpl::ApmCaptureState::KeyboardInfo::Extract(
+    const float* const* data,
+    const StreamConfig& stream_config) {
+  if (stream_config.has_keyboard()) {
+    keyboard_data = data[stream_config.num_channels()];
+  } else {
+    keyboard_data = NULL;
+  }
+  num_keyboard_frames = stream_config.num_frames();
+}
+
 AudioProcessingImpl::ApmRenderState::ApmRenderState() = default;
 
 AudioProcessingImpl::ApmRenderState::~ApmRenderState() = default;
diff --git a/modules/audio_processing/audio_processing_impl.h b/modules/audio_processing/audio_processing_impl.h
index 05dbb50..1539cd5 100644
--- a/modules/audio_processing/audio_processing_impl.h
+++ b/modules/audio_processing/audio_processing_impl.h
@@ -394,6 +394,12 @@
     int playout_volume;
     int prev_playout_volume;
     AudioProcessingStats stats;
+    struct KeyboardInfo {
+      void Extract(const float* const* data, const StreamConfig& stream_config);
+      size_t num_keyboard_frames = 0;
+      const float* keyboard_data = nullptr;
+    } keyboard_info;
+    AudioFrame::VADActivity vad_activity = AudioFrame::kVadUnknown;
   } capture_ RTC_GUARDED_BY(crit_capture_);
 
   struct ApmCaptureNonLockedState {
diff --git a/modules/audio_processing/echo_control_mobile_impl.cc b/modules/audio_processing/echo_control_mobile_impl.cc
index 69dfafe..c8084ea 100644
--- a/modules/audio_processing/echo_control_mobile_impl.cc
+++ b/modules/audio_processing/echo_control_mobile_impl.cc
@@ -101,7 +101,10 @@
 };
 
 EchoControlMobileImpl::EchoControlMobileImpl()
-    : routing_mode_(kSpeakerphone), comfort_noise_enabled_(false) {}
+    : routing_mode_(kSpeakerphone), comfort_noise_enabled_(false) {
+  low_pass_reference_[0].fill(0);
+  low_pass_reference_[1].fill(0);
+}
 
 EchoControlMobileImpl::~EchoControlMobileImpl() {}
 
@@ -168,7 +171,9 @@
   for (size_t capture = 0; capture < audio->num_channels(); ++capture) {
     // TODO(ajm): improve how this works, possibly inside AECM.
     //            This is kind of hacked up.
-    const int16_t* noisy = audio->low_pass_reference(capture);
+    RTC_DCHECK_LT(capture, low_pass_reference_.size());
+    const int16_t* noisy =
+        reference_copied_ ? low_pass_reference_[capture].data() : nullptr;
     const int16_t* clean = audio->split_bands_const(capture)[kBand0To8kHz];
     if (noisy == NULL) {
       noisy = clean;
@@ -195,6 +200,16 @@
   return AudioProcessing::kNoError;
 }
 
+void EchoControlMobileImpl::CopyLowPassReference(AudioBuffer* audio) {
+  RTC_DCHECK_LE(audio->num_channels(), low_pass_reference_.size());
+  reference_copied_ = true;
+  for (size_t capture = 0; capture < audio->num_channels(); ++capture) {
+    memcpy(low_pass_reference_[capture].data(),
+           audio->split_bands_const(capture)[kBand0To8kHz],
+           audio->num_frames_per_band() * sizeof(int16_t));
+  }
+}
+
 int EchoControlMobileImpl::set_routing_mode(RoutingMode mode) {
   if (MapSetting(mode) == -1) {
     return AudioProcessing::kBadParameterError;
@@ -219,6 +234,9 @@
 void EchoControlMobileImpl::Initialize(int sample_rate_hz,
                                        size_t num_reverse_channels,
                                        size_t num_output_channels) {
+  low_pass_reference_[0].fill(0);
+  low_pass_reference_[1].fill(0);
+
   stream_properties_.reset(new StreamProperties(
       sample_rate_hz, num_reverse_channels, num_output_channels));
 
diff --git a/modules/audio_processing/echo_control_mobile_impl.h b/modules/audio_processing/echo_control_mobile_impl.h
index d84a15e..718819d 100644
--- a/modules/audio_processing/echo_control_mobile_impl.h
+++ b/modules/audio_processing/echo_control_mobile_impl.h
@@ -54,6 +54,7 @@
 
   void ProcessRenderAudio(rtc::ArrayView<const int16_t> packed_render_audio);
   int ProcessCaptureAudio(AudioBuffer* audio, int stream_delay_ms);
+  void CopyLowPassReference(AudioBuffer* audio);
 
   void Initialize(int sample_rate_hz,
                   size_t num_reverse_channels,
@@ -78,6 +79,8 @@
 
   std::vector<std::unique_ptr<Canceller>> cancellers_;
   std::unique_ptr<StreamProperties> stream_properties_;
+  std::array<std::array<int16_t, 160>, 2> low_pass_reference_;
+  bool reference_copied_ = false;
 };
 }  // namespace webrtc
 
diff --git a/modules/audio_processing/gain_control_impl.cc b/modules/audio_processing/gain_control_impl.cc
index 2ca522c..5855943 100644
--- a/modules/audio_processing/gain_control_impl.cc
+++ b/modules/audio_processing/gain_control_impl.cc
@@ -120,10 +120,28 @@
     std::vector<int16_t>* packed_buffer) {
   RTC_DCHECK_GE(160, audio->num_frames_per_band());
 
+  std::array<int16_t, 160> mixed_low_pass_data;
+  rtc::ArrayView<const int16_t> mixed_low_pass;
+  if (audio->num_proc_channels() == 1) {
+    mixed_low_pass =
+        rtc::ArrayView<const int16_t>(audio->split_bands_const(0)[kBand0To8kHz],
+                                      audio->num_frames_per_band());
+  } else {
+    const int num_channels = static_cast<int>(audio->num_channels());
+    for (size_t i = 0; i < audio->num_frames_per_band(); ++i) {
+      int32_t value = audio->split_channels_const(kBand0To8kHz)[0][i];
+      for (int j = 1; j < num_channels; ++j) {
+        value += audio->split_channels_const(kBand0To8kHz)[j][i];
+      }
+      mixed_low_pass_data[i] = value / num_channels;
+    }
+    mixed_low_pass = rtc::ArrayView<const int16_t>(
+        mixed_low_pass_data.data(), audio->num_frames_per_band());
+  }
+
   packed_buffer->clear();
-  packed_buffer->insert(
-      packed_buffer->end(), audio->mixed_low_pass_data(),
-      (audio->mixed_low_pass_data() + audio->num_frames_per_band()));
+  packed_buffer->insert(packed_buffer->end(), mixed_low_pass.data(),
+                        (mixed_low_pass.data() + audio->num_frames_per_band()));
 }
 
 int GainControlImpl::AnalyzeCaptureAudio(AudioBuffer* audio) {
diff --git a/modules/audio_processing/voice_detection_impl.cc b/modules/audio_processing/voice_detection_impl.cc
index 7bf6c4a..0263de4 100644
--- a/modules/audio_processing/voice_detection_impl.cc
+++ b/modules/audio_processing/voice_detection_impl.cc
@@ -54,30 +54,42 @@
   set_likelihood(likelihood_);
 }
 
-void VoiceDetectionImpl::ProcessCaptureAudio(AudioBuffer* audio) {
+bool VoiceDetectionImpl::ProcessCaptureAudio(AudioBuffer* audio) {
   rtc::CritScope cs(crit_);
-  if (!enabled_) {
-    return;
-  }
-  if (using_external_vad_) {
-    using_external_vad_ = false;
-    return;
-  }
+  RTC_DCHECK(enabled_);
 
   RTC_DCHECK_GE(160, audio->num_frames_per_band());
-  // TODO(ajm): concatenate data in frame buffer here.
-  int vad_ret =
-      WebRtcVad_Process(vad_->state(), sample_rate_hz_,
-                        audio->mixed_low_pass_data(), frame_size_samples_);
+  std::array<int16_t, 160> mixed_low_pass_data;
+  rtc::ArrayView<const int16_t> mixed_low_pass;
+  if (audio->num_proc_channels() == 1) {
+    mixed_low_pass =
+        rtc::ArrayView<const int16_t>(audio->split_bands_const(0)[kBand0To8kHz],
+                                      audio->num_frames_per_band());
+  } else {
+    const int num_channels = static_cast<int>(audio->num_channels());
+    for (size_t i = 0; i < audio->num_frames_per_band(); ++i) {
+      int32_t value = audio->split_channels_const(kBand0To8kHz)[0][i];
+      for (int j = 1; j < num_channels; ++j) {
+        value += audio->split_channels_const(kBand0To8kHz)[j][i];
+      }
+      mixed_low_pass_data[i] = value / num_channels;
+    }
+    mixed_low_pass = rtc::ArrayView<const int16_t>(
+        mixed_low_pass_data.data(), audio->num_frames_per_band());
+  }
+
+  int vad_ret = WebRtcVad_Process(vad_->state(), sample_rate_hz_,
+                                  mixed_low_pass.data(), frame_size_samples_);
   if (vad_ret == 0) {
     stream_has_voice_ = false;
-    audio->set_activity(AudioFrame::kVadPassive);
+    return false;
   } else if (vad_ret == 1) {
     stream_has_voice_ = true;
-    audio->set_activity(AudioFrame::kVadActive);
   } else {
     RTC_NOTREACHED();
   }
+
+  return stream_has_voice_;
 }
 
 int VoiceDetectionImpl::Enable(bool enable) {
diff --git a/modules/audio_processing/voice_detection_impl.h b/modules/audio_processing/voice_detection_impl.h
index 4007f67..7ee303f 100644
--- a/modules/audio_processing/voice_detection_impl.h
+++ b/modules/audio_processing/voice_detection_impl.h
@@ -31,7 +31,14 @@
 
   // TODO(peah): Fold into ctor, once public API is removed.
   void Initialize(int sample_rate_hz);
-  void ProcessCaptureAudio(AudioBuffer* audio);
+
+  // Returns the VAD activity.
+  bool ProcessCaptureAudio(AudioBuffer* audio);
+
+  bool using_external_vad() const {
+    rtc::CritScope cs(crit_);
+    return using_external_vad_;
+  }
 
   // VoiceDetection implementation.
   int Enable(bool enable) override;