ACM: Adding support for more than 2 channels in the send pipeline

This CL adds support in the audio coding module for sending more than
2 channels to the encoder.

Bug: webrtc:11007
Change-Id: I0909b5c37a54c9d2e1353b864e55008cda50ffae
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/155583
Reviewed-by: Henrik Andreassson <henrika@webrtc.org>
Reviewed-by: Alex Loiko <aleloi@webrtc.org>
Commit-Queue: Per Åhgren <peah@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#29385}
diff --git a/modules/audio_coding/acm2/audio_coding_module.cc b/modules/audio_coding/acm2/audio_coding_module.cc
index 3f7a06a..314afd7 100644
--- a/modules/audio_coding/acm2/audio_coding_module.cc
+++ b/modules/audio_coding/acm2/audio_coding_module.cc
@@ -33,6 +33,10 @@
 
 namespace {
 
+// Initial size for the buffer in InputBuffer. This matches 6 channels of 10 ms
+// 48 kHz data.
+constexpr size_t kInitialInputDataBufferSize = 6 * 480;
+
 class AudioCodingModuleImpl final : public AudioCodingModule {
  public:
   explicit AudioCodingModuleImpl(const AudioCodingModule::Config& config);
@@ -97,15 +101,18 @@
 
  private:
   struct InputData {
+    InputData() : buffer(kInitialInputDataBufferSize) {}
     uint32_t input_timestamp;
     const int16_t* audio;
     size_t length_per_channel;
     size_t audio_channel;
     // If a re-mix is required (up or down), this buffer will store a re-mixed
     // version of the input.
-    int16_t buffer[WEBRTC_10MS_PCM_AUDIO];
+    std::vector<int16_t> buffer;
   };
 
+  InputData input_data_ RTC_GUARDED_BY(acm_crit_sect_);
+
   // This member class writes values to the named UMA histogram, but only if
   // the value has changed since the last time (and always for the first call).
   class ChangeLogger {
@@ -193,9 +200,9 @@
 }
 
 // Stereo-to-mono can be used as in-place.
-int DownMix(const AudioFrame& frame,
-            size_t length_out_buff,
-            int16_t* out_buff) {
+void DownMix(const AudioFrame& frame,
+             size_t length_out_buff,
+             int16_t* out_buff) {
   RTC_DCHECK_EQ(frame.num_channels_, 2);
   RTC_DCHECK_GE(length_out_buff, frame.samples_per_channel_);
 
@@ -210,26 +217,70 @@
   } else {
     std::fill(out_buff, out_buff + frame.samples_per_channel_, 0);
   }
-  return 0;
 }
 
-// Mono-to-stereo can be used as in-place.
-int UpMix(const AudioFrame& frame, size_t length_out_buff, int16_t* out_buff) {
-  RTC_DCHECK_EQ(frame.num_channels_, 1);
-  RTC_DCHECK_GE(length_out_buff, 2 * frame.samples_per_channel_);
+// Remixes the input frame to an output data vector. The output vector is
+// resized if needed.
+void ReMix(const AudioFrame& input,
+           size_t num_output_channels,
+           std::vector<int16_t>* output) {
+  const size_t output_size = num_output_channels * input.samples_per_channel_;
 
-  if (!frame.muted()) {
-    const int16_t* frame_data = frame.data();
-    for (size_t n = frame.samples_per_channel_; n != 0; --n) {
-      size_t i = n - 1;
-      int16_t sample = frame_data[i];
-      out_buff[2 * i + 1] = sample;
-      out_buff[2 * i] = sample;
-    }
-  } else {
-    std::fill(out_buff, out_buff + frame.samples_per_channel_ * 2, 0);
+  if (output->size() != output_size) {
+    output->resize(output_size);
   }
-  return 0;
+
+  // For muted frames, fill the frame with zeros.
+  if (input.muted()) {
+    std::fill(output->begin(), output->end(), 0);
+    return;
+  }
+
+  // Ensure that the special case of zero input channels is handled correctly
+  // (zero samples per channel is already handled correctly in the code below).
+  if (input.num_channels_ == 0) {
+    return;
+  }
+
+  const int16_t* input_data = input.data();
+  size_t in_index = 0;
+  size_t out_index = 0;
+
+  // When upmixing is needed, duplicate the last channel of the input.
+  if (input.num_channels_ < num_output_channels) {
+    for (size_t k = 0; k < input.samples_per_channel_; ++k) {
+      for (size_t j = 0; j < input.num_channels_; ++j) {
+        (*output)[out_index++] = input_data[in_index++];
+      }
+      RTC_DCHECK_GT(in_index, 0);
+      const int16_t value_last_channel = input_data[in_index - 1];
+      for (size_t j = input.num_channels_; j < num_output_channels; ++j) {
+        (*output)[out_index++] = value_last_channel;
+      }
+    }
+    return;
+  }
+
+  // When downmixing is needed, and the input is stereo, average the channels.
+  if (input.num_channels_ == 2) {
+    for (size_t n = 0; n < input.samples_per_channel_; ++n) {
+      (*output)[n] =
+          static_cast<int16_t>((static_cast<int32_t>(input_data[2 * n]) +
+                                static_cast<int32_t>(input_data[2 * n + 1])) >>
+                               1);
+    }
+    return;
+  }
+
+  // When downmixing is needed, and the input is multichannel, drop the surplus
+  // channels.
+  const size_t num_channels_to_drop = input.num_channels_ - num_output_channels;
+  for (size_t k = 0; k < input.samples_per_channel_; ++k) {
+    for (size_t j = 0; j < num_output_channels; ++j) {
+      (*output)[out_index++] = input_data[in_index++];
+    }
+    in_index += num_channels_to_drop;
+  }
 }
 
 void AudioCodingModuleImpl::ChangeLogger::MaybeLog(int value) {
@@ -367,10 +418,9 @@
 
 // Add 10MS of raw (PCM) audio data to the encoder.
 int AudioCodingModuleImpl::Add10MsData(const AudioFrame& audio_frame) {
-  InputData input_data;
   rtc::CritScope lock(&acm_crit_sect_);
-  int r = Add10MsDataInternal(audio_frame, &input_data);
-  return r < 0 ? r : Encode(input_data);
+  int r = Add10MsDataInternal(audio_frame, &input_data_);
+  return r < 0 ? r : Encode(input_data_);
 }
 
 int AudioCodingModuleImpl::Add10MsDataInternal(const AudioFrame& audio_frame,
@@ -421,30 +471,26 @@
   const bool same_num_channels =
       ptr_frame->num_channels_ == current_num_channels;
 
-  if (!same_num_channels) {
-    if (ptr_frame->num_channels_ == 1) {
-      if (UpMix(*ptr_frame, WEBRTC_10MS_PCM_AUDIO, input_data->buffer) < 0)
-        return -1;
-    } else {
-      if (DownMix(*ptr_frame, WEBRTC_10MS_PCM_AUDIO, input_data->buffer) < 0)
-        return -1;
-    }
-  }
-
-  // When adding data to encoders this pointer is pointing to an audio buffer
-  // with correct number of channels.
-  const int16_t* ptr_audio = ptr_frame->data();
-
-  // For pushing data to primary, point the |ptr_audio| to correct buffer.
-  if (!same_num_channels)
-    ptr_audio = input_data->buffer;
-
   // TODO(yujo): Skip encode of muted frames.
   input_data->input_timestamp = ptr_frame->timestamp_;
-  input_data->audio = ptr_audio;
   input_data->length_per_channel = ptr_frame->samples_per_channel_;
   input_data->audio_channel = current_num_channels;
 
+  if (!same_num_channels) {
+    // Remixes the input frame to the output data and in the process resize the
+    // output data if needed.
+    ReMix(*ptr_frame, current_num_channels, &input_data->buffer);
+
+    // For pushing data to primary, point the |ptr_audio| to correct buffer.
+    input_data->audio = input_data->buffer.data();
+    RTC_DCHECK_GE(input_data->buffer.size(),
+                  input_data->length_per_channel * input_data->audio_channel);
+  } else {
+    // When adding data to encoders this pointer is pointing to an audio buffer
+    // with correct number of channels.
+    input_data->audio = ptr_frame->data();
+  }
+
   return 0;
 }
 
@@ -508,8 +554,7 @@
     // local buffer, otherwise, it will be written to the output frame.
     int16_t* dest_ptr_audio =
         resample ? audio : preprocess_frame_.mutable_data();
-    if (DownMix(in_frame, WEBRTC_10MS_PCM_AUDIO, dest_ptr_audio) < 0)
-      return -1;
+    DownMix(in_frame, WEBRTC_10MS_PCM_AUDIO, dest_ptr_audio);
     preprocess_frame_.num_channels_ = 1;
     // Set the input of the resampler is the down-mixed signal.
     src_ptr_audio = audio;
diff --git a/modules/audio_coding/acm2/audio_coding_module_unittest.cc b/modules/audio_coding/acm2/audio_coding_module_unittest.cc
index a5946f9..9f026e8 100644
--- a/modules/audio_coding/acm2/audio_coding_module_unittest.cc
+++ b/modules/audio_coding/acm2/audio_coding_module_unittest.cc
@@ -1634,6 +1634,96 @@
   RunInner(40000, 60000);
 }
 
+// Verify that it works when the data to send is mono and the encoder is set to
+// send surround audio.
+TEST_F(AudioCodingModuleTestOldApi, SendingMultiChannelForMonoInput) {
+  constexpr int kSampleRateHz = 48000;
+  constexpr int kSamplesPerChannel = (kSampleRateHz * 10) / 1000;
+
+  audio_format_ = SdpAudioFormat({"multiopus",
+                                  kSampleRateHz,
+                                  6,
+                                  {{"minptime", "10"},
+                                   {"useinbandfec", "1"},
+                                   {"channel_mapping", "0,4,1,2,3,5"},
+                                   {"num_streams", "4"},
+                                   {"coupled_streams", "2"}}});
+
+  RegisterCodec();
+
+  input_frame_.sample_rate_hz_ = kSampleRateHz;
+  input_frame_.num_channels_ = 1;
+  input_frame_.samples_per_channel_ = kSamplesPerChannel;
+  for (size_t k = 0; k < 10; ++k) {
+    ASSERT_GE(acm_->Add10MsData(input_frame_), 0);
+    input_frame_.timestamp_ += kSamplesPerChannel;
+  }
+}
+
+// Verify that it works when the data to send is stereo and the encoder is set
+// to send surround audio.
+TEST_F(AudioCodingModuleTestOldApi, SendingMultiChannelForStereoInput) {
+  constexpr int kSampleRateHz = 48000;
+  constexpr int kSamplesPerChannel = (kSampleRateHz * 10) / 1000;
+
+  audio_format_ = SdpAudioFormat({"multiopus",
+                                  kSampleRateHz,
+                                  6,
+                                  {{"minptime", "10"},
+                                   {"useinbandfec", "1"},
+                                   {"channel_mapping", "0,4,1,2,3,5"},
+                                   {"num_streams", "4"},
+                                   {"coupled_streams", "2"}}});
+
+  RegisterCodec();
+
+  input_frame_.sample_rate_hz_ = kSampleRateHz;
+  input_frame_.num_channels_ = 2;
+  input_frame_.samples_per_channel_ = kSamplesPerChannel;
+  for (size_t k = 0; k < 10; ++k) {
+    ASSERT_GE(acm_->Add10MsData(input_frame_), 0);
+    input_frame_.timestamp_ += kSamplesPerChannel;
+  }
+}
+
+// Verify that it works when the data to send is mono and the encoder is set to
+// send stereo audio.
+TEST_F(AudioCodingModuleTestOldApi, SendingStereoForMonoInput) {
+  constexpr int kSampleRateHz = 48000;
+  constexpr int kSamplesPerChannel = (kSampleRateHz * 10) / 1000;
+
+  audio_format_ = SdpAudioFormat("opus", kSampleRateHz, 2);
+
+  RegisterCodec();
+
+  input_frame_.sample_rate_hz_ = kSampleRateHz;
+  input_frame_.num_channels_ = 1;
+  input_frame_.samples_per_channel_ = kSamplesPerChannel;
+  for (size_t k = 0; k < 10; ++k) {
+    ASSERT_GE(acm_->Add10MsData(input_frame_), 0);
+    input_frame_.timestamp_ += kSamplesPerChannel;
+  }
+}
+
+// Verify that it works when the data to send is stereo and the encoder is set
+// to send mono audio.
+TEST_F(AudioCodingModuleTestOldApi, SendingMonoForStereoInput) {
+  constexpr int kSampleRateHz = 48000;
+  constexpr int kSamplesPerChannel = (kSampleRateHz * 10) / 1000;
+
+  audio_format_ = SdpAudioFormat("L16", kSampleRateHz, 1);
+
+  RegisterCodec();
+
+  input_frame_.sample_rate_hz_ = kSampleRateHz;
+  input_frame_.num_channels_ = 1;
+  input_frame_.samples_per_channel_ = kSamplesPerChannel;
+  for (size_t k = 0; k < 10; ++k) {
+    ASSERT_GE(acm_->Add10MsData(input_frame_), 0);
+    input_frame_.timestamp_ += kSamplesPerChannel;
+  }
+}
+
 // The result on the Android platforms is inconsistent for this test case.
 // On android_rel the result is different from android and android arm64 rel.
 #if defined(WEBRTC_ANDROID)