Avoid flagging Opus DTX frames as speech.

Background: After 20 consecutive DTX frames, Opus encodes the background
noise in a normal frame and then goes back to outputting DTX frames.

Currently all Opus frames are flagged as containing speech.

This CL is has two effects on outgoing Opus packets:
1. DTX frames are flagged as non-speech.
2. A non-DTX frame that follows 20 consecutive DTX frames is flagged as
   non-speech.

Bug: webrtc:8088
Change-Id: Ic36cf8c9d0a34f55ed4e57858362ad91e3897dda
Reviewed-on: https://webrtc-review.googlesource.com/23760
Commit-Queue: Gustaf Ullberg <gustaf@webrtc.org>
Reviewed-by: Henrik Lundin <henrik.lundin@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#20794}
diff --git a/modules/audio_coding/codecs/opus/audio_encoder_opus.cc b/modules/audio_coding/codecs/opus/audio_encoder_opus.cc
index 5a5ac34..f07cd42 100644
--- a/modules/audio_coding/codecs/opus/audio_encoder_opus.cc
+++ b/modules/audio_coding/codecs/opus/audio_encoder_opus.cc
@@ -380,7 +380,8 @@
       inst_(nullptr),
       packet_loss_fraction_smoother_(new PacketLossFractionSmoother()),
       audio_network_adaptor_creator_(audio_network_adaptor_creator),
-      bitrate_smoother_(std::move(bitrate_smoother)) {
+      bitrate_smoother_(std::move(bitrate_smoother)),
+      consecutive_dtx_frames_(0) {
   RTC_DCHECK(0 <= payload_type && payload_type <= 127);
 
   // Sanity check of the redundant payload type field that we want to get rid
@@ -603,14 +604,23 @@
           });
   input_buffer_.clear();
 
+  bool dtx_frame = (info.encoded_bytes <= 2);
+
   // Will use new packet size for next encoding.
   config_.frame_size_ms = next_frame_length_ms_;
 
   info.encoded_timestamp = first_timestamp_in_buffer_;
   info.payload_type = payload_type_;
   info.send_even_if_empty = true;  // Allows Opus to send empty packets.
-  info.speech = (info.encoded_bytes > 0);
+  // After 20 DTX frames (MAX_CONSECUTIVE_DTX) Opus will send a frame
+  // coding the background noise. Avoid flagging this frame as speech
+  // (even though there is a probability of the frame being speech).
+  info.speech = !dtx_frame && (consecutive_dtx_frames_ != 20);
   info.encoder_type = CodecType::kOpus;
+
+  // Increase or reset DTX counter.
+  consecutive_dtx_frames_ = (dtx_frame) ? (consecutive_dtx_frames_ + 1) : (0);
+
   return info;
 }
 
diff --git a/modules/audio_coding/codecs/opus/audio_encoder_opus.h b/modules/audio_coding/codecs/opus/audio_encoder_opus.h
index 8e51dbd..22967c4 100644
--- a/modules/audio_coding/codecs/opus/audio_encoder_opus.h
+++ b/modules/audio_coding/codecs/opus/audio_encoder_opus.h
@@ -161,6 +161,7 @@
   rtc::Optional<size_t> overhead_bytes_per_packet_;
   const std::unique_ptr<SmoothingFilter> bitrate_smoother_;
   rtc::Optional<int64_t> bitrate_smoother_last_update_time_;
+  int consecutive_dtx_frames_;
 
   friend struct AudioEncoderOpus;
   RTC_DISALLOW_COPY_AND_ASSIGN(AudioEncoderOpusImpl);
diff --git a/modules/audio_coding/codecs/opus/audio_encoder_opus_unittest.cc b/modules/audio_coding/codecs/opus/audio_encoder_opus_unittest.cc
index 868de8c..c3ad488 100644
--- a/modules/audio_coding/codecs/opus/audio_encoder_opus_unittest.cc
+++ b/modules/audio_coding/codecs/opus/audio_encoder_opus_unittest.cc
@@ -753,4 +753,66 @@
   EXPECT_EQ(64000, config.bitrate_bps);
 }
 
+TEST(AudioEncoderOpusTest, OpusFlagDtxAsNonSpeech) {
+  // Create encoder with DTX enabled.
+  AudioEncoderOpusConfig config;
+  config.dtx_enabled = true;
+  constexpr int payload_type = 17;
+  const auto encoder = AudioEncoderOpus::MakeAudioEncoder(config, payload_type);
+
+  // Open file containing speech and silence.
+  const std::string kInputFileName =
+      webrtc::test::ResourcePath("audio_coding/testfile32kHz", "pcm");
+  test::AudioLoop audio_loop;
+  // Use the file as if it were sampled at 48 kHz.
+  constexpr int kSampleRateHz = 48000;
+  EXPECT_EQ(kSampleRateHz, encoder->SampleRateHz());
+  constexpr size_t kMaxLoopLengthSamples =
+      kSampleRateHz * 10;  // Max 10 second loop.
+  constexpr size_t kInputBlockSizeSamples =
+      10 * kSampleRateHz / 1000;  // 10 ms.
+  EXPECT_TRUE(audio_loop.Init(kInputFileName, kMaxLoopLengthSamples,
+                              kInputBlockSizeSamples));
+
+  // Encode.
+  AudioEncoder::EncodedInfo info;
+  rtc::Buffer encoded(500);
+  int nonspeech_frames = 0;
+  int max_nonspeech_frames = 0;
+  int dtx_frames = 0;
+  int max_dtx_frames = 0;
+  uint32_t rtp_timestamp = 0u;
+  for (size_t i = 0; i < 500; ++i) {
+    encoded.Clear();
+
+    // Every second call to the encoder will generate an Opus packet.
+    for (int j = 0; j < 2; j++) {
+      info =
+          encoder->Encode(rtp_timestamp, audio_loop.GetNextBlock(), &encoded);
+      rtp_timestamp += kInputBlockSizeSamples;
+    }
+
+    // Bookkeeping of number of DTX frames.
+    if (info.encoded_bytes <= 2) {
+      ++dtx_frames;
+    } else {
+      if (dtx_frames > max_dtx_frames)
+        max_dtx_frames = dtx_frames;
+      dtx_frames = 0;
+    }
+
+    // Bookkeeping of number of non-speech frames.
+    if (info.speech == 0) {
+      ++nonspeech_frames;
+    } else {
+      if (nonspeech_frames > max_nonspeech_frames)
+        max_nonspeech_frames = nonspeech_frames;
+      nonspeech_frames = 0;
+    }
+  }
+
+  // Maximum number of consecutive non-speech packets should exceed 20.
+  EXPECT_GT(max_nonspeech_frames, 20);
+}
+
 }  // namespace webrtc
diff --git a/modules/audio_coding/test/TestVADDTX.cc b/modules/audio_coding/test/TestVADDTX.cc
index 628582d..8064448 100644
--- a/modules/audio_coding/test/TestVADDTX.cc
+++ b/modules/audio_coding/test/TestVADDTX.cc
@@ -257,6 +257,7 @@
 
   EXPECT_EQ(0, acm_send_->EnableOpusDtx());
   expects[kEmptyFrame] = 1;
+  expects[kAudioFrameCN] = 1;
   Run(webrtc::test::ResourcePath("audio_coding/testfile32kHz", "pcm"),
       32000, 1, out_filename, true, expects);
 
@@ -265,12 +266,14 @@
   RegisterCodec(kOpusStereo);
   EXPECT_EQ(0, acm_send_->DisableOpusDtx());
   expects[kEmptyFrame] = 0;
+  expects[kAudioFrameCN] = 0;
   Run(webrtc::test::ResourcePath("audio_coding/teststereo32kHz", "pcm"),
       32000, 2, out_filename, false, expects);
 
   EXPECT_EQ(0, acm_send_->EnableOpusDtx());
 
   expects[kEmptyFrame] = 1;
+  expects[kAudioFrameCN] = 1;
   Run(webrtc::test::ResourcePath("audio_coding/teststereo32kHz", "pcm"),
       32000, 2, out_filename, true, expects);
 #endif