Refactor NetEq fake decode from file.

More or less bit-exact, only difference is that we don't seek in the
input file before returning silence for DTX packets.

Bug: webrtc:13322
Change-Id: I147b70d4a0f2c78719c9673b55df6617e064bd61
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/301104
Commit-Queue: Jakob Ivarsson‎ <jakobi@webrtc.org>
Reviewed-by: Henrik Lundin <henrik.lundin@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#39851}
diff --git a/modules/audio_coding/neteq/tools/fake_decode_from_file.cc b/modules/audio_coding/neteq/tools/fake_decode_from_file.cc
index 6c5e5ac..ad52239 100644
--- a/modules/audio_coding/neteq/tools/fake_decode_from_file.cc
+++ b/modules/audio_coding/neteq/tools/fake_decode_from_file.cc
@@ -21,47 +21,55 @@
 
 class FakeEncodedFrame : public AudioDecoder::EncodedAudioFrame {
  public:
-  FakeEncodedFrame(AudioDecoder* decoder, rtc::Buffer&& payload)
-      : decoder_(decoder), payload_(std::move(payload)) {}
+  FakeEncodedFrame(FakeDecodeFromFile* decoder,
+                   uint32_t timestamp,
+                   size_t duration,
+                   bool is_dtx)
+      : decoder_(decoder),
+        timestamp_(timestamp),
+        duration_(duration),
+        is_dtx_(is_dtx) {}
 
-  size_t Duration() const override {
-    const int ret = decoder_->PacketDuration(payload_.data(), payload_.size());
-    return ret < 0 ? 0 : static_cast<size_t>(ret);
-  }
+  size_t Duration() const override { return duration_; }
 
   absl::optional<DecodeResult> Decode(
       rtc::ArrayView<int16_t> decoded) const override {
-    auto speech_type = AudioDecoder::kSpeech;
-    const int ret = decoder_->Decode(
-        payload_.data(), payload_.size(), decoder_->SampleRateHz(),
-        decoded.size() * sizeof(int16_t), decoded.data(), &speech_type);
-    return ret < 0 ? absl::nullopt
-                   : absl::optional<DecodeResult>(
-                         {static_cast<size_t>(ret), speech_type});
+    if (is_dtx_) {
+      std::fill_n(decoded.data(), duration_, 0);
+      return DecodeResult{duration_, AudioDecoder::kComfortNoise};
+    }
+
+    decoder_->ReadFromFile(timestamp_, duration_, decoded.data());
+    return DecodeResult{Duration(), AudioDecoder::kSpeech};
   }
 
-  // This is to mimic OpusFrame.
-  bool IsDtxPacket() const override {
-    uint32_t original_payload_size_bytes =
-        ByteReader<uint32_t>::ReadLittleEndian(&payload_.data()[8]);
-    return original_payload_size_bytes <= 2;
-  }
+  bool IsDtxPacket() const override { return is_dtx_; }
 
  private:
-  AudioDecoder* const decoder_;
-  const rtc::Buffer payload_;
+  FakeDecodeFromFile* const decoder_;
+  const uint32_t timestamp_;
+  const size_t duration_;
+  const bool is_dtx_;
 };
 
 }  // namespace
 
-std::vector<AudioDecoder::ParseResult> FakeDecodeFromFile::ParsePayload(
-    rtc::Buffer&& payload,
-    uint32_t timestamp) {
-  std::vector<ParseResult> results;
-  std::unique_ptr<EncodedAudioFrame> frame(
-      new FakeEncodedFrame(this, std::move(payload)));
-  results.emplace_back(timestamp, 0, std::move(frame));
-  return results;
+void FakeDecodeFromFile::ReadFromFile(uint32_t timestamp,
+                                      size_t samples,
+                                      int16_t* destination) {
+  if (next_timestamp_from_input_ && timestamp != *next_timestamp_from_input_) {
+    // A gap in the timestamp sequence is detected. Skip the same number of
+    // samples from the file.
+    uint32_t jump = timestamp - *next_timestamp_from_input_;
+    RTC_CHECK(input_->Seek(jump));
+  }
+
+  next_timestamp_from_input_ = timestamp + samples;
+  RTC_CHECK(input_->Read(static_cast<size_t>(samples), destination));
+
+  if (stereo_) {
+    InputAudioFile::DuplicateInterleaved(destination, samples, 2, destination);
+  }
 }
 
 int FakeDecodeFromFile::DecodeInternal(const uint8_t* encoded,
@@ -69,90 +77,18 @@
                                        int sample_rate_hz,
                                        int16_t* decoded,
                                        SpeechType* speech_type) {
+  // This call is only used to produce codec-internal comfort noise.
   RTC_DCHECK_EQ(sample_rate_hz, SampleRateHz());
+  RTC_DCHECK_EQ(encoded_len, 0);
+  RTC_DCHECK(!encoded);  // NetEq always sends nullptr in this case.
 
-  const int samples_to_decode = PacketDuration(encoded, encoded_len);
+  const int samples_to_decode = rtc::CheckedDivExact(SampleRateHz(), 100);
   const int total_samples_to_decode = samples_to_decode * (stereo_ ? 2 : 1);
-
-  if (encoded_len == 0) {
-    // Decoder is asked to produce codec-internal comfort noise.
-    RTC_DCHECK(!encoded);  // NetEq always sends nullptr in this case.
-    RTC_DCHECK(cng_mode_);
-    RTC_DCHECK_GT(total_samples_to_decode, 0);
-    std::fill_n(decoded, total_samples_to_decode, 0);
-    *speech_type = kComfortNoise;
-    return rtc::dchecked_cast<int>(total_samples_to_decode);
-  }
-
-  RTC_CHECK_GE(encoded_len, 12);
-  uint32_t timestamp_to_decode =
-      ByteReader<uint32_t>::ReadLittleEndian(encoded);
-
-  if (next_timestamp_from_input_ &&
-      timestamp_to_decode != *next_timestamp_from_input_) {
-    // A gap in the timestamp sequence is detected. Skip the same number of
-    // samples from the file.
-    uint32_t jump = timestamp_to_decode - *next_timestamp_from_input_;
-    RTC_CHECK(input_->Seek(jump));
-  }
-
-  next_timestamp_from_input_ = timestamp_to_decode + samples_to_decode;
-
-  uint32_t original_payload_size_bytes =
-      ByteReader<uint32_t>::ReadLittleEndian(&encoded[8]);
-  if (original_payload_size_bytes <= 2) {
-    // This is a comfort noise payload.
-    RTC_DCHECK_GT(total_samples_to_decode, 0);
-    std::fill_n(decoded, total_samples_to_decode, 0);
-    *speech_type = kComfortNoise;
-    cng_mode_ = true;
-    return rtc::dchecked_cast<int>(total_samples_to_decode);
-  }
-
-  cng_mode_ = false;
-  RTC_CHECK(input_->Read(static_cast<size_t>(samples_to_decode), decoded));
-
-  if (stereo_) {
-    InputAudioFile::DuplicateInterleaved(decoded, samples_to_decode, 2,
-                                         decoded);
-  }
-
-  *speech_type = kSpeech;
-  last_decoded_length_ = samples_to_decode;
+  std::fill_n(decoded, total_samples_to_decode, 0);
+  *speech_type = kComfortNoise;
   return rtc::dchecked_cast<int>(total_samples_to_decode);
 }
 
-int FakeDecodeFromFile::PacketDuration(const uint8_t* encoded,
-                                       size_t encoded_len) const {
-  const uint32_t original_payload_size_bytes =
-      encoded_len < 8 + sizeof(uint32_t)
-          ? 0
-          : ByteReader<uint32_t>::ReadLittleEndian(&encoded[8]);
-  const uint32_t samples_to_decode =
-      encoded_len < 4 + sizeof(uint32_t)
-          ? 0
-          : ByteReader<uint32_t>::ReadLittleEndian(&encoded[4]);
-  if (encoded_len == 0) {
-    // Decoder is asked to produce codec-internal comfort noise.
-    return rtc::CheckedDivExact(SampleRateHz(), 100);
-  }
-  bool is_dtx_payload =
-      original_payload_size_bytes <= 2 || samples_to_decode == 0;
-  bool has_error_duration =
-      samples_to_decode % rtc::CheckedDivExact(SampleRateHz(), 100) != 0;
-  if (is_dtx_payload || has_error_duration) {
-    if (last_decoded_length_ > 0) {
-      // Use length of last decoded packet.
-      return rtc::dchecked_cast<int>(last_decoded_length_);
-    } else {
-      // This is the first packet to decode, and we do not know the length of
-      // it. Set it to 10 ms.
-      return rtc::CheckedDivExact(SampleRateHz(), 100);
-    }
-  }
-  return samples_to_decode;
-}
-
 void FakeDecodeFromFile::PrepareEncoded(uint32_t timestamp,
                                         size_t samples,
                                         size_t original_payload_size_bytes,
@@ -165,5 +101,22 @@
       &encoded[8], rtc::checked_cast<uint32_t>(original_payload_size_bytes));
 }
 
+std::vector<AudioDecoder::ParseResult> FakeDecodeFromFile::ParsePayload(
+    rtc::Buffer&& payload,
+    uint32_t timestamp) {
+  RTC_CHECK_GE(payload.size(), 12);
+  // Parse payload encoded in PrepareEncoded.
+  RTC_CHECK_EQ(timestamp, ByteReader<uint32_t>::ReadLittleEndian(&payload[0]));
+  size_t samples = ByteReader<uint32_t>::ReadLittleEndian(&payload[4]);
+  size_t original_payload_size_bytes =
+      ByteReader<uint32_t>::ReadLittleEndian(&payload[8]);
+  bool opus_dtx = original_payload_size_bytes <= 2;
+  std::vector<ParseResult> results;
+  results.emplace_back(
+      timestamp, 0,
+      std::make_unique<FakeEncodedFrame>(this, timestamp, samples, opus_dtx));
+  return results;
+}
+
 }  // namespace test
 }  // namespace webrtc
diff --git a/modules/audio_coding/neteq/tools/fake_decode_from_file.h b/modules/audio_coding/neteq/tools/fake_decode_from_file.h
index 7b53653..050a29d 100644
--- a/modules/audio_coding/neteq/tools/fake_decode_from_file.h
+++ b/modules/audio_coding/neteq/tools/fake_decode_from_file.h
@@ -52,7 +52,9 @@
                      int16_t* decoded,
                      SpeechType* speech_type) override;
 
-  int PacketDuration(const uint8_t* encoded, size_t encoded_len) const override;
+  // Reads `samples` from the input file and writes the results to
+  // `destination`. Location in file is determined by `timestamp`.
+  void ReadFromFile(uint32_t timestamp, size_t samples, int16_t* destination);
 
   // Helper method. Writes `timestamp`, `samples` and
   // `original_payload_size_bytes` to `encoded` in a format that the
@@ -68,8 +70,6 @@
   absl::optional<uint32_t> next_timestamp_from_input_;
   const int sample_rate_hz_;
   const bool stereo_;
-  size_t last_decoded_length_ = 0;
-  bool cng_mode_ = false;
 };
 
 }  // namespace test
diff --git a/modules/audio_coding/neteq/tools/neteq_replacement_input.cc b/modules/audio_coding/neteq/tools/neteq_replacement_input.cc
index 9436b68..081bd96 100644
--- a/modules/audio_coding/neteq/tools/neteq_replacement_input.cc
+++ b/modules/audio_coding/neteq/tools/neteq_replacement_input.cc
@@ -105,8 +105,9 @@
   uint32_t input_frame_size_timestamps = last_frame_size_timestamps_;
   const uint32_t timestamp_diff =
       next_hdr->timestamp - packet_->header.timestamp;
+  const bool opus_dtx = packet_->payload.size() <= 2;
   if (next_hdr->sequenceNumber == packet_->header.sequenceNumber + 1 &&
-      timestamp_diff <= 120 * 48) {
+      timestamp_diff <= 120 * 48 && !opus_dtx) {
     // Packets are in order and the timestamp diff is less than 5760 samples.
     // Accept the timestamp diff as a valid frame size.
     input_frame_size_timestamps = timestamp_diff;