Introduce capture_time_identifier in webrtc::EncodedImage

This CL propagates capture_time_identifier introduced in
webrtc::VideoFrame and propagates it to EncodedImage. For use cases
involving EncodedTransforms, this identifier is further propagated to
TransformableVideoSenderFrame.

VideoEncoder::Encode function is overriden by each encoder. Each of
these overriden functions needs to be changed so that they can handle
this new identifier and propagate its value in the created EncodedImage.

Change-Id: I5bea4c5a3fe714f1198e497a4bcb5fd059afe516
Bug: webrtc:14878
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/291800
Reviewed-by: Tony Herre <herre@google.com>
Reviewed-by: Harald Alvestrand <hta@webrtc.org>
Commit-Queue: Palak Agarwal <agpalak@google.com>
Cr-Commit-Position: refs/heads/main@{#39374}
diff --git a/api/frame_transformer_interface.h b/api/frame_transformer_interface.h
index 92832c0..16a869e 100644
--- a/api/frame_transformer_interface.h
+++ b/api/frame_transformer_interface.h
@@ -36,6 +36,11 @@
   virtual uint8_t GetPayloadType() const = 0;
   virtual uint32_t GetSsrc() const = 0;
   virtual uint32_t GetTimestamp() const = 0;
+  // TODO(https://bugs.webrtc.org/14878): Change this to pure virtual after it
+  // is implemented everywhere.
+  virtual absl::optional<Timestamp> GetCaptureTimeIdentifier() const {
+    return absl::nullopt;
+  }
 
   enum class Direction {
     kUnknown,
diff --git a/api/video/encoded_image.h b/api/video/encoded_image.h
index 9046167..7b2f5c8 100644
--- a/api/video/encoded_image.h
+++ b/api/video/encoded_image.h
@@ -98,6 +98,14 @@
     simulcast_index_ = simulcast_index;
   }
 
+  const absl::optional<webrtc::Timestamp>& CaptureTimeIdentifier() const {
+    return capture_time_identifier_;
+  }
+  void SetCaptureTimeIdentifier(
+      const absl::optional<webrtc::Timestamp>& capture_time_identifier) {
+    capture_time_identifier_ = capture_time_identifier;
+  }
+
   // Encoded images can have dependencies between spatial and/or temporal
   // layers, depending on the scalability mode used by the encoder. See diagrams
   // at https://w3c.github.io/webrtc-svc/#dependencydiagrams*.
@@ -217,6 +225,7 @@
   size_t size_ = 0;  // Size of encoded frame data.
   uint32_t timestamp_rtp_ = 0;
   absl::optional<int> simulcast_index_;
+  absl::optional<webrtc::Timestamp> capture_time_identifier_;
   absl::optional<int> spatial_index_;
   absl::optional<int> temporal_index_;
   std::map<int, size_t> spatial_layer_frame_size_bytes_;
diff --git a/modules/rtp_rtcp/source/rtp_sender_video_frame_transformer_delegate.cc b/modules/rtp_rtcp/source/rtp_sender_video_frame_transformer_delegate.cc
index b34d2f7..aeda625 100644
--- a/modules/rtp_rtcp/source/rtp_sender_video_frame_transformer_delegate.cc
+++ b/modules/rtp_rtcp/source/rtp_sender_video_frame_transformer_delegate.cc
@@ -41,6 +41,7 @@
         codec_type_(codec_type),
         timestamp_(rtp_timestamp),
         capture_time_ms_(encoded_image.capture_time_ms_),
+        capture_time_identifier_(encoded_image.CaptureTimeIdentifier()),
         expected_retransmission_time_ms_(expected_retransmission_time_ms) {
     RTC_DCHECK_GE(payload_type_, 0);
     RTC_DCHECK_LE(payload_type_, 127);
@@ -87,6 +88,9 @@
   uint8_t GetPayloadType() const override { return payload_type_; }
   absl::optional<VideoCodecType> GetCodecType() const { return codec_type_; }
   int64_t GetCaptureTimeMs() const { return capture_time_ms_; }
+  absl::optional<Timestamp> GetCaptureTimeIdentifier() const override {
+    return capture_time_identifier_;
+  }
 
   const absl::optional<int64_t>& GetExpectedRetransmissionTimeMs() const {
     return expected_retransmission_time_ms_;
@@ -107,6 +111,7 @@
   const absl::optional<VideoCodecType> codec_type_ = absl::nullopt;
   const uint32_t timestamp_;
   const int64_t capture_time_ms_;
+  const absl::optional<Timestamp> capture_time_identifier_;
   const absl::optional<int64_t> expected_retransmission_time_ms_;
 };
 }  // namespace
diff --git a/modules/rtp_rtcp/source/rtp_sender_video_unittest.cc b/modules/rtp_rtcp/source/rtp_sender_video_unittest.cc
index 883b98a..9f44d78 100644
--- a/modules/rtp_rtcp/source/rtp_sender_video_unittest.cc
+++ b/modules/rtp_rtcp/source/rtp_sender_video_unittest.cc
@@ -1654,6 +1654,30 @@
 }
 
 TEST_F(RtpSenderVideoWithFrameTransformerTest,
+       TransformableFrameHasCorrectCaptureIdentifier) {
+  auto mock_frame_transformer =
+      rtc::make_ref_counted<NiceMock<MockFrameTransformer>>();
+  std::unique_ptr<RTPSenderVideo> rtp_sender_video =
+      CreateSenderWithFrameTransformer(mock_frame_transformer);
+  auto encoded_image = CreateDefaultEncodedImage();
+  encoded_image->SetCaptureTimeIdentifier(Timestamp::Millis(1));
+  RTPVideoHeader video_header;
+
+  EXPECT_CALL(*mock_frame_transformer, Transform)
+      .WillOnce([&encoded_image](std::unique_ptr<TransformableFrameInterface>
+                                     transformable_frame) {
+        auto* frame = static_cast<TransformableVideoFrameInterface*>(
+            transformable_frame.get());
+        ASSERT_TRUE(frame);
+        EXPECT_EQ(frame->GetCaptureTimeIdentifier(),
+                  encoded_image->CaptureTimeIdentifier());
+      });
+  rtp_sender_video->SendEncodedImage(kPayload, kType, kTimestamp,
+                                     *encoded_image, video_header,
+                                     kDefaultExpectedRetransmissionTimeMs);
+}
+
+TEST_F(RtpSenderVideoWithFrameTransformerTest,
        OnTransformedFrameSendsVideoWhenCloned) {
   auto mock_frame_transformer =
       rtc::make_ref_counted<NiceMock<MockFrameTransformer>>();
diff --git a/modules/video_coding/codecs/av1/libaom_av1_encoder.cc b/modules/video_coding/codecs/av1/libaom_av1_encoder.cc
index e89dc29e..13ffc93 100644
--- a/modules/video_coding/codecs/av1/libaom_av1_encoder.cc
+++ b/modules/video_coding/codecs/av1/libaom_av1_encoder.cc
@@ -678,6 +678,7 @@
                                        ? VideoFrameType::kVideoFrameKey
                                        : VideoFrameType::kVideoFrameDelta;
         encoded_image.SetTimestamp(frame.timestamp());
+        encoded_image.SetCaptureTimeIdentifier(frame.capture_time_identifier());
         encoded_image.capture_time_ms_ = frame.render_time_ms();
         encoded_image.rotation_ = frame.rotation();
         encoded_image.content_type_ = VideoContentType::UNSPECIFIED;
diff --git a/modules/video_coding/codecs/av1/libaom_av1_encoder_unittest.cc b/modules/video_coding/codecs/av1/libaom_av1_encoder_unittest.cc
index 1dcf53e..5b569fa 100644
--- a/modules/video_coding/codecs/av1/libaom_av1_encoder_unittest.cc
+++ b/modules/video_coding/codecs/av1/libaom_av1_encoder_unittest.cc
@@ -263,5 +263,37 @@
               Eq(VideoFrameType::kVideoFrameDelta));
 }
 
+TEST(LibaomAv1EncoderTest, TestCaptureTimeId) {
+  std::unique_ptr<VideoEncoder> encoder = CreateLibaomAv1Encoder();
+  const Timestamp capture_time_id = Timestamp::Micros(2000);
+  VideoCodec codec_settings = DefaultCodecSettings();
+  codec_settings.SetScalabilityMode(ScalabilityMode::kL2T1);
+  ASSERT_EQ(encoder->InitEncode(&codec_settings, DefaultEncoderSettings()),
+            WEBRTC_VIDEO_CODEC_OK);
+
+  VideoEncoder::RateControlParameters rate_parameters;
+  rate_parameters.framerate_fps = 30;
+  rate_parameters.bitrate.SetBitrate(/*spatial_index=*/0, /*temporal_index=*/0,
+                                     300'000);
+  rate_parameters.bitrate.SetBitrate(/*spatial_index=*/1, /*temporal_index=*/0,
+                                     300'000);
+  encoder->SetRates(rate_parameters);
+
+  std::vector<EncodedVideoFrameProducer::EncodedFrame> encoded_frames =
+      EncodedVideoFrameProducer(*encoder)
+          .SetNumInputFrames(1)
+          .SetCaptureTimeIdentifier(capture_time_id)
+          .Encode();
+  ASSERT_THAT(encoded_frames, SizeIs(2));
+  ASSERT_TRUE(
+      encoded_frames[0].encoded_image.CaptureTimeIdentifier().has_value());
+  ASSERT_TRUE(
+      encoded_frames[1].encoded_image.CaptureTimeIdentifier().has_value());
+  EXPECT_EQ(encoded_frames[0].encoded_image.CaptureTimeIdentifier()->us(),
+            capture_time_id.us());
+  EXPECT_EQ(encoded_frames[1].encoded_image.CaptureTimeIdentifier()->us(),
+            capture_time_id.us());
+}
+
 }  // namespace
 }  // namespace webrtc
diff --git a/modules/video_coding/codecs/test/encoded_video_frame_producer.cc b/modules/video_coding/codecs/test/encoded_video_frame_producer.cc
index 899826e..be2f2bf 100644
--- a/modules/video_coding/codecs/test/encoded_video_frame_producer.cc
+++ b/modules/video_coding/codecs/test/encoded_video_frame_producer.cc
@@ -62,6 +62,7 @@
         VideoFrame::Builder()
             .set_video_frame_buffer(frame_buffer_generator->NextFrame().buffer)
             .set_timestamp_rtp(rtp_timestamp_)
+            .set_capture_time_identifier(capture_time_identifier_)
             .build();
     rtp_timestamp_ += rtp_tick;
     RTC_CHECK_EQ(encoder_.Encode(frame, &next_frame_type_),
diff --git a/modules/video_coding/codecs/test/encoded_video_frame_producer.h b/modules/video_coding/codecs/test/encoded_video_frame_producer.h
index 04f4a64..063cfd4 100644
--- a/modules/video_coding/codecs/test/encoded_video_frame_producer.h
+++ b/modules/video_coding/codecs/test/encoded_video_frame_producer.h
@@ -49,15 +49,18 @@
 
   EncodedVideoFrameProducer& SetRtpTimestamp(uint32_t value);
 
-  // Generates input video frames and encodes them with `encoder` provided in
-  // the constructor. Returns frame passed to the `OnEncodedImage` by wraping
-  // `EncodedImageCallback` underneath.
+  EncodedVideoFrameProducer& SetCaptureTimeIdentifier(Timestamp value);
+
+  // Generates input video frames and encodes them with `encoder` provided
+  // in the constructor. Returns frame passed to the `OnEncodedImage` by
+  // wraping `EncodedImageCallback` underneath.
   std::vector<EncodedFrame> Encode();
 
  private:
   VideoEncoder& encoder_;
 
   uint32_t rtp_timestamp_ = 1000;
+  Timestamp capture_time_identifier_ = Timestamp::Micros(1000);
   int num_input_frames_ = 1;
   int framerate_fps_ = 30;
   RenderResolution resolution_ = {320, 180};
@@ -96,5 +99,10 @@
   return *this;
 }
 
+inline EncodedVideoFrameProducer&
+EncodedVideoFrameProducer::SetCaptureTimeIdentifier(Timestamp value) {
+  capture_time_identifier_ = value;
+  return *this;
+}
 }  // namespace webrtc
 #endif  // MODULES_VIDEO_CODING_CODECS_TEST_ENCODED_VIDEO_FRAME_PRODUCER_H_
diff --git a/modules/video_coding/codecs/vp8/libvpx_vp8_encoder.cc b/modules/video_coding/codecs/vp8/libvpx_vp8_encoder.cc
index 20bfc65..5457402 100644
--- a/modules/video_coding/codecs/vp8/libvpx_vp8_encoder.cc
+++ b/modules/video_coding/codecs/vp8/libvpx_vp8_encoder.cc
@@ -1180,6 +1180,8 @@
       }
     }
     encoded_images_[encoder_idx].SetTimestamp(input_image.timestamp());
+    encoded_images_[encoder_idx].SetCaptureTimeIdentifier(
+        input_image.capture_time_identifier());
     encoded_images_[encoder_idx].SetColorSpace(input_image.color_space());
     encoded_images_[encoder_idx].SetRetransmissionAllowed(
         retransmission_allowed);
diff --git a/modules/video_coding/codecs/vp8/test/vp8_impl_unittest.cc b/modules/video_coding/codecs/vp8/test/vp8_impl_unittest.cc
index 3dba397..839d696 100644
--- a/modules/video_coding/codecs/vp8/test/vp8_impl_unittest.cc
+++ b/modules/video_coding/codecs/vp8/test/vp8_impl_unittest.cc
@@ -249,10 +249,12 @@
 }
 
 TEST_F(TestVp8Impl, OnEncodedImageReportsInfo) {
+  constexpr Timestamp kCaptureTimeIdentifier = Timestamp::Micros(1000);
   VideoFrame input_frame = NextInputFrame();
   input_frame.set_timestamp(kInitialTimestampRtp);
   input_frame.set_timestamp_us(kInitialTimestampMs *
                                rtc::kNumMicrosecsPerMillisec);
+  input_frame.set_capture_time_identifier(kCaptureTimeIdentifier);
   EncodedImage encoded_frame;
   CodecSpecificInfo codec_specific_info;
   EncodeAndWaitForFrame(input_frame, &encoded_frame, &codec_specific_info);
@@ -260,6 +262,9 @@
   EXPECT_EQ(kInitialTimestampRtp, encoded_frame.Timestamp());
   EXPECT_EQ(kWidth, static_cast<int>(encoded_frame._encodedWidth));
   EXPECT_EQ(kHeight, static_cast<int>(encoded_frame._encodedHeight));
+  ASSERT_TRUE(encoded_frame.CaptureTimeIdentifier().has_value());
+  EXPECT_EQ(kCaptureTimeIdentifier.us(),
+            encoded_frame.CaptureTimeIdentifier()->us());
 }
 
 TEST_F(TestVp8Impl,
diff --git a/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc b/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc
index c2884c0..e054289 100644
--- a/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc
+++ b/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc
@@ -1754,6 +1754,8 @@
 
   TRACE_COUNTER1("webrtc", "EncodedFrameSize", encoded_image_.size());
   encoded_image_.SetTimestamp(input_image_->timestamp());
+  encoded_image_.SetCaptureTimeIdentifier(
+      input_image_->capture_time_identifier());
   encoded_image_.SetColorSpace(input_image_->color_space());
   encoded_image_._encodedHeight =
       pkt->data.frame.height[layer_id.spatial_layer_id];
diff --git a/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc b/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc
index b6293a3..a74dfa4 100644
--- a/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc
+++ b/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc
@@ -218,6 +218,19 @@
   EXPECT_EQ(encoded_frame.qp_, *decoded_qp);
 }
 
+TEST_P(TestVp9ImplForPixelFormat, CheckCaptureTimeID) {
+  constexpr Timestamp kCaptureTimeIdentifier = Timestamp::Micros(1000);
+  VideoFrame input_frame = NextInputFrame();
+  input_frame.set_capture_time_identifier(kCaptureTimeIdentifier);
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK, encoder_->Encode(input_frame, nullptr));
+  EncodedImage encoded_frame;
+  CodecSpecificInfo codec_specific_info;
+  ASSERT_TRUE(WaitForEncodedFrame(&encoded_frame, &codec_specific_info));
+  ASSERT_TRUE(encoded_frame.CaptureTimeIdentifier().has_value());
+  EXPECT_EQ(kCaptureTimeIdentifier.us(),
+            encoded_frame.CaptureTimeIdentifier()->us());
+}
+
 TEST_F(TestVp9Impl, SwitchInputPixelFormatsWithoutReconfigure) {
   EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK, encoder_->Encode(NextInputFrame(), nullptr));
   EncodedImage encoded_frame;
diff --git a/video/video_stream_encoder.cc b/video/video_stream_encoder.cc
index b0666a4..7e10e77 100644
--- a/video/video_stream_encoder.cc
+++ b/video/video_stream_encoder.cc
@@ -1501,6 +1501,11 @@
   incoming_frame.set_timestamp(
       kMsToRtpTimestamp * static_cast<uint32_t>(incoming_frame.ntp_time_ms()));
 
+  // Identifier should remain the same for newly produced incoming frame and the
+  // received |video_frame|.
+  incoming_frame.set_capture_time_identifier(
+      video_frame.capture_time_identifier());
+
   if (incoming_frame.ntp_time_ms() <= last_captured_timestamp_) {
     // We don't allow the same capture time for two frames, drop this one.
     RTC_LOG(LS_WARNING) << "Same/old NTP timestamp ("
@@ -1962,6 +1967,8 @@
     out_frame.set_video_frame_buffer(cropped_buffer);
     out_frame.set_update_rect(update_rect);
     out_frame.set_ntp_time_ms(video_frame.ntp_time_ms());
+    out_frame.set_capture_time_identifier(
+        video_frame.capture_time_identifier());
     // Since accumulated_update_rect_ is constructed before cropping,
     // we can't trust it. If any changes were pending, we invalidate whole
     // frame here.