Make RtpPayloadParams::MinimalisticVp9Structure codec agnostic.

Bug: none
Change-Id: I97f603aad53933b09c761da954130b06ea5a5501
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/230760
Commit-Queue: Philip Eliasson <philipel@webrtc.org>
Reviewed-by: Erik Språng <sprang@webrtc.org>
Reviewed-by: Danil Chapovalov <danilchap@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#34894}
diff --git a/call/rtp_payload_params.cc b/call/rtp_payload_params.cc
index af3874b..3f98a58 100644
--- a/call/rtp_payload_params.cc
+++ b/call/rtp_payload_params.cc
@@ -308,13 +308,16 @@
       rtp_video_header->generic.emplace();
 
   generic.frame_id = shared_frame_id;
+  generic.decode_target_indications.push_back(DecodeTargetIndication::kSwitch);
 
   if (is_keyframe) {
+    generic.chain_diffs.push_back(0);
     last_shared_frame_id_[0].fill(-1);
   } else {
     int64_t frame_id = last_shared_frame_id_[0][0];
     RTC_DCHECK_NE(frame_id, -1);
     RTC_DCHECK_LT(frame_id, shared_frame_id);
+    generic.chain_diffs.push_back(shared_frame_id - frame_id);
     generic.dependencies.push_back(frame_id);
   }
 
@@ -408,10 +411,10 @@
   }
 }
 
-FrameDependencyStructure RtpPayloadParams::MinimalisticVp9Structure(
-    const CodecSpecificInfoVP9& vp9) {
-  const int num_spatial_layers = vp9.num_spatial_layers;
-  const int num_temporal_layers = kMaxTemporalStreams;
+FrameDependencyStructure RtpPayloadParams::MinimalisticStructure(
+    int num_spatial_layers,
+    int num_temporal_layers) {
+  RTC_DCHECK_LE(num_spatial_layers * num_temporal_layers, 32);
   FrameDependencyStructure structure;
   structure.num_decode_targets = num_spatial_layers * num_temporal_layers;
   structure.num_chains = num_spatial_layers;
@@ -423,10 +426,10 @@
       a_template.temporal_id = tid;
       for (int s = 0; s < num_spatial_layers; ++s) {
         for (int t = 0; t < num_temporal_layers; ++t) {
-          // Prefer kSwitch for indication frame is part of the decode target
-          // because RtpPayloadParams::Vp9ToGeneric uses that indication more
-          // often that kRequired, increasing chance custom dti need not to
-          // use more bits in dependency descriptor on the wire.
+          // Prefer kSwitch indication for frames that is part of the decode
+          // target because dependency descriptor information generated in this
+          // class use kSwitch indications more often that kRequired, increasing
+          // the chance of a good (or complete) template match.
           a_template.decode_target_indications.push_back(
               sid <= s && tid <= t ? DecodeTargetIndication::kSwitch
                                    : DecodeTargetIndication::kNotPresent);
@@ -440,9 +443,6 @@
 
       structure.decode_target_protected_by_chain.push_back(sid);
     }
-    if (vp9.ss_data_available && vp9.spatial_layer_resolution_present) {
-      structure.resolutions.emplace_back(vp9.width[sid], vp9.height[sid]);
-    }
   }
   return structure;
 }
diff --git a/call/rtp_payload_params.h b/call/rtp_payload_params.h
index da53cbc..23827dd 100644
--- a/call/rtp_payload_params.h
+++ b/call/rtp_payload_params.h
@@ -42,13 +42,16 @@
                                    const CodecSpecificInfo* codec_specific_info,
                                    int64_t shared_frame_id);
 
-  // Returns structure that aligns with simulated generic info for VP9.
-  // The templates allow to produce valid dependency descriptor for any vp9
-  // stream with up to 4 temporal layers. The set of the templates is not tuned
-  // for any paricular structure thus dependency descriptor would use more bytes
-  // on the wire than with tuned templates.
-  static FrameDependencyStructure MinimalisticVp9Structure(
-      const CodecSpecificInfoVP9& vp9);
+  // Returns structure that aligns with simulated generic info. The templates
+  // allow to produce valid dependency descriptor for any stream where
+  // `num_spatial_layers` * `num_temporal_layers` <= 32 (limited by
+  // https://aomediacodec.github.io/av1-rtp-spec/#a82-syntax, see
+  // template_fdiffs()). The set of the templates is not tuned for any paricular
+  // structure thus dependency descriptor would use more bytes on the wire than
+  // with tuned templates.
+  static FrameDependencyStructure MinimalisticStructure(
+      int num_spatial_layers,
+      int num_temporal_layers);
 
   uint32_t ssrc() const;
 
diff --git a/call/rtp_payload_params_unittest.cc b/call/rtp_payload_params_unittest.cc
index 59c8f23..8b22716 100644
--- a/call/rtp_payload_params_unittest.cc
+++ b/call/rtp_payload_params_unittest.cc
@@ -33,6 +33,7 @@
 
 using ::testing::Each;
 using ::testing::ElementsAre;
+using ::testing::Eq;
 using ::testing::IsEmpty;
 using ::testing::SizeIs;
 
@@ -302,7 +303,7 @@
 }
 
 TEST(RtpPayloadParamsTest, GenericDescriptorForGenericCodec) {
-  RtpPayloadState state{};
+  RtpPayloadState state;
 
   EncodedImage encoded_image;
   encoded_image._frameType = VideoFrameType::kVideoFrameKey;
@@ -313,16 +314,27 @@
   RTPVideoHeader header =
       params.GetRtpVideoHeader(encoded_image, &codec_info, 0);
 
-  EXPECT_EQ(kVideoCodecGeneric, header.codec);
+  EXPECT_THAT(header.codec, Eq(kVideoCodecGeneric));
+
   ASSERT_TRUE(header.generic);
-  EXPECT_EQ(0, header.generic->frame_id);
+  EXPECT_THAT(header.generic->frame_id, Eq(0));
+  EXPECT_THAT(header.generic->spatial_index, Eq(0));
+  EXPECT_THAT(header.generic->temporal_index, Eq(0));
+  EXPECT_THAT(header.generic->decode_target_indications,
+              ElementsAre(DecodeTargetIndication::kSwitch));
   EXPECT_THAT(header.generic->dependencies, IsEmpty());
+  EXPECT_THAT(header.generic->chain_diffs, ElementsAre(0));
 
   encoded_image._frameType = VideoFrameType::kVideoFrameDelta;
-  header = params.GetRtpVideoHeader(encoded_image, &codec_info, 1);
+  header = params.GetRtpVideoHeader(encoded_image, &codec_info, 3);
   ASSERT_TRUE(header.generic);
-  EXPECT_EQ(1, header.generic->frame_id);
+  EXPECT_THAT(header.generic->frame_id, Eq(3));
+  EXPECT_THAT(header.generic->spatial_index, Eq(0));
+  EXPECT_THAT(header.generic->temporal_index, Eq(0));
   EXPECT_THAT(header.generic->dependencies, ElementsAre(0));
+  EXPECT_THAT(header.generic->decode_target_indications,
+              ElementsAre(DecodeTargetIndication::kSwitch));
+  EXPECT_THAT(header.generic->chain_diffs, ElementsAre(3));
 }
 
 TEST(RtpPayloadParamsTest, SetsGenericFromGenericFrameInfo) {
diff --git a/call/rtp_video_sender.cc b/call/rtp_video_sender.cc
index 16f4f25..39296cf 100644
--- a/call/rtp_video_sender.cc
+++ b/call/rtp_video_sender.cc
@@ -370,6 +370,9 @@
       simulate_vp9_structure_(!absl::StartsWith(
           field_trials_.Lookup("WebRTC-Vp9DependencyDescriptor"),
           "Disabled")),
+      simulate_generic_structure_(absl::StartsWith(
+          field_trials_.Lookup("WebRTC-GenericCodecDependencyDescriptor"),
+          "Enabled")),
       active_(false),
       suspended_ssrcs_(std::move(suspended_ssrcs)),
       fec_controller_(std::move(fec_controller)),
@@ -575,9 +578,23 @@
       sender_video.SetVideoStructure(&*codec_specific_info->template_structure);
     } else if (simulate_vp9_structure_ && codec_specific_info &&
                codec_specific_info->codecType == kVideoCodecVP9) {
+      const CodecSpecificInfoVP9& vp9 = codec_specific_info->codecSpecific.VP9;
+
       FrameDependencyStructure structure =
-          RtpPayloadParams::MinimalisticVp9Structure(
-              codec_specific_info->codecSpecific.VP9);
+          RtpPayloadParams::MinimalisticStructure(vp9.num_spatial_layers,
+                                                  kMaxTemporalStreams);
+      if (vp9.ss_data_available && vp9.spatial_layer_resolution_present) {
+        for (size_t i = 0; i < vp9.num_spatial_layers; ++i) {
+          structure.resolutions.emplace_back(vp9.width[i], vp9.height[i]);
+        }
+      }
+      sender_video.SetVideoStructure(&structure);
+    } else if (simulate_generic_structure_ && codec_specific_info &&
+               codec_specific_info->codecType == kVideoCodecGeneric) {
+      FrameDependencyStructure structure =
+          RtpPayloadParams::MinimalisticStructure(
+              /*num_spatial_layers=*/1,
+              /*num_temporal_layers=*/1);
       sender_video.SetVideoStructure(&structure);
     } else {
       sender_video.SetVideoStructure(nullptr);
diff --git a/call/rtp_video_sender.h b/call/rtp_video_sender.h
index c725214..d7e1d75 100644
--- a/call/rtp_video_sender.h
+++ b/call/rtp_video_sender.h
@@ -169,6 +169,7 @@
   const bool use_frame_rate_for_overhead_;
   const bool has_packet_feedback_;
   const bool simulate_vp9_structure_;
+  const bool simulate_generic_structure_;
 
   // TODO(holmer): Remove mutex_ once RtpVideoSender runs on the
   // transport task queue.
diff --git a/call/rtp_video_sender_unittest.cc b/call/rtp_video_sender_unittest.cc
index a45473f..689a61d 100644
--- a/call/rtp_video_sender_unittest.cc
+++ b/call/rtp_video_sender_unittest.cc
@@ -824,6 +824,54 @@
   EXPECT_TRUE(sent_packets[1].HasExtension<RtpDependencyDescriptorExtension>());
 }
 
+TEST(RtpVideoSenderTest, GenerateDependecyDescriptorForGenericCodecs) {
+  test::ScopedFieldTrials field_trials(
+      "WebRTC-GenericCodecDependencyDescriptor/Enabled/");
+  RtpVideoSenderTestFixture test({kSsrc1}, {}, kPayloadType, {});
+  test.router()->SetActive(true);
+
+  RtpHeaderExtensionMap extensions;
+  extensions.Register<RtpDependencyDescriptorExtension>(
+      kDependencyDescriptorExtensionId);
+  std::vector<RtpPacket> sent_packets;
+  ON_CALL(test.transport(), SendRtp)
+      .WillByDefault([&](const uint8_t* packet, size_t length,
+                         const PacketOptions& options) {
+        sent_packets.emplace_back(&extensions);
+        EXPECT_TRUE(sent_packets.back().Parse(packet, length));
+        return true;
+      });
+
+  const uint8_t kPayload[1] = {'a'};
+  EncodedImage encoded_image;
+  encoded_image.SetTimestamp(1);
+  encoded_image.capture_time_ms_ = 2;
+  encoded_image._frameType = VideoFrameType::kVideoFrameKey;
+  encoded_image._encodedWidth = 320;
+  encoded_image._encodedHeight = 180;
+  encoded_image.SetEncodedData(
+      EncodedImageBuffer::Create(kPayload, sizeof(kPayload)));
+
+  CodecSpecificInfo codec_specific;
+  codec_specific.codecType = VideoCodecType::kVideoCodecGeneric;
+  codec_specific.end_of_picture = true;
+
+  // Send two tiny images, each mapping to single RTP packet.
+  EXPECT_EQ(test.router()->OnEncodedImage(encoded_image, &codec_specific).error,
+            EncodedImageCallback::Result::OK);
+
+  // Send in 2nd picture.
+  encoded_image._frameType = VideoFrameType::kVideoFrameDelta;
+  encoded_image.SetTimestamp(3000);
+  EXPECT_EQ(test.router()->OnEncodedImage(encoded_image, &codec_specific).error,
+            EncodedImageCallback::Result::OK);
+
+  test.AdvanceTime(TimeDelta::Millis(33));
+  ASSERT_THAT(sent_packets, SizeIs(2));
+  EXPECT_TRUE(sent_packets[0].HasExtension<RtpDependencyDescriptorExtension>());
+  EXPECT_TRUE(sent_packets[1].HasExtension<RtpDependencyDescriptorExtension>());
+}
+
 TEST(RtpVideoSenderTest, SupportsStoppingUsingDependencyDescriptor) {
   RtpVideoSenderTestFixture test({kSsrc1}, {}, kPayloadType, {});
   test.router()->SetActive(true);