For VP9 assume max number of spatial layers to simulate generic descriptor

VP9 allows to increase number of spatial layers on delta frame, which
is not supported by dependency descriptor.
Thus to generate DD compatible generic header, simulator would set max
number of spatial layers, while number of active spatial layers would be
communicated with active_decode_target bitmask

Bug: webrtc:14042
Change-Id: I4da63fa7c38b0f17758a7a6243640f444470b40c
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/265164
Commit-Queue: Danil Chapovalov <danilchap@webrtc.org>
Reviewed-by: Philip Eliasson <philipel@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#37151}
diff --git a/call/rtp_payload_params.cc b/call/rtp_payload_params.cc
index 470d96a..6ff7549 100644
--- a/call/rtp_payload_params.cc
+++ b/call/rtp_payload_params.cc
@@ -30,8 +30,10 @@
 #include "rtc_base/time_utils.h"
 
 namespace webrtc {
-
 namespace {
+
+constexpr int kMaxSimulatedSpatialLayers = 3;
+
 void PopulateRtpWithCodecSpecifics(const CodecSpecificInfo& info,
                                    absl::optional<int> spatial_index,
                                    RTPVideoHeader* rtp) {
@@ -123,6 +125,50 @@
   timing->network2_timestamp_delta_ms = 0;
   timing->flags = image.timing_.flags;
 }
+
+// Returns structure that aligns with simulated generic info. The templates
+// allow to produce valid dependency descriptor for any stream where
+// `num_spatial_layers` * `num_temporal_layers` <= 32 (limited by
+// https://aomediacodec.github.io/av1-rtp-spec/#a82-syntax, see
+// template_fdiffs()). The set of the templates is not tuned for any paricular
+// structure thus dependency descriptor would use more bytes on the wire than
+// with tuned templates.
+FrameDependencyStructure MinimalisticStructure(int num_spatial_layers,
+                                               int num_temporal_layers) {
+  RTC_DCHECK_LE(num_spatial_layers, DependencyDescriptor::kMaxSpatialIds);
+  RTC_DCHECK_LE(num_temporal_layers, DependencyDescriptor::kMaxTemporalIds);
+  RTC_DCHECK_LE(num_spatial_layers * num_temporal_layers, 32);
+  FrameDependencyStructure structure;
+  structure.num_decode_targets = num_spatial_layers * num_temporal_layers;
+  structure.num_chains = num_spatial_layers;
+  structure.templates.reserve(num_spatial_layers * num_temporal_layers);
+  for (int sid = 0; sid < num_spatial_layers; ++sid) {
+    for (int tid = 0; tid < num_temporal_layers; ++tid) {
+      FrameDependencyTemplate a_template;
+      a_template.spatial_id = sid;
+      a_template.temporal_id = tid;
+      for (int s = 0; s < num_spatial_layers; ++s) {
+        for (int t = 0; t < num_temporal_layers; ++t) {
+          // Prefer kSwitch indication for frames that is part of the decode
+          // target because dependency descriptor information generated in this
+          // class use kSwitch indications more often that kRequired, increasing
+          // the chance of a good (or complete) template match.
+          a_template.decode_target_indications.push_back(
+              sid <= s && tid <= t ? DecodeTargetIndication::kSwitch
+                                   : DecodeTargetIndication::kNotPresent);
+        }
+      }
+      a_template.frame_diffs.push_back(tid == 0 ? num_spatial_layers *
+                                                      num_temporal_layers
+                                                : num_spatial_layers);
+      a_template.chain_diffs.assign(structure.num_chains, 1);
+      structure.templates.push_back(a_template);
+
+      structure.decode_target_protected_by_chain.push_back(sid);
+    }
+  }
+  return structure;
+}
 }  // namespace
 
 RtpPayloadParams::RtpPayloadParams(const uint32_t ssrc,
@@ -131,7 +177,10 @@
     : ssrc_(ssrc),
       generic_picture_id_experiment_(
           absl::StartsWith(trials.Lookup("WebRTC-GenericPictureId"),
-                           "Enabled")) {
+                           "Enabled")),
+      simulate_generic_structure_(absl::StartsWith(
+          trials.Lookup("WebRTC-GenericCodecDependencyDescriptor"),
+          "Enabled")) {
   for (auto& spatial_layer : last_shared_frame_id_)
     spatial_layer.fill(-1);
 
@@ -298,6 +347,69 @@
   RTC_DCHECK_NOTREACHED() << "Unsupported codec.";
 }
 
+absl::optional<FrameDependencyStructure> RtpPayloadParams::GenericStructure(
+    const CodecSpecificInfo* codec_specific_info) {
+  if (codec_specific_info == nullptr) {
+    return absl::nullopt;
+  }
+  // This helper shouldn't be used when template structure is specified
+  // explicetly.
+  RTC_DCHECK(!codec_specific_info->template_structure.has_value());
+  switch (codec_specific_info->codecType) {
+    case VideoCodecType::kVideoCodecGeneric:
+      if (simulate_generic_structure_) {
+        return MinimalisticStructure(/*num_spatial_layers=*/1,
+                                     /*num_temporal_layer=*/1);
+      }
+      return absl::nullopt;
+    case VideoCodecType::kVideoCodecVP8:
+      return MinimalisticStructure(/*num_spatial_layers=*/1,
+                                   /*num_temporal_layer=*/kMaxTemporalStreams);
+    case VideoCodecType::kVideoCodecVP9: {
+      absl::optional<FrameDependencyStructure> structure =
+          MinimalisticStructure(
+              /*num_spatial_layers=*/kMaxSimulatedSpatialLayers,
+              /*num_temporal_layer=*/kMaxTemporalStreams);
+      const CodecSpecificInfoVP9& vp9 = codec_specific_info->codecSpecific.VP9;
+      if (vp9.ss_data_available && vp9.spatial_layer_resolution_present) {
+        RenderResolution first_valid;
+        RenderResolution last_valid;
+        for (size_t i = 0; i < vp9.num_spatial_layers; ++i) {
+          RenderResolution r(vp9.width[i], vp9.height[i]);
+          if (r.Valid()) {
+            if (!first_valid.Valid()) {
+              first_valid = r;
+            }
+            last_valid = r;
+          }
+          structure->resolutions.push_back(r);
+        }
+        if (!last_valid.Valid()) {
+          // No valid resolution found. Do not send resolutions.
+          structure->resolutions.clear();
+        } else {
+          structure->resolutions.resize(kMaxSimulatedSpatialLayers, last_valid);
+          // VP9 encoder wrapper may disable first few spatial layers by
+          // setting invalid resolution (0,0). `structure->resolutions`
+          // doesn't support invalid resolution, so reset them to something
+          // valid.
+          for (RenderResolution& r : structure->resolutions) {
+            if (!r.Valid()) {
+              r = first_valid;
+            }
+          }
+        }
+      }
+      return structure;
+    }
+    case VideoCodecType::kVideoCodecAV1:
+    case VideoCodecType::kVideoCodecH264:
+    case VideoCodecType::kVideoCodecMultiplex:
+      return absl::nullopt;
+  }
+  RTC_DCHECK_NOTREACHED() << "Unsupported codec.";
+}
+
 void RtpPayloadParams::GenericToGeneric(int64_t shared_frame_id,
                                         bool is_keyframe,
                                         RTPVideoHeader* rtp_video_header) {
@@ -426,49 +538,20 @@
   }
 }
 
-FrameDependencyStructure RtpPayloadParams::MinimalisticStructure(
-    int num_spatial_layers,
-    int num_temporal_layers) {
-  RTC_DCHECK_LE(num_spatial_layers * num_temporal_layers, 32);
-  FrameDependencyStructure structure;
-  structure.num_decode_targets = num_spatial_layers * num_temporal_layers;
-  structure.num_chains = num_spatial_layers;
-  structure.templates.reserve(num_spatial_layers * num_temporal_layers);
-  for (int sid = 0; sid < num_spatial_layers; ++sid) {
-    for (int tid = 0; tid < num_temporal_layers; ++tid) {
-      FrameDependencyTemplate a_template;
-      a_template.spatial_id = sid;
-      a_template.temporal_id = tid;
-      for (int s = 0; s < num_spatial_layers; ++s) {
-        for (int t = 0; t < num_temporal_layers; ++t) {
-          // Prefer kSwitch indication for frames that is part of the decode
-          // target because dependency descriptor information generated in this
-          // class use kSwitch indications more often that kRequired, increasing
-          // the chance of a good (or complete) template match.
-          a_template.decode_target_indications.push_back(
-              sid <= s && tid <= t ? DecodeTargetIndication::kSwitch
-                                   : DecodeTargetIndication::kNotPresent);
-        }
-      }
-      a_template.frame_diffs.push_back(tid == 0 ? num_spatial_layers *
-                                                      num_temporal_layers
-                                                : num_spatial_layers);
-      a_template.chain_diffs.assign(structure.num_chains, 1);
-      structure.templates.push_back(a_template);
-
-      structure.decode_target_protected_by_chain.push_back(sid);
-    }
-  }
-  return structure;
-}
-
 void RtpPayloadParams::Vp9ToGeneric(const CodecSpecificInfoVP9& vp9_info,
                                     int64_t shared_frame_id,
                                     RTPVideoHeader& rtp_video_header) {
   const auto& vp9_header =
       absl::get<RTPVideoHeaderVP9>(rtp_video_header.video_type_header);
-  const int num_spatial_layers = vp9_header.num_spatial_layers;
+  const int num_spatial_layers = kMaxSimulatedSpatialLayers;
+  const int num_active_spatial_layers = vp9_header.num_spatial_layers;
   const int num_temporal_layers = kMaxTemporalStreams;
+  static_assert(num_spatial_layers <=
+                RtpGenericFrameDescriptor::kMaxSpatialLayers);
+  static_assert(num_temporal_layers <=
+                RtpGenericFrameDescriptor::kMaxTemporalLayers);
+  static_assert(num_spatial_layers <= DependencyDescriptor::kMaxSpatialIds);
+  static_assert(num_temporal_layers <= DependencyDescriptor::kMaxTemporalIds);
 
   int spatial_index =
       vp9_header.spatial_idx != kNoSpatialIdx ? vp9_header.spatial_idx : 0;
@@ -477,7 +560,7 @@
 
   if (spatial_index >= num_spatial_layers ||
       temporal_index >= num_temporal_layers ||
-      num_spatial_layers > RtpGenericFrameDescriptor::kMaxSpatialLayers) {
+      num_active_spatial_layers > num_spatial_layers) {
     // Prefer to generate no generic layering than an inconsistent one.
     return;
   }
@@ -541,6 +624,9 @@
   last_vp9_frame_id_[vp9_header.picture_id % kPictureDiffLimit][spatial_index] =
       shared_frame_id;
 
+  result.active_decode_targets =
+      ((uint32_t{1} << num_temporal_layers * num_active_spatial_layers) - 1);
+
   // Calculate chains, asuming chain includes all frames with temporal_id = 0
   if (!vp9_header.inter_pic_predicted && !vp9_header.inter_layer_predicted) {
     // Assume frames without dependencies also reset chains.
@@ -548,8 +634,8 @@
       chain_last_frame_id_[sid] = -1;
     }
   }
-  result.chain_diffs.resize(num_spatial_layers);
-  for (int sid = 0; sid < num_spatial_layers; ++sid) {
+  result.chain_diffs.resize(num_spatial_layers, 0);
+  for (int sid = 0; sid < num_active_spatial_layers; ++sid) {
     if (chain_last_frame_id_[sid] == -1) {
       result.chain_diffs[sid] = 0;
       continue;
diff --git a/call/rtp_payload_params.h b/call/rtp_payload_params.h
index ff2de73..5feee11 100644
--- a/call/rtp_payload_params.h
+++ b/call/rtp_payload_params.h
@@ -26,8 +26,6 @@
 
 namespace webrtc {
 
-class RtpRtcp;
-
 // State for setting picture id and tl0 pic idx, for VP8 and VP9
 // TODO(nisse): Make these properties not codec specific.
 class RtpPayloadParams final {
@@ -42,16 +40,10 @@
                                    const CodecSpecificInfo* codec_specific_info,
                                    int64_t shared_frame_id);
 
-  // Returns structure that aligns with simulated generic info. The templates
-  // allow to produce valid dependency descriptor for any stream where
-  // `num_spatial_layers` * `num_temporal_layers` <= 32 (limited by
-  // https://aomediacodec.github.io/av1-rtp-spec/#a82-syntax, see
-  // template_fdiffs()). The set of the templates is not tuned for any paricular
-  // structure thus dependency descriptor would use more bytes on the wire than
-  // with tuned templates.
-  static FrameDependencyStructure MinimalisticStructure(
-      int num_spatial_layers,
-      int num_temporal_layers);
+  // Returns structure that aligns with simulated generic info generated by
+  // `GetRtpVideoHeader` for the `codec_specific_info`
+  absl::optional<FrameDependencyStructure> GenericStructure(
+      const CodecSpecificInfo* codec_specific_info);
 
   uint32_t ssrc() const;
 
@@ -136,6 +128,7 @@
   RtpPayloadState state_;
 
   const bool generic_picture_id_experiment_;
+  const bool simulate_generic_structure_;
 };
 }  // namespace webrtc
 #endif  // CALL_RTP_PAYLOAD_PARAMS_H_
diff --git a/call/rtp_payload_params_unittest.cc b/call/rtp_payload_params_unittest.cc
index 169a82d..6a54ac8 100644
--- a/call/rtp_payload_params_unittest.cc
+++ b/call/rtp_payload_params_unittest.cc
@@ -587,7 +587,8 @@
   EXPECT_EQ(header.generic->decode_target_indications[0],
             DecodeTargetIndication::kSwitch);
   EXPECT_THAT(header.generic->dependencies, IsEmpty());
-  EXPECT_THAT(header.generic->chain_diffs, ElementsAre(0));
+  ASSERT_THAT(header.generic->chain_diffs, Not(IsEmpty()));
+  EXPECT_EQ(header.generic->chain_diffs[0], 0);
 
   // Delta frame.
   encoded_image._frameType = VideoFrameType::kVideoFrameDelta;
@@ -605,8 +606,9 @@
   EXPECT_EQ(header.generic->decode_target_indications[0],
             DecodeTargetIndication::kSwitch);
   EXPECT_THAT(header.generic->dependencies, ElementsAre(1));
+  ASSERT_THAT(header.generic->chain_diffs, Not(IsEmpty()));
   // previous frame in the chain was frame#1,
-  EXPECT_THAT(header.generic->chain_diffs, ElementsAre(3 - 1));
+  EXPECT_EQ(header.generic->chain_diffs[0], 3 - 1);
 }
 
 TEST(RtpPayloadParamsVp9ToGenericTest, TemporalScalabilityWith2Layers) {
@@ -670,7 +672,9 @@
 
   ASSERT_TRUE(headers[0].generic);
   int num_decode_targets = headers[0].generic->decode_target_indications.size();
+  int num_chains = headers[0].generic->chain_diffs.size();
   ASSERT_GE(num_decode_targets, 2);
+  ASSERT_GE(num_chains, 1);
 
   for (int frame_idx = 0; frame_idx < 6; ++frame_idx) {
     const RTPVideoHeader& header = headers[frame_idx];
@@ -680,6 +684,7 @@
     EXPECT_EQ(header.generic->frame_id, 1 + 2 * frame_idx);
     ASSERT_THAT(header.generic->decode_target_indications,
                 SizeIs(num_decode_targets));
+    ASSERT_THAT(header.generic->chain_diffs, SizeIs(num_chains));
     // Expect only T0 frames are needed for the 1st decode target.
     if (header.generic->temporal_index == 0) {
       EXPECT_NE(header.generic->decode_target_indications[0],
@@ -694,10 +699,14 @@
   }
 
   // Expect switch at every beginning of the pattern.
-  EXPECT_THAT(headers[0].generic->decode_target_indications,
-              Each(DecodeTargetIndication::kSwitch));
-  EXPECT_THAT(headers[4].generic->decode_target_indications,
-              Each(DecodeTargetIndication::kSwitch));
+  EXPECT_THAT(headers[0].generic->decode_target_indications[0],
+              DecodeTargetIndication::kSwitch);
+  EXPECT_THAT(headers[0].generic->decode_target_indications[1],
+              DecodeTargetIndication::kSwitch);
+  EXPECT_THAT(headers[4].generic->decode_target_indications[0],
+              DecodeTargetIndication::kSwitch);
+  EXPECT_THAT(headers[4].generic->decode_target_indications[1],
+              DecodeTargetIndication::kSwitch);
 
   EXPECT_THAT(headers[0].generic->dependencies, IsEmpty());          // T0, 1
   EXPECT_THAT(headers[1].generic->dependencies, ElementsAre(1));     // T1, 3
@@ -706,12 +715,12 @@
   EXPECT_THAT(headers[4].generic->dependencies, ElementsAre(5));     // T0, 9
   EXPECT_THAT(headers[5].generic->dependencies, ElementsAre(9));     // T1, 11
 
-  EXPECT_THAT(headers[0].generic->chain_diffs, ElementsAre(0));
-  EXPECT_THAT(headers[1].generic->chain_diffs, ElementsAre(2));
-  EXPECT_THAT(headers[2].generic->chain_diffs, ElementsAre(4));
-  EXPECT_THAT(headers[3].generic->chain_diffs, ElementsAre(2));
-  EXPECT_THAT(headers[4].generic->chain_diffs, ElementsAre(4));
-  EXPECT_THAT(headers[5].generic->chain_diffs, ElementsAre(2));
+  EXPECT_THAT(headers[0].generic->chain_diffs[0], Eq(0));
+  EXPECT_THAT(headers[1].generic->chain_diffs[0], Eq(2));
+  EXPECT_THAT(headers[2].generic->chain_diffs[0], Eq(4));
+  EXPECT_THAT(headers[3].generic->chain_diffs[0], Eq(2));
+  EXPECT_THAT(headers[4].generic->chain_diffs[0], Eq(4));
+  EXPECT_THAT(headers[5].generic->chain_diffs[0], Eq(2));
 }
 
 TEST(RtpPayloadParamsVp9ToGenericTest, TemporalScalabilityWith3Layers) {
@@ -792,7 +801,9 @@
 
   ASSERT_TRUE(headers[0].generic);
   int num_decode_targets = headers[0].generic->decode_target_indications.size();
+  int num_chains = headers[0].generic->chain_diffs.size();
   ASSERT_GE(num_decode_targets, 3);
+  ASSERT_GE(num_chains, 1);
 
   for (int frame_idx = 0; frame_idx < 9; ++frame_idx) {
     const RTPVideoHeader& header = headers[frame_idx];
@@ -801,6 +812,7 @@
     EXPECT_EQ(header.generic->frame_id, 1 + 2 * frame_idx);
     ASSERT_THAT(header.generic->decode_target_indications,
                 SizeIs(num_decode_targets));
+    ASSERT_THAT(header.generic->chain_diffs, SizeIs(num_chains));
     // Expect only T0 frames are needed for the 1st decode target.
     if (header.generic->temporal_index == 0) {
       EXPECT_NE(header.generic->decode_target_indications[0],
@@ -835,8 +847,12 @@
   // Expect switch at every beginning of the pattern.
   EXPECT_THAT(headers[0].generic->decode_target_indications,
               Each(DecodeTargetIndication::kSwitch));
-  EXPECT_THAT(headers[8].generic->decode_target_indications,
-              Each(DecodeTargetIndication::kSwitch));
+  EXPECT_THAT(headers[8].generic->decode_target_indications[0],
+              DecodeTargetIndication::kSwitch);
+  EXPECT_THAT(headers[8].generic->decode_target_indications[1],
+              DecodeTargetIndication::kSwitch);
+  EXPECT_THAT(headers[8].generic->decode_target_indications[2],
+              DecodeTargetIndication::kSwitch);
 
   EXPECT_THAT(headers[0].generic->dependencies, IsEmpty());          // T0, 1
   EXPECT_THAT(headers[1].generic->dependencies, ElementsAre(1));     // T2, 3
@@ -848,15 +864,15 @@
   EXPECT_THAT(headers[7].generic->dependencies, ElementsAre(13));    // T2, 15
   EXPECT_THAT(headers[8].generic->dependencies, ElementsAre(9));     // T0, 17
 
-  EXPECT_THAT(headers[0].generic->chain_diffs, ElementsAre(0));
-  EXPECT_THAT(headers[1].generic->chain_diffs, ElementsAre(2));
-  EXPECT_THAT(headers[2].generic->chain_diffs, ElementsAre(4));
-  EXPECT_THAT(headers[3].generic->chain_diffs, ElementsAre(6));
-  EXPECT_THAT(headers[4].generic->chain_diffs, ElementsAre(8));
-  EXPECT_THAT(headers[5].generic->chain_diffs, ElementsAre(2));
-  EXPECT_THAT(headers[6].generic->chain_diffs, ElementsAre(4));
-  EXPECT_THAT(headers[7].generic->chain_diffs, ElementsAre(6));
-  EXPECT_THAT(headers[8].generic->chain_diffs, ElementsAre(8));
+  EXPECT_THAT(headers[0].generic->chain_diffs[0], Eq(0));
+  EXPECT_THAT(headers[1].generic->chain_diffs[0], Eq(2));
+  EXPECT_THAT(headers[2].generic->chain_diffs[0], Eq(4));
+  EXPECT_THAT(headers[3].generic->chain_diffs[0], Eq(6));
+  EXPECT_THAT(headers[4].generic->chain_diffs[0], Eq(8));
+  EXPECT_THAT(headers[5].generic->chain_diffs[0], Eq(2));
+  EXPECT_THAT(headers[6].generic->chain_diffs[0], Eq(4));
+  EXPECT_THAT(headers[7].generic->chain_diffs[0], Eq(6));
+  EXPECT_THAT(headers[8].generic->chain_diffs[0], Eq(8));
 }
 
 TEST(RtpPayloadParamsVp9ToGenericTest, SpatialScalabilityKSvc) {
@@ -916,7 +932,9 @@
   // Rely on implementation detail there are always kMaxTemporalStreams temporal
   // layers assumed, in particular assume Decode Target#0 matches layer S0T0,
   // and Decode Target#kMaxTemporalStreams matches layer S1T0.
-  ASSERT_EQ(num_decode_targets, kMaxTemporalStreams * 2);
+  ASSERT_GE(num_decode_targets, kMaxTemporalStreams * 2);
+  int num_chains = headers[0].generic->chain_diffs.size();
+  ASSERT_GE(num_chains, 2);
 
   for (int frame_idx = 0; frame_idx < 4; ++frame_idx) {
     const RTPVideoHeader& header = headers[frame_idx];
@@ -926,6 +944,7 @@
     EXPECT_EQ(header.generic->frame_id, 1 + 2 * frame_idx);
     ASSERT_THAT(header.generic->decode_target_indications,
                 SizeIs(num_decode_targets));
+    ASSERT_THAT(header.generic->chain_diffs, SizeIs(num_chains));
   }
 
   // Expect S0 key frame is switch for both Decode Targets.
@@ -953,10 +972,114 @@
   EXPECT_THAT(headers[2].generic->dependencies, ElementsAre(1));  // S0, 5
   EXPECT_THAT(headers[3].generic->dependencies, ElementsAre(3));  // S1, 7
 
-  EXPECT_THAT(headers[0].generic->chain_diffs, ElementsAre(0, 0));
-  EXPECT_THAT(headers[1].generic->chain_diffs, ElementsAre(2, 2));
-  EXPECT_THAT(headers[2].generic->chain_diffs, ElementsAre(4, 2));
-  EXPECT_THAT(headers[3].generic->chain_diffs, ElementsAre(2, 4));
+  EXPECT_THAT(headers[0].generic->chain_diffs[0], Eq(0));
+  EXPECT_THAT(headers[0].generic->chain_diffs[1], Eq(0));
+  EXPECT_THAT(headers[1].generic->chain_diffs[0], Eq(2));
+  EXPECT_THAT(headers[1].generic->chain_diffs[1], Eq(2));
+  EXPECT_THAT(headers[2].generic->chain_diffs[0], Eq(4));
+  EXPECT_THAT(headers[2].generic->chain_diffs[1], Eq(2));
+  EXPECT_THAT(headers[3].generic->chain_diffs[0], Eq(2));
+  EXPECT_THAT(headers[3].generic->chain_diffs[1], Eq(4));
+}
+
+TEST(RtpPayloadParamsVp9ToGenericTest,
+     IncreaseNumberOfSpatialLayersOnDeltaFrame) {
+  // S1     5--
+  //        | ...
+  // S0 1---3--
+  RtpPayloadState state;
+  RtpPayloadParams params(/*ssrc=*/123, &state, FieldTrialBasedConfig());
+
+  EncodedImage image;
+  CodecSpecificInfo info;
+  info.codecType = kVideoCodecVP9;
+  info.codecSpecific.VP9.num_spatial_layers = 1;
+  info.codecSpecific.VP9.first_frame_in_picture = true;
+
+  RTPVideoHeader headers[3];
+  // Key frame.
+  image._frameType = VideoFrameType::kVideoFrameKey;
+  image.SetSpatialIndex(0);
+  info.codecSpecific.VP9.inter_pic_predicted = false;
+  info.codecSpecific.VP9.inter_layer_predicted = false;
+  info.codecSpecific.VP9.non_ref_for_inter_layer_pred = true;
+  info.codecSpecific.VP9.num_ref_pics = 0;
+  info.codecSpecific.VP9.first_frame_in_picture = true;
+  info.end_of_picture = true;
+  headers[0] = params.GetRtpVideoHeader(image, &info, /*shared_frame_id=*/1);
+
+  // S0 delta frame.
+  image._frameType = VideoFrameType::kVideoFrameDelta;
+  info.codecSpecific.VP9.num_spatial_layers = 2;
+  info.codecSpecific.VP9.non_ref_for_inter_layer_pred = false;
+  info.codecSpecific.VP9.first_frame_in_picture = true;
+  info.codecSpecific.VP9.inter_pic_predicted = true;
+  info.codecSpecific.VP9.num_ref_pics = 1;
+  info.codecSpecific.VP9.p_diff[0] = 1;
+  info.end_of_picture = false;
+  headers[1] = params.GetRtpVideoHeader(image, &info, /*shared_frame_id=*/3);
+
+  // S1 delta frame.
+  image.SetSpatialIndex(1);
+  info.codecSpecific.VP9.inter_layer_predicted = true;
+  info.codecSpecific.VP9.non_ref_for_inter_layer_pred = true;
+  info.codecSpecific.VP9.first_frame_in_picture = false;
+  info.codecSpecific.VP9.inter_pic_predicted = false;
+  info.end_of_picture = true;
+  headers[2] = params.GetRtpVideoHeader(image, &info, /*shared_frame_id=*/5);
+
+  ASSERT_TRUE(headers[0].generic);
+  int num_decode_targets = headers[0].generic->decode_target_indications.size();
+  int num_chains = headers[0].generic->chain_diffs.size();
+  // Rely on implementation detail there are always kMaxTemporalStreams temporal
+  // layers. In particular assume Decode Target#0 matches layer S0T0, and
+  // Decode Target#kMaxTemporalStreams matches layer S1T0.
+  static constexpr int kS0T0 = 0;
+  static constexpr int kS1T0 = kMaxTemporalStreams;
+  ASSERT_GE(num_decode_targets, 2);
+  ASSERT_GE(num_chains, 2);
+
+  for (int frame_idx = 0; frame_idx < 3; ++frame_idx) {
+    const RTPVideoHeader& header = headers[frame_idx];
+    ASSERT_TRUE(header.generic);
+    EXPECT_EQ(header.generic->temporal_index, 0);
+    EXPECT_EQ(header.generic->frame_id, 1 + 2 * frame_idx);
+    ASSERT_THAT(header.generic->decode_target_indications,
+                SizeIs(num_decode_targets));
+    ASSERT_THAT(header.generic->chain_diffs, SizeIs(num_chains));
+  }
+
+  EXPECT_TRUE(headers[0].generic->active_decode_targets[kS0T0]);
+  EXPECT_FALSE(headers[0].generic->active_decode_targets[kS1T0]);
+
+  EXPECT_TRUE(headers[1].generic->active_decode_targets[kS0T0]);
+  EXPECT_TRUE(headers[1].generic->active_decode_targets[kS1T0]);
+
+  EXPECT_TRUE(headers[2].generic->active_decode_targets[kS0T0]);
+  EXPECT_TRUE(headers[2].generic->active_decode_targets[kS1T0]);
+
+  EXPECT_EQ(headers[0].generic->decode_target_indications[kS0T0],
+            DecodeTargetIndication::kSwitch);
+
+  EXPECT_EQ(headers[1].generic->decode_target_indications[kS0T0],
+            DecodeTargetIndication::kSwitch);
+
+  EXPECT_EQ(headers[2].generic->decode_target_indications[kS0T0],
+            DecodeTargetIndication::kNotPresent);
+  EXPECT_EQ(headers[2].generic->decode_target_indications[kS1T0],
+            DecodeTargetIndication::kSwitch);
+
+  EXPECT_THAT(headers[0].generic->dependencies, IsEmpty());       // S0, 1
+  EXPECT_THAT(headers[1].generic->dependencies, ElementsAre(1));  // S0, 3
+  EXPECT_THAT(headers[2].generic->dependencies, ElementsAre(3));  // S1, 5
+
+  EXPECT_EQ(headers[0].generic->chain_diffs[0], 0);
+
+  EXPECT_EQ(headers[1].generic->chain_diffs[0], 2);
+  EXPECT_EQ(headers[1].generic->chain_diffs[1], 0);
+
+  EXPECT_EQ(headers[2].generic->chain_diffs[0], 2);
+  EXPECT_EQ(headers[2].generic->chain_diffs[1], 2);
 }
 
 class RtpPayloadParamsH264ToGenericTest : public ::testing::Test {
diff --git a/call/rtp_video_sender.cc b/call/rtp_video_sender.cc
index 1930036..1f55eb8 100644
--- a/call/rtp_video_sender.cc
+++ b/call/rtp_video_sender.cc
@@ -378,9 +378,6 @@
           field_trials_.Lookup("WebRTC-Video-UseFrameRateForOverhead"),
           "Enabled")),
       has_packet_feedback_(TransportSeqNumExtensionConfigured(rtp_config)),
-      simulate_generic_structure_(absl::StartsWith(
-          field_trials_.Lookup("WebRTC-GenericCodecDependencyDescriptor"),
-          "Enabled")),
       active_(false),
       fec_controller_(std::move(fec_controller)),
       fec_allowed_(true),
@@ -603,32 +600,10 @@
     RTPSenderVideo& sender_video = *rtp_streams_[stream_index].sender_video;
     if (codec_specific_info && codec_specific_info->template_structure) {
       sender_video.SetVideoStructure(&*codec_specific_info->template_structure);
-    } else if (codec_specific_info &&
-               codec_specific_info->codecType == kVideoCodecVP8) {
-      FrameDependencyStructure structure =
-          RtpPayloadParams::MinimalisticStructure(/*num_spatial_layers=*/1,
-                                                  kMaxTemporalStreams);
-      sender_video.SetVideoStructure(&structure);
-    } else if (codec_specific_info &&
-               codec_specific_info->codecType == kVideoCodecVP9) {
-      const CodecSpecificInfoVP9& vp9 = codec_specific_info->codecSpecific.VP9;
-
-      FrameDependencyStructure structure =
-          RtpPayloadParams::MinimalisticStructure(vp9.num_spatial_layers,
-                                                  kMaxTemporalStreams);
-      if (vp9.ss_data_available && vp9.spatial_layer_resolution_present) {
-        for (size_t i = 0; i < vp9.num_spatial_layers; ++i) {
-          structure.resolutions.emplace_back(vp9.width[i], vp9.height[i]);
-        }
-      }
-      sender_video.SetVideoStructure(&structure);
-    } else if (simulate_generic_structure_ && codec_specific_info &&
-               codec_specific_info->codecType == kVideoCodecGeneric) {
-      FrameDependencyStructure structure =
-          RtpPayloadParams::MinimalisticStructure(
-              /*num_spatial_layers=*/1,
-              /*num_temporal_layers=*/1);
-      sender_video.SetVideoStructure(&structure);
+    } else if (absl::optional<FrameDependencyStructure> structure =
+                   params_[stream_index].GenericStructure(
+                       codec_specific_info)) {
+      sender_video.SetVideoStructure(&*structure);
     } else {
       sender_video.SetVideoStructure(nullptr);
     }
diff --git a/call/rtp_video_sender.h b/call/rtp_video_sender.h
index e177bc4..d762624 100644
--- a/call/rtp_video_sender.h
+++ b/call/rtp_video_sender.h
@@ -170,7 +170,6 @@
   const bool send_side_bwe_with_overhead_;
   const bool use_frame_rate_for_overhead_;
   const bool has_packet_feedback_;
-  const bool simulate_generic_structure_;
 
   // Semantically equivalent to checking for `transport_->GetWorkerQueue()`
   // but some tests need to be updated to call from the correct context.