In VP9 wrapper fill information required to produce Dependency Descriptor

Bug: webrtc:11999
Change-Id: Id20575fca5b9279adccf1498165815aa16e044af
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/187340
Commit-Queue: Danil Chapovalov <danilchap@webrtc.org>
Reviewed-by: Philip Eliasson <philipel@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#32421}
diff --git a/modules/rtp_rtcp/source/rtp_sender_video.cc b/modules/rtp_rtcp/source/rtp_sender_video.cc
index 8294891..7a75973 100644
--- a/modules/rtp_rtcp/source/rtp_sender_video.cc
+++ b/modules/rtp_rtcp/source/rtp_sender_video.cc
@@ -382,10 +382,15 @@
         descriptor.active_decode_targets_bitmask =
             active_decode_targets_tracker_.ActiveDecodeTargetsBitmask();
       }
-      // To avoid extra structure copy, temporary share ownership of the
-      // video_structure with the dependency descriptor.
+      // VP9 mark all layer frames of the first picture as kVideoFrameKey,
+      // Structure should be attached to the descriptor to lowest spatial layer
+      // when inter layer dependency is used, i.e. L structures; or to all
+      // layers when inter layer dependency is not used, i.e. S structures.
+      // Distinguish these two cases by checking if there are any dependencies.
       if (video_header.frame_type == VideoFrameType::kVideoFrameKey &&
-          first_packet) {
+          video_header.generic->dependencies.empty() && first_packet) {
+        // To avoid extra structure copy, temporary share ownership of the
+        // video_structure with the dependency descriptor.
         descriptor.attached_structure =
             absl::WrapUnique(video_structure_.get());
       }
diff --git a/modules/video_coding/BUILD.gn b/modules/video_coding/BUILD.gn
index 5dae630..f81387f 100644
--- a/modules/video_coding/BUILD.gn
+++ b/modules/video_coding/BUILD.gn
@@ -565,6 +565,8 @@
     "../../rtc_base/synchronization:mutex",
     "../../system_wrappers:field_trial",
     "../rtp_rtcp:rtp_rtcp_format",
+    "svc:scalability_structures",
+    "svc:scalable_video_controller",
     "//third_party/libyuv",
   ]
   absl_deps = [
diff --git a/modules/video_coding/codecs/test/encoded_video_frame_producer.cc b/modules/video_coding/codecs/test/encoded_video_frame_producer.cc
index 277033f..899826e 100644
--- a/modules/video_coding/codecs/test/encoded_video_frame_producer.cc
+++ b/modules/video_coding/codecs/test/encoded_video_frame_producer.cc
@@ -57,7 +57,6 @@
                WEBRTC_VIDEO_CODEC_OK);
 
   uint32_t rtp_tick = 90000 / framerate_fps_;
-  std::vector<VideoFrameType> frame_types = {VideoFrameType::kVideoFrameDelta};
   for (int i = 0; i < num_input_frames_; ++i) {
     VideoFrame frame =
         VideoFrame::Builder()
@@ -65,7 +64,9 @@
             .set_timestamp_rtp(rtp_timestamp_)
             .build();
     rtp_timestamp_ += rtp_tick;
-    RTC_CHECK_EQ(encoder_.Encode(frame, &frame_types), WEBRTC_VIDEO_CODEC_OK);
+    RTC_CHECK_EQ(encoder_.Encode(frame, &next_frame_type_),
+                 WEBRTC_VIDEO_CODEC_OK);
+    next_frame_type_[0] = VideoFrameType::kVideoFrameDelta;
   }
 
   RTC_CHECK_EQ(encoder_.RegisterEncodeCompleteCallback(nullptr),
diff --git a/modules/video_coding/codecs/test/encoded_video_frame_producer.h b/modules/video_coding/codecs/test/encoded_video_frame_producer.h
index 1b1b901..2216287 100644
--- a/modules/video_coding/codecs/test/encoded_video_frame_producer.h
+++ b/modules/video_coding/codecs/test/encoded_video_frame_producer.h
@@ -40,6 +40,8 @@
 
   // Number of the input frames to pass to the encoder.
   EncodedVideoFrameProducer& SetNumInputFrames(int value);
+  // Encode next frame as key frame.
+  EncodedVideoFrameProducer& ForceKeyFrame();
   // Resolution of the input frames.
   EncodedVideoFrameProducer& SetResolution(RenderResolution value);
 
@@ -57,6 +59,8 @@
   int num_input_frames_ = 1;
   int framerate_fps_ = 30;
   RenderResolution resolution_ = {320, 180};
+  std::vector<VideoFrameType> next_frame_type_ = {
+      VideoFrameType::kVideoFrameKey};
 };
 
 inline EncodedVideoFrameProducer& EncodedVideoFrameProducer::SetNumInputFrames(
@@ -66,6 +70,11 @@
   return *this;
 }
 
+inline EncodedVideoFrameProducer& EncodedVideoFrameProducer::ForceKeyFrame() {
+  next_frame_type_ = {VideoFrameType::kVideoFrameKey};
+  return *this;
+}
+
 inline EncodedVideoFrameProducer& EncodedVideoFrameProducer::SetResolution(
     RenderResolution value) {
   resolution_ = value;
diff --git a/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc b/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc
index a55b110..d3b7b94 100644
--- a/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc
+++ b/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc
@@ -28,7 +28,9 @@
 namespace webrtc {
 namespace {
 
+using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
+using ::testing::IsEmpty;
 using ::testing::SizeIs;
 using ::testing::UnorderedElementsAreArray;
 using EncoderInfo = webrtc::VideoEncoder::EncoderInfo;
@@ -53,6 +55,21 @@
   return codec_settings;
 }
 
+void ConfigureSvc(VideoCodec& codec_settings,
+                  int num_spatial_layers,
+                  int num_temporal_layers = 1) {
+  codec_settings.VP9()->numberOfSpatialLayers = num_spatial_layers;
+  codec_settings.VP9()->numberOfTemporalLayers = num_temporal_layers;
+  codec_settings.VP9()->frameDroppingOn = false;
+
+  std::vector<SpatialLayer> layers = GetSvcConfig(
+      codec_settings.width, codec_settings.height, codec_settings.maxFramerate,
+      /*first_active_layer=*/0, num_spatial_layers, num_temporal_layers, false);
+  for (size_t i = 0; i < layers.size(); ++i) {
+    codec_settings.spatialLayers[i] = layers[i];
+  }
+}
+
 }  // namespace
 
 class TestVp9Impl : public VideoCodecUnitTest {
@@ -72,21 +89,6 @@
     codec_settings->VP9()->numberOfTemporalLayers = 1;
     codec_settings->VP9()->numberOfSpatialLayers = 1;
   }
-
-  void ConfigureSvc(size_t num_spatial_layers, size_t num_temporal_layers = 1) {
-    codec_settings_.VP9()->numberOfSpatialLayers =
-        static_cast<unsigned char>(num_spatial_layers);
-    codec_settings_.VP9()->numberOfTemporalLayers = num_temporal_layers;
-    codec_settings_.VP9()->frameDroppingOn = false;
-
-    std::vector<SpatialLayer> layers =
-        GetSvcConfig(codec_settings_.width, codec_settings_.height,
-                     codec_settings_.maxFramerate, /*first_active_layer=*/0,
-                     num_spatial_layers, num_temporal_layers, false);
-    for (size_t i = 0; i < layers.size(); ++i) {
-      codec_settings_.spatialLayers[i] = layers[i];
-    }
-  }
 };
 
 class TestVp9ImplForPixelFormat
@@ -204,6 +206,28 @@
   EXPECT_EQ(encoded_frame.qp_, qp);
 }
 
+TEST(Vp9ImplTest, EncodeAttachesTemplateStructureWithSvcController) {
+  test::ScopedFieldTrials override_field_trials(
+      "WebRTC-Vp9DependencyDescriptor/Enabled/");
+  std::unique_ptr<VideoEncoder> encoder = VP9Encoder::Create();
+  VideoCodec codec_settings = DefaultCodecSettings();
+  EXPECT_EQ(encoder->InitEncode(&codec_settings, kSettings),
+            WEBRTC_VIDEO_CODEC_OK);
+
+  std::vector<EncodedVideoFrameProducer::EncodedFrame> frames =
+      EncodedVideoFrameProducer(*encoder)
+          .SetNumInputFrames(2)
+          .SetResolution({kWidth, kHeight})
+          .Encode();
+
+  ASSERT_THAT(frames, SizeIs(2));
+  EXPECT_TRUE(frames[0].codec_specific_info.template_structure);
+  EXPECT_TRUE(frames[0].codec_specific_info.generic_frame_info);
+
+  EXPECT_FALSE(frames[1].codec_specific_info.template_structure);
+  EXPECT_TRUE(frames[1].codec_specific_info.generic_frame_info);
+}
+
 TEST(Vp9ImplTest, EncoderWith2TemporalLayers) {
   std::unique_ptr<VideoEncoder> encoder = VP9Encoder::Create();
   VideoCodec codec_settings = DefaultCodecSettings();
@@ -226,6 +250,37 @@
   EXPECT_EQ(frames[3].codec_specific_info.codecSpecific.VP9.temporal_idx, 1);
 }
 
+TEST(Vp9ImplTest, EncodeTemporalLayersWithSvcController) {
+  test::ScopedFieldTrials override_field_trials(
+      "WebRTC-Vp9DependencyDescriptor/Enabled/");
+  std::unique_ptr<VideoEncoder> encoder = VP9Encoder::Create();
+  VideoCodec codec_settings = DefaultCodecSettings();
+  codec_settings.VP9()->numberOfTemporalLayers = 2;
+  EXPECT_EQ(encoder->InitEncode(&codec_settings, kSettings),
+            WEBRTC_VIDEO_CODEC_OK);
+
+  std::vector<EncodedVideoFrameProducer::EncodedFrame> frames =
+      EncodedVideoFrameProducer(*encoder)
+          .SetNumInputFrames(4)
+          .SetResolution({kWidth, kHeight})
+          .Encode();
+
+  ASSERT_THAT(frames, SizeIs(4));
+  EXPECT_EQ(frames[0].codec_specific_info.codecSpecific.VP9.temporal_idx, 0);
+  EXPECT_EQ(frames[1].codec_specific_info.codecSpecific.VP9.temporal_idx, 1);
+  EXPECT_EQ(frames[2].codec_specific_info.codecSpecific.VP9.temporal_idx, 0);
+  EXPECT_EQ(frames[3].codec_specific_info.codecSpecific.VP9.temporal_idx, 1);
+  // Verify codec agnostic part
+  ASSERT_TRUE(frames[0].codec_specific_info.generic_frame_info);
+  ASSERT_TRUE(frames[1].codec_specific_info.generic_frame_info);
+  ASSERT_TRUE(frames[2].codec_specific_info.generic_frame_info);
+  ASSERT_TRUE(frames[3].codec_specific_info.generic_frame_info);
+  EXPECT_EQ(frames[0].codec_specific_info.generic_frame_info->temporal_id, 0);
+  EXPECT_EQ(frames[1].codec_specific_info.generic_frame_info->temporal_id, 1);
+  EXPECT_EQ(frames[2].codec_specific_info.generic_frame_info->temporal_id, 0);
+  EXPECT_EQ(frames[3].codec_specific_info.generic_frame_info->temporal_id, 1);
+}
+
 TEST(Vp9ImplTest, EncoderWith2SpatialLayers) {
   std::unique_ptr<VideoEncoder> encoder = VP9Encoder::Create();
   VideoCodec codec_settings = DefaultCodecSettings();
@@ -244,6 +299,37 @@
   EXPECT_EQ(frames[1].encoded_image.SpatialIndex(), 1);
 }
 
+TEST(Vp9ImplTest, EncodeSpatialLayersWithSvcController) {
+  test::ScopedFieldTrials override_field_trials(
+      "WebRTC-Vp9DependencyDescriptor/Enabled/");
+  std::unique_ptr<VideoEncoder> encoder = VP9Encoder::Create();
+  VideoCodec codec_settings = DefaultCodecSettings();
+  codec_settings.VP9()->numberOfSpatialLayers = 2;
+  EXPECT_EQ(encoder->InitEncode(&codec_settings, kSettings),
+            WEBRTC_VIDEO_CODEC_OK);
+
+  std::vector<EncodedVideoFrameProducer::EncodedFrame> frames =
+      EncodedVideoFrameProducer(*encoder)
+          .SetNumInputFrames(2)
+          .SetResolution({kWidth, kHeight})
+          .Encode();
+
+  ASSERT_THAT(frames, SizeIs(4));
+  EXPECT_EQ(frames[0].encoded_image.SpatialIndex(), 0);
+  EXPECT_EQ(frames[1].encoded_image.SpatialIndex(), 1);
+  EXPECT_EQ(frames[2].encoded_image.SpatialIndex(), 0);
+  EXPECT_EQ(frames[3].encoded_image.SpatialIndex(), 1);
+  // Verify codec agnostic part
+  ASSERT_TRUE(frames[0].codec_specific_info.generic_frame_info);
+  ASSERT_TRUE(frames[1].codec_specific_info.generic_frame_info);
+  ASSERT_TRUE(frames[2].codec_specific_info.generic_frame_info);
+  ASSERT_TRUE(frames[3].codec_specific_info.generic_frame_info);
+  EXPECT_EQ(frames[0].codec_specific_info.generic_frame_info->spatial_id, 0);
+  EXPECT_EQ(frames[1].codec_specific_info.generic_frame_info->spatial_id, 1);
+  EXPECT_EQ(frames[2].codec_specific_info.generic_frame_info->spatial_id, 0);
+  EXPECT_EQ(frames[3].codec_specific_info.generic_frame_info->spatial_id, 1);
+}
+
 TEST_F(TestVp9Impl, EncoderExplicitLayering) {
   // Override default settings.
   codec_settings_.VP9()->numberOfTemporalLayers = 1;
@@ -304,7 +390,7 @@
   const size_t num_spatial_layers = 3;
   const size_t num_frames_to_encode = 5;
 
-  ConfigureSvc(num_spatial_layers);
+  ConfigureSvc(codec_settings_, num_spatial_layers);
   codec_settings_.VP9()->frameDroppingOn = true;
 
   EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
@@ -350,6 +436,68 @@
   }
 }
 
+TEST(Vp9ImplTest, EnableDisableSpatialLayersWithSvcController) {
+  test::ScopedFieldTrials override_field_trials(
+      "WebRTC-Vp9DependencyDescriptor/Enabled/");
+  const int num_spatial_layers = 3;
+  // Configure encoder to produce 3 spatial layers. Encode frames of layer 0
+  // then enable layer 1 and encode more frames and so on.
+  // Then disable layers one by one in the same way.
+  // Note: bit rate allocation is high to avoid frame dropping due to rate
+  // control, the encoder should always produce a frame. A dropped
+  // frame indicates a problem and the test will fail.
+  std::unique_ptr<VideoEncoder> encoder = VP9Encoder::Create();
+  VideoCodec codec_settings = DefaultCodecSettings();
+  ConfigureSvc(codec_settings, num_spatial_layers);
+  codec_settings.VP9()->frameDroppingOn = true;
+  EXPECT_EQ(encoder->InitEncode(&codec_settings, kSettings),
+            WEBRTC_VIDEO_CODEC_OK);
+
+  EncodedVideoFrameProducer producer(*encoder);
+  producer.SetResolution({kWidth, kHeight});
+
+  // Encode a key frame to validate all other frames are delta frames.
+  std::vector<EncodedVideoFrameProducer::EncodedFrame> frames =
+      producer.SetNumInputFrames(1).Encode();
+  ASSERT_THAT(frames, Not(IsEmpty()));
+  EXPECT_TRUE(frames[0].codec_specific_info.template_structure);
+
+  const size_t num_frames_to_encode = 5;
+
+  VideoBitrateAllocation bitrate_allocation;
+  for (size_t sl_idx = 0; sl_idx < num_spatial_layers; ++sl_idx) {
+    // Allocate high bit rate to avoid frame dropping due to rate control.
+    bitrate_allocation.SetBitrate(
+        sl_idx, 0,
+        codec_settings.spatialLayers[sl_idx].targetBitrate * 1000 * 2);
+    encoder->SetRates(VideoEncoder::RateControlParameters(
+        bitrate_allocation, codec_settings.maxFramerate));
+
+    frames = producer.SetNumInputFrames(num_frames_to_encode).Encode();
+    // With (sl_idx+1) spatial layers expect (sl_idx+1) frames per input frame.
+    ASSERT_THAT(frames, SizeIs(num_frames_to_encode * (sl_idx + 1)));
+    for (size_t i = 0; i < frames.size(); ++i) {
+      EXPECT_TRUE(frames[i].codec_specific_info.generic_frame_info);
+      EXPECT_FALSE(frames[i].codec_specific_info.template_structure);
+    }
+  }
+
+  for (int sl_idx = num_spatial_layers - 1; sl_idx > 0; --sl_idx) {
+    bitrate_allocation.SetBitrate(sl_idx, 0, 0);
+    encoder->SetRates(VideoEncoder::RateControlParameters(
+        bitrate_allocation, codec_settings.maxFramerate));
+
+    frames = producer.SetNumInputFrames(num_frames_to_encode).Encode();
+    // With |sl_idx| spatial layer disabled, there are |sl_idx| spatial layers
+    // left.
+    ASSERT_THAT(frames, SizeIs(num_frames_to_encode * sl_idx));
+    for (size_t i = 0; i < frames.size(); ++i) {
+      EXPECT_TRUE(frames[i].codec_specific_info.generic_frame_info);
+      EXPECT_FALSE(frames[i].codec_specific_info.template_structure);
+    }
+  }
+}
+
 TEST_F(TestVp9Impl, DisableEnableBaseLayerTriggersKeyFrame) {
   // Configure encoder to produce N spatial layers. Encode frames for all
   // layers. Then disable all but the last layer. Then reenable all back again.
@@ -360,7 +508,7 @@
   // Must not be multiple of temporal period to exercise all code paths.
   const size_t num_frames_to_encode = 5;
 
-  ConfigureSvc(num_spatial_layers, num_temporal_layers);
+  ConfigureSvc(codec_settings_, num_spatial_layers, num_temporal_layers);
   codec_settings_.VP9()->frameDroppingOn = false;
   codec_settings_.VP9()->flexibleMode = false;
   codec_settings_.VP9()->interLayerPred = InterLayerPredMode::kOnKeyPic;
@@ -506,13 +654,133 @@
   }
 }
 
+TEST(Vp9ImplTest, DisableEnableBaseLayerWithSvcControllerTriggersKeyFrame) {
+  // Configure encoder to produce N spatial layers. Encode frames for all
+  // layers. Then disable all but the last layer. Then reenable all back again.
+  test::ScopedFieldTrials override_field_trials(
+      "WebRTC-Vp9DependencyDescriptor/Enabled/");
+  const size_t num_spatial_layers = 3;
+  const size_t num_temporal_layers = 3;
+  // Must not be multiple of temporal period to exercise all code paths.
+  const size_t num_frames_to_encode = 5;
+
+  std::unique_ptr<VideoEncoder> encoder = VP9Encoder::Create();
+  VideoCodec codec_settings = DefaultCodecSettings();
+  ConfigureSvc(codec_settings, num_spatial_layers, num_temporal_layers);
+  codec_settings.VP9()->frameDroppingOn = false;
+  codec_settings.VP9()->flexibleMode = false;
+  codec_settings.mode = VideoCodecMode::kRealtimeVideo;
+
+  EXPECT_EQ(encoder->InitEncode(&codec_settings, kSettings),
+            WEBRTC_VIDEO_CODEC_OK);
+
+  VideoBitrateAllocation bitrate_allocation;
+  for (size_t sl_idx = 0; sl_idx < num_spatial_layers; ++sl_idx) {
+    for (size_t tl_idx = 0; tl_idx < num_temporal_layers; ++tl_idx) {
+      // Allocate high bit rate to avoid frame dropping due to rate control.
+      bitrate_allocation.SetBitrate(
+          sl_idx, tl_idx,
+          codec_settings.spatialLayers[sl_idx].targetBitrate * 1000 * 2);
+    }
+  }
+  encoder->SetRates(VideoEncoder::RateControlParameters(
+      bitrate_allocation, codec_settings.maxFramerate));
+
+  EncodedVideoFrameProducer producer(*encoder);
+  producer.SetResolution({kWidth, kHeight});
+
+  std::vector<EncodedVideoFrameProducer::EncodedFrame> frames =
+      producer.SetNumInputFrames(num_frames_to_encode).Encode();
+  ASSERT_THAT(frames, SizeIs(num_frames_to_encode * num_spatial_layers));
+
+  // Disable all but top spatial layer.
+  for (size_t sl_idx = 0; sl_idx < num_spatial_layers - 1; ++sl_idx) {
+    for (size_t tl_idx = 0; tl_idx < num_temporal_layers; ++tl_idx) {
+      bitrate_allocation.SetBitrate(sl_idx, tl_idx, 0);
+    }
+  }
+  encoder->SetRates(VideoEncoder::RateControlParameters(
+      bitrate_allocation, codec_settings.maxFramerate));
+
+  frames = producer.SetNumInputFrames(num_frames_to_encode).Encode();
+  EXPECT_THAT(frames, SizeIs(num_frames_to_encode));
+  for (const auto& frame : frames) {
+    // Expect no key-frames generated.
+    EXPECT_FALSE(frame.codec_specific_info.template_structure);
+    ASSERT_TRUE(frame.codec_specific_info.generic_frame_info);
+    EXPECT_EQ(frame.codec_specific_info.generic_frame_info->spatial_id, 2);
+  }
+
+  frames = producer.ForceKeyFrame().SetNumInputFrames(1).Encode();
+  ASSERT_THAT(frames, SizeIs(1));
+  // Key-frame should be produced.
+  EXPECT_EQ(frames[0].encoded_image._frameType, VideoFrameType::kVideoFrameKey);
+  ASSERT_TRUE(frames[0].codec_specific_info.template_structure);
+  ASSERT_TRUE(frames[0].codec_specific_info.generic_frame_info);
+  EXPECT_EQ(frames[0].codec_specific_info.generic_frame_info->spatial_id, 2);
+
+  frames = producer.SetNumInputFrames(num_frames_to_encode).Encode();
+  ASSERT_THAT(frames, SizeIs(num_frames_to_encode));
+  for (const auto& frame : frames) {
+    EXPECT_EQ(frame.encoded_image._frameType, VideoFrameType::kVideoFrameDelta);
+    EXPECT_FALSE(frame.codec_specific_info.template_structure);
+    ASSERT_TRUE(frame.codec_specific_info.generic_frame_info);
+    EXPECT_EQ(frame.codec_specific_info.generic_frame_info->spatial_id, 2);
+  }
+
+  // Enable the second layer back.
+  // Allocate high bit rate to avoid frame dropping due to rate control.
+  for (size_t tl_idx = 0; tl_idx < num_temporal_layers; ++tl_idx) {
+    bitrate_allocation.SetBitrate(
+        1, tl_idx, codec_settings.spatialLayers[0].targetBitrate * 1000 * 2);
+  }
+  encoder->SetRates(VideoEncoder::RateControlParameters(
+      bitrate_allocation, codec_settings.maxFramerate));
+
+  frames = producer.SetNumInputFrames(num_frames_to_encode).Encode();
+  ASSERT_THAT(frames, SizeIs(num_frames_to_encode * 2));
+  EXPECT_EQ(frames[0].encoded_image._frameType, VideoFrameType::kVideoFrameKey);
+  EXPECT_TRUE(frames[0].codec_specific_info.template_structure);
+  ASSERT_TRUE(frames[0].codec_specific_info.generic_frame_info);
+  EXPECT_EQ(frames[0].codec_specific_info.generic_frame_info->spatial_id, 1);
+  for (size_t i = 1; i < frames.size(); ++i) {
+    EXPECT_EQ(frames[i].encoded_image._frameType,
+              VideoFrameType::kVideoFrameDelta);
+    EXPECT_FALSE(frames[i].codec_specific_info.template_structure);
+    ASSERT_TRUE(frames[i].codec_specific_info.generic_frame_info);
+    EXPECT_EQ(frames[i].codec_specific_info.generic_frame_info->spatial_id,
+              1 + static_cast<int>(i % 2));
+  }
+
+  // Enable the first layer back.
+  // Allocate high bit rate to avoid frame dropping due to rate control.
+  for (size_t tl_idx = 0; tl_idx < num_temporal_layers; ++tl_idx) {
+    bitrate_allocation.SetBitrate(
+        0, tl_idx, codec_settings.spatialLayers[1].targetBitrate * 1000 * 2);
+  }
+  encoder->SetRates(VideoEncoder::RateControlParameters(
+      bitrate_allocation, codec_settings.maxFramerate));
+
+  frames = producer.SetNumInputFrames(num_frames_to_encode).Encode();
+  ASSERT_THAT(frames, SizeIs(num_frames_to_encode * 3));
+  EXPECT_TRUE(frames[0].codec_specific_info.template_structure);
+  ASSERT_TRUE(frames[0].codec_specific_info.generic_frame_info);
+  EXPECT_EQ(frames[0].codec_specific_info.generic_frame_info->spatial_id, 0);
+  for (size_t i = 1; i < frames.size(); ++i) {
+    EXPECT_FALSE(frames[i].codec_specific_info.template_structure);
+    ASSERT_TRUE(frames[i].codec_specific_info.generic_frame_info);
+    EXPECT_EQ(frames[i].codec_specific_info.generic_frame_info->spatial_id,
+              static_cast<int>(i % 3));
+  }
+}
+
 TEST_F(TestVp9Impl, DisableEnableBaseLayerTriggersKeyFrameForScreenshare) {
   // Configure encoder to produce N spatial layers. Encode frames for all
   // layers. Then disable all but the last layer. Then reenable all back again.
   const size_t num_spatial_layers = 3;
   const size_t num_frames_to_encode = 5;
 
-  ConfigureSvc(num_spatial_layers);
+  ConfigureSvc(codec_settings_, num_spatial_layers);
   codec_settings_.VP9()->frameDroppingOn = false;
   codec_settings_.mode = VideoCodecMode::kScreensharing;
   codec_settings_.VP9()->interLayerPred = InterLayerPredMode::kOn;
@@ -630,7 +898,7 @@
 
 TEST_F(TestVp9Impl, EndOfPicture) {
   const size_t num_spatial_layers = 2;
-  ConfigureSvc(num_spatial_layers);
+  ConfigureSvc(codec_settings_, num_spatial_layers);
 
   EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
             encoder_->InitEncode(&codec_settings_, kSettings));
@@ -671,7 +939,7 @@
 
 TEST_F(TestVp9Impl, InterLayerPred) {
   const size_t num_spatial_layers = 2;
-  ConfigureSvc(num_spatial_layers);
+  ConfigureSvc(codec_settings_, num_spatial_layers);
   codec_settings_.VP9()->frameDroppingOn = false;
 
   VideoBitrateAllocation bitrate_allocation;
@@ -746,7 +1014,7 @@
   const size_t num_spatial_layers = 3;
   const size_t num_frames_to_encode = 2;
 
-  ConfigureSvc(num_spatial_layers);
+  ConfigureSvc(codec_settings_, num_spatial_layers);
   codec_settings_.VP9()->frameDroppingOn = false;
 
   const std::vector<InterLayerPredMode> inter_layer_pred_modes = {
@@ -803,7 +1071,7 @@
   const size_t num_spatial_layers = 3;
   const size_t num_frames_to_encode = 2;
 
-  ConfigureSvc(num_spatial_layers);
+  ConfigureSvc(codec_settings_, num_spatial_layers);
   codec_settings_.VP9()->frameDroppingOn = false;
   codec_settings_.VP9()->flexibleMode = false;
 
@@ -858,7 +1126,7 @@
   const size_t num_spatial_layers = 2;
   const size_t num_temporal_layers = 2;
 
-  ConfigureSvc(num_spatial_layers, num_temporal_layers);
+  ConfigureSvc(codec_settings_, num_spatial_layers, num_temporal_layers);
   codec_settings_.VP9()->frameDroppingOn = false;
   codec_settings_.VP9()->flexibleMode = false;
 
@@ -930,7 +1198,7 @@
   const size_t num_spatial_layers = 2;
   const size_t num_temporal_layers = 2;
 
-  ConfigureSvc(num_spatial_layers, num_temporal_layers);
+  ConfigureSvc(codec_settings_, num_spatial_layers, num_temporal_layers);
   codec_settings_.VP9()->frameDroppingOn = false;
   codec_settings_.VP9()->flexibleMode = false;
 
@@ -1010,7 +1278,7 @@
   const size_t num_frames_to_encode_before_drop = 1;
 
   codec_settings_.maxFramerate = 30;
-  ConfigureSvc(num_spatial_layers);
+  ConfigureSvc(codec_settings_, num_spatial_layers);
   codec_settings_.spatialLayers[0].maxFramerate = 5.0;
   // use 30 for the SL 1 instead of 10, so even if SL 0 frame is dropped due to
   // framerate capping we would still get back at least a middle layer. It
@@ -1069,7 +1337,7 @@
   const int num_frames_to_detect_drops = 2;
 
   codec_settings_.maxFramerate = 30;
-  ConfigureSvc(num_spatial_layers);
+  ConfigureSvc(codec_settings_, num_spatial_layers);
   // use 30 for the SL0 and SL1 because it simplifies the test.
   codec_settings_.spatialLayers[0].maxFramerate = 30.0;
   codec_settings_.spatialLayers[1].maxFramerate = 30.0;
@@ -1159,7 +1427,7 @@
   const size_t num_dropped_frames = 5;
 
   codec_settings_.maxFramerate = 30;
-  ConfigureSvc(num_spatial_layers);
+  ConfigureSvc(codec_settings_, num_spatial_layers);
   codec_settings_.spatialLayers[0].maxFramerate = 5.0;
   // use 30 for the SL 1 instead of 5, so even if SL 0 frame is dropped due to
   // framerate capping we would still get back at least a middle layer. It
@@ -1246,7 +1514,7 @@
   const size_t num_temporal_layers = 2;
   // Chosen by hand, the 2nd frame is dropped with configured per-layer max
   // framerate.
-  ConfigureSvc(num_spatial_layers, num_temporal_layers);
+  ConfigureSvc(codec_settings_, num_spatial_layers, num_temporal_layers);
   codec_settings_.VP9()->frameDroppingOn = false;
   codec_settings_.mode = VideoCodecMode::kRealtimeVideo;
   codec_settings_.VP9()->interLayerPred = InterLayerPredMode::kOnKeyPic;
@@ -1305,7 +1573,7 @@
 
 TEST_F(TestVp9Impl,
        LowLayerMarkedAsRefIfHighLayerNotEncodedAndInterLayerPredIsEnabled) {
-  ConfigureSvc(3);
+  ConfigureSvc(codec_settings_, 3);
   codec_settings_.VP9()->frameDroppingOn = false;
   codec_settings_.VP9()->interLayerPred = InterLayerPredMode::kOn;
 
@@ -1710,7 +1978,7 @@
   // Force low frame-rate, so all layers are present for all frames.
   codec_settings_.maxFramerate = 5;
 
-  ConfigureSvc(num_spatial_layers);
+  ConfigureSvc(codec_settings_, num_spatial_layers);
 
   EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
             encoder_->InitEncode(&codec_settings_, kSettings));
diff --git a/modules/video_coding/codecs/vp9/vp9_impl.cc b/modules/video_coding/codecs/vp9/vp9_impl.cc
index 49fa92f..599a4d5 100644
--- a/modules/video_coding/codecs/vp9/vp9_impl.cc
+++ b/modules/video_coding/codecs/vp9/vp9_impl.cc
@@ -27,12 +27,16 @@
 #include "common_video/libyuv/include/webrtc_libyuv.h"
 #include "modules/rtp_rtcp/include/rtp_rtcp_defines.h"
 #include "modules/video_coding/codecs/vp9/svc_rate_allocator.h"
+#include "modules/video_coding/svc/create_scalability_structure.h"
+#include "modules/video_coding/svc/scalable_video_controller.h"
+#include "modules/video_coding/svc/scalable_video_controller_no_layering.h"
 #include "modules/video_coding/utility/vp9_uncompressed_header_parser.h"
 #include "rtc_base/checks.h"
 #include "rtc_base/experiments/field_trial_parser.h"
 #include "rtc_base/experiments/rate_control_settings.h"
 #include "rtc_base/keep_ref_until_done.h"
 #include "rtc_base/logging.h"
+#include "rtc_base/strings/string_builder.h"
 #include "rtc_base/time_utils.h"
 #include "rtc_base/trace_event.h"
 #include "third_party/libyuv/include/libyuv/convert.h"
@@ -214,6 +218,107 @@
   config->rc_dropframe_thresh = new_settings.rc_dropframe_thresh;
 }
 
+std::unique_ptr<ScalableVideoController> CreateVp9ScalabilityStructure(
+    const VideoCodec& codec) {
+  int num_spatial_layers = codec.VP9().numberOfSpatialLayers;
+  int num_temporal_layers =
+      std::max(1, int{codec.VP9().numberOfTemporalLayers});
+  if (num_spatial_layers == 1 && num_temporal_layers == 1) {
+    return std::make_unique<ScalableVideoControllerNoLayering>();
+  }
+
+  if (codec.VP9().interLayerPred != InterLayerPredMode::kOn ||
+      codec.mode == VideoCodecMode::kScreensharing) {
+    // TODO(bugs.webrtc.org/11999): Return names of the structure when they are
+    // implemented and support frame skipping.
+    return nullptr;
+  }
+
+  char name[20];
+  rtc::SimpleStringBuilder ss(name);
+  ss << "L" << num_spatial_layers << "T" << num_temporal_layers;
+
+  // Check spatial ratio.
+  if (num_spatial_layers > 1 && codec.spatialLayers[0].targetBitrate > 0) {
+    if (codec.width != codec.spatialLayers[num_spatial_layers - 1].width ||
+        codec.height != codec.spatialLayers[num_spatial_layers - 1].height) {
+      RTC_LOG(LS_WARNING)
+          << "Top layer resolution expected to match overall resolution";
+      return nullptr;
+    }
+    // Check if the ratio is one of the supported.
+    int numerator;
+    int denominator;
+    if (codec.spatialLayers[1].width == 2 * codec.spatialLayers[0].width) {
+      numerator = 1;
+      denominator = 2;
+      // no suffix for 1:2 ratio.
+    } else if (2 * codec.spatialLayers[1].width ==
+               3 * codec.spatialLayers[0].width) {
+      numerator = 2;
+      denominator = 3;
+      ss << "h";
+    } else {
+      RTC_LOG(LS_WARNING) << "Unsupported scalability ratio "
+                          << codec.spatialLayers[0].width << ":"
+                          << codec.spatialLayers[1].width;
+      return nullptr;
+    }
+    // Validate ratio is consistent for all spatial layer transitions.
+    for (int sid = 1; sid < num_spatial_layers; ++sid) {
+      if (codec.spatialLayers[sid].width * numerator !=
+              codec.spatialLayers[sid - 1].width * denominator ||
+          codec.spatialLayers[sid].height * numerator !=
+              codec.spatialLayers[sid - 1].height * denominator) {
+        RTC_LOG(LS_WARNING) << "Inconsistent scalability ratio " << numerator
+                            << ":" << denominator;
+        return nullptr;
+      }
+    }
+  }
+
+  auto scalability_structure_controller = CreateScalabilityStructure(name);
+  if (scalability_structure_controller == nullptr) {
+    RTC_LOG(LS_WARNING) << "Unsupported scalability structure " << name;
+  } else {
+    RTC_LOG(LS_INFO) << "Created scalability structure " << name;
+  }
+  return scalability_structure_controller;
+}
+
+vpx_svc_ref_frame_config_t Vp9References(
+    rtc::ArrayView<const ScalableVideoController::LayerFrameConfig> layers) {
+  vpx_svc_ref_frame_config_t ref_config = {};
+  for (const ScalableVideoController::LayerFrameConfig& layer_frame : layers) {
+    const auto& buffers = layer_frame.Buffers();
+    RTC_DCHECK_LE(buffers.size(), 3);
+    int sid = layer_frame.SpatialId();
+    if (!buffers.empty()) {
+      ref_config.lst_fb_idx[sid] = buffers[0].id;
+      ref_config.reference_last[sid] = buffers[0].referenced;
+      if (buffers[0].updated) {
+        ref_config.update_buffer_slot[sid] |= (1 << buffers[0].id);
+      }
+    }
+    if (buffers.size() > 1) {
+      ref_config.gld_fb_idx[sid] = buffers[1].id;
+      ref_config.reference_golden[sid] = buffers[1].referenced;
+      if (buffers[1].updated) {
+        ref_config.update_buffer_slot[sid] |= (1 << buffers[1].id);
+      }
+    }
+    if (buffers.size() > 2) {
+      ref_config.alt_fb_idx[sid] = buffers[2].id;
+      ref_config.reference_alt_ref[sid] = buffers[2].referenced;
+      if (buffers[2].updated) {
+        ref_config.update_buffer_slot[sid] |= (1 << buffers[2].id);
+      }
+    }
+  }
+  // TODO(bugs.webrtc.org/11999): Fill ref_config.duration
+  return ref_config;
+}
+
 }  // namespace
 
 void VP9EncoderImpl::EncoderOutputCodedPacketCallback(vpx_codec_cx_pkt* pkt,
@@ -262,6 +367,9 @@
       first_frame_in_picture_(true),
       ss_info_needed_(false),
       force_all_active_layers_(false),
+      use_svc_controller_(
+          absl::StartsWith(trials.Lookup("WebRTC-Vp9DependencyDescriptor"),
+                           "Enabled")),
       is_flexible_mode_(false),
       variable_framerate_experiment_(ParseVariableFramerateConfig(trials)),
       variable_framerate_controller_(
@@ -438,6 +546,18 @@
     force_all_active_layers_ = true;
   }
 
+  if (svc_controller_) {
+    VideoBitrateAllocation allocation;
+    for (int sid = 0; sid < num_spatial_layers_; ++sid) {
+      for (int tid = 0; tid < num_temporal_layers_; ++tid) {
+        allocation.SetBitrate(
+            sid, tid,
+            config_->layer_target_bitrate[sid * num_temporal_layers_ + tid] *
+                1000);
+      }
+    }
+    svc_controller_->OnRatesUpdated(allocation);
+  }
   current_bitrate_allocation_ = bitrate_allocation;
   cpu_speed_ = GetCpuSpeed(highest_active_width, highest_active_height);
   config_changed_ = true;
@@ -528,6 +648,9 @@
     num_temporal_layers_ = 1;
   }
 
+  if (use_svc_controller_) {
+    svc_controller_ = CreateVp9ScalabilityStructure(*inst);
+  }
   framerate_controller_ = std::vector<FramerateController>(
       num_spatial_layers_, FramerateController(codec_.maxFramerate));
 
@@ -706,7 +829,13 @@
     svc_params_.min_quantizers[i] = config_->rc_min_quantizer;
   }
   config_->ss_number_layers = num_spatial_layers_;
-  if (ExplicitlyConfiguredSpatialLayers()) {
+  if (svc_controller_) {
+    auto stream_config = svc_controller_->StreamConfig();
+    for (int i = 0; i < stream_config.num_spatial_layers; ++i) {
+      svc_params_.scaling_factor_num[i] = stream_config.scaling_factor_num[i];
+      svc_params_.scaling_factor_den[i] = stream_config.scaling_factor_den[i];
+    }
+  } else if (ExplicitlyConfiguredSpatialLayers()) {
     for (int i = 0; i < num_spatial_layers_; ++i) {
       const auto& layer = codec_.spatialLayers[i];
       RTC_CHECK_GT(layer.width, 0);
@@ -920,6 +1049,13 @@
     force_key_frame_ = true;
   }
 
+  if (svc_controller_) {
+    layer_frames_ = svc_controller_->NextFrameConfig(force_key_frame_);
+    if (layer_frames_.empty()) {
+      return WEBRTC_VIDEO_CODEC_ERROR;
+    }
+  }
+
   vpx_svc_layer_id_t layer_id = {0};
   if (!force_key_frame_) {
     const size_t gof_idx = (pics_since_key_ + 1) % gof_.num_frames_in_gof;
@@ -991,6 +1127,15 @@
     layer_id.spatial_layer_id = first_active_layer_;
   }
 
+  if (svc_controller_) {
+    layer_id.spatial_layer_id = layer_frames_.front().SpatialId();
+    layer_id.temporal_layer_id = layer_frames_.front().TemporalId();
+    for (const auto& layer : layer_frames_) {
+      layer_id.temporal_layer_id_per_spatial[layer.SpatialId()] =
+          layer.TemporalId();
+    }
+  }
+
   vpx_codec_control(encoder_, VP9E_SET_SVC_LAYER_ID, &layer_id);
 
   if (num_spatial_layers_ > 1) {
@@ -1086,7 +1231,10 @@
     flags = VPX_EFLAG_FORCE_KF;
   }
 
-  if (external_ref_control_) {
+  if (svc_controller_) {
+    vpx_svc_ref_frame_config_t ref_config = Vp9References(layer_frames_);
+    vpx_codec_control(encoder_, VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_config);
+  } else if (external_ref_control_) {
     vpx_svc_ref_frame_config_t ref_config =
         SetReferences(force_key_frame_, layer_id.spatial_layer_id);
 
@@ -1252,6 +1400,31 @@
   }
 
   first_frame_in_picture_ = false;
+
+  // Populate codec-agnostic section in the codec specific structure.
+  if (svc_controller_) {
+    auto it = absl::c_find_if(
+        layer_frames_,
+        [&](const ScalableVideoController::LayerFrameConfig& config) {
+          return config.SpatialId() == spatial_idx->value_or(0);
+        });
+    RTC_CHECK(it != layer_frames_.end())
+        << "Failed to find spatial id " << spatial_idx->value_or(0);
+    codec_specific->generic_frame_info = svc_controller_->OnEncodeDone(*it);
+    if (is_key_frame) {
+      codec_specific->template_structure =
+          svc_controller_->DependencyStructure();
+      auto& resolutions = codec_specific->template_structure->resolutions;
+      resolutions.resize(num_spatial_layers_);
+      for (int sid = 0; sid < num_spatial_layers_; ++sid) {
+        resolutions[sid] = RenderResolution(
+            /*width=*/codec_.width * svc_params_.scaling_factor_num[sid] /
+                svc_params_.scaling_factor_den[sid],
+            /*height=*/codec_.height * svc_params_.scaling_factor_num[sid] /
+                svc_params_.scaling_factor_den[sid]);
+      }
+    }
+  }
 }
 
 void VP9EncoderImpl::FillReferenceIndices(const vpx_codec_cx_pkt& pkt,
diff --git a/modules/video_coding/codecs/vp9/vp9_impl.h b/modules/video_coding/codecs/vp9/vp9_impl.h
index 6e23dc6..7ba6a1b 100644
--- a/modules/video_coding/codecs/vp9/vp9_impl.h
+++ b/modules/video_coding/codecs/vp9/vp9_impl.h
@@ -26,6 +26,7 @@
 #include "media/base/vp9_profile.h"
 #include "modules/video_coding/codecs/vp9/include/vp9.h"
 #include "modules/video_coding/codecs/vp9/vp9_frame_buffer_pool.h"
+#include "modules/video_coding/svc/scalable_video_controller.h"
 #include "modules/video_coding/utility/framerate_controller.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_decoder.h"
@@ -139,7 +140,9 @@
   VideoBitrateAllocation current_bitrate_allocation_;
   bool ss_info_needed_;
   bool force_all_active_layers_;
+  const bool use_svc_controller_;
 
+  std::unique_ptr<ScalableVideoController> svc_controller_;
   std::vector<FramerateController> framerate_controller_;
 
   // Used for flexible mode.
@@ -163,6 +166,7 @@
     size_t temporal_layer_id = 0;
   };
   std::map<size_t, RefFrameBuffer> ref_buf_;
+  std::vector<ScalableVideoController::LayerFrameConfig> layer_frames_;
 
   // Variable frame-rate related fields and methods.
   const struct VariableFramerateExperiment {