Various VP9 high fps fixes

- Enable flexible mode in loopback tools and quality tests
- Ensure duplicate references are not set by the sender in video header
- Reset first active spatial layer on keyframe in encoder
- Make vp9 encoder to not generate spatial references for first active
  layer with external reference control in svc flexible mode

Bug: webrtc:10049
Change-Id: If9ff576ea8a1a2fef6116b17b5b5adff08c5f8c6
Reviewed-on: https://webrtc-review.googlesource.com/c/112080
Commit-Queue: Ilya Nikolaevskiy <ilnik@webrtc.org>
Reviewed-by: Sergey Silkin <ssilkin@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#25795}
diff --git a/modules/video_coding/codecs/vp9/svc_config.cc b/modules/video_coding/codecs/vp9/svc_config.cc
index 6807698..3e92280 100644
--- a/modules/video_coding/codecs/vp9/svc_config.cc
+++ b/modules/video_coding/codecs/vp9/svc_config.cc
@@ -22,9 +22,9 @@
 namespace {
 const size_t kMinVp9SvcBitrateKbps = 30;
 
-const size_t kMaxNumLayersForScreenSharing = 2;
-const float kMaxScreenSharingLayerFramerateFps[] = {5.0, 5.0};
-const size_t kMaxScreenSharingLayerBitrateKbps[] = {200, 500};
+const size_t kMaxNumLayersForScreenSharing = 3;
+const float kMaxScreenSharingLayerFramerateFps[] = {5.0, 5.0, 30.0};
+const size_t kMaxScreenSharingLayerBitrateKbps[] = {200, 500, 1250};
 }  // namespace
 
 std::vector<SpatialLayer> ConfigureSvcScreenSharing(size_t input_width,
diff --git a/modules/video_coding/codecs/vp9/svc_config_unittest.cc b/modules/video_coding/codecs/vp9/svc_config_unittest.cc
index 257c5df..b997767 100644
--- a/modules/video_coding/codecs/vp9/svc_config_unittest.cc
+++ b/modules/video_coding/codecs/vp9/svc_config_unittest.cc
@@ -49,12 +49,13 @@
   std::vector<SpatialLayer> spatial_layers =
       GetSvcConfig(1920, 1080, 30, 3, 3, true);
 
-  EXPECT_EQ(spatial_layers.size(), 2UL);
+  EXPECT_EQ(spatial_layers.size(), 3UL);
 
-  for (const SpatialLayer& layer : spatial_layers) {
+  for (size_t i = 0; i < 3; ++i) {
+    const SpatialLayer& layer = spatial_layers[i];
     EXPECT_EQ(layer.width, 1920);
     EXPECT_EQ(layer.height, 1080);
-    EXPECT_EQ(layer.maxFramerate, 5);
+    EXPECT_EQ(layer.maxFramerate, (i < 2) ? 5 : 30);
     EXPECT_EQ(layer.numberOfTemporalLayers, 1);
     EXPECT_LE(layer.minBitrate, layer.maxBitrate);
     EXPECT_LE(layer.minBitrate, layer.targetBitrate);
diff --git a/modules/video_coding/codecs/vp9/svc_rate_allocator_unittest.cc b/modules/video_coding/codecs/vp9/svc_rate_allocator_unittest.cc
index 048bf7d..eec2b9d 100644
--- a/modules/video_coding/codecs/vp9/svc_rate_allocator_unittest.cc
+++ b/modules/video_coding/codecs/vp9/svc_rate_allocator_unittest.cc
@@ -149,7 +149,7 @@
 
   const SpatialLayer* layers = codec.spatialLayers;
 
-  EXPECT_LE(codec.VP9()->numberOfSpatialLayers, 2U);
+  EXPECT_LE(codec.VP9()->numberOfSpatialLayers, 3U);
 
   VideoBitrateAllocation allocation =
       allocator.GetAllocation(layers[0].minBitrate * 1000, 30);
diff --git a/modules/video_coding/codecs/vp9/vp9_impl.cc b/modules/video_coding/codecs/vp9/vp9_impl.cc
index cec7d9e..61542c5 100644
--- a/modules/video_coding/codecs/vp9/vp9_impl.cc
+++ b/modules/video_coding/codecs/vp9/vp9_impl.cc
@@ -696,30 +696,33 @@
     }
   }
 
-  if (VideoCodecMode::kScreensharing == codec_.mode && !force_key_frame_) {
-    // Skip encoding spatial layer frames if their target frame rate is lower
-    // than actual input frame rate.
+  size_t first_active_spatial_layer_id = 0;
+  if (VideoCodecMode::kScreensharing == codec_.mode) {
     vpx_svc_layer_id_t layer_id = {0};
-    const size_t gof_idx = (pics_since_key_ + 1) % gof_.num_frames_in_gof;
-    layer_id.temporal_layer_id = gof_.temporal_idx[gof_idx];
+    if (!force_key_frame_) {
+      // Skip encoding spatial layer frames if their target frame rate is lower
+      // than actual input frame rate.
+      const size_t gof_idx = (pics_since_key_ + 1) % gof_.num_frames_in_gof;
+      layer_id.temporal_layer_id = gof_.temporal_idx[gof_idx];
 
-    const uint32_t frame_timestamp_ms =
-        1000 * input_image.timestamp() / kVideoPayloadTypeFrequency;
+      const uint32_t frame_timestamp_ms =
+          1000 * input_image.timestamp() / kVideoPayloadTypeFrequency;
 
-    for (uint8_t sl_idx = 0; sl_idx < num_active_spatial_layers_; ++sl_idx) {
-      if (framerate_controller_[sl_idx].DropFrame(frame_timestamp_ms)) {
-        ++layer_id.spatial_layer_id;
-      } else {
-        break;
+      for (uint8_t sl_idx = 0; sl_idx < num_active_spatial_layers_; ++sl_idx) {
+        if (framerate_controller_[sl_idx].DropFrame(frame_timestamp_ms)) {
+          ++layer_id.spatial_layer_id;
+        } else {
+          break;
+        }
+      }
+
+      RTC_DCHECK_LE(layer_id.spatial_layer_id, num_active_spatial_layers_);
+      if (layer_id.spatial_layer_id >= num_active_spatial_layers_) {
+        // Drop entire picture.
+        return WEBRTC_VIDEO_CODEC_OK;
       }
     }
-
-    RTC_DCHECK_LE(layer_id.spatial_layer_id, num_active_spatial_layers_);
-    if (layer_id.spatial_layer_id >= num_active_spatial_layers_) {
-      // Drop entire picture.
-      return WEBRTC_VIDEO_CODEC_OK;
-    }
-
+    first_active_spatial_layer_id = layer_id.spatial_layer_id;
     vpx_codec_control(encoder_, VP9E_SET_SVC_LAYER_ID, &layer_id);
   }
 
@@ -780,7 +783,8 @@
   }
 
   if (external_ref_control_) {
-    vpx_svc_ref_frame_config_t ref_config = SetReferences(force_key_frame_);
+    vpx_svc_ref_frame_config_t ref_config =
+        SetReferences(force_key_frame_, first_active_spatial_layer_id);
 
     if (VideoCodecMode::kScreensharing == codec_.mode) {
       for (uint8_t sl_idx = 0; sl_idx < num_active_spatial_layers_; ++sl_idx) {
@@ -985,6 +989,8 @@
 
   size_t max_ref_temporal_layer_id = 0;
 
+  std::vector<size_t> ref_pid_list;
+
   vp9_info->num_ref_pics = 0;
   for (const RefFrameBuffer& ref_buf : ref_buf_list) {
     RTC_DCHECK_LE(ref_buf.pic_num, pic_num);
@@ -997,6 +1003,16 @@
       }
       RTC_DCHECK_LE(ref_buf.temporal_layer_id, layer_id.temporal_layer_id);
 
+      // Encoder may reference several spatial layers on the same previous
+      // frame in case if some spatial layers are skipped on the current frame.
+      // We shouldn't put duplicate references as it may break some old
+      // clients and isn't RTP compatible.
+      if (std::find(ref_pid_list.begin(), ref_pid_list.end(),
+                    ref_buf.pic_num) != ref_pid_list.end()) {
+        continue;
+      }
+      ref_pid_list.push_back(ref_buf.pic_num);
+
       const size_t p_diff = pic_num - ref_buf.pic_num;
       RTC_DCHECK_LE(p_diff, 127UL);
 
@@ -1061,7 +1077,9 @@
   }
 }
 
-vpx_svc_ref_frame_config_t VP9EncoderImpl::SetReferences(bool is_key_pic) {
+vpx_svc_ref_frame_config_t VP9EncoderImpl::SetReferences(
+    bool is_key_pic,
+    size_t first_active_spatial_layer_id) {
   // kRefBufIdx, kUpdBufIdx need to be updated to support longer GOFs.
   RTC_DCHECK_LE(gof_.num_frames_in_gof, 4);
 
@@ -1113,13 +1131,14 @@
       }
     }
 
-    if (is_inter_layer_pred_allowed && sl_idx > 0) {
+    if (is_inter_layer_pred_allowed && sl_idx > first_active_spatial_layer_id) {
       // Set up spatial reference.
       RTC_DCHECK(last_updated_buf_idx);
       ref_config.gld_fb_idx[sl_idx] = *last_updated_buf_idx;
       ref_config.reference_golden[sl_idx] = 1;
     } else {
-      RTC_DCHECK(ref_config.reference_last[sl_idx] != 0 || sl_idx == 0 ||
+      RTC_DCHECK(ref_config.reference_last[sl_idx] != 0 ||
+                 sl_idx == first_active_spatial_layer_id ||
                  inter_layer_pred_ == InterLayerPredMode::kOff);
     }
 
diff --git a/modules/video_coding/codecs/vp9/vp9_impl.h b/modules/video_coding/codecs/vp9/vp9_impl.h
index 33f41fd..3bfab9a 100644
--- a/modules/video_coding/codecs/vp9/vp9_impl.h
+++ b/modules/video_coding/codecs/vp9/vp9_impl.h
@@ -70,7 +70,9 @@
                             CodecSpecificInfoVP9* vp9_info);
   void UpdateReferenceBuffers(const vpx_codec_cx_pkt& pkt,
                               const size_t pic_num);
-  vpx_svc_ref_frame_config_t SetReferences(bool is_key_pic);
+  vpx_svc_ref_frame_config_t SetReferences(
+      bool is_key_pic,
+      size_t first_active_spatial_layer_id);
 
   bool ExplicitlyConfiguredSpatialLayers() const;
   bool SetSvcRates(const VideoBitrateAllocation& bitrate_allocation);
diff --git a/video/video_quality_test.cc b/video/video_quality_test.cc
index 3261d41..d6ccb65 100644
--- a/video/video_quality_test.cc
+++ b/video/video_quality_test.cc
@@ -670,6 +670,10 @@
         vp9_settings.numberOfSpatialLayers = static_cast<unsigned char>(
             params_.ss[video_idx].num_spatial_layers);
         vp9_settings.interLayerPred = params_.ss[video_idx].inter_layer_pred;
+        // High FPS vp9 screenshare requires flexible mode.
+        if (params_.video[video_idx].fps > 5) {
+          vp9_settings.flexibleMode = true;
+        }
         video_encoder_configs_[video_idx].encoder_specific_settings =
             new rtc::RefCountedObject<
                 VideoEncoderConfig::Vp9EncoderSpecificSettings>(vp9_settings);