Update how VP9 temporal up switch is populated

This CL updates both the static GOF pattern with the correct flags for
temporal_up_switch, as well the flexible mode logic to base the flag
on dependency descriptors instead use reference buffers.

Bug: webrtc:13576
Change-Id: I578f744bec51d1f3531da5f4a89d12f05a16a6c0
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/247187
Reviewed-by: Danil Chapovalov <danilchap@webrtc.org>
Commit-Queue: Erik Språng <sprang@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#35741}
diff --git a/modules/video_coding/codecs/vp9/include/vp9_globals.h b/modules/video_coding/codecs/vp9/include/vp9_globals.h
index 87dafe4..e6f644e 100644
--- a/modules/video_coding/codecs/vp9/include/vp9_globals.h
+++ b/modules/video_coding/codecs/vp9/include/vp9_globals.h
@@ -46,14 +46,14 @@
       case kTemporalStructureMode1:
         num_frames_in_gof = 1;
         temporal_idx[0] = 0;
-        temporal_up_switch[0] = false;
+        temporal_up_switch[0] = true;
         num_ref_pics[0] = 1;
         pid_diff[0][0] = 1;
         break;
       case kTemporalStructureMode2:
         num_frames_in_gof = 2;
         temporal_idx[0] = 0;
-        temporal_up_switch[0] = false;
+        temporal_up_switch[0] = true;
         num_ref_pics[0] = 1;
         pid_diff[0][0] = 2;
 
@@ -65,7 +65,7 @@
       case kTemporalStructureMode3:
         num_frames_in_gof = 4;
         temporal_idx[0] = 0;
-        temporal_up_switch[0] = false;
+        temporal_up_switch[0] = true;
         num_ref_pics[0] = 1;
         pid_diff[0][0] = 4;
 
@@ -87,7 +87,7 @@
       case kTemporalStructureMode4:
         num_frames_in_gof = 8;
         temporal_idx[0] = 0;
-        temporal_up_switch[0] = false;
+        temporal_up_switch[0] = true;
         num_ref_pics[0] = 1;
         pid_diff[0][0] = 4;
 
@@ -97,12 +97,12 @@
         pid_diff[1][0] = 1;
 
         temporal_idx[2] = 1;
-        temporal_up_switch[2] = true;
+        temporal_up_switch[2] = false;
         num_ref_pics[2] = 1;
         pid_diff[2][0] = 2;
 
         temporal_idx[3] = 2;
-        temporal_up_switch[3] = false;
+        temporal_up_switch[3] = true;
         num_ref_pics[3] = 2;
         pid_diff[3][0] = 1;
         pid_diff[3][1] = 2;
@@ -113,7 +113,7 @@
         pid_diff[4][0] = 4;
 
         temporal_idx[5] = 2;
-        temporal_up_switch[5] = false;
+        temporal_up_switch[5] = true;
         num_ref_pics[5] = 2;
         pid_diff[5][0] = 1;
         pid_diff[5][1] = 2;
@@ -125,7 +125,7 @@
         pid_diff[6][1] = 4;
 
         temporal_idx[7] = 2;
-        temporal_up_switch[7] = false;
+        temporal_up_switch[7] = true;
         num_ref_pics[7] = 2;
         pid_diff[7][0] = 1;
         pid_diff[7][1] = 2;
@@ -195,7 +195,10 @@
   uint8_t temporal_idx;     // Temporal layer index, or kNoTemporalIdx.
   uint8_t spatial_idx;      // Spatial layer index, or kNoSpatialIdx.
   bool temporal_up_switch;  // True if upswitch to higher frame rate is possible
-                            // starting from this frame.
+                            // meaning subsequent higher temporal layer pictures
+                            // will not depend on any picture before the current
+                            // picture (in coding order) with temporal layer ID
+                            // greater than `temporal_idx` of this frame.
   bool inter_layer_predicted;  // Frame is dependent on directly lower spatial
                                // layer frame.
 
diff --git a/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc b/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc
index 0c3196c..99680cb 100644
--- a/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc
+++ b/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc
@@ -959,7 +959,7 @@
     const size_t gof_idx = (pics_since_key_ + 1) % gof_.num_frames_in_gof;
     layer_id.temporal_layer_id = gof_.temporal_idx[gof_idx];
 
-    if (VideoCodecMode::kScreensharing == codec_.mode) {
+    if (codec_.mode == VideoCodecMode::kScreensharing) {
       const uint32_t frame_timestamp_ms =
           1000 * input_image.timestamp() / kVideoPayloadTypeFrequency;
 
@@ -1212,8 +1212,7 @@
 
 bool LibvpxVp9Encoder::PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
                                              absl::optional<int>* spatial_idx,
-                                             const vpx_codec_cx_pkt& pkt,
-                                             uint32_t timestamp) {
+                                             const vpx_codec_cx_pkt& pkt) {
   RTC_CHECK(codec_specific != nullptr);
   codec_specific->codecType = kVideoCodecVP9;
   CodecSpecificInfoVP9* vp9_info = &(codec_specific->codecSpecific.VP9);
@@ -1248,9 +1247,6 @@
     *spatial_idx = layer_id.spatial_layer_id;
   }
 
-  // TODO(asapersson): this info has to be obtained from the encoder.
-  vp9_info->temporal_up_switch = false;
-
   const bool is_key_pic = (pics_since_key_ == 0);
   const bool is_inter_layer_pred_allowed =
       (inter_layer_pred_ == InterLayerPredMode::kOn ||
@@ -1283,6 +1279,20 @@
                        vp9_info);
   if (vp9_info->flexible_mode) {
     vp9_info->gof_idx = kNoGofIdx;
+    if (!svc_controller_) {
+      if (num_temporal_layers_ == 1) {
+        vp9_info->temporal_up_switch = true;
+      } else {
+        // In flexible mode with > 1 temporal layer but no SVC controller we
+        // can't techincally determine if a frame is an upswitch point, use
+        // gof-based data as proxy for now.
+        // TODO(sprang): Remove once SVC controller is the only choice.
+        vp9_info->gof_idx =
+            static_cast<uint8_t>(pics_since_key_ % gof_.num_frames_in_gof);
+        vp9_info->temporal_up_switch =
+            gof_.temporal_up_switch[vp9_info->gof_idx];
+      }
+    }
   } else {
     vp9_info->gof_idx =
         static_cast<uint8_t>(pics_since_key_ % gof_.num_frames_in_gof);
@@ -1353,6 +1363,23 @@
                 svc_params_.scaling_factor_den[sid]);
       }
     }
+    if (is_flexible_mode_) {
+      // Populate data for legacy temporal-upswitch state.
+      // We can switch up to a higher temporal layer only if all temporal layers
+      // higher than this (within the current spatial layer) are switch points.
+      vp9_info->temporal_up_switch = true;
+      for (int i = layer_id.temporal_layer_id + 1; i < num_temporal_layers_;
+           ++i) {
+        // Assumes decode targets are always ordered first by spatial then by
+        // temporal id.
+        size_t dti_index =
+            (layer_id.spatial_layer_id * num_temporal_layers_) + i;
+        vp9_info->temporal_up_switch &=
+            (codec_specific->generic_frame_info
+                 ->decode_target_indications[dti_index] ==
+             DecodeTargetIndication::kSwitch);
+      }
+    }
   }
   return true;
 }
@@ -1428,8 +1455,6 @@
     ref_buf_list.push_back(ref_buf_.at(0));
   }
 
-  size_t max_ref_temporal_layer_id = 0;
-
   std::vector<size_t> ref_pid_list;
 
   vp9_info->num_ref_pics = 0;
@@ -1461,9 +1486,6 @@
 
       vp9_info->p_diff[vp9_info->num_ref_pics] = static_cast<uint8_t>(p_diff);
       ++vp9_info->num_ref_pics;
-
-      max_ref_temporal_layer_id =
-          std::max(max_ref_temporal_layer_id, ref_buf.temporal_layer_id);
     } else {
       RTC_DCHECK(inter_layer_predicted);
       // RTP spec only allows to use previous spatial layer for inter-layer
@@ -1471,10 +1493,6 @@
       RTC_DCHECK_EQ(ref_buf.spatial_layer_id + 1, layer_id.spatial_layer_id);
     }
   }
-
-  vp9_info->temporal_up_switch =
-      (max_ref_temporal_layer_id <
-       static_cast<size_t>(layer_id.temporal_layer_id));
 }
 
 void LibvpxVp9Encoder::UpdateReferenceBuffers(const vpx_codec_cx_pkt& pkt,
@@ -1636,8 +1654,7 @@
 
   codec_specific_ = {};
   absl::optional<int> spatial_index;
-  if (!PopulateCodecSpecific(&codec_specific_, &spatial_index, *pkt,
-                             input_image_->timestamp())) {
+  if (!PopulateCodecSpecific(&codec_specific_, &spatial_index, *pkt)) {
     // Drop the frame.
     encoded_image_.set_size(0);
     return;
diff --git a/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.h b/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.h
index d2f98c1..93b2a59 100644
--- a/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.h
+++ b/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.h
@@ -67,8 +67,7 @@
 
   bool PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
                              absl::optional<int>* spatial_idx,
-                             const vpx_codec_cx_pkt& pkt,
-                             uint32_t timestamp);
+                             const vpx_codec_cx_pkt& pkt);
   void FillReferenceIndices(const vpx_codec_cx_pkt& pkt,
                             const size_t pic_num,
                             const bool inter_layer_predicted,
diff --git a/video/video_send_stream_tests.cc b/video/video_send_stream_tests.cc
index f948b03..a7ba243 100644
--- a/video/video_send_stream_tests.cc
+++ b/video/video_send_stream_tests.cc
@@ -3089,20 +3089,20 @@
   void VerifyTemporalLayerStructure0(const RTPVideoHeaderVP9& vp9) const {
     EXPECT_EQ(kNoTl0PicIdx, vp9.tl0_pic_idx);
     EXPECT_EQ(kNoTemporalIdx, vp9.temporal_idx);  // no tid
+    // Technically true, but layer indices not available.
     EXPECT_FALSE(vp9.temporal_up_switch);
   }
 
   void VerifyTemporalLayerStructure1(const RTPVideoHeaderVP9& vp9) const {
     EXPECT_NE(kNoTl0PicIdx, vp9.tl0_pic_idx);
     EXPECT_EQ(0, vp9.temporal_idx);  // 0,0,0,...
-    EXPECT_FALSE(vp9.temporal_up_switch);
   }
 
   void VerifyTemporalLayerStructure2(const RTPVideoHeaderVP9& vp9) const {
     EXPECT_NE(kNoTl0PicIdx, vp9.tl0_pic_idx);
     EXPECT_GE(vp9.temporal_idx, 0);  // 0,1,0,1,... (tid reset on I-frames).
     EXPECT_LE(vp9.temporal_idx, 1);
-    EXPECT_EQ(vp9.temporal_idx > 0, vp9.temporal_up_switch);
+    EXPECT_TRUE(vp9.temporal_up_switch);
     if (IsNewPictureId(vp9)) {
       uint8_t expected_tid =
           (!vp9.inter_pic_predicted || last_vp9_.temporal_idx == 1) ? 0 : 1;
@@ -3116,18 +3116,16 @@
     EXPECT_LE(vp9.temporal_idx, 2);
     if (IsNewPictureId(vp9) && vp9.inter_pic_predicted) {
       EXPECT_NE(vp9.temporal_idx, last_vp9_.temporal_idx);
+      EXPECT_TRUE(vp9.temporal_up_switch);
       switch (vp9.temporal_idx) {
         case 0:
-          EXPECT_EQ(2, last_vp9_.temporal_idx);
-          EXPECT_FALSE(vp9.temporal_up_switch);
+          EXPECT_EQ(last_vp9_.temporal_idx, 2);
           break;
         case 1:
-          EXPECT_EQ(2, last_vp9_.temporal_idx);
-          EXPECT_TRUE(vp9.temporal_up_switch);
+          EXPECT_EQ(last_vp9_.temporal_idx, 2);
           break;
         case 2:
           EXPECT_LT(last_vp9_.temporal_idx, 2);
-          EXPECT_TRUE(vp9.temporal_up_switch);
           break;
       }
     }
@@ -3192,8 +3190,12 @@
       EXPECT_FALSE(vp9.inter_pic_predicted);  // P
 
     if (!vp9.inter_pic_predicted) {
-      EXPECT_TRUE(vp9.temporal_idx == 0 || vp9.temporal_idx == kNoTemporalIdx);
-      EXPECT_FALSE(vp9.temporal_up_switch);
+      if (vp9.temporal_idx == kNoTemporalIdx) {
+        EXPECT_FALSE(vp9.temporal_up_switch);
+      } else {
+        EXPECT_EQ(vp9.temporal_idx, 0);
+        EXPECT_TRUE(vp9.temporal_up_switch);
+      }
     }
   }