Reland "Mark frames with inter_layer_predicted=true as delta frames"

This is a reland of commit 7ae48c452abf8694a1b0a7a9a2aef13a9d10298a with  updated RtpVp9RefFinder

RtpVp9RefFinder relied on the fact that frames with (inter_pic_predicted=true && inter_layer_predicted=true) were marked as keyframes. Since this is not the case anymore, the related code paths in RtpVp9RefFinder have been deleted.

Calculation of gof_info_[] index for non-keyframes has been updated to account for that fact it is now possible to received multiple T0 frames belonging to the same temporal unit (we don't need to do "unwrapped_tl0 - 1" in this case).

Original change's description:
> Mark frames with inter_layer_predicted=true as delta frames
>
> As it is currently implemented, the VP9 depacketizer decides packet's frame type based on p_bit ("Inter-picture predicted layer frame"). p_bit is set to 0 for upper spatial layer frames of keyframe since they do not have temporal refs. This results in marking packets of upper spatial layer frames, and, eventually these frames, of SVC keyframes as "keyframe" while they are in fact delta frames.
>
> Normally spatial layer frames are merged into a superframe and the superframe is passed to decoder. But passing individual layers to a single decoder instance is a valid scenario too and is used in downstream projects. In this case, an upper layer frame marked as keyframe may cause decoder reset [2] and break decoding.
>
> This CL changes frame type decision logic in the VP9 depacketizer such that only packets with both P and D (inter-layer predicted) bits unset are considered as keyframe packets.
>
> When spatial layer frames are merged into a superframe in CombineAndDeleteFrames [1], frame type of the superframe is inferred from the lowest spatial layer frame.
>
> [1] https://source.chromium.org/chromium/chromium/src/+/main:third_party/webrtc/modules/video_coding/frame_helpers.cc;l=53
>
> [2] https://source.corp.google.com/piper///depot/google3/third_party/webrtc/files/stable/webrtc/modules/video_coding/codecs/vp9/libvpx_vp9_decoder.cc;l=209
>
> Bug: webrtc:15827
> Change-Id: Idc3445636f0eae0192dac998876fedec48628560
> Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/343342
> Reviewed-by: Danil Chapovalov <danilchap@webrtc.org>
> Commit-Queue: Sergey Silkin <ssilkin@webrtc.org>
> Cr-Commit-Position: refs/heads/main@{#41939}

Bug: webrtc:15827
Change-Id: Ic69b94989919cf6d353bceea85d0eba63bc500ee
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/344144
Reviewed-by: Philip Eliasson <philipel@webrtc.org>
Commit-Queue: Sergey Silkin <ssilkin@webrtc.org>
Reviewed-by: Danil Chapovalov <danilchap@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#41985}
diff --git a/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.cc b/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.cc
index 41f363d..a8b04db 100644
--- a/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.cc
+++ b/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.cc
@@ -180,9 +180,6 @@
   video_header->simulcastIdx = 0;
   video_header->codec = kVideoCodecVP9;
 
-  video_header->frame_type =
-      p_bit ? VideoFrameType::kVideoFrameDelta : VideoFrameType::kVideoFrameKey;
-
   auto& vp9_header =
       video_header->video_type_header.emplace<RTPVideoHeaderVP9>();
   vp9_header.InitRTPVideoHeaderVP9();
@@ -211,6 +208,9 @@
       video_header->height = vp9_header.height[0];
     }
   }
+  video_header->frame_type = p_bit || vp9_header.inter_layer_predicted
+                                 ? VideoFrameType::kVideoFrameDelta
+                                 : VideoFrameType::kVideoFrameKey;
   video_header->is_first_packet_in_frame = b_bit;
   video_header->is_last_packet_in_frame = e_bit;
 
diff --git a/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9_unittest.cc b/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9_unittest.cc
index 36af59a..7f89812 100644
--- a/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9_unittest.cc
+++ b/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9_unittest.cc
@@ -369,5 +369,18 @@
   // Compare pointers to check there was no copy on write buffer unsharing.
   EXPECT_EQ(parsed->video_payload.cdata(), rtp_payload.cdata() + kHeaderSize);
 }
+
+TEST(VideoRtpDepacketizerVp9Test, InterLayerPredOnlyFrameMerkedAsDelta) {
+  // Set P=0 and D=1 and vefify that the depaketizers marks this packet as a
+  // part of a delta frame (not a keyframe).
+  uint8_t packet[13] = {0};
+  packet[0] = 0b0010'0000;  // I:0 P:0 L:1 F:0 B:0 E:0 V:0 Z:0
+  packet[1] = 0b0000'0001;  // T:000 U:0 S:000 D:1
+  packet[2] = 0;            // TL0PICIDX
+
+  RTPVideoHeader video_header;
+  VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header);
+  EXPECT_EQ(video_header.frame_type, VideoFrameType::kVideoFrameDelta);
+}
 }  // namespace
 }  // namespace webrtc
diff --git a/modules/video_coding/rtp_vp9_ref_finder.cc b/modules/video_coding/rtp_vp9_ref_finder.cc
index 175ed34..f0d51ad 100644
--- a/modules/video_coding/rtp_vp9_ref_finder.cc
+++ b/modules/video_coding/rtp_vp9_ref_finder.cc
@@ -132,24 +132,20 @@
       FlattenFrameIdAndRefs(frame, codec_header.inter_layer_predicted);
       return kHandOff;
     }
-  } else if (frame->frame_type() == VideoFrameType::kVideoFrameKey) {
-    if (frame->SpatialIndex() == 0) {
+  } else {
+    if (frame->frame_type() == VideoFrameType::kVideoFrameKey) {
       RTC_LOG(LS_WARNING) << "Received keyframe without scalability structure";
       return kDrop;
     }
-    const auto gof_info_it = gof_info_.find(unwrapped_tl0);
-    if (gof_info_it == gof_info_.end())
-      return kStash;
 
-    info = &gof_info_it->second;
-
-    frame->num_references = 0;
-    FrameReceivedVp9(frame->Id(), info);
-    FlattenFrameIdAndRefs(frame, codec_header.inter_layer_predicted);
-    return kHandOff;
-  } else {
-    auto gof_info_it = gof_info_.find(
-        (codec_header.temporal_idx == 0) ? unwrapped_tl0 - 1 : unwrapped_tl0);
+    // tl0_idx is incremented on temporal_idx=0 frames of the lowest spatial
+    // layer (which spatial_idx is not necessarily zero). Upper spatial layer
+    // frames with inter-layer prediction use GOF info of their base spatial
+    // layer frames.
+    const bool use_prev_gof =
+        codec_header.temporal_idx == 0 && !codec_header.inter_layer_predicted;
+    auto gof_info_it =
+        gof_info_.find(use_prev_gof ? unwrapped_tl0 - 1 : unwrapped_tl0);
 
     // Gof info for this frame is not available yet, stash this frame.
     if (gof_info_it == gof_info_.end())
@@ -185,29 +181,29 @@
   auto up_switch_erase_to = up_switch_.lower_bound(old_picture_id);
   up_switch_.erase(up_switch_.begin(), up_switch_erase_to);
 
-  size_t diff =
-      ForwardDiff<uint16_t, kFrameIdLength>(info->gof->pid_start, frame->Id());
-  size_t gof_idx = diff % info->gof->num_frames_in_gof;
+  if (codec_header.inter_pic_predicted) {
+    size_t diff = ForwardDiff<uint16_t, kFrameIdLength>(info->gof->pid_start,
+                                                        frame->Id());
+    size_t gof_idx = diff % info->gof->num_frames_in_gof;
 
-  if (info->gof->num_ref_pics[gof_idx] > EncodedFrame::kMaxFrameReferences) {
-    return kDrop;
-  }
-  // Populate references according to the scalability structure.
-  frame->num_references = info->gof->num_ref_pics[gof_idx];
-  for (size_t i = 0; i < frame->num_references; ++i) {
-    frame->references[i] =
-        Subtract<kFrameIdLength>(frame->Id(), info->gof->pid_diff[gof_idx][i]);
-
-    // If this is a reference to a frame earlier than the last up switch point,
-    // then ignore this reference.
-    if (UpSwitchInIntervalVp9(frame->Id(), codec_header.temporal_idx,
-                              frame->references[i])) {
-      --frame->num_references;
+    if (info->gof->num_ref_pics[gof_idx] > EncodedFrame::kMaxFrameReferences) {
+      return kDrop;
     }
-  }
 
-  // Override GOF references.
-  if (!codec_header.inter_pic_predicted) {
+    // Populate references according to the scalability structure.
+    frame->num_references = info->gof->num_ref_pics[gof_idx];
+    for (size_t i = 0; i < frame->num_references; ++i) {
+      frame->references[i] = Subtract<kFrameIdLength>(
+          frame->Id(), info->gof->pid_diff[gof_idx][i]);
+
+      // If this is a reference to a frame earlier than the last up switch
+      // point, then ignore this reference.
+      if (UpSwitchInIntervalVp9(frame->Id(), codec_header.temporal_idx,
+                                frame->references[i])) {
+        --frame->num_references;
+      }
+    }
+  } else {
     frame->num_references = 0;
   }
 
diff --git a/modules/video_coding/rtp_vp9_ref_finder_unittest.cc b/modules/video_coding/rtp_vp9_ref_finder_unittest.cc
index a3cb31a..23b6cfc 100644
--- a/modules/video_coding/rtp_vp9_ref_finder_unittest.cc
+++ b/modules/video_coding/rtp_vp9_ref_finder_unittest.cc
@@ -360,6 +360,18 @@
   EXPECT_THAT(frames_, HasFrameWithIdAndRefs(35, {30}));
 }
 
+TEST_F(RtpVp9RefFinderTest, GofInterLayerPredS0KeyS1Delta) {
+  GofInfoVP9 ss;
+  ss.SetGofInfoVP9(kTemporalStructureMode1);
+
+  Insert(Frame().Pid(1).SidAndTid(0, 0).Tl0(0).AsKeyFrame().Gof(&ss));
+  Insert(Frame().Pid(1).SidAndTid(1, 0).Tl0(0).AsInterLayer().NotAsInterPic());
+
+  ASSERT_EQ(2UL, frames_.size());
+  EXPECT_THAT(frames_, HasFrameWithIdAndRefs(5, {}));
+  EXPECT_THAT(frames_, HasFrameWithIdAndRefs(6, {5}));
+}
+
 TEST_F(RtpVp9RefFinderTest, GofTemporalLayers_01) {
   GofInfoVP9 ss;
   ss.SetGofInfoVP9(kTemporalStructureMode2);  // 0101 pattern