Support conversion of VP9 non-flexible mode to generic descriptor for non-layered streams only.

When VP9 HW encoders don't provide any metadata a minimal non-flexible mode structure is generated for the stream: (https://source.chromium.org/chromium/chromium/src/+/refs/heads/main:third_party/blink/renderer/platform/peerconnection/rtc_video_encoder.cc;l=1275-1298;drc=f80633b34538615fcb73515ad8c4bc56a748abfe).

Bug: chromium:1455428, b/286993839, b/287458300
Change-Id: I72628f20927d685e9c8ba1744126d763896bd804
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/309380
Commit-Queue: Philip Eliasson <philipel@webrtc.org>
Reviewed-by: Erik Språng <sprang@webrtc.org>
Reviewed-by: Henrik Boström <hbos@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#40316}
diff --git a/call/rtp_payload_params.cc b/call/rtp_payload_params.cc
index 18e6d91..f0347bc 100644
--- a/call/rtp_payload_params.cc
+++ b/call/rtp_payload_params.cc
@@ -604,23 +604,42 @@
     // Create the array only if it is ever used.
     last_vp9_frame_id_.resize(kPictureDiffLimit);
   }
-  if (vp9_header.inter_layer_predicted && spatial_index > 0) {
-    result.dependencies.push_back(
-        last_vp9_frame_id_[vp9_header.picture_id % kPictureDiffLimit]
-                          [spatial_index - 1]);
-  }
-  if (vp9_header.inter_pic_predicted) {
-    for (size_t i = 0; i < vp9_header.num_ref_pics; ++i) {
-      // picture_id is 15 bit number that wraps around. Though undeflow may
-      // produce picture that exceeds 2^15, it is ok because in this
-      // code block only last 7 bits of the picture_id are used.
-      uint16_t depend_on = vp9_header.picture_id - vp9_header.pid_diff[i];
+
+  if (vp9_header.flexible_mode) {
+    if (vp9_header.inter_layer_predicted && spatial_index > 0) {
       result.dependencies.push_back(
-          last_vp9_frame_id_[depend_on % kPictureDiffLimit][spatial_index]);
+          last_vp9_frame_id_[vp9_header.picture_id % kPictureDiffLimit]
+                            [spatial_index - 1]);
     }
+    if (vp9_header.inter_pic_predicted) {
+      for (size_t i = 0; i < vp9_header.num_ref_pics; ++i) {
+        // picture_id is 15 bit number that wraps around. Though undeflow may
+        // produce picture that exceeds 2^15, it is ok because in this
+        // code block only last 7 bits of the picture_id are used.
+        uint16_t depend_on = vp9_header.picture_id - vp9_header.pid_diff[i];
+        result.dependencies.push_back(
+            last_vp9_frame_id_[depend_on % kPictureDiffLimit][spatial_index]);
+      }
+    }
+    last_vp9_frame_id_[vp9_header.picture_id % kPictureDiffLimit]
+                      [spatial_index] = shared_frame_id;
+  } else {
+    // Implementing general conversion logic for non-flexible mode requires some
+    // work and we will almost certainly never need it, so for now support only
+    // non-layerd streams.
+    if (spatial_index > 0 || temporal_index > 0) {
+      // Prefer to generate no generic layering than an inconsistent one.
+      rtp_video_header.generic.reset();
+      return;
+    }
+
+    if (vp9_header.inter_pic_predicted) {
+      // Since we only support non-scalable streams we only need to save the
+      // last frame id.
+      result.dependencies.push_back(last_vp9_frame_id_[0][0]);
+    }
+    last_vp9_frame_id_[0][0] = shared_frame_id;
   }
-  last_vp9_frame_id_[vp9_header.picture_id % kPictureDiffLimit][spatial_index] =
-      shared_frame_id;
 
   result.active_decode_targets =
       ((uint32_t{1} << num_temporal_layers * num_active_spatial_layers) - 1);