In RtpDepacketizerAV1 use aggregation header to detect key frames

instead of guessing based on presence of the sequence header OBU.

Bug: webrtc:11042
Change-Id: I9a0674249feceebb07299ea965c62e87499f6baf
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/161013
Reviewed-by: Philip Eliasson <philipel@webrtc.org>
Commit-Queue: Danil Chapovalov <danilchap@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#29958}
diff --git a/modules/rtp_rtcp/source/rtp_depacketizer_av1.cc b/modules/rtp_rtcp/source/rtp_depacketizer_av1.cc
index 52c62f8..71890e9 100644
--- a/modules/rtp_rtcp/source/rtp_depacketizer_av1.cc
+++ b/modules/rtp_rtcp/source/rtp_depacketizer_av1.cc
@@ -26,7 +26,7 @@
 // RTP payload syntax:
 //     0 1 2 3 4 5 6 7
 //    +-+-+-+-+-+-+-+-+
-//    |Z|Y| W |-|-|-|-| (REQUIRED)
+//    |Z|Y| W |N|-|-|-| (REQUIRED)
 //    +=+=+=+=+=+=+=+=+ (REPEATED W-1 times, or any times if W = 0)
 //    |1|             |
 //    +-+ OBU fragment|
@@ -154,7 +154,6 @@
 // also has Sequence Header OBU.
 using VectorObuInfo = absl::InlinedVector<ObuInfo, 4>;
 
-constexpr int kObuTypeSequenceHeader = 1;
 constexpr uint8_t kObuSizePresentBit = 0b0'0000'010;
 
 bool ObuHasExtension(uint8_t obu_header) {
@@ -165,10 +164,6 @@
   return obu_header & kObuSizePresentBit;
 }
 
-int ObuType(uint8_t obu_header) {
-  return (obu_header & 0b0'1111'000u) >> 3;
-}
-
 bool RtpStartsWithFragment(uint8_t aggregation_header) {
   return aggregation_header & 0b1000'0000u;
 }
@@ -178,6 +173,9 @@
 int RtpNumObus(uint8_t aggregation_header) {  // 0 for any number of obus.
   return (aggregation_header & 0b0011'0000u) >> 4;
 }
+int RtpStartsNewCodedVideoSequence(uint8_t aggregation_header) {
+  return aggregation_header & 0b0000'1000u;
+}
 
 // Reorgonizes array of rtp payloads into array of obus:
 // fills ObuInfo::data field.
@@ -373,16 +371,18 @@
     RTC_DLOG(LS_ERROR) << "Empty rtp payload.";
     return false;
   }
+  uint8_t aggregation_header = payload_data[0];
+  if (RtpStartsNewCodedVideoSequence(aggregation_header) &&
+      RtpStartsWithFragment(aggregation_header)) {
+    // new coded video sequence can't start from an OBU fragment.
+    return false;
+  }
+
   // To assemble frame, all of the rtp payload is required, including
   // aggregation header.
   parsed_payload->payload = payload_data;
   parsed_payload->payload_length = payload_data_length;
 
-  rtc::ByteBufferReader payload(reinterpret_cast<const char*>(payload_data),
-                                payload_data_length);
-  uint8_t aggregation_header;
-  RTC_CHECK(payload.ReadUInt8(&aggregation_header));
-
   parsed_payload->video.codec = VideoCodecType::kVideoCodecAV1;
   // These are not accurate since frame may consist of several packet aligned
   // chunks of obus, but should be good enough for most cases. It might produce
@@ -393,57 +393,11 @@
       !RtpStartsWithFragment(aggregation_header);
   parsed_payload->video.is_last_packet_in_frame =
       !RtpEndsWithFragment(aggregation_header);
-  parsed_payload->video.frame_type = VideoFrameType::kVideoFrameDelta;
-  // If packet starts a frame, check if it contains Sequence Header OBU.
-  // In that case treat it as key frame packet.
-  if (parsed_payload->video.is_first_packet_in_frame) {
-    int num_expected_obus = RtpNumObus(aggregation_header);
 
-    // The only OBU that can preceed SequenceHeader is a TemporalDelimiter OBU,
-    // so check no more than two OBUs while searching for SH.
-    for (int obu_index = 1; payload.Length() > 0 && obu_index <= 2;
-         ++obu_index) {
-      uint64_t fragment_size;
-      // When num_expected_obus > 0, last OBU (fragment) is not preceeded by
-      // the size field. See W field in
-      // https://aomediacodec.github.io/av1-rtp-spec/#43-av1-aggregation-header
-      bool has_fragment_size = (obu_index != num_expected_obus);
-      if (has_fragment_size) {
-        if (!payload.ReadUVarint(&fragment_size)) {
-          RTC_DLOG(LS_WARNING)
-              << "Failed to read OBU fragment size for OBU#" << obu_index;
-          return false;
-        }
-        if (fragment_size > payload.Length()) {
-          RTC_DLOG(LS_WARNING) << "OBU fragment size " << fragment_size
-                               << " exceeds remaining payload size "
-                               << payload.Length() << " for OBU#" << obu_index;
-          // Malformed input: written size is larger than remaining buffer.
-          return false;
-        }
-      } else {
-        fragment_size = payload.Length();
-      }
-      // Though it is inpractical to pass empty fragments, it is allowed.
-      if (fragment_size == 0) {
-        RTC_LOG(LS_WARNING)
-            << "Weird obu of size 0 at offset "
-            << (payload_data_length - payload.Length()) << ", skipping.";
-        continue;
-      }
-      uint8_t obu_header = *reinterpret_cast<const uint8_t*>(payload.Data());
-      if (ObuType(obu_header) == kObuTypeSequenceHeader) {
-        // TODO(bugs.webrtc.org/11042): Check frame_header OBU and/or frame OBU
-        // too for other conditions of the start of a new coded video sequence.
-        // For proper checks checking single packet might not be enough. See
-        // https://aomediacodec.github.io/av1-spec/av1-spec.pdf section 7.5
-        parsed_payload->video.frame_type = VideoFrameType::kVideoFrameKey;
-        break;
-      }
-      payload.Consume(fragment_size);
-    }
-  }
-
+  parsed_payload->video.frame_type =
+      RtpStartsNewCodedVideoSequence(aggregation_header)
+          ? VideoFrameType::kVideoFrameKey
+          : VideoFrameType::kVideoFrameDelta;
   return true;
 }
 
diff --git a/modules/rtp_rtcp/source/rtp_depacketizer_av1_unittest.cc b/modules/rtp_rtcp/source/rtp_depacketizer_av1_unittest.cc
index cf55aae..d0d0670 100644
--- a/modules/rtp_rtcp/source/rtp_depacketizer_av1_unittest.cc
+++ b/modules/rtp_rtcp/source/rtp_depacketizer_av1_unittest.cc
@@ -19,12 +19,9 @@
 using ::testing::ElementsAre;
 
 // Signals number of the OBU (fragments) in the packet.
-constexpr uint8_t kObuCountAny = 0b0000'0000;
-constexpr uint8_t kObuCountOne = 0b0001'0000;
-constexpr uint8_t kObuCountTwo = 0b0010'0000;
+constexpr uint8_t kObuCountOne = 0b00'01'0000;
 
 constexpr uint8_t kObuHeaderSequenceHeader = 0b0'0001'000;
-constexpr uint8_t kObuHeaderTemporalDelimiter = 0b0'0010'000;
 constexpr uint8_t kObuHeaderFrame = 0b0'0110'000;
 
 constexpr uint8_t kObuHeaderHasSize = 0b0'0000'010;
@@ -73,131 +70,37 @@
   EXPECT_TRUE(parsed.video.is_last_packet_in_frame);
 }
 
-TEST(RtpDepacketizerAv1Test, ParseTreatsStartOfSequenceHeaderAsKeyFrame) {
-  const uint8_t packet[] = {kObuCountOne, kObuHeaderSequenceHeader};
-  RtpDepacketizerAv1 depacketizer;
-  RtpDepacketizer::ParsedPayload parsed;
-  ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet)));
-  EXPECT_TRUE(parsed.video.is_first_packet_in_frame);
-  EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameKey);
-}
-
-TEST(RtpDepacketizerAv1Test, ParseTreatsNotStartOfFrameAsDeltaFrame) {
-  const uint8_t packet[] = {
-      (uint8_t{1} << 7) | kObuCountOne,
-      // Byte that look like start of sequence header, but since it is not start
-      // of an OBU, it is actually not a start of sequence header.
-      kObuHeaderSequenceHeader};
-  RtpDepacketizerAv1 depacketizer;
-  RtpDepacketizer::ParsedPayload parsed;
-  ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet)));
-  EXPECT_FALSE(parsed.video.is_first_packet_in_frame);
-  EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameDelta);
-}
-
 TEST(RtpDepacketizerAv1Test,
-     ParseTreatsStartOfFrameWithoutSequenceHeaderAsDeltaFrame) {
-  const uint8_t packet[] = {kObuCountOne, kObuHeaderFrame};
-  RtpDepacketizerAv1 depacketizer;
-  RtpDepacketizer::ParsedPayload parsed;
-  ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet)));
-  EXPECT_TRUE(parsed.video.is_first_packet_in_frame);
-  EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameDelta);
-}
-
-TEST(RtpDepacketizerAv1Test, ParseFindsSequenceHeaderBehindFragmentSize1) {
-  const uint8_t packet[] = {kObuCountAny,
-                            1,  // size of the next fragment
+     ParseUsesNewCodedVideoSequenceBitAsKeyFrameIndidcator) {
+  const uint8_t packet[] = {(uint8_t{1} << 3) | kObuCountOne,
                             kObuHeaderSequenceHeader};
   RtpDepacketizerAv1 depacketizer;
   RtpDepacketizer::ParsedPayload parsed;
   ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet)));
-  EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameKey);
-}
-
-TEST(RtpDepacketizerAv1Test, ParseFindsSequenceHeaderBehindFragmentSize2) {
-  const uint8_t packet[] = {kObuCountTwo,
-                            2,  // size of the next fragment
-                            kObuHeaderSequenceHeader,
-                            42,  // SH payload.
-                            kObuHeaderFrame};
-  RtpDepacketizerAv1 depacketizer;
-  RtpDepacketizer::ParsedPayload parsed;
-  ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet)));
+  EXPECT_TRUE(parsed.video.is_first_packet_in_frame);
   EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameKey);
 }
 
 TEST(RtpDepacketizerAv1Test,
-     ParseFindsSequenceHeaderBehindMultiByteFragmentSize) {
-  const uint8_t packet[] = {kObuCountTwo,
-                            0b1000'0101,  // leb128 encoded value of 5
-                            0b1000'0000,  // using 3 bytes
-                            0b0000'0000,  // to encode the value.
-                            kObuHeaderSequenceHeader,
-                            8,  // 4 bytes of SH payload.
-                            0,
-                            0,
-                            0,
-                            kObuHeaderFrame};
+     ParseUsesUnsetNewCodedVideoSequenceBitAsDeltaFrameIndidcator) {
+  const uint8_t packet[] = {(uint8_t{0} << 3) | kObuCountOne,
+                            kObuHeaderSequenceHeader};
   RtpDepacketizerAv1 depacketizer;
   RtpDepacketizer::ParsedPayload parsed;
   ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet)));
-  EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameKey);
-}
-
-TEST(RtpDepacketizerAv1Test, ParseFindsSequenceHeaderBehindTemporalDelimiter) {
-  const uint8_t packet[] = {kObuCountTwo,
-                            1,  // size of the next fragment
-                            kObuHeaderTemporalDelimiter,
-                            kObuHeaderSequenceHeader,
-                            8,  // 4 bytes of SH payload.
-                            0,
-                            0,
-                            0};
-  RtpDepacketizerAv1 depacketizer;
-  RtpDepacketizer::ParsedPayload parsed;
-  ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet)));
-  EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameKey);
-}
-
-TEST(RtpDepacketizerAv1Test,
-     ParseFindsSequenceHeaderBehindTemporalDelimiterAndSize) {
-  const uint8_t packet[] = {kObuCountAny,
-                            1,  // size of the next fragment
-                            kObuHeaderTemporalDelimiter,
-                            5,  // size of the next fragment
-                            kObuHeaderSequenceHeader,
-                            8,  // 4 bytes of SH payload.
-                            0,
-                            0,
-                            0,
-                            1,  // size of the next fragment
-                            kObuHeaderFrame};
-  RtpDepacketizerAv1 depacketizer;
-  RtpDepacketizer::ParsedPayload parsed;
-  ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet)));
-  EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameKey);
-}
-
-TEST(RtpDepacketizerAv1Test, ParseSkipsEmptyFragments) {
-  static_assert(kObuHeaderSequenceHeader == 8, "");
-  const uint8_t packet[] = {kObuCountAny,
-                            0,  // size of the next fragment
-                            8,  // size of the next fragment that look like SH
-                            kObuHeaderFrame,
-                            1,
-                            2,
-                            3,
-                            4,
-                            5,
-                            6,
-                            7};
-  RtpDepacketizerAv1 depacketizer;
-  RtpDepacketizer::ParsedPayload parsed;
-  ASSERT_TRUE(depacketizer.Parse(&parsed, packet, sizeof(packet)));
+  EXPECT_TRUE(parsed.video.is_first_packet_in_frame);
   EXPECT_TRUE(parsed.video.frame_type == VideoFrameType::kVideoFrameDelta);
 }
 
+TEST(RtpDepacketizerAv1Test,
+     ParseRejectsPacketWithNewCVSAndContinuationFlagsBothSet) {
+  const uint8_t packet[] = {0b10'00'1000 | kObuCountOne,
+                            kObuHeaderSequenceHeader};
+  RtpDepacketizerAv1 depacketizer;
+  RtpDepacketizer::ParsedPayload parsed;
+  EXPECT_FALSE(depacketizer.Parse(&parsed, packet, sizeof(packet)));
+}
+
 TEST(RtpDepacketizerAv1Test, AssembleFrameSetsOBUPayloadSizeWhenAbsent) {
   const uint8_t payload1[] = {0b00'01'0000,  // aggregation header
                               0b0'0110'000,  // /  Frame