Use CodecBufferUsage to determine dependencies.

In this CL:
 - Assign frame IDs so that simulcast streams share one frame ID space.
 - Added a CodecBufferUsage class that represent how a particular buffer
   was used (updated, referenced or both).
 - Calculate frame dependencies based on the CodecBufferUsage information.

Bug: webrtc:10342
Change-Id: I4ed5ad703f9376a7d995c04bb757c7d214865ddb
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/131287
Reviewed-by: Niels Moller <nisse@webrtc.org>
Reviewed-by: Ilya Nikolaevskiy <ilnik@webrtc.org>
Reviewed-by: Danil Chapovalov <danilchap@webrtc.org>
Commit-Queue: Philip Eliasson <philipel@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#27784}
diff --git a/api/video/video_codec_constants.h b/api/video/video_codec_constants.h
index a3f7cd2..e4e4b0f 100644
--- a/api/video/video_codec_constants.h
+++ b/api/video/video_codec_constants.h
@@ -13,6 +13,7 @@
 
 namespace webrtc {
 
+enum : int { kMaxEncoderBuffers = 8 };
 enum : int { kMaxSimulcastStreams = 4 };
 enum : int { kMaxSpatialLayers = 5 };
 enum : int { kMaxTemporalStreams = 4 };
diff --git a/common_video/generic_frame_descriptor/BUILD.gn b/common_video/generic_frame_descriptor/BUILD.gn
index 6c69b64..dc9461a 100644
--- a/common_video/generic_frame_descriptor/BUILD.gn
+++ b/common_video/generic_frame_descriptor/BUILD.gn
@@ -16,6 +16,7 @@
 
   deps = [
     "../../api:array_view",
+    "../../api/video:video_codec_constants",
     "../../rtc_base:checks",
     "//third_party/abseil-cpp/absl/container:inlined_vector",
     "//third_party/abseil-cpp/absl/strings",
diff --git a/common_video/generic_frame_descriptor/generic_frame_info.h b/common_video/generic_frame_descriptor/generic_frame_info.h
index 7909291..d12ae88 100644
--- a/common_video/generic_frame_descriptor/generic_frame_info.h
+++ b/common_video/generic_frame_descriptor/generic_frame_info.h
@@ -17,9 +17,20 @@
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
 #include "api/array_view.h"
+#include "api/video/video_codec_constants.h"
 
 namespace webrtc {
 
+// Describes how a certain encoder buffer was used when encoding a frame.
+struct CodecBufferUsage {
+  CodecBufferUsage(int id, bool referenced, bool updated)
+      : id(id), referenced(referenced), updated(updated) {}
+
+  int id = 0;
+  bool referenced = false;
+  bool updated = false;
+};
+
 struct GenericFrameInfo {
   enum class DecodeTargetIndication {
     kNotPresent,   // DecodeTargetInfo symbol '-'
@@ -37,10 +48,12 @@
   GenericFrameInfo(const GenericFrameInfo&);
   ~GenericFrameInfo();
 
+  int64_t frame_id = 0;
   int temporal_id = 0;
   int spatial_id = 0;
   absl::InlinedVector<int, 10> frame_diffs;
   absl::InlinedVector<DecodeTargetIndication, 10> decode_target_indications;
+  absl::InlinedVector<CodecBufferUsage, kMaxEncoderBuffers> encoder_buffers;
 };
 
 class GenericFrameInfo::Builder {
diff --git a/modules/video_coding/codecs/vp8/default_temporal_layers.cc b/modules/video_coding/codecs/vp8/default_temporal_layers.cc
index e315dc4..5a434d9 100644
--- a/modules/video_coding/codecs/vp8/default_temporal_layers.cc
+++ b/modules/video_coding/codecs/vp8/default_temporal_layers.cc
@@ -505,11 +505,17 @@
   RTC_DCHECK_EQ(vp8_info.referencedBuffersCount, 0u);
   RTC_DCHECK_EQ(vp8_info.updatedBuffersCount, 0u);
 
+  GenericFrameInfo& generic_frame_info = info->generic_frame_info.emplace();
+
   for (int i = 0; i < static_cast<int>(Vp8FrameConfig::Buffer::kCount); ++i) {
+    bool references = false;
+    bool updates = is_keyframe;
+
     if (!is_keyframe &&
         frame_config.References(static_cast<Vp8FrameConfig::Buffer>(i))) {
       RTC_DCHECK_LT(vp8_info.referencedBuffersCount,
                     arraysize(CodecSpecificInfoVP8::referencedBuffers));
+      references = true;
       vp8_info.referencedBuffers[vp8_info.referencedBuffersCount++] = i;
     }
 
@@ -517,8 +523,12 @@
         frame_config.Updates(static_cast<Vp8FrameConfig::Buffer>(i))) {
       RTC_DCHECK_LT(vp8_info.updatedBuffersCount,
                     arraysize(CodecSpecificInfoVP8::updatedBuffers));
+      updates = true;
       vp8_info.updatedBuffers[vp8_info.updatedBuffersCount++] = i;
     }
+
+    if (references || updates)
+      generic_frame_info.encoder_buffers.emplace_back(i, references, updates);
   }
 
   // The templates are always present on keyframes, and then refered to by
@@ -526,10 +536,9 @@
   if (is_keyframe) {
     info->template_structure = GetTemplateStructure(num_layers_);
   }
-
-  GenericFrameInfo& generic_frame_info = info->generic_frame_info.emplace();
   generic_frame_info.decode_target_indications =
       frame.dependency_info.decode_target_indications;
+  generic_frame_info.temporal_id = frame_config.packetizer_temporal_idx;
 
   if (!frame.expired) {
     for (Vp8BufferReference buffer : kAllBuffers) {
diff --git a/modules/video_coding/codecs/vp8/screenshare_layers.cc b/modules/video_coding/codecs/vp8/screenshare_layers.cc
index 352300b..2f4b482 100644
--- a/modules/video_coding/codecs/vp8/screenshare_layers.cc
+++ b/modules/video_coding/codecs/vp8/screenshare_layers.cc
@@ -305,6 +305,8 @@
     vp8_info.layerSync = false;
     generic_frame_info.decode_target_indications =
         GenericFrameInfo::DecodeTargetInfo("S");
+    generic_frame_info.encoder_buffers.emplace_back(
+        0, /*referenced=*/!is_keyframe, /*updated=*/true);
   } else {
     int64_t unwrapped_timestamp = time_wrap_handler_.Unwrap(rtp_timestamp);
     if (dependency_info) {
@@ -340,10 +342,13 @@
     // Note that |frame_config| is not derefernced if |is_keyframe|,
     // meaning it's never dereferenced if the optional may be unset.
     for (int i = 0; i < static_cast<int>(Vp8FrameConfig::Buffer::kCount); ++i) {
+      bool references = false;
+      bool updates = is_keyframe;
       if (!is_keyframe && dependency_info->frame_config.References(
                               static_cast<Vp8FrameConfig::Buffer>(i))) {
         RTC_DCHECK_LT(vp8_info.referencedBuffersCount,
                       arraysize(CodecSpecificInfoVP8::referencedBuffers));
+        references = true;
         vp8_info.referencedBuffers[vp8_info.referencedBuffersCount++] = i;
       }
 
@@ -351,8 +356,12 @@
                              static_cast<Vp8FrameConfig::Buffer>(i))) {
         RTC_DCHECK_LT(vp8_info.updatedBuffersCount,
                       arraysize(CodecSpecificInfoVP8::updatedBuffers));
+        updates = true;
         vp8_info.updatedBuffers[vp8_info.updatedBuffersCount++] = i;
       }
+
+      if (references || updates)
+        generic_frame_info.encoder_buffers.emplace_back(i, references, updates);
     }
   }
 
diff --git a/video/video_stream_encoder.cc b/video/video_stream_encoder.cc
index dbe2d00..35a9106 100644
--- a/video/video_stream_encoder.cc
+++ b/video/video_stream_encoder.cc
@@ -515,12 +515,16 @@
       next_frame_types_(1, VideoFrameType::kVideoFrameDelta),
       frame_encoder_timer_(this),
       experiment_groups_(GetExperimentGroups()),
+      next_frame_id_(0),
       encoder_queue_(task_queue_factory->CreateTaskQueue(
           "EncoderQueue",
           TaskQueueFactory::Priority::NORMAL)) {
   RTC_DCHECK(encoder_stats_observer);
   RTC_DCHECK(overuse_detector_);
   RTC_DCHECK_GE(number_of_cores, 1);
+
+  for (auto& state : encoder_buffer_state_)
+    state.fill(std::numeric_limits<int64_t>::max());
 }
 
 VideoStreamEncoder::~VideoStreamEncoder() {
@@ -1450,8 +1454,61 @@
   // running in parallel on different threads.
   encoder_stats_observer_->OnSendEncodedImage(image_copy, codec_specific_info);
 
-  EncodedImageCallback::Result result =
-      sink_->OnEncodedImage(image_copy, codec_specific_info, fragmentation);
+  // The simulcast id is signaled in the SpatialIndex. This makes it impossible
+  // to do simulcast for codecs that actually support spatial layers since we
+  // can't distinguish between an actual spatial layer and a simulcast stream.
+  // TODO(bugs.webrtc.org/10520): Signal the simulcast id explicitly.
+  int simulcast_id = 0;
+  if (codec_specific_info &&
+      (codec_specific_info->codecType == kVideoCodecVP8 ||
+       codec_specific_info->codecType == kVideoCodecH264 ||
+       codec_specific_info->codecType == kVideoCodecGeneric)) {
+    simulcast_id = encoded_image.SpatialIndex().value_or(0);
+  }
+
+  std::unique_ptr<CodecSpecificInfo> codec_info_copy;
+  {
+    rtc::CritScope cs(&encoded_image_lock_);
+
+    if (codec_specific_info && codec_specific_info->generic_frame_info) {
+      codec_info_copy =
+          absl::make_unique<CodecSpecificInfo>(*codec_specific_info);
+      GenericFrameInfo& generic_info = *codec_info_copy->generic_frame_info;
+      generic_info.frame_id = next_frame_id_++;
+
+      if (encoder_buffer_state_.size() <= static_cast<size_t>(simulcast_id)) {
+        RTC_LOG(LS_ERROR) << "At most " << encoder_buffer_state_.size()
+                          << " simulcast streams supported.";
+      } else {
+        std::array<int64_t, kMaxEncoderBuffers>& state =
+            encoder_buffer_state_[simulcast_id];
+        for (const CodecBufferUsage& buffer : generic_info.encoder_buffers) {
+          if (state.size() <= static_cast<size_t>(buffer.id)) {
+            RTC_LOG(LS_ERROR)
+                << "At most " << state.size() << " encoder buffers supported.";
+            break;
+          }
+
+          if (buffer.referenced) {
+            int64_t diff = generic_info.frame_id - state[buffer.id];
+            if (diff <= 0) {
+              RTC_LOG(LS_ERROR) << "Invalid frame diff: " << diff << ".";
+            } else if (absl::c_find(generic_info.frame_diffs, diff) ==
+                       generic_info.frame_diffs.end()) {
+              generic_info.frame_diffs.push_back(diff);
+            }
+          }
+
+          if (buffer.updated)
+            state[buffer.id] = generic_info.frame_id;
+        }
+      }
+    }
+  }
+
+  EncodedImageCallback::Result result = sink_->OnEncodedImage(
+      image_copy, codec_info_copy ? codec_info_copy.get() : codec_specific_info,
+      fragmentation);
 
   // We are only interested in propagating the meta-data about the image, not
   // encoded data itself, to the post encode function. Since we cannot be sure
diff --git a/video/video_stream_encoder.h b/video/video_stream_encoder.h
index 45ef430..ec172e8 100644
--- a/video/video_stream_encoder.h
+++ b/video/video_stream_encoder.h
@@ -28,6 +28,7 @@
 #include "modules/video_coding/utility/frame_dropper.h"
 #include "modules/video_coding/utility/quality_scaler.h"
 #include "modules/video_coding/video_coding_impl.h"
+#include "rtc_base/critical_section.h"
 #include "rtc_base/event.h"
 #include "rtc_base/experiments/rate_control_settings.h"
 #include "rtc_base/race_checker.h"
@@ -348,6 +349,17 @@
   // experiment group numbers incremented by 1.
   const std::array<uint8_t, 2> experiment_groups_;
 
+  // TODO(philipel): Remove this lock and run on |encoder_queue_| instead.
+  rtc::CriticalSection encoded_image_lock_;
+
+  int64_t next_frame_id_ RTC_GUARDED_BY(encoded_image_lock_);
+
+  // This array is used as a map from simulcast id to an encoder's buffer
+  // state. For every buffer of the encoder we keep track of the last frame id
+  // that updated that buffer.
+  std::array<std::array<int64_t, kMaxEncoderBuffers>, kMaxSimulcastStreams>
+      encoder_buffer_state_ RTC_GUARDED_BY(encoded_image_lock_);
+
   // All public methods are proxied to |encoder_queue_|. It must must be
   // destroyed first to make sure no tasks are run that use other members.
   rtc::TaskQueue encoder_queue_;