Set correct spatial layer number in FrameEncodeMetadataWriter

This CL set the spatial id in LibaomAv1Encoder and set correct number
of spatial layers for AV1 in FrameEncodeMetadataWriter.

Bug: None
Change-Id: I40092e45be88ec9ab75f228d9ca84c44e3cad326
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/237662
Reviewed-by: Ilya Nikolaevskiy <ilnik@webrtc.org>
Commit-Queue: Zhaoliang Ma <zhaoliang.ma@intel.com>
Cr-Commit-Position: refs/heads/main@{#35339}
diff --git a/modules/video_coding/codecs/av1/libaom_av1_encoder.cc b/modules/video_coding/codecs/av1/libaom_av1_encoder.cc
index 8a100e8..11aa3a9 100644
--- a/modules/video_coding/codecs/av1/libaom_av1_encoder.cc
+++ b/modules/video_coding/codecs/av1/libaom_av1_encoder.cc
@@ -701,6 +701,7 @@
           int d = svc_params_->scaling_factor_den[layer_frame->SpatialId()];
           encoded_image._encodedWidth = cfg_.g_w * n / d;
           encoded_image._encodedHeight = cfg_.g_h * n / d;
+          encoded_image.SetSpatialIndex(layer_frame->SpatialId());
         } else {
           encoded_image._encodedWidth = cfg_.g_w;
           encoded_image._encodedHeight = cfg_.g_h;
diff --git a/video/BUILD.gn b/video/BUILD.gn
index d19d217..4821699 100644
--- a/video/BUILD.gn
+++ b/video/BUILD.gn
@@ -324,6 +324,7 @@
     "../modules/video_coding:video_codec_interface",
     "../modules/video_coding:video_coding_utility",
     "../modules/video_coding:webrtc_vp9_helpers",
+    "../modules/video_coding/svc:scalability_structures",
     "../modules/video_coding/svc:svc_rate_allocator",
     "../rtc_base:checks",
     "../rtc_base:criticalsection",
diff --git a/video/frame_encode_metadata_writer.cc b/video/frame_encode_metadata_writer.cc
index b5eb5cd..ff2034e 100644
--- a/video/frame_encode_metadata_writer.cc
+++ b/video/frame_encode_metadata_writer.cc
@@ -11,11 +11,13 @@
 #include "video/frame_encode_metadata_writer.h"
 
 #include <algorithm>
+#include <memory>
 #include <utility>
 
 #include "common_video/h264/sps_vui_rewriter.h"
 #include "modules/include/module_common_types_public.h"
 #include "modules/video_coding/include/video_coding_defines.h"
+#include "modules/video_coding/svc/create_scalability_structure.h"
 #include "rtc_base/logging.h"
 #include "rtc_base/ref_counted_object.h"
 #include "rtc_base/time_utils.h"
@@ -62,6 +64,20 @@
   MutexLock lock(&lock_);
   codec_settings_ = codec;
   internal_source_ = internal_source;
+
+  size_t num_spatial_layers = codec_settings_.numberOfSimulcastStreams;
+  if (codec_settings_.codecType == kVideoCodecVP9) {
+    num_spatial_layers = std::max(
+        num_spatial_layers,
+        static_cast<size_t>(codec_settings_.VP9()->numberOfSpatialLayers));
+  } else if (codec_settings_.codecType == kVideoCodecAV1 &&
+             codec_settings_.ScalabilityMode() != "") {
+    std::unique_ptr<ScalableVideoController> structure =
+        CreateScalabilityStructure(codec_settings_.ScalabilityMode());
+    RTC_DCHECK(structure);
+    num_spatial_layers = structure->StreamConfig().num_spatial_layers;
+  }
+  num_spatial_layers_ = std::max(num_spatial_layers, size_t{1});
 }
 
 void FrameEncodeMetadataWriter::OnSetRates(
@@ -69,11 +85,10 @@
     uint32_t framerate_fps) {
   MutexLock lock(&lock_);
   framerate_fps_ = framerate_fps;
-  const size_t num_spatial_layers = NumSpatialLayers();
-  if (timing_frames_info_.size() < num_spatial_layers) {
-    timing_frames_info_.resize(num_spatial_layers);
+  if (timing_frames_info_.size() < num_spatial_layers_) {
+    timing_frames_info_.resize(num_spatial_layers_);
   }
-  for (size_t i = 0; i < num_spatial_layers; ++i) {
+  for (size_t i = 0; i < num_spatial_layers_; ++i) {
     timing_frames_info_[i].target_bitrate_bytes_per_sec =
         bitrate_allocation.GetSpatialLayerSum(i) / 8;
   }
@@ -85,8 +100,7 @@
     return;
   }
 
-  const size_t num_spatial_layers = NumSpatialLayers();
-  timing_frames_info_.resize(num_spatial_layers);
+  timing_frames_info_.resize(num_spatial_layers_);
   FrameMetadata metadata;
   metadata.rtp_timestamp = frame.timestamp();
   metadata.encode_start_time_ms = rtc::TimeMillis();
@@ -95,7 +109,7 @@
   metadata.rotation = frame.rotation();
   metadata.color_space = frame.color_space();
   metadata.packet_infos = frame.packet_infos();
-  for (size_t si = 0; si < num_spatial_layers; ++si) {
+  for (size_t si = 0; si < num_spatial_layers_; ++si) {
     RTC_DCHECK(timing_frames_info_[si].frames.empty() ||
                rtc::TimeDiff(
                    frame.render_time_ms(),
@@ -283,14 +297,4 @@
   return result;
 }
 
-size_t FrameEncodeMetadataWriter::NumSpatialLayers() const {
-  size_t num_spatial_layers = codec_settings_.numberOfSimulcastStreams;
-  if (codec_settings_.codecType == kVideoCodecVP9) {
-    num_spatial_layers = std::max(
-        num_spatial_layers,
-        static_cast<size_t>(codec_settings_.VP9().numberOfSpatialLayers));
-  }
-  return std::max(num_spatial_layers, size_t{1});
-}
-
 }  // namespace webrtc
diff --git a/video/frame_encode_metadata_writer.h b/video/frame_encode_metadata_writer.h
index 541ed98..80e5c5e 100644
--- a/video/frame_encode_metadata_writer.h
+++ b/video/frame_encode_metadata_writer.h
@@ -42,8 +42,6 @@
   void Reset();
 
  private:
-  size_t NumSpatialLayers() const RTC_EXCLUSIVE_LOCKS_REQUIRED(lock_);
-
   // For non-internal-source encoders, returns encode started time and fixes
   // capture timestamp for the frame, if corrupted by the encoder.
   absl::optional<int64_t> ExtractEncodeStartTimeAndFillMetadata(
@@ -72,6 +70,7 @@
   bool internal_source_ RTC_GUARDED_BY(&lock_);
   uint32_t framerate_fps_ RTC_GUARDED_BY(&lock_);
 
+  size_t num_spatial_layers_ RTC_GUARDED_BY(&lock_);
   // Separate instance for each simulcast stream or spatial layer.
   std::vector<TimingFramesLayerInfo> timing_frames_info_ RTC_GUARDED_BY(&lock_);
   int64_t last_timing_frame_time_ms_ RTC_GUARDED_BY(&lock_);