Communicate encoder resolutions via rtc::VideoSinkWants.

This will allow us to optimize the internal buffers of
webrtc::VideoFrame for the resolution(s) that we actually want to
encode.

Bug: webrtc:12469, chromium:1157072
Change-Id: If378b52b5e35aa9a9800c1f7dfe189437ce43253
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/208540
Reviewed-by: Niels Moller <nisse@webrtc.org>
Reviewed-by: Harald Alvestrand <hta@webrtc.org>
Reviewed-by: Ilya Nikolaevskiy <ilnik@webrtc.org>
Commit-Queue: Henrik Boström <hbos@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#33342}
diff --git a/api/video/video_source_interface.h b/api/video/video_source_interface.h
index b03d7c5..8b5823f 100644
--- a/api/video/video_source_interface.h
+++ b/api/video/video_source_interface.h
@@ -12,6 +12,7 @@
 #define API_VIDEO_VIDEO_SOURCE_INTERFACE_H_
 
 #include <limits>
+#include <vector>
 
 #include "absl/types/optional.h"
 #include "api/video/video_sink_interface.h"
@@ -22,6 +23,15 @@
 // VideoSinkWants is used for notifying the source of properties a video frame
 // should have when it is delivered to a certain sink.
 struct RTC_EXPORT VideoSinkWants {
+  struct FrameSize {
+    FrameSize(int width, int height) : width(width), height(height) {}
+    FrameSize(const FrameSize&) = default;
+    ~FrameSize() = default;
+
+    int width;
+    int height;
+  };
+
   VideoSinkWants();
   VideoSinkWants(const VideoSinkWants&);
   ~VideoSinkWants();
@@ -49,8 +59,34 @@
   // Note that this field is unrelated to any horizontal or vertical stride
   // requirements the encoder has on the incoming video frame buffers.
   int resolution_alignment = 1;
+
+  // The resolutions that sink is configured to consume. If the sink is an
+  // encoder this is what the encoder is configured to encode. In singlecast we
+  // only encode one resolution, but in simulcast and SVC this can mean multiple
+  // resolutions per frame.
+  //
+  // The sink is always configured to consume a subset of the
+  // webrtc::VideoFrame's resolution. In the case of encoding, we usually encode
+  // at webrtc::VideoFrame's resolution but this may not always be the case due
+  // to scaleResolutionDownBy or turning off simulcast or SVC layers.
+  //
+  // For example, we may capture at 720p and due to adaptation (e.g. applying
+  // |max_pixel_count| constraints) create webrtc::VideoFrames of size 480p, but
+  // if we do scaleResolutionDownBy:2 then the only resolution we end up
+  // encoding is 240p. In this case we still need to provide webrtc::VideoFrames
+  // of size 480p but we can optimize internal buffers for 240p, avoiding
+  // downsampling to 480p if possible.
+  //
+  // Note that the |resolutions| can change while frames are in flight and
+  // should only be used as a hint when constructing the webrtc::VideoFrame.
+  std::vector<FrameSize> resolutions;
 };
 
+inline bool operator==(const VideoSinkWants::FrameSize& a,
+                       const VideoSinkWants::FrameSize& b) {
+  return a.width == b.width && a.height == b.height;
+}
+
 template <typename VideoFrameT>
 class VideoSourceInterface {
  public:
diff --git a/call/call_perf_tests.cc b/call/call_perf_tests.cc
index 6591ab5..4cb9766 100644
--- a/call/call_perf_tests.cc
+++ b/call/call_perf_tests.cc
@@ -561,6 +561,18 @@
     // TODO(sprang): Add integration test for maintain-framerate mode?
     void OnSinkWantsChanged(rtc::VideoSinkInterface<VideoFrame>* sink,
                             const rtc::VideoSinkWants& wants) override {
+      // The sink wants can change either because an adaptation happened (i.e.
+      // the pixels or frame rate changed) or for other reasons, such as encoded
+      // resolutions being communicated (happens whenever we capture a new frame
+      // size). In this test, we only care about adaptations.
+      bool did_adapt =
+          last_wants_.max_pixel_count != wants.max_pixel_count ||
+          last_wants_.target_pixel_count != wants.target_pixel_count ||
+          last_wants_.max_framerate_fps != wants.max_framerate_fps;
+      last_wants_ = wants;
+      if (!did_adapt) {
+        return;
+      }
       // At kStart expect CPU overuse. Then expect CPU underuse when the encoder
       // delay has been decreased.
       switch (test_phase_) {
@@ -625,6 +637,9 @@
       kAdaptedDown,
       kAdaptedUp
     } test_phase_;
+
+   private:
+    rtc::VideoSinkWants last_wants_;
   } test;
 
   RunBaseTest(&test);
diff --git a/video/video_source_sink_controller.cc b/video/video_source_sink_controller.cc
index 376eb85..4cd12d8 100644
--- a/video/video_source_sink_controller.cc
+++ b/video/video_source_sink_controller.cc
@@ -29,7 +29,14 @@
      << " max_pixel_count=" << wants.max_pixel_count << " target_pixel_count="
      << (wants.target_pixel_count.has_value()
              ? std::to_string(wants.target_pixel_count.value())
-             : "null");
+             : "null")
+     << " resolutions={";
+  for (size_t i = 0; i < wants.resolutions.size(); ++i) {
+    if (i != 0)
+      ss << ",";
+    ss << wants.resolutions[i].width << "x" << wants.resolutions[i].height;
+  }
+  ss << "}";
 
   return ss.Release();
 }
@@ -104,6 +111,12 @@
   return resolution_alignment_;
 }
 
+const std::vector<rtc::VideoSinkWants::FrameSize>&
+VideoSourceSinkController::resolutions() const {
+  RTC_DCHECK_RUN_ON(&sequence_checker_);
+  return resolutions_;
+}
+
 void VideoSourceSinkController::SetRestrictions(
     VideoSourceRestrictions restrictions) {
   RTC_DCHECK_RUN_ON(&sequence_checker_);
@@ -133,6 +146,12 @@
   resolution_alignment_ = resolution_alignment;
 }
 
+void VideoSourceSinkController::SetResolutions(
+    std::vector<rtc::VideoSinkWants::FrameSize> resolutions) {
+  RTC_DCHECK_RUN_ON(&sequence_checker_);
+  resolutions_ = std::move(resolutions);
+}
+
 // RTC_EXCLUSIVE_LOCKS_REQUIRED(sequence_checker_)
 rtc::VideoSinkWants VideoSourceSinkController::CurrentSettingsToSinkWants()
     const {
@@ -161,6 +180,7 @@
                frame_rate_upper_limit_.has_value()
                    ? static_cast<int>(frame_rate_upper_limit_.value())
                    : std::numeric_limits<int>::max());
+  wants.resolutions = resolutions_;
   return wants;
 }
 
diff --git a/video/video_source_sink_controller.h b/video/video_source_sink_controller.h
index 29a9588..c61084f 100644
--- a/video/video_source_sink_controller.h
+++ b/video/video_source_sink_controller.h
@@ -12,6 +12,7 @@
 #define VIDEO_VIDEO_SOURCE_SINK_CONTROLLER_H_
 
 #include <string>
+#include <vector>
 
 #include "absl/types/optional.h"
 #include "api/sequence_checker.h"
@@ -46,6 +47,7 @@
   absl::optional<double> frame_rate_upper_limit() const;
   bool rotation_applied() const;
   int resolution_alignment() const;
+  const std::vector<rtc::VideoSinkWants::FrameSize>& resolutions() const;
 
   // Updates the settings stored internally. In order for these settings to be
   // applied to the sink, PushSourceSinkSettings() must subsequently be called.
@@ -55,6 +57,7 @@
   void SetFrameRateUpperLimit(absl::optional<double> frame_rate_upper_limit);
   void SetRotationApplied(bool rotation_applied);
   void SetResolutionAlignment(int resolution_alignment);
+  void SetResolutions(std::vector<rtc::VideoSinkWants::FrameSize> resolutions);
 
  private:
   rtc::VideoSinkWants CurrentSettingsToSinkWants() const
@@ -79,6 +82,8 @@
       RTC_GUARDED_BY(&sequence_checker_);
   bool rotation_applied_ RTC_GUARDED_BY(&sequence_checker_) = false;
   int resolution_alignment_ RTC_GUARDED_BY(&sequence_checker_) = 1;
+  std::vector<rtc::VideoSinkWants::FrameSize> resolutions_
+      RTC_GUARDED_BY(&sequence_checker_);
 };
 
 }  // namespace webrtc
diff --git a/video/video_stream_encoder.cc b/video/video_stream_encoder.cc
index 63770c4..ae58725 100644
--- a/video/video_stream_encoder.cc
+++ b/video/video_stream_encoder.cc
@@ -991,14 +991,29 @@
     max_framerate = std::max(stream.max_framerate, max_framerate);
   }
 
-  main_queue_->PostTask(
-      ToQueuedTask(task_safety_, [this, max_framerate, alignment]() {
+  // The resolutions that we're actually encoding with.
+  std::vector<rtc::VideoSinkWants::FrameSize> encoder_resolutions;
+  // TODO(hbos): For the case of SVC, also make use of |codec.spatialLayers|.
+  // For now, SVC layers are handled by the VP9 encoder.
+  for (const auto& simulcastStream : codec.simulcastStream) {
+    if (!simulcastStream.active)
+      continue;
+    encoder_resolutions.emplace_back(simulcastStream.width,
+                                     simulcastStream.height);
+  }
+  main_queue_->PostTask(ToQueuedTask(
+      task_safety_, [this, max_framerate, alignment,
+                     encoder_resolutions = std::move(encoder_resolutions)]() {
         RTC_DCHECK_RUN_ON(main_queue_);
         if (max_framerate !=
                 video_source_sink_controller_.frame_rate_upper_limit() ||
-            alignment != video_source_sink_controller_.resolution_alignment()) {
+            alignment != video_source_sink_controller_.resolution_alignment() ||
+            encoder_resolutions !=
+                video_source_sink_controller_.resolutions()) {
           video_source_sink_controller_.SetFrameRateUpperLimit(max_framerate);
           video_source_sink_controller_.SetResolutionAlignment(alignment);
+          video_source_sink_controller_.SetResolutions(
+              std::move(encoder_resolutions));
           video_source_sink_controller_.PushSourceSinkSettings();
         }
       }));
diff --git a/video/video_stream_encoder_unittest.cc b/video/video_stream_encoder_unittest.cc
index f7a3621..d74ebe8 100644
--- a/video/video_stream_encoder_unittest.cc
+++ b/video/video_stream_encoder_unittest.cc
@@ -461,6 +461,10 @@
     return adaptation_enabled_;
   }
 
+  // The "last wants" is a snapshot of the previous rtc::VideoSinkWants where
+  // the resolution or frame rate was different than it is currently. If
+  // something else is modified, such as encoder resolutions, but the resolution
+  // and frame rate stays the same, last wants is not updated.
   rtc::VideoSinkWants last_wants() const {
     MutexLock lock(&mutex_);
     return last_wants_;
@@ -519,7 +523,14 @@
   void AddOrUpdateSink(rtc::VideoSinkInterface<VideoFrame>* sink,
                        const rtc::VideoSinkWants& wants) override {
     MutexLock lock(&mutex_);
-    last_wants_ = sink_wants_locked();
+    rtc::VideoSinkWants prev_wants = sink_wants_locked();
+    bool did_adapt =
+        prev_wants.max_pixel_count != wants.max_pixel_count ||
+        prev_wants.target_pixel_count != wants.target_pixel_count ||
+        prev_wants.max_framerate_fps != wants.max_framerate_fps;
+    if (did_adapt) {
+      last_wants_ = prev_wants;
+    }
     adapter_.OnSinkWants(wants);
     test::FrameForwarder::AddOrUpdateSinkLocked(sink, wants);
   }
@@ -7611,4 +7622,105 @@
   video_stream_encoder_->Stop();
 }
 
+TEST_F(VideoStreamEncoderTest, EncoderResolutionsExposedInSinglecast) {
+  const int kFrameWidth = 1280;
+  const int kFrameHeight = 720;
+
+  SetUp();
+  video_stream_encoder_->OnBitrateUpdatedAndWaitForManagedResources(
+      DataRate::BitsPerSec(kTargetBitrateBps),
+      DataRate::BitsPerSec(kTargetBitrateBps),
+      DataRate::BitsPerSec(kTargetBitrateBps), 0, 0, 0);
+
+  // Capturing a frame should reconfigure the encoder and expose the encoder
+  // resolution, which is the same as the input frame.
+  int64_t timestamp_ms = kFrameIntervalMs;
+  video_source_.IncomingCapturedFrame(
+      CreateFrame(timestamp_ms, kFrameWidth, kFrameHeight));
+  WaitForEncodedFrame(timestamp_ms);
+  video_stream_encoder_->WaitUntilTaskQueueIsIdle();
+  EXPECT_THAT(video_source_.sink_wants().resolutions,
+              ::testing::ElementsAreArray(
+                  {rtc::VideoSinkWants::FrameSize(kFrameWidth, kFrameHeight)}));
+
+  video_stream_encoder_->Stop();
+}
+
+TEST_F(VideoStreamEncoderTest, EncoderResolutionsExposedInSimulcast) {
+  // Pick downscale factors such that we never encode at full resolution - this
+  // is an interesting use case. The frame resolution influences the encoder
+  // resolutions, but if no layer has |scale_resolution_down_by| == 1 then the
+  // encoder should not ask for the frame resolution. This allows video frames
+  // to have the appearence of one resolution but optimize its internal buffers
+  // for what is actually encoded.
+  const size_t kNumSimulcastLayers = 3u;
+  const float kDownscaleFactors[] = {8.0, 4.0, 2.0};
+  const int kFrameWidth = 1280;
+  const int kFrameHeight = 720;
+  const rtc::VideoSinkWants::FrameSize kLayer0Size(
+      kFrameWidth / kDownscaleFactors[0], kFrameHeight / kDownscaleFactors[0]);
+  const rtc::VideoSinkWants::FrameSize kLayer1Size(
+      kFrameWidth / kDownscaleFactors[1], kFrameHeight / kDownscaleFactors[1]);
+  const rtc::VideoSinkWants::FrameSize kLayer2Size(
+      kFrameWidth / kDownscaleFactors[2], kFrameHeight / kDownscaleFactors[2]);
+
+  VideoEncoderConfig config;
+  test::FillEncoderConfiguration(kVideoCodecVP8, kNumSimulcastLayers, &config);
+  for (size_t i = 0; i < kNumSimulcastLayers; ++i) {
+    config.simulcast_layers[i].scale_resolution_down_by = kDownscaleFactors[i];
+    config.simulcast_layers[i].active = true;
+  }
+  config.video_stream_factory =
+      new rtc::RefCountedObject<cricket::EncoderStreamFactory>(
+          "VP8", /*max qp*/ 56, /*screencast*/ false,
+          /*screenshare enabled*/ false);
+  video_stream_encoder_->OnBitrateUpdatedAndWaitForManagedResources(
+      DataRate::BitsPerSec(kSimulcastTargetBitrateBps),
+      DataRate::BitsPerSec(kSimulcastTargetBitrateBps),
+      DataRate::BitsPerSec(kSimulcastTargetBitrateBps), 0, 0, 0);
+
+  // Capture a frame with all layers active.
+  int64_t timestamp_ms = kFrameIntervalMs;
+  sink_.SetNumExpectedLayers(kNumSimulcastLayers);
+  video_stream_encoder_->ConfigureEncoder(config.Copy(), kMaxPayloadLength);
+  video_source_.IncomingCapturedFrame(
+      CreateFrame(timestamp_ms, kFrameWidth, kFrameHeight));
+  WaitForEncodedFrame(timestamp_ms);
+  // Expect encoded resolutions to match the expected simulcast layers.
+  video_stream_encoder_->WaitUntilTaskQueueIsIdle();
+  EXPECT_THAT(
+      video_source_.sink_wants().resolutions,
+      ::testing::ElementsAreArray({kLayer0Size, kLayer1Size, kLayer2Size}));
+
+  // Capture a frame with one of the layers inactive.
+  timestamp_ms += kFrameIntervalMs;
+  config.simulcast_layers[2].active = false;
+  sink_.SetNumExpectedLayers(kNumSimulcastLayers - 1);
+  video_stream_encoder_->ConfigureEncoder(config.Copy(), kMaxPayloadLength);
+  video_source_.IncomingCapturedFrame(
+      CreateFrame(timestamp_ms, kFrameWidth, kFrameHeight));
+  WaitForEncodedFrame(timestamp_ms);
+
+  // Expect encoded resolutions to match the expected simulcast layers.
+  video_stream_encoder_->WaitUntilTaskQueueIsIdle();
+  EXPECT_THAT(video_source_.sink_wants().resolutions,
+              ::testing::ElementsAreArray({kLayer0Size, kLayer1Size}));
+
+  // Capture a frame with all but one layer turned off.
+  timestamp_ms += kFrameIntervalMs;
+  config.simulcast_layers[1].active = false;
+  sink_.SetNumExpectedLayers(kNumSimulcastLayers - 2);
+  video_stream_encoder_->ConfigureEncoder(config.Copy(), kMaxPayloadLength);
+  video_source_.IncomingCapturedFrame(
+      CreateFrame(timestamp_ms, kFrameWidth, kFrameHeight));
+  WaitForEncodedFrame(timestamp_ms);
+
+  // Expect encoded resolutions to match the expected simulcast layers.
+  video_stream_encoder_->WaitUntilTaskQueueIsIdle();
+  EXPECT_THAT(video_source_.sink_wants().resolutions,
+              ::testing::ElementsAreArray({kLayer0Size}));
+
+  video_stream_encoder_->Stop();
+}
+
 }  // namespace webrtc