Update how VP9 temporal up switch is populated
This CL updates both the static GOF pattern with the correct flags for
temporal_up_switch, as well the flexible mode logic to base the flag
on dependency descriptors instead use reference buffers.
Bug: webrtc:13576
Change-Id: I578f744bec51d1f3531da5f4a89d12f05a16a6c0
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/247187
Reviewed-by: Danil Chapovalov <danilchap@webrtc.org>
Commit-Queue: Erik Språng <sprang@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#35741}
diff --git a/modules/video_coding/codecs/vp9/include/vp9_globals.h b/modules/video_coding/codecs/vp9/include/vp9_globals.h
index 87dafe4..e6f644e 100644
--- a/modules/video_coding/codecs/vp9/include/vp9_globals.h
+++ b/modules/video_coding/codecs/vp9/include/vp9_globals.h
@@ -46,14 +46,14 @@
case kTemporalStructureMode1:
num_frames_in_gof = 1;
temporal_idx[0] = 0;
- temporal_up_switch[0] = false;
+ temporal_up_switch[0] = true;
num_ref_pics[0] = 1;
pid_diff[0][0] = 1;
break;
case kTemporalStructureMode2:
num_frames_in_gof = 2;
temporal_idx[0] = 0;
- temporal_up_switch[0] = false;
+ temporal_up_switch[0] = true;
num_ref_pics[0] = 1;
pid_diff[0][0] = 2;
@@ -65,7 +65,7 @@
case kTemporalStructureMode3:
num_frames_in_gof = 4;
temporal_idx[0] = 0;
- temporal_up_switch[0] = false;
+ temporal_up_switch[0] = true;
num_ref_pics[0] = 1;
pid_diff[0][0] = 4;
@@ -87,7 +87,7 @@
case kTemporalStructureMode4:
num_frames_in_gof = 8;
temporal_idx[0] = 0;
- temporal_up_switch[0] = false;
+ temporal_up_switch[0] = true;
num_ref_pics[0] = 1;
pid_diff[0][0] = 4;
@@ -97,12 +97,12 @@
pid_diff[1][0] = 1;
temporal_idx[2] = 1;
- temporal_up_switch[2] = true;
+ temporal_up_switch[2] = false;
num_ref_pics[2] = 1;
pid_diff[2][0] = 2;
temporal_idx[3] = 2;
- temporal_up_switch[3] = false;
+ temporal_up_switch[3] = true;
num_ref_pics[3] = 2;
pid_diff[3][0] = 1;
pid_diff[3][1] = 2;
@@ -113,7 +113,7 @@
pid_diff[4][0] = 4;
temporal_idx[5] = 2;
- temporal_up_switch[5] = false;
+ temporal_up_switch[5] = true;
num_ref_pics[5] = 2;
pid_diff[5][0] = 1;
pid_diff[5][1] = 2;
@@ -125,7 +125,7 @@
pid_diff[6][1] = 4;
temporal_idx[7] = 2;
- temporal_up_switch[7] = false;
+ temporal_up_switch[7] = true;
num_ref_pics[7] = 2;
pid_diff[7][0] = 1;
pid_diff[7][1] = 2;
@@ -195,7 +195,10 @@
uint8_t temporal_idx; // Temporal layer index, or kNoTemporalIdx.
uint8_t spatial_idx; // Spatial layer index, or kNoSpatialIdx.
bool temporal_up_switch; // True if upswitch to higher frame rate is possible
- // starting from this frame.
+ // meaning subsequent higher temporal layer pictures
+ // will not depend on any picture before the current
+ // picture (in coding order) with temporal layer ID
+ // greater than `temporal_idx` of this frame.
bool inter_layer_predicted; // Frame is dependent on directly lower spatial
// layer frame.
diff --git a/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc b/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc
index 0c3196c..99680cb 100644
--- a/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc
+++ b/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc
@@ -959,7 +959,7 @@
const size_t gof_idx = (pics_since_key_ + 1) % gof_.num_frames_in_gof;
layer_id.temporal_layer_id = gof_.temporal_idx[gof_idx];
- if (VideoCodecMode::kScreensharing == codec_.mode) {
+ if (codec_.mode == VideoCodecMode::kScreensharing) {
const uint32_t frame_timestamp_ms =
1000 * input_image.timestamp() / kVideoPayloadTypeFrequency;
@@ -1212,8 +1212,7 @@
bool LibvpxVp9Encoder::PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
absl::optional<int>* spatial_idx,
- const vpx_codec_cx_pkt& pkt,
- uint32_t timestamp) {
+ const vpx_codec_cx_pkt& pkt) {
RTC_CHECK(codec_specific != nullptr);
codec_specific->codecType = kVideoCodecVP9;
CodecSpecificInfoVP9* vp9_info = &(codec_specific->codecSpecific.VP9);
@@ -1248,9 +1247,6 @@
*spatial_idx = layer_id.spatial_layer_id;
}
- // TODO(asapersson): this info has to be obtained from the encoder.
- vp9_info->temporal_up_switch = false;
-
const bool is_key_pic = (pics_since_key_ == 0);
const bool is_inter_layer_pred_allowed =
(inter_layer_pred_ == InterLayerPredMode::kOn ||
@@ -1283,6 +1279,20 @@
vp9_info);
if (vp9_info->flexible_mode) {
vp9_info->gof_idx = kNoGofIdx;
+ if (!svc_controller_) {
+ if (num_temporal_layers_ == 1) {
+ vp9_info->temporal_up_switch = true;
+ } else {
+ // In flexible mode with > 1 temporal layer but no SVC controller we
+ // can't techincally determine if a frame is an upswitch point, use
+ // gof-based data as proxy for now.
+ // TODO(sprang): Remove once SVC controller is the only choice.
+ vp9_info->gof_idx =
+ static_cast<uint8_t>(pics_since_key_ % gof_.num_frames_in_gof);
+ vp9_info->temporal_up_switch =
+ gof_.temporal_up_switch[vp9_info->gof_idx];
+ }
+ }
} else {
vp9_info->gof_idx =
static_cast<uint8_t>(pics_since_key_ % gof_.num_frames_in_gof);
@@ -1353,6 +1363,23 @@
svc_params_.scaling_factor_den[sid]);
}
}
+ if (is_flexible_mode_) {
+ // Populate data for legacy temporal-upswitch state.
+ // We can switch up to a higher temporal layer only if all temporal layers
+ // higher than this (within the current spatial layer) are switch points.
+ vp9_info->temporal_up_switch = true;
+ for (int i = layer_id.temporal_layer_id + 1; i < num_temporal_layers_;
+ ++i) {
+ // Assumes decode targets are always ordered first by spatial then by
+ // temporal id.
+ size_t dti_index =
+ (layer_id.spatial_layer_id * num_temporal_layers_) + i;
+ vp9_info->temporal_up_switch &=
+ (codec_specific->generic_frame_info
+ ->decode_target_indications[dti_index] ==
+ DecodeTargetIndication::kSwitch);
+ }
+ }
}
return true;
}
@@ -1428,8 +1455,6 @@
ref_buf_list.push_back(ref_buf_.at(0));
}
- size_t max_ref_temporal_layer_id = 0;
-
std::vector<size_t> ref_pid_list;
vp9_info->num_ref_pics = 0;
@@ -1461,9 +1486,6 @@
vp9_info->p_diff[vp9_info->num_ref_pics] = static_cast<uint8_t>(p_diff);
++vp9_info->num_ref_pics;
-
- max_ref_temporal_layer_id =
- std::max(max_ref_temporal_layer_id, ref_buf.temporal_layer_id);
} else {
RTC_DCHECK(inter_layer_predicted);
// RTP spec only allows to use previous spatial layer for inter-layer
@@ -1471,10 +1493,6 @@
RTC_DCHECK_EQ(ref_buf.spatial_layer_id + 1, layer_id.spatial_layer_id);
}
}
-
- vp9_info->temporal_up_switch =
- (max_ref_temporal_layer_id <
- static_cast<size_t>(layer_id.temporal_layer_id));
}
void LibvpxVp9Encoder::UpdateReferenceBuffers(const vpx_codec_cx_pkt& pkt,
@@ -1636,8 +1654,7 @@
codec_specific_ = {};
absl::optional<int> spatial_index;
- if (!PopulateCodecSpecific(&codec_specific_, &spatial_index, *pkt,
- input_image_->timestamp())) {
+ if (!PopulateCodecSpecific(&codec_specific_, &spatial_index, *pkt)) {
// Drop the frame.
encoded_image_.set_size(0);
return;
diff --git a/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.h b/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.h
index d2f98c1..93b2a59 100644
--- a/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.h
+++ b/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.h
@@ -67,8 +67,7 @@
bool PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
absl::optional<int>* spatial_idx,
- const vpx_codec_cx_pkt& pkt,
- uint32_t timestamp);
+ const vpx_codec_cx_pkt& pkt);
void FillReferenceIndices(const vpx_codec_cx_pkt& pkt,
const size_t pic_num,
const bool inter_layer_predicted,
diff --git a/video/video_send_stream_tests.cc b/video/video_send_stream_tests.cc
index f948b03..a7ba243 100644
--- a/video/video_send_stream_tests.cc
+++ b/video/video_send_stream_tests.cc
@@ -3089,20 +3089,20 @@
void VerifyTemporalLayerStructure0(const RTPVideoHeaderVP9& vp9) const {
EXPECT_EQ(kNoTl0PicIdx, vp9.tl0_pic_idx);
EXPECT_EQ(kNoTemporalIdx, vp9.temporal_idx); // no tid
+ // Technically true, but layer indices not available.
EXPECT_FALSE(vp9.temporal_up_switch);
}
void VerifyTemporalLayerStructure1(const RTPVideoHeaderVP9& vp9) const {
EXPECT_NE(kNoTl0PicIdx, vp9.tl0_pic_idx);
EXPECT_EQ(0, vp9.temporal_idx); // 0,0,0,...
- EXPECT_FALSE(vp9.temporal_up_switch);
}
void VerifyTemporalLayerStructure2(const RTPVideoHeaderVP9& vp9) const {
EXPECT_NE(kNoTl0PicIdx, vp9.tl0_pic_idx);
EXPECT_GE(vp9.temporal_idx, 0); // 0,1,0,1,... (tid reset on I-frames).
EXPECT_LE(vp9.temporal_idx, 1);
- EXPECT_EQ(vp9.temporal_idx > 0, vp9.temporal_up_switch);
+ EXPECT_TRUE(vp9.temporal_up_switch);
if (IsNewPictureId(vp9)) {
uint8_t expected_tid =
(!vp9.inter_pic_predicted || last_vp9_.temporal_idx == 1) ? 0 : 1;
@@ -3116,18 +3116,16 @@
EXPECT_LE(vp9.temporal_idx, 2);
if (IsNewPictureId(vp9) && vp9.inter_pic_predicted) {
EXPECT_NE(vp9.temporal_idx, last_vp9_.temporal_idx);
+ EXPECT_TRUE(vp9.temporal_up_switch);
switch (vp9.temporal_idx) {
case 0:
- EXPECT_EQ(2, last_vp9_.temporal_idx);
- EXPECT_FALSE(vp9.temporal_up_switch);
+ EXPECT_EQ(last_vp9_.temporal_idx, 2);
break;
case 1:
- EXPECT_EQ(2, last_vp9_.temporal_idx);
- EXPECT_TRUE(vp9.temporal_up_switch);
+ EXPECT_EQ(last_vp9_.temporal_idx, 2);
break;
case 2:
EXPECT_LT(last_vp9_.temporal_idx, 2);
- EXPECT_TRUE(vp9.temporal_up_switch);
break;
}
}
@@ -3192,8 +3190,12 @@
EXPECT_FALSE(vp9.inter_pic_predicted); // P
if (!vp9.inter_pic_predicted) {
- EXPECT_TRUE(vp9.temporal_idx == 0 || vp9.temporal_idx == kNoTemporalIdx);
- EXPECT_FALSE(vp9.temporal_up_switch);
+ if (vp9.temporal_idx == kNoTemporalIdx) {
+ EXPECT_FALSE(vp9.temporal_up_switch);
+ } else {
+ EXPECT_EQ(vp9.temporal_idx, 0);
+ EXPECT_TRUE(vp9.temporal_up_switch);
+ }
}
}