blob: 3d041ca218694977c52e94576643e1c8584f3543 [file] [log] [blame]
/*
* Copyright (c) 2021 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "api/video/rtp_video_frame_assembler.h"
#include <algorithm>
#include <cstdint>
#include <map>
#include <memory>
#include <utility>
#include <vector>
#include "absl/container/inlined_vector.h"
#include "absl/types/optional.h"
#include "modules/rtp_rtcp/source/frame_object.h"
#include "modules/rtp_rtcp/source/rtp_dependency_descriptor_extension.h"
#include "modules/rtp_rtcp/source/rtp_generic_frame_descriptor_extension.h"
#include "modules/rtp_rtcp/source/rtp_packet_received.h"
#include "modules/rtp_rtcp/source/video_rtp_depacketizer_av1.h"
#include "modules/rtp_rtcp/source/video_rtp_depacketizer_generic.h"
#include "modules/rtp_rtcp/source/video_rtp_depacketizer_h264.h"
#include "modules/rtp_rtcp/source/video_rtp_depacketizer_raw.h"
#include "modules/rtp_rtcp/source/video_rtp_depacketizer_vp8.h"
#include "modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.h"
#include "modules/video_coding/packet_buffer.h"
#include "modules/video_coding/rtp_frame_reference_finder.h"
#include "rtc_base/logging.h"
#include "rtc_base/numerics/sequence_number_unwrapper.h"
namespace webrtc {
namespace {
std::unique_ptr<VideoRtpDepacketizer> CreateDepacketizer(
RtpVideoFrameAssembler::PayloadFormat payload_format) {
switch (payload_format) {
case RtpVideoFrameAssembler::kRaw:
return std::make_unique<VideoRtpDepacketizerRaw>();
case RtpVideoFrameAssembler::kH264:
return std::make_unique<VideoRtpDepacketizerH264>();
case RtpVideoFrameAssembler::kVp8:
return std::make_unique<VideoRtpDepacketizerVp8>();
case RtpVideoFrameAssembler::kVp9:
return std::make_unique<VideoRtpDepacketizerVp9>();
case RtpVideoFrameAssembler::kAv1:
return std::make_unique<VideoRtpDepacketizerAv1>();
case RtpVideoFrameAssembler::kGeneric:
return std::make_unique<VideoRtpDepacketizerGeneric>();
}
RTC_DCHECK_NOTREACHED();
return nullptr;
}
} // namespace
class RtpVideoFrameAssembler::Impl {
public:
explicit Impl(std::unique_ptr<VideoRtpDepacketizer> depacketizer);
~Impl() = default;
FrameVector InsertPacket(const RtpPacketReceived& packet);
private:
using RtpFrameVector =
absl::InlinedVector<std::unique_ptr<RtpFrameObject>, 3>;
RtpFrameVector AssembleFrames(
video_coding::PacketBuffer::InsertResult insert_result);
FrameVector FindReferences(RtpFrameVector frames);
FrameVector UpdateWithPadding(uint16_t seq_num);
bool ParseDependenciesDescriptorExtension(const RtpPacketReceived& rtp_packet,
RTPVideoHeader& video_header);
bool ParseGenericDescriptorExtension(const RtpPacketReceived& rtp_packet,
RTPVideoHeader& video_header);
void ClearOldData(uint16_t incoming_seq_num);
std::unique_ptr<FrameDependencyStructure> video_structure_;
SeqNumUnwrapper<uint16_t> frame_id_unwrapper_;
absl::optional<int64_t> video_structure_frame_id_;
std::unique_ptr<VideoRtpDepacketizer> depacketizer_;
video_coding::PacketBuffer packet_buffer_;
RtpFrameReferenceFinder reference_finder_;
};
RtpVideoFrameAssembler::Impl::Impl(
std::unique_ptr<VideoRtpDepacketizer> depacketizer)
: depacketizer_(std::move(depacketizer)),
packet_buffer_(/*start_buffer_size=*/2048, /*max_buffer_size=*/2048) {}
RtpVideoFrameAssembler::FrameVector RtpVideoFrameAssembler::Impl::InsertPacket(
const RtpPacketReceived& rtp_packet) {
if (rtp_packet.payload_size() == 0) {
ClearOldData(rtp_packet.SequenceNumber());
return UpdateWithPadding(rtp_packet.SequenceNumber());
}
absl::optional<VideoRtpDepacketizer::ParsedRtpPayload> parsed_payload =
depacketizer_->Parse(rtp_packet.PayloadBuffer());
if (parsed_payload == absl::nullopt) {
return {};
}
if (rtp_packet.HasExtension<RtpDependencyDescriptorExtension>()) {
if (!ParseDependenciesDescriptorExtension(rtp_packet,
parsed_payload->video_header)) {
return {};
}
} else if (rtp_packet.HasExtension<RtpGenericFrameDescriptorExtension00>()) {
if (!ParseGenericDescriptorExtension(rtp_packet,
parsed_payload->video_header)) {
return {};
}
}
parsed_payload->video_header.is_last_packet_in_frame |= rtp_packet.Marker();
auto packet = std::make_unique<video_coding::PacketBuffer::Packet>(
rtp_packet, parsed_payload->video_header);
packet->video_payload = std::move(parsed_payload->video_payload);
ClearOldData(rtp_packet.SequenceNumber());
return FindReferences(
AssembleFrames(packet_buffer_.InsertPacket(std::move(packet))));
}
void RtpVideoFrameAssembler::Impl::ClearOldData(uint16_t incoming_seq_num) {
constexpr uint16_t kOldSeqNumThreshold = 2000;
uint16_t old_seq_num = incoming_seq_num - kOldSeqNumThreshold;
packet_buffer_.ClearTo(old_seq_num);
reference_finder_.ClearTo(old_seq_num);
}
RtpVideoFrameAssembler::Impl::RtpFrameVector
RtpVideoFrameAssembler::Impl::AssembleFrames(
video_coding::PacketBuffer::InsertResult insert_result) {
video_coding::PacketBuffer::Packet* first_packet = nullptr;
std::vector<rtc::ArrayView<const uint8_t>> payloads;
RtpFrameVector result;
for (auto& packet : insert_result.packets) {
if (packet->is_first_packet_in_frame()) {
first_packet = packet.get();
payloads.clear();
}
payloads.emplace_back(packet->video_payload);
if (packet->is_last_packet_in_frame()) {
rtc::scoped_refptr<EncodedImageBuffer> bitstream =
depacketizer_->AssembleFrame(payloads);
if (!bitstream) {
continue;
}
const video_coding::PacketBuffer::Packet& last_packet = *packet;
result.push_back(std::make_unique<RtpFrameObject>(
first_packet->seq_num, //
last_packet.seq_num, //
last_packet.marker_bit, //
/*times_nacked=*/0, //
/*first_packet_received_time=*/0, //
/*last_packet_received_time=*/0, //
first_packet->timestamp, //
/*ntp_time_ms=*/0, //
/*timing=*/VideoSendTiming(), //
first_packet->payload_type, //
first_packet->codec(), //
last_packet.video_header.rotation, //
last_packet.video_header.content_type, //
first_packet->video_header, //
last_packet.video_header.color_space, //
/*packet_infos=*/RtpPacketInfos(), //
std::move(bitstream)));
}
}
return result;
}
RtpVideoFrameAssembler::FrameVector
RtpVideoFrameAssembler::Impl::FindReferences(RtpFrameVector frames) {
FrameVector res;
for (auto& frame : frames) {
auto complete_frames = reference_finder_.ManageFrame(std::move(frame));
for (std::unique_ptr<RtpFrameObject>& complete_frame : complete_frames) {
uint16_t rtp_seq_num_start = complete_frame->first_seq_num();
uint16_t rtp_seq_num_end = complete_frame->last_seq_num();
res.emplace_back(rtp_seq_num_start, rtp_seq_num_end,
std::move(complete_frame));
}
}
return res;
}
RtpVideoFrameAssembler::FrameVector
RtpVideoFrameAssembler::Impl::UpdateWithPadding(uint16_t seq_num) {
auto res =
FindReferences(AssembleFrames(packet_buffer_.InsertPadding(seq_num)));
auto ref_finder_update = reference_finder_.PaddingReceived(seq_num);
for (std::unique_ptr<RtpFrameObject>& complete_frame : ref_finder_update) {
uint16_t rtp_seq_num_start = complete_frame->first_seq_num();
uint16_t rtp_seq_num_end = complete_frame->last_seq_num();
res.emplace_back(rtp_seq_num_start, rtp_seq_num_end,
std::move(complete_frame));
}
return res;
}
bool RtpVideoFrameAssembler::Impl::ParseDependenciesDescriptorExtension(
const RtpPacketReceived& rtp_packet,
RTPVideoHeader& video_header) {
webrtc::DependencyDescriptor dependency_descriptor;
if (!rtp_packet.GetExtension<RtpDependencyDescriptorExtension>(
video_structure_.get(), &dependency_descriptor)) {
// Descriptor is either malformed, or the template referenced is not in
// the `video_structure_` currently being held.
// TODO(bugs.webrtc.org/10342): Improve packet reordering behavior.
RTC_LOG(LS_WARNING) << "ssrc: " << rtp_packet.Ssrc()
<< " Failed to parse dependency descriptor.";
return false;
}
if (dependency_descriptor.attached_structure != nullptr &&
!dependency_descriptor.first_packet_in_frame) {
RTC_LOG(LS_WARNING) << "ssrc: " << rtp_packet.Ssrc()
<< "Invalid dependency descriptor: structure "
"attached to non first packet of a frame.";
return false;
}
video_header.is_first_packet_in_frame =
dependency_descriptor.first_packet_in_frame;
video_header.is_last_packet_in_frame =
dependency_descriptor.last_packet_in_frame;
int64_t frame_id =
frame_id_unwrapper_.Unwrap(dependency_descriptor.frame_number);
auto& generic_descriptor_info = video_header.generic.emplace();
generic_descriptor_info.frame_id = frame_id;
generic_descriptor_info.spatial_index =
dependency_descriptor.frame_dependencies.spatial_id;
generic_descriptor_info.temporal_index =
dependency_descriptor.frame_dependencies.temporal_id;
for (int fdiff : dependency_descriptor.frame_dependencies.frame_diffs) {
generic_descriptor_info.dependencies.push_back(frame_id - fdiff);
}
for (int cdiff : dependency_descriptor.frame_dependencies.chain_diffs) {
generic_descriptor_info.chain_diffs.push_back(frame_id - cdiff);
}
generic_descriptor_info.decode_target_indications =
dependency_descriptor.frame_dependencies.decode_target_indications;
if (dependency_descriptor.resolution) {
video_header.width = dependency_descriptor.resolution->Width();
video_header.height = dependency_descriptor.resolution->Height();
}
if (dependency_descriptor.active_decode_targets_bitmask.has_value()) {
generic_descriptor_info.active_decode_targets =
*dependency_descriptor.active_decode_targets_bitmask;
}
// FrameDependencyStructure is sent in the dependency descriptor of the first
// packet of a key frame and is required to parse all subsequent packets until
// the next key frame.
if (dependency_descriptor.attached_structure) {
RTC_DCHECK(dependency_descriptor.first_packet_in_frame);
if (video_structure_frame_id_ > frame_id) {
RTC_LOG(LS_WARNING)
<< "Arrived key frame with id " << frame_id << " and structure id "
<< dependency_descriptor.attached_structure->structure_id
<< " is older than the latest received key frame with id "
<< *video_structure_frame_id_ << " and structure id "
<< video_structure_->structure_id;
return false;
}
video_structure_ = std::move(dependency_descriptor.attached_structure);
video_structure_frame_id_ = frame_id;
video_header.frame_type = VideoFrameType::kVideoFrameKey;
} else {
video_header.frame_type = VideoFrameType::kVideoFrameDelta;
}
return true;
}
bool RtpVideoFrameAssembler::Impl::ParseGenericDescriptorExtension(
const RtpPacketReceived& rtp_packet,
RTPVideoHeader& video_header) {
RtpGenericFrameDescriptor generic_frame_descriptor;
if (!rtp_packet.GetExtension<RtpGenericFrameDescriptorExtension00>(
&generic_frame_descriptor)) {
return false;
}
video_header.is_first_packet_in_frame =
generic_frame_descriptor.FirstPacketInSubFrame();
video_header.is_last_packet_in_frame =
generic_frame_descriptor.LastPacketInSubFrame();
if (generic_frame_descriptor.FirstPacketInSubFrame()) {
video_header.frame_type =
generic_frame_descriptor.FrameDependenciesDiffs().empty()
? VideoFrameType::kVideoFrameKey
: VideoFrameType::kVideoFrameDelta;
auto& generic_descriptor_info = video_header.generic.emplace();
int64_t frame_id =
frame_id_unwrapper_.Unwrap(generic_frame_descriptor.FrameId());
generic_descriptor_info.frame_id = frame_id;
generic_descriptor_info.spatial_index =
generic_frame_descriptor.SpatialLayer();
generic_descriptor_info.temporal_index =
generic_frame_descriptor.TemporalLayer();
for (uint16_t fdiff : generic_frame_descriptor.FrameDependenciesDiffs()) {
generic_descriptor_info.dependencies.push_back(frame_id - fdiff);
}
}
video_header.width = generic_frame_descriptor.Width();
video_header.height = generic_frame_descriptor.Height();
return true;
}
RtpVideoFrameAssembler::RtpVideoFrameAssembler(PayloadFormat payload_format)
: impl_(std::make_unique<Impl>(CreateDepacketizer(payload_format))) {}
RtpVideoFrameAssembler::~RtpVideoFrameAssembler() = default;
RtpVideoFrameAssembler::FrameVector RtpVideoFrameAssembler::InsertPacket(
const RtpPacketReceived& packet) {
return impl_->InsertPacket(packet);
}
} // namespace webrtc