diff --git a/modules/rtp_rtcp/BUILD.gn b/modules/rtp_rtcp/BUILD.gn index daaac94d68..099c0663d2 100644 --- a/modules/rtp_rtcp/BUILD.gn +++ b/modules/rtp_rtcp/BUILD.gn @@ -251,6 +251,7 @@ rtc_library("rtp_rtcp") { "../../api/rtc_event_log", "../../api/transport:field_trial_based_config", "../../api/transport:webrtc_key_value_config", + "../../api/transport/rtp:dependency_descriptor", "../../api/transport/rtp:rtp_source", "../../api/units:data_rate", "../../api/units:time_delta", @@ -332,6 +333,7 @@ rtc_library("rtp_video_header") { ] deps = [ "../../:webrtc_common", + "../../api/transport/rtp:dependency_descriptor", "../../api/video:video_frame", "../../api/video:video_frame_type", "../../api/video:video_rtp_headers", @@ -508,6 +510,7 @@ if (rtc_include_tests) { "../../api:transport_api", "../../api/rtc_event_log", "../../api/transport:field_trial_based_config", + "../../api/transport/rtp:dependency_descriptor", "../../api/units:timestamp", "../../api/video:encoded_image", "../../api/video:video_bitrate_allocation", @@ -518,6 +521,7 @@ if (rtc_include_tests) { "../../api/video_codecs:video_codecs_api", "../../call:rtp_receiver", "../../common_video", + "../../common_video/generic_frame_descriptor", "../../common_video/test:utilities", "../../logging:mocks", "../../rtc_base:checks", diff --git a/modules/rtp_rtcp/source/rtp_sender_video.cc b/modules/rtp_rtcp/source/rtp_sender_video.cc index 9779df1361..fc176c96cd 100644 --- a/modules/rtp_rtcp/source/rtp_sender_video.cc +++ b/modules/rtp_rtcp/source/rtp_sender_video.cc @@ -18,12 +18,15 @@ #include #include +#include "absl/memory/memory.h" #include "absl/strings/match.h" #include "api/crypto/frame_encryptor_interface.h" +#include "api/transport/rtp/dependency_descriptor.h" #include "modules/remote_bitrate_estimator/test/bwe_test_logging.h" #include "modules/rtp_rtcp/include/rtp_rtcp_defines.h" #include "modules/rtp_rtcp/source/absolute_capture_time_sender.h" #include "modules/rtp_rtcp/source/byte_io.h" +#include "modules/rtp_rtcp/source/rtp_dependency_descriptor_extension.h" #include "modules/rtp_rtcp/source/rtp_format.h" #include "modules/rtp_rtcp/source/rtp_generic_frame_descriptor_extension.h" #include "modules/rtp_rtcp/source/rtp_header_extensions.h" @@ -72,6 +75,7 @@ void AddRtpHeaderExtensions( const RTPVideoHeader& video_header, const absl::optional& playout_delay, const absl::optional& absolute_capture_time, + FrameDependencyStructure* video_structure, bool set_video_rotation, bool set_color_space, bool set_frame_marking, @@ -115,34 +119,71 @@ void AddRtpHeaderExtensions( } if (video_header.generic) { - RtpGenericFrameDescriptor generic_descriptor; - generic_descriptor.SetFirstPacketInSubFrame(first_packet); - generic_descriptor.SetLastPacketInSubFrame(last_packet); - generic_descriptor.SetDiscardable(video_header.generic->discardable); - - if (first_packet) { - generic_descriptor.SetFrameId( - static_cast(video_header.generic->frame_id)); + bool extension_is_set = false; + if (video_structure != nullptr) { + DependencyDescriptor descriptor; + descriptor.first_packet_in_frame = first_packet; + descriptor.last_packet_in_frame = last_packet; + descriptor.frame_number = video_header.generic->frame_id & 0xFFFF; + descriptor.frame_dependencies.spatial_id = + video_header.generic->spatial_index; + descriptor.frame_dependencies.temporal_id = + video_header.generic->temporal_index; for (int64_t dep : video_header.generic->dependencies) { - generic_descriptor.AddFrameDependencyDiff( + descriptor.frame_dependencies.frame_diffs.push_back( video_header.generic->frame_id - dep); } + descriptor.frame_dependencies.decode_target_indications = + video_header.generic->decode_target_indications; + RTC_DCHECK_EQ( + descriptor.frame_dependencies.decode_target_indications.size(), + video_structure->num_decode_targets); - uint8_t spatial_bimask = 1 << video_header.generic->spatial_index; - generic_descriptor.SetSpatialLayersBitmask(spatial_bimask); - - generic_descriptor.SetTemporalLayer(video_header.generic->temporal_index); - - if (video_header.frame_type == VideoFrameType::kVideoFrameKey) { - generic_descriptor.SetResolution(video_header.width, - video_header.height); + // To avoid extra structure copy, temporary share ownership of the + // video_structure with the dependency descriptor. + if (video_header.frame_type == VideoFrameType::kVideoFrameKey && + first_packet) { + descriptor.attached_structure = absl::WrapUnique(video_structure); } + extension_is_set = packet->SetExtension( + *video_structure, descriptor); + + // Remove the temporary shared ownership. + descriptor.attached_structure.release(); } - if (!packet->SetExtension( - generic_descriptor)) { - packet->SetExtension( - generic_descriptor); + // Do not use v0/v1 generic frame descriptor when v2 is stored. + if (!extension_is_set) { + RtpGenericFrameDescriptor generic_descriptor; + generic_descriptor.SetFirstPacketInSubFrame(first_packet); + generic_descriptor.SetLastPacketInSubFrame(last_packet); + generic_descriptor.SetDiscardable(video_header.generic->discardable); + + if (first_packet) { + generic_descriptor.SetFrameId( + static_cast(video_header.generic->frame_id)); + for (int64_t dep : video_header.generic->dependencies) { + generic_descriptor.AddFrameDependencyDiff( + video_header.generic->frame_id - dep); + } + + uint8_t spatial_bimask = 1 << video_header.generic->spatial_index; + generic_descriptor.SetSpatialLayersBitmask(spatial_bimask); + + generic_descriptor.SetTemporalLayer( + video_header.generic->temporal_index); + + if (video_header.frame_type == VideoFrameType::kVideoFrameKey) { + generic_descriptor.SetResolution(video_header.width, + video_header.height); + } + } + + if (!packet->SetExtension( + generic_descriptor)) { + packet->SetExtension( + generic_descriptor); + } } } } @@ -417,6 +458,38 @@ absl::optional RTPSenderVideo::FlexfecSsrc() const { return absl::nullopt; } +void RTPSenderVideo::SetVideoStructure( + const FrameDependencyStructure* video_structure) { + RTC_DCHECK_RUNS_SERIALIZED(&send_checker_); + if (video_structure == nullptr) { + video_structure_ = nullptr; + return; + } + // Simple sanity checks video structure is set up. + RTC_DCHECK_GT(video_structure->num_decode_targets, 0); + RTC_DCHECK_GT(video_structure->templates.size(), 0); + + int structure_id = 0; + if (video_structure_) { + if (*video_structure_ == *video_structure) { + // Same structure (just a new key frame), no update required. + return; + } + // When setting different video structure make sure structure_id is updated + // so that templates from different structures do not collide. + static constexpr int kMaxTemplates = 64; + structure_id = + (video_structure_->structure_id + video_structure_->templates.size()) % + kMaxTemplates; + } + + video_structure_ = + std::make_unique(*video_structure); + video_structure_->structure_id = structure_id; + // TODO(bugs.webrtc.org/10342): Support chains. + video_structure_->num_chains = 0; +} + bool RTPSenderVideo::SendVideo( int payload_type, absl::optional codec_type, @@ -523,16 +596,20 @@ bool RTPSenderVideo::SendVideo( auto last_packet = std::make_unique(*single_packet); // Simplest way to estimate how much extensions would occupy is to set them. AddRtpHeaderExtensions(video_header, playout_delay, absolute_capture_time, - set_video_rotation, set_color_space, set_frame_marking, + video_structure_.get(), set_video_rotation, + set_color_space, set_frame_marking, /*first=*/true, /*last=*/true, single_packet.get()); AddRtpHeaderExtensions(video_header, playout_delay, absolute_capture_time, - set_video_rotation, set_color_space, set_frame_marking, + video_structure_.get(), set_video_rotation, + set_color_space, set_frame_marking, /*first=*/true, /*last=*/false, first_packet.get()); AddRtpHeaderExtensions(video_header, playout_delay, absolute_capture_time, - set_video_rotation, set_color_space, set_frame_marking, + video_structure_.get(), set_video_rotation, + set_color_space, set_frame_marking, /*first=*/false, /*last=*/false, middle_packet.get()); AddRtpHeaderExtensions(video_header, playout_delay, absolute_capture_time, - set_video_rotation, set_color_space, set_frame_marking, + video_structure_.get(), set_video_rotation, + set_color_space, set_frame_marking, /*first=*/false, /*last=*/true, last_packet.get()); RTC_DCHECK_GT(packet_capacity, single_packet->headers_size()); diff --git a/modules/rtp_rtcp/source/rtp_sender_video.h b/modules/rtp_rtcp/source/rtp_sender_video.h index 3f4c676435..053877ef28 100644 --- a/modules/rtp_rtcp/source/rtp_sender_video.h +++ b/modules/rtp_rtcp/source/rtp_sender_video.h @@ -18,6 +18,7 @@ #include "absl/strings/string_view.h" #include "absl/types/optional.h" #include "api/array_view.h" +#include "api/transport/rtp/dependency_descriptor.h" #include "api/video/video_codec_type.h" #include "api/video/video_frame_type.h" #include "modules/include/module_common_types.h" @@ -103,6 +104,13 @@ class RTPSenderVideo { const RTPFragmentationHeader* fragmentation, RTPVideoHeader video_header, absl::optional expected_retransmission_time_ms); + // Configures video structures produced by encoder to send using the + // dependency descriptor rtp header extension. Next call to SendVideo should + // have video_header.frame_type == kVideoFrameKey. + // All calls to SendVideo after this call must use video_header compatible + // with the video_structure. + void SetVideoStructure(const FrameDependencyStructure* video_structure); + // FlexFEC/ULPFEC. // Set FEC rates, max frames before FEC is sent, and type of FEC masks. // Returns false on failure. @@ -184,6 +192,8 @@ class RTPSenderVideo { VideoRotation last_rotation_ RTC_GUARDED_BY(send_checker_); absl::optional last_color_space_ RTC_GUARDED_BY(send_checker_); bool transmit_color_space_next_frame_ RTC_GUARDED_BY(send_checker_); + std::unique_ptr video_structure_ + RTC_GUARDED_BY(send_checker_); // Tracks the current request for playout delay limits from application // and decides whether the current RTP frame should include the playout diff --git a/modules/rtp_rtcp/source/rtp_sender_video_unittest.cc b/modules/rtp_rtcp/source/rtp_sender_video_unittest.cc index 7ccd0ac028..867e05b60d 100644 --- a/modules/rtp_rtcp/source/rtp_sender_video_unittest.cc +++ b/modules/rtp_rtcp/source/rtp_sender_video_unittest.cc @@ -10,15 +10,20 @@ #include "modules/rtp_rtcp/source/rtp_sender_video.h" +#include #include +#include #include +#include "api/transport/rtp/dependency_descriptor.h" #include "api/video/video_codec_constants.h" #include "api/video/video_timing.h" +#include "common_video/generic_frame_descriptor/generic_frame_info.h" #include "modules/rtp_rtcp/include/rtp_cvo.h" #include "modules/rtp_rtcp/include/rtp_header_extension_map.h" #include "modules/rtp_rtcp/include/rtp_rtcp.h" #include "modules/rtp_rtcp/include/rtp_rtcp_defines.h" +#include "modules/rtp_rtcp/source/rtp_dependency_descriptor_extension.h" #include "modules/rtp_rtcp/source/rtp_format_video_generic.h" #include "modules/rtp_rtcp/source/rtp_generic_frame_descriptor.h" #include "modules/rtp_rtcp/source/rtp_generic_frame_descriptor_extension.h" @@ -35,12 +40,15 @@ namespace webrtc { namespace { using ::testing::ElementsAre; +using ::testing::IsEmpty; +using ::testing::SizeIs; enum : int { // The first valid value is 1. kAbsoluteSendTimeExtensionId = 1, kFrameMarkingExtensionId, kGenericDescriptorId00, kGenericDescriptorId01, + kGenericDescriptorId02, kTransmissionTimeOffsetExtensionId, kTransportSequenceNumberExtensionId, kVideoRotationExtensionId, @@ -73,6 +81,8 @@ class LoopbackTransportTest : public webrtc::Transport { kGenericDescriptorId00); receivers_extensions_.Register( kGenericDescriptorId01); + receivers_extensions_.Register( + kGenericDescriptorId02); receivers_extensions_.Register( kFrameMarkingExtensionId); receivers_extensions_.Register( @@ -522,6 +532,148 @@ TEST_P(RtpSenderVideoTest, ConditionalRetransmitLimit) { EXPECT_TRUE(rtp_sender_video_.AllowRetransmission(header, kSettings, kRttMs)); } +TEST_P(RtpSenderVideoTest, SendsDependencyDescriptorWhenVideoStructureIsSet) { + const int64_t kFrameId = 100000; + uint8_t kFrame[100]; + rtp_module_->RegisterRtpHeaderExtension( + RtpDependencyDescriptorExtension::kUri, kGenericDescriptorId02); + FrameDependencyStructure video_structure; + video_structure.num_decode_targets = 2; + video_structure.templates = { + GenericFrameInfo::Builder().S(0).T(0).Dtis("SS").Build(), + GenericFrameInfo::Builder().S(1).T(0).Dtis("-S").Build(), + GenericFrameInfo::Builder().S(1).T(1).Dtis("-D").Build(), + }; + rtp_sender_video_.SetVideoStructure(&video_structure); + + // Send key frame. + RTPVideoHeader hdr; + RTPVideoHeader::GenericDescriptorInfo& generic = hdr.generic.emplace(); + generic.frame_id = kFrameId; + generic.temporal_index = 0; + generic.spatial_index = 0; + generic.decode_target_indications = {DecodeTargetIndication::kSwitch, + DecodeTargetIndication::kSwitch}; + hdr.frame_type = VideoFrameType::kVideoFrameKey; + rtp_sender_video_.SendVideo(kPayload, kType, kTimestamp, 0, kFrame, nullptr, + hdr, kDefaultExpectedRetransmissionTimeMs); + + ASSERT_EQ(transport_.packets_sent(), 1); + DependencyDescriptor descriptor_key; + ASSERT_TRUE(transport_.last_sent_packet() + .GetExtension( + nullptr, &descriptor_key)); + ASSERT_TRUE(descriptor_key.attached_structure); + EXPECT_EQ(descriptor_key.attached_structure->num_decode_targets, 2); + EXPECT_THAT(descriptor_key.attached_structure->templates, SizeIs(3)); + EXPECT_EQ(descriptor_key.frame_number, kFrameId & 0xFFFF); + EXPECT_EQ(descriptor_key.frame_dependencies.spatial_id, 0); + EXPECT_EQ(descriptor_key.frame_dependencies.temporal_id, 0); + EXPECT_EQ(descriptor_key.frame_dependencies.decode_target_indications, + generic.decode_target_indications); + EXPECT_THAT(descriptor_key.frame_dependencies.frame_diffs, IsEmpty()); + + // Send delta frame. + generic.frame_id = kFrameId + 1; + generic.temporal_index = 1; + generic.spatial_index = 1; + generic.dependencies = {kFrameId, kFrameId - 500}; + generic.decode_target_indications = {DecodeTargetIndication::kNotPresent, + DecodeTargetIndication::kRequired}; + hdr.frame_type = VideoFrameType::kVideoFrameDelta; + rtp_sender_video_.SendVideo(kPayload, kType, kTimestamp, 0, kFrame, nullptr, + hdr, kDefaultExpectedRetransmissionTimeMs); + + EXPECT_EQ(transport_.packets_sent(), 2); + DependencyDescriptor descriptor_delta; + ASSERT_TRUE( + transport_.last_sent_packet() + .GetExtension( + descriptor_key.attached_structure.get(), &descriptor_delta)); + EXPECT_EQ(descriptor_delta.attached_structure, nullptr); + EXPECT_EQ(descriptor_delta.frame_number, (kFrameId + 1) & 0xFFFF); + EXPECT_EQ(descriptor_delta.frame_dependencies.spatial_id, 1); + EXPECT_EQ(descriptor_delta.frame_dependencies.temporal_id, 1); + EXPECT_EQ(descriptor_delta.frame_dependencies.decode_target_indications, + generic.decode_target_indications); + EXPECT_THAT(descriptor_delta.frame_dependencies.frame_diffs, + ElementsAre(1, 501)); +} + +TEST_P(RtpSenderVideoTest, + SetDiffentVideoStructureAvoidsCollisionWithThePreviousStructure) { + const int64_t kFrameId = 100000; + uint8_t kFrame[100]; + rtp_module_->RegisterRtpHeaderExtension( + RtpDependencyDescriptorExtension::kUri, kGenericDescriptorId02); + FrameDependencyStructure video_structure1; + video_structure1.num_decode_targets = 2; + video_structure1.templates = { + GenericFrameInfo::Builder().S(0).T(0).Dtis("SS").Build(), + GenericFrameInfo::Builder().S(0).T(1).Dtis("D-").Build(), + }; + FrameDependencyStructure video_structure2; + video_structure2.num_decode_targets = 2; + video_structure2.templates = { + GenericFrameInfo::Builder().S(0).T(0).Dtis("SS").Build(), + GenericFrameInfo::Builder().S(0).T(1).Dtis("R-").Build(), + }; + + // Send 1st key frame. + RTPVideoHeader hdr; + RTPVideoHeader::GenericDescriptorInfo& generic = hdr.generic.emplace(); + generic.frame_id = kFrameId; + generic.decode_target_indications = {DecodeTargetIndication::kSwitch, + DecodeTargetIndication::kSwitch}; + hdr.frame_type = VideoFrameType::kVideoFrameKey; + rtp_sender_video_.SetVideoStructure(&video_structure1); + rtp_sender_video_.SendVideo(kPayload, kType, kTimestamp, 0, kFrame, nullptr, + hdr, kDefaultExpectedRetransmissionTimeMs); + // Parse 1st extension. + ASSERT_EQ(transport_.packets_sent(), 1); + DependencyDescriptor descriptor_key1; + ASSERT_TRUE(transport_.last_sent_packet() + .GetExtension( + nullptr, &descriptor_key1)); + ASSERT_TRUE(descriptor_key1.attached_structure); + + // Send the delta frame. + generic.frame_id = kFrameId + 1; + generic.temporal_index = 1; + generic.decode_target_indications = {DecodeTargetIndication::kDiscardable, + DecodeTargetIndication::kNotPresent}; + hdr.frame_type = VideoFrameType::kVideoFrameDelta; + rtp_sender_video_.SendVideo(kPayload, kType, kTimestamp, 0, kFrame, nullptr, + hdr, kDefaultExpectedRetransmissionTimeMs); + + ASSERT_EQ(transport_.packets_sent(), 2); + RtpPacket delta_packet = transport_.last_sent_packet(); + + // Send 2nd key frame. + generic.frame_id = kFrameId + 2; + generic.decode_target_indications = {DecodeTargetIndication::kSwitch, + DecodeTargetIndication::kSwitch}; + hdr.frame_type = VideoFrameType::kVideoFrameKey; + rtp_sender_video_.SetVideoStructure(&video_structure2); + rtp_sender_video_.SendVideo(kPayload, kType, kTimestamp, 0, kFrame, nullptr, + hdr, kDefaultExpectedRetransmissionTimeMs); + // Parse the 2nd key frame. + ASSERT_EQ(transport_.packets_sent(), 3); + DependencyDescriptor descriptor_key2; + ASSERT_TRUE(transport_.last_sent_packet() + .GetExtension( + nullptr, &descriptor_key2)); + ASSERT_TRUE(descriptor_key2.attached_structure); + + // Try to parse the 1st delta frame. It should parseble using the structure + // from the 1st key frame, but not using the structure from the 2nd key frame. + DependencyDescriptor descriptor_delta; + EXPECT_TRUE(delta_packet.GetExtension( + descriptor_key1.attached_structure.get(), &descriptor_delta)); + EXPECT_FALSE(delta_packet.GetExtension( + descriptor_key2.attached_structure.get(), &descriptor_delta)); +} + void RtpSenderVideoTest::PopulateGenericFrameDescriptor(int version) { const absl::string_view ext_uri = (version == 0) ? RtpGenericFrameDescriptorExtension00::kUri diff --git a/modules/rtp_rtcp/source/rtp_video_header.h b/modules/rtp_rtcp/source/rtp_video_header.h index b66cba8404..714d1eb08c 100644 --- a/modules/rtp_rtcp/source/rtp_video_header.h +++ b/modules/rtp_rtcp/source/rtp_video_header.h @@ -15,6 +15,7 @@ #include "absl/container/inlined_vector.h" #include "absl/types/optional.h" #include "absl/types/variant.h" +#include "api/transport/rtp/dependency_descriptor.h" #include "api/video/color_space.h" #include "api/video/video_codec_type.h" #include "api/video/video_content_type.h" @@ -50,6 +51,7 @@ struct RTPVideoHeader { int64_t frame_id = 0; int spatial_index = 0; int temporal_index = 0; + absl::InlinedVector decode_target_indications; absl::InlinedVector dependencies; bool discardable = false; };