diff --git a/video/rtp_video_stream_receiver.cc b/video/rtp_video_stream_receiver.cc index e78bc58774..52f1014746 100644 --- a/video/rtp_video_stream_receiver.cc +++ b/video/rtp_video_stream_receiver.cc @@ -27,6 +27,7 @@ #include "modules/rtp_rtcp/include/rtp_rtcp.h" #include "modules/rtp_rtcp/include/ulpfec_receiver.h" #include "modules/rtp_rtcp/source/create_video_rtp_depacketizer.h" +#include "modules/rtp_rtcp/source/rtp_dependency_descriptor_extension.h" #include "modules/rtp_rtcp/source/rtp_format.h" #include "modules/rtp_rtcp/source/rtp_generic_frame_descriptor.h" #include "modules/rtp_rtcp/source/rtp_generic_frame_descriptor_extension.h" @@ -331,45 +332,119 @@ RtpVideoStreamReceiver::ParseGenericDependenciesResult RtpVideoStreamReceiver::ParseGenericDependenciesExtension( const RtpPacketReceived& rtp_packet, RTPVideoHeader* video_header) { + if (rtp_packet.HasExtension()) { + webrtc::DependencyDescriptor dependency_descriptor; + if (!rtp_packet.GetExtension( + video_structure_.get(), &dependency_descriptor)) { + // Descriptor is there, but failed to parse. Either it is invalid, + // or too old packet (after relevant video_structure_ changed), + // or too new packet (before relevant video_structure_ arrived). + // Drop such packet to be on the safe side. + // TODO(bugs.webrtc.org/10342): Stash too new packet. + RTC_LOG(LS_WARNING) << "ssrc: " << rtp_packet.Ssrc() + << " Failed to parse dependency descriptor."; + return kDropPacket; + } + if (dependency_descriptor.attached_structure != nullptr && + !dependency_descriptor.first_packet_in_frame) { + RTC_LOG(LS_WARNING) << "ssrc: " << rtp_packet.Ssrc() + << "Invalid dependency descriptor: structure " + "attached to non first packet of a frame."; + return kDropPacket; + } + video_header->is_first_packet_in_frame = + dependency_descriptor.first_packet_in_frame; + video_header->is_last_packet_in_frame = + dependency_descriptor.last_packet_in_frame; + + int64_t frame_id = + frame_id_unwrapper_.Unwrap(dependency_descriptor.frame_number); + auto& generic_descriptor_info = video_header->generic.emplace(); + generic_descriptor_info.frame_id = frame_id; + generic_descriptor_info.spatial_index = + dependency_descriptor.frame_dependencies.spatial_id; + generic_descriptor_info.temporal_index = + dependency_descriptor.frame_dependencies.temporal_id; + for (int fdiff : dependency_descriptor.frame_dependencies.frame_diffs) { + generic_descriptor_info.dependencies.push_back(frame_id - fdiff); + } + generic_descriptor_info.decode_target_indications = + dependency_descriptor.frame_dependencies.decode_target_indications; + generic_descriptor_info.discardable = + absl::c_linear_search(generic_descriptor_info.decode_target_indications, + DecodeTargetIndication::kDiscardable); + if (dependency_descriptor.resolution) { + video_header->width = dependency_descriptor.resolution->Width(); + video_header->height = dependency_descriptor.resolution->Height(); + } + + // FrameDependencyStructure is sent in dependency descriptor of the first + // packet of a key frame and required for parsed dependency descriptor in + // all the following packets until next key frame. + // Save it if there is a (potentially) new structure. + if (dependency_descriptor.attached_structure) { + RTC_DCHECK(dependency_descriptor.first_packet_in_frame); + if (video_structure_frame_id_ > frame_id) { + RTC_LOG(LS_WARNING) + << "Arrived key frame with id " << frame_id << " and structure id " + << dependency_descriptor.attached_structure->structure_id + << " is older than the latest received key frame with id " + << *video_structure_frame_id_ << " and structure id " + << video_structure_->structure_id; + return kDropPacket; + } + video_structure_ = std::move(dependency_descriptor.attached_structure); + video_structure_frame_id_ = frame_id; + video_header->frame_type = VideoFrameType::kVideoFrameKey; + } else { + video_header->frame_type = VideoFrameType::kVideoFrameDelta; + } + return kHasGenericDescriptor; + } + if (rtp_packet.HasExtension() && rtp_packet.HasExtension()) { RTC_LOG(LS_WARNING) << "RTP packet had two different GFD versions."; return kDropPacket; } - RtpGenericFrameDescriptor generic_descriptor; + RtpGenericFrameDescriptor generic_frame_descriptor; bool has_generic_descriptor = rtp_packet.GetExtension( - &generic_descriptor) || + &generic_frame_descriptor) || rtp_packet.GetExtension( - &generic_descriptor); + &generic_frame_descriptor); if (!has_generic_descriptor) { return kNoGenericDescriptor; } video_header->is_first_packet_in_frame = - generic_descriptor.FirstPacketInSubFrame(); + generic_frame_descriptor.FirstPacketInSubFrame(); video_header->is_last_packet_in_frame = - generic_descriptor.LastPacketInSubFrame(); + generic_frame_descriptor.LastPacketInSubFrame(); - if (generic_descriptor.FirstPacketInSubFrame()) { + if (generic_frame_descriptor.FirstPacketInSubFrame()) { video_header->frame_type = - generic_descriptor.FrameDependenciesDiffs().empty() + generic_frame_descriptor.FrameDependenciesDiffs().empty() ? VideoFrameType::kVideoFrameKey : VideoFrameType::kVideoFrameDelta; - auto& descriptor = video_header->generic.emplace(); - int64_t frame_id = frame_id_unwrapper_.Unwrap(generic_descriptor.FrameId()); - descriptor.frame_id = frame_id; - descriptor.spatial_index = generic_descriptor.SpatialLayer(); - descriptor.temporal_index = generic_descriptor.TemporalLayer(); - descriptor.discardable = generic_descriptor.Discardable().value_or(false); - for (uint16_t fdiff : generic_descriptor.FrameDependenciesDiffs()) { - descriptor.dependencies.push_back(frame_id - fdiff); + auto& generic_descriptor_info = video_header->generic.emplace(); + int64_t frame_id = + frame_id_unwrapper_.Unwrap(generic_frame_descriptor.FrameId()); + generic_descriptor_info.frame_id = frame_id; + generic_descriptor_info.spatial_index = + generic_frame_descriptor.SpatialLayer(); + generic_descriptor_info.temporal_index = + generic_frame_descriptor.TemporalLayer(); + generic_descriptor_info.discardable = + generic_frame_descriptor.Discardable().value_or(false); + for (uint16_t fdiff : generic_frame_descriptor.FrameDependenciesDiffs()) { + generic_descriptor_info.dependencies.push_back(frame_id - fdiff); } } - video_header->width = generic_descriptor.Width(); - video_header->height = generic_descriptor.Height(); + video_header->width = generic_frame_descriptor.Width(); + video_header->height = generic_frame_descriptor.Height(); return kHasGenericDescriptor; } diff --git a/video/rtp_video_stream_receiver.h b/video/rtp_video_stream_receiver.h index e6a8068003..c0271f4cb1 100644 --- a/video/rtp_video_stream_receiver.h +++ b/video/rtp_video_stream_receiver.h @@ -32,6 +32,7 @@ #include "modules/rtp_rtcp/include/rtp_rtcp.h" #include "modules/rtp_rtcp/include/rtp_rtcp_defines.h" #include "modules/rtp_rtcp/source/absolute_capture_time_receiver.h" +#include "modules/rtp_rtcp/source/rtp_dependency_descriptor_extension.h" #include "modules/rtp_rtcp/source/rtp_packet_received.h" #include "modules/rtp_rtcp/source/rtp_video_header.h" #include "modules/rtp_rtcp/source/video_rtp_depacketizer.h" @@ -288,6 +289,16 @@ class RtpVideoStreamReceiver : public LossNotificationSender, SeqNumUnwrapper frame_id_unwrapper_ RTC_GUARDED_BY(worker_task_checker_); + // Video structure provided in the dependency descriptor in a first packet + // of a key frame. It is required to parse dependency descriptor in the + // following delta packets. + std::unique_ptr video_structure_ + RTC_GUARDED_BY(worker_task_checker_); + // Frame id of the last frame with the attached video structure. + // absl::nullopt when `video_structure_ == nullptr`; + absl::optional video_structure_frame_id_ + RTC_GUARDED_BY(worker_task_checker_); + rtc::CriticalSection reference_finder_lock_; std::unique_ptr reference_finder_ RTC_GUARDED_BY(reference_finder_lock_); diff --git a/video/rtp_video_stream_receiver_unittest.cc b/video/rtp_video_stream_receiver_unittest.cc index d5d0be5e94..98f6663fec 100644 --- a/video/rtp_video_stream_receiver_unittest.cc +++ b/video/rtp_video_stream_receiver_unittest.cc @@ -148,9 +148,7 @@ class RtpVideoStreamReceiverTest : public ::testing::Test { explicit RtpVideoStreamReceiverTest(std::string field_trials) : override_field_trials_(field_trials), config_(CreateConfig()), - process_thread_(ProcessThread::Create("TestThread")) {} - - void SetUp() { + process_thread_(ProcessThread::Create("TestThread")) { rtp_receive_statistics_ = ReceiveStatistics::Create(Clock::GetRealTimeClock()); rtp_video_stream_receiver_ = std::make_unique( @@ -1024,6 +1022,170 @@ TEST_P(RtpVideoStreamReceiverGenericDescriptorTest, UnwrapsFrameId) { inject_packet(/*wrapped_frame_id=*/0x0002); } +class RtpVideoStreamReceiverDependencyDescriptorTest + : public RtpVideoStreamReceiverTest { + public: + RtpVideoStreamReceiverDependencyDescriptorTest() { + VideoCodec codec; + codec.plType = payload_type_; + rtp_video_stream_receiver_->AddReceiveCodec(codec, {}, + /*raw_payload=*/true); + extension_map_.Register(7); + rtp_video_stream_receiver_->StartReceive(); + } + + // Returns some valid structure for the DependencyDescriptors. + // First template of that structure always fit for a key frame. + static FrameDependencyStructure CreateStreamStructure() { + FrameDependencyStructure stream_structure; + stream_structure.num_decode_targets = 1; + stream_structure.templates = { + GenericFrameInfo::Builder().Dtis("S").Build(), + GenericFrameInfo::Builder().Dtis("S").Fdiffs({1}).Build(), + }; + return stream_structure; + } + + void InjectPacketWith(const FrameDependencyStructure& stream_structure, + const DependencyDescriptor& dependency_descriptor) { + const std::vector data = {0, 1, 2, 3, 4}; + RtpPacketReceived rtp_packet(&extension_map_); + ASSERT_TRUE(rtp_packet.SetExtension( + stream_structure, dependency_descriptor)); + uint8_t* payload = rtp_packet.SetPayloadSize(data.size()); + ASSERT_TRUE(payload); + memcpy(payload, data.data(), data.size()); + mock_on_complete_frame_callback_.ClearExpectedBitstream(); + mock_on_complete_frame_callback_.AppendExpectedBitstream(data.data(), + data.size()); + rtp_packet.SetMarker(true); + rtp_packet.SetPayloadType(payload_type_); + rtp_packet.SetSequenceNumber(++rtp_sequence_number_); + rtp_video_stream_receiver_->OnRtpPacket(rtp_packet); + } + + private: + const int payload_type_ = 123; + RtpHeaderExtensionMap extension_map_; + uint16_t rtp_sequence_number_ = 321; +}; + +TEST_F(RtpVideoStreamReceiverDependencyDescriptorTest, UnwrapsFrameId) { + FrameDependencyStructure stream_structure = CreateStreamStructure(); + + DependencyDescriptor keyframe_descriptor; + keyframe_descriptor.attached_structure = + std::make_unique(stream_structure); + keyframe_descriptor.frame_dependencies = stream_structure.templates[0]; + keyframe_descriptor.frame_number = 0xfff0; + // DependencyDescriptor doesn't support reordering delta frame before + // keyframe. Thus feed a key frame first, then test reodered delta frames. + int64_t first_picture_id; + EXPECT_CALL(mock_on_complete_frame_callback_, DoOnCompleteFrame) + .WillOnce([&](video_coding::EncodedFrame* frame) { + first_picture_id = frame->id.picture_id; + }); + InjectPacketWith(stream_structure, keyframe_descriptor); + + DependencyDescriptor deltaframe1_descriptor; + deltaframe1_descriptor.frame_dependencies = stream_structure.templates[1]; + deltaframe1_descriptor.frame_number = 0xfffe; + + DependencyDescriptor deltaframe2_descriptor; + deltaframe1_descriptor.frame_dependencies = stream_structure.templates[1]; + deltaframe2_descriptor.frame_number = 0x0002; + + // Parser should unwrap frame ids correctly even if packets were reordered by + // the network. + EXPECT_CALL(mock_on_complete_frame_callback_, DoOnCompleteFrame) + .WillOnce([&](video_coding::EncodedFrame* frame) { + // 0x0002 - 0xfff0 + EXPECT_EQ(frame->id.picture_id - first_picture_id, 18); + }) + .WillOnce([&](video_coding::EncodedFrame* frame) { + // 0xfffe - 0xfff0 + EXPECT_EQ(frame->id.picture_id - first_picture_id, 14); + }); + InjectPacketWith(stream_structure, deltaframe2_descriptor); + InjectPacketWith(stream_structure, deltaframe1_descriptor); +} + +TEST_F(RtpVideoStreamReceiverDependencyDescriptorTest, + DropsLateDeltaFramePacketWithDependencyDescriptorExtension) { + FrameDependencyStructure stream_structure1 = CreateStreamStructure(); + FrameDependencyStructure stream_structure2 = CreateStreamStructure(); + // Make sure template ids for these two structures do not collide: + // adjust structure_id (that is also used as template id offset). + stream_structure1.structure_id = 13; + stream_structure2.structure_id = + stream_structure1.structure_id + stream_structure1.templates.size(); + + DependencyDescriptor keyframe1_descriptor; + keyframe1_descriptor.attached_structure = + std::make_unique(stream_structure1); + keyframe1_descriptor.frame_dependencies = stream_structure1.templates[0]; + keyframe1_descriptor.frame_number = 1; + EXPECT_CALL(mock_on_complete_frame_callback_, DoOnCompleteFrame); + InjectPacketWith(stream_structure1, keyframe1_descriptor); + + // Pass in 2nd key frame with different structure. + DependencyDescriptor keyframe2_descriptor; + keyframe2_descriptor.attached_structure = + std::make_unique(stream_structure2); + keyframe2_descriptor.frame_dependencies = stream_structure2.templates[0]; + keyframe2_descriptor.frame_number = 3; + EXPECT_CALL(mock_on_complete_frame_callback_, DoOnCompleteFrame); + InjectPacketWith(stream_structure2, keyframe2_descriptor); + + // Pass in late delta frame that uses structure of the 1st key frame. + DependencyDescriptor deltaframe_descriptor; + deltaframe_descriptor.frame_dependencies = stream_structure1.templates[0]; + deltaframe_descriptor.frame_number = 2; + EXPECT_CALL(mock_on_complete_frame_callback_, DoOnCompleteFrame).Times(0); + InjectPacketWith(stream_structure1, deltaframe_descriptor); +} + +TEST_F(RtpVideoStreamReceiverDependencyDescriptorTest, + DropsLateKeyFramePacketWithDependencyDescriptorExtension) { + FrameDependencyStructure stream_structure1 = CreateStreamStructure(); + FrameDependencyStructure stream_structure2 = CreateStreamStructure(); + // Make sure template ids for these two structures do not collide: + // adjust structure_id (that is also used as template id offset). + stream_structure1.structure_id = 13; + stream_structure2.structure_id = + stream_structure1.structure_id + stream_structure1.templates.size(); + + DependencyDescriptor keyframe1_descriptor; + keyframe1_descriptor.attached_structure = + std::make_unique(stream_structure1); + keyframe1_descriptor.frame_dependencies = stream_structure1.templates[0]; + keyframe1_descriptor.frame_number = 1; + + DependencyDescriptor keyframe2_descriptor; + keyframe2_descriptor.attached_structure = + std::make_unique(stream_structure2); + keyframe2_descriptor.frame_dependencies = stream_structure2.templates[0]; + keyframe2_descriptor.frame_number = 3; + + EXPECT_CALL(mock_on_complete_frame_callback_, DoOnCompleteFrame) + .WillOnce([&](video_coding::EncodedFrame* frame) { + EXPECT_EQ(frame->id.picture_id & 0xFFFF, 3); + }); + InjectPacketWith(stream_structure2, keyframe2_descriptor); + InjectPacketWith(stream_structure1, keyframe1_descriptor); + + // Pass in delta frame that uses structure of the 2nd key frame. Late key + // frame shouldn't block it. + DependencyDescriptor deltaframe_descriptor; + deltaframe_descriptor.frame_dependencies = stream_structure2.templates[0]; + deltaframe_descriptor.frame_number = 4; + EXPECT_CALL(mock_on_complete_frame_callback_, DoOnCompleteFrame) + .WillOnce([&](video_coding::EncodedFrame* frame) { + EXPECT_EQ(frame->id.picture_id & 0xFFFF, 4); + }); + InjectPacketWith(stream_structure2, deltaframe_descriptor); +} + #if RTC_DCHECK_IS_ON && GTEST_HAS_DEATH_TEST && !defined(WEBRTC_ANDROID) TEST_F(RtpVideoStreamReceiverTest, RepeatedSecondarySinkDisallowed) { MockRtpPacketSink secondary_sink;