diff --git a/modules/rtp_rtcp/source/rtp_format_video_generic.cc b/modules/rtp_rtcp/source/rtp_format_video_generic.cc index edd1e3c530..6ea13b7fce 100644 --- a/modules/rtp_rtcp/source/rtp_format_video_generic.cc +++ b/modules/rtp_rtcp/source/rtp_format_video_generic.cc @@ -119,6 +119,14 @@ bool RtpDepacketizerGeneric::Parse(ParsedPayload* parsed_payload, parsed_payload->video_header().generic.emplace(); parsed_payload->video_header().generic->frame_id = ((payload_data[0] & 0x7F) << 8) | payload_data[1]; + + // The old generic format (this format) does not include spatial and + // temporal layer information. To distinguish which format that was actually + // used we set the spatial and themporal layer to -1; + // TODO(bugs.webrtc.org/9772): Remove the old format. + parsed_payload->video_header().generic->spatial_index = -1; + parsed_payload->video_header().generic->temporal_index = -1; + payload_data += kExtendedHeaderLength; payload_data_length -= kExtendedHeaderLength; } diff --git a/modules/video_coding/rtp_frame_reference_finder.cc b/modules/video_coding/rtp_frame_reference_finder.cc index 6c93c84018..67414bb356 100644 --- a/modules/video_coding/rtp_frame_reference_finder.cc +++ b/modules/video_coding/rtp_frame_reference_finder.cc @@ -84,6 +84,14 @@ void RtpFrameReferenceFinder::RetryStashedFrames() { RtpFrameReferenceFinder::FrameDecision RtpFrameReferenceFinder::ManageFrameInternal(RtpFrameObject* frame) { + absl::optional video_header = frame->GetRtpVideoHeader(); + // TODO(bugs.webrtc.org/9772): Remove the spatial id check when the old + // generic format has been removed. + if (video_header && video_header->generic && + video_header->generic->spatial_index != -1) { + return ManageFrameGeneric(frame, *video_header->generic); + } + switch (frame->codec_type()) { case kVideoCodecVP8: return ManageFrameVp8(frame); @@ -91,11 +99,11 @@ RtpFrameReferenceFinder::ManageFrameInternal(RtpFrameObject* frame) { return ManageFrameVp9(frame); default: { // Use 15 first bits of frame ID as picture ID if available. - absl::optional video_header = frame->GetRtpVideoHeader(); - absl::optional generic_info = - video_header ? video_header->generic : absl::nullopt; - return ManageFrameGeneric( - frame, generic_info ? generic_info->frame_id & 0x7fff : kNoPictureId); + int picture_id = kNoPictureId; + if (video_header && video_header->generic) + picture_id = video_header->generic->frame_id & 0x7fff; + + return ManageFramePidOrSeqNum(frame, picture_id); } } } @@ -161,8 +169,28 @@ void RtpFrameReferenceFinder::UpdateLastPictureIdWithPadding(uint16_t seq_num) { } RtpFrameReferenceFinder::FrameDecision -RtpFrameReferenceFinder::ManageFrameGeneric(RtpFrameObject* frame, - int picture_id) { +RtpFrameReferenceFinder::ManageFrameGeneric( + RtpFrameObject* frame, + const RTPVideoHeader::GenericDescriptorInfo& descriptor) { + if (EncodedFrame::kMaxFrameReferences < descriptor.dependencies.size()) { + RTC_LOG(LS_WARNING) << "Too many dependencies in generic descriptor."; + return kDrop; + } + + int64_t frame_id = generic_frame_id_unwrapper_.Unwrap(descriptor.frame_id); + frame->id.picture_id = frame_id; + frame->id.spatial_layer = descriptor.spatial_index; + + frame->num_references = descriptor.dependencies.size(); + for (size_t i = 0; i < descriptor.dependencies.size(); ++i) + frame->references[i] = frame_id - descriptor.dependencies[i]; + + return kHandOff; +} + +RtpFrameReferenceFinder::FrameDecision +RtpFrameReferenceFinder::ManageFramePidOrSeqNum(RtpFrameObject* frame, + int picture_id) { // If |picture_id| is specified then we use that to set the frame references, // otherwise we use sequence number. if (picture_id != kNoPictureId) { @@ -219,7 +247,7 @@ RtpFrameReferenceFinder::ManageFrameGeneric(RtpFrameObject* frame, // picture id according to some incrementing counter. frame->id.picture_id = frame->last_seq_num(); frame->num_references = frame->frame_type() == kVideoFrameDelta; - frame->references[0] = generic_unwrapper_.Unwrap(last_picture_id_gop); + frame->references[0] = rtp_seq_num_unwrapper_.Unwrap(last_picture_id_gop); if (AheadOf(frame->id.picture_id, last_picture_id_gop)) { seq_num_it->second.first = frame->id.picture_id; seq_num_it->second.second = frame->id.picture_id; @@ -227,7 +255,7 @@ RtpFrameReferenceFinder::ManageFrameGeneric(RtpFrameObject* frame, last_picture_id_ = frame->id.picture_id; UpdateLastPictureIdWithPadding(frame->id.picture_id); - frame->id.picture_id = generic_unwrapper_.Unwrap(frame->id.picture_id); + frame->id.picture_id = rtp_seq_num_unwrapper_.Unwrap(frame->id.picture_id); return kHandOff; } @@ -247,7 +275,7 @@ RtpFrameReferenceFinder::FrameDecision RtpFrameReferenceFinder::ManageFrameVp8( if (codec_header.pictureId == kNoPictureId || codec_header.temporalIdx == kNoTemporalIdx || codec_header.tl0PicIdx == kNoTl0PicIdx) { - return ManageFrameGeneric(std::move(frame), codec_header.pictureId); + return ManageFramePidOrSeqNum(std::move(frame), codec_header.pictureId); } frame->id.picture_id = codec_header.pictureId % kPicIdLength; @@ -396,7 +424,7 @@ RtpFrameReferenceFinder::FrameDecision RtpFrameReferenceFinder::ManageFrameVp9( if (codec_header.picture_id == kNoPictureId || codec_header.temporal_idx == kNoTemporalIdx) { - return ManageFrameGeneric(std::move(frame), codec_header.picture_id); + return ManageFramePidOrSeqNum(std::move(frame), codec_header.picture_id); } frame->id.spatial_layer = codec_header.spatial_idx; diff --git a/modules/video_coding/rtp_frame_reference_finder.h b/modules/video_coding/rtp_frame_reference_finder.h index 501fcbc81a..eae73d2fdc 100644 --- a/modules/video_coding/rtp_frame_reference_finder.h +++ b/modules/video_coding/rtp_frame_reference_finder.h @@ -88,10 +88,15 @@ class RtpFrameReferenceFinder { FrameDecision ManageFrameInternal(RtpFrameObject* frame) RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_); - // Find references for generic frames. If |picture_id| is unspecified - // then packet sequence numbers will be used to determine the references - // of the frames. - FrameDecision ManageFrameGeneric(RtpFrameObject* frame, int picture_id) + FrameDecision ManageFrameGeneric( + RtpFrameObject* frame, + const RTPVideoHeader::GenericDescriptorInfo& descriptor) + RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_); + + // Find references for frames with no or very limited information in the + // descriptor. If |picture_id| is unspecified then packet sequence numbers + // will be used to determine the references of the frames. + FrameDecision ManageFramePidOrSeqNum(RtpFrameObject* frame, int picture_id) RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_); // Find references for Vp8 frames @@ -192,9 +197,11 @@ class RtpFrameReferenceFinder { OnCompleteFrameCallback* frame_callback_; + SeqNumUnwrapper generic_frame_id_unwrapper_ RTC_GUARDED_BY(crit_); + // Unwrapper used to unwrap generic RTP streams. In a generic stream we derive // a picture id from the packet sequence number. - SeqNumUnwrapper generic_unwrapper_ RTC_GUARDED_BY(crit_); + SeqNumUnwrapper rtp_seq_num_unwrapper_ RTC_GUARDED_BY(crit_); // Unwrapper used to unwrap VP8/VP9 streams which have their picture id // specified. diff --git a/video/rtp_video_stream_receiver_unittest.cc b/video/rtp_video_stream_receiver_unittest.cc index 9891697e00..9fadae10f7 100644 --- a/video/rtp_video_stream_receiver_unittest.cc +++ b/video/rtp_video_stream_receiver_unittest.cc @@ -15,6 +15,8 @@ #include "common_video/h264/h264_common.h" #include "media/base/mediaconstants.h" #include "modules/pacing/packet_router.h" +#include "modules/rtp_rtcp/source/rtp_generic_frame_descriptor.h" +#include "modules/rtp_rtcp/source/rtp_generic_frame_descriptor_extension.h" #include "modules/rtp_rtcp/source/rtp_packet_received.h" #include "modules/utility/include/process_thread.h" #include "modules/video_coding/frame_object.h" @@ -28,7 +30,8 @@ #include "test/field_trial.h" #include "video/rtp_video_stream_receiver.h" -using testing::_; +using ::testing::_; +using ::testing::Invoke; namespace webrtc { @@ -498,6 +501,120 @@ TEST_F(RtpVideoStreamReceiverTest, rtp_video_stream_receiver_->RemoveSecondarySink(&secondary_sink); } +TEST_F(RtpVideoStreamReceiverTest, ParseGenericDescriptorOnePacket) { + const std::vector data = {0, 1, 2, 3, 4}; + const int kPayloadType = 123; + const int kSpatialIndex = 1; + + VideoCodec codec; + codec.plType = kPayloadType; + rtp_video_stream_receiver_->AddReceiveCodec(codec, {}); + rtp_video_stream_receiver_->StartReceive(); + + RtpHeaderExtensionMap extension_map; + extension_map.Register(5); + RtpPacketReceived rtp_packet(&extension_map); + + RtpGenericFrameDescriptor generic_descriptor; + generic_descriptor.SetFirstPacketInSubFrame(true); + generic_descriptor.SetLastPacketInSubFrame(true); + generic_descriptor.SetFirstSubFrameInFrame(true); + generic_descriptor.SetLastSubFrameInFrame(true); + generic_descriptor.SetFrameId(100); + generic_descriptor.SetSpatialLayersBitmask(1 << kSpatialIndex); + generic_descriptor.AddFrameDependencyDiff(90); + generic_descriptor.AddFrameDependencyDiff(80); + EXPECT_TRUE(rtp_packet.SetExtension( + generic_descriptor)); + + uint8_t* payload = rtp_packet.SetPayloadSize(data.size()); + memcpy(payload, data.data(), data.size()); + // The first byte is the header, so we ignore the first byte of |data|. + mock_on_complete_frame_callback_.AppendExpectedBitstream(data.data() + 1, + data.size() - 1); + + rtp_packet.SetMarker(true); + rtp_packet.SetPayloadType(kPayloadType); + rtp_packet.SetSequenceNumber(1); + + EXPECT_CALL(mock_on_complete_frame_callback_, DoOnCompleteFrame) + .WillOnce(Invoke([kSpatialIndex](video_coding::EncodedFrame* frame) { + EXPECT_EQ(frame->num_references, 2U); + EXPECT_EQ(frame->references[0], frame->id.picture_id - 90); + EXPECT_EQ(frame->references[1], frame->id.picture_id - 80); + EXPECT_EQ(frame->id.spatial_layer, kSpatialIndex); + })); + + rtp_video_stream_receiver_->OnRtpPacket(rtp_packet); +} + +TEST_F(RtpVideoStreamReceiverTest, ParseGenericDescriptorTwoPackets) { + const std::vector data = {0, 1, 2, 3, 4}; + const int kPayloadType = 123; + const int kSpatialIndex = 1; + + VideoCodec codec; + codec.plType = kPayloadType; + rtp_video_stream_receiver_->AddReceiveCodec(codec, {}); + rtp_video_stream_receiver_->StartReceive(); + + RtpHeaderExtensionMap extension_map; + extension_map.Register(5); + RtpPacketReceived first_packet(&extension_map); + + RtpGenericFrameDescriptor first_packet_descriptor; + first_packet_descriptor.SetFirstPacketInSubFrame(true); + first_packet_descriptor.SetLastPacketInSubFrame(false); + first_packet_descriptor.SetFirstSubFrameInFrame(true); + first_packet_descriptor.SetLastSubFrameInFrame(true); + first_packet_descriptor.SetFrameId(100); + first_packet_descriptor.SetTemporalLayer(1); + first_packet_descriptor.SetSpatialLayersBitmask(1 << kSpatialIndex); + first_packet_descriptor.AddFrameDependencyDiff(90); + first_packet_descriptor.AddFrameDependencyDiff(80); + EXPECT_TRUE(first_packet.SetExtension( + first_packet_descriptor)); + + uint8_t* first_packet_payload = first_packet.SetPayloadSize(data.size()); + memcpy(first_packet_payload, data.data(), data.size()); + // The first byte is the header, so we ignore the first byte of |data|. + mock_on_complete_frame_callback_.AppendExpectedBitstream(data.data() + 1, + data.size() - 1); + + first_packet.SetPayloadType(kPayloadType); + first_packet.SetSequenceNumber(1); + rtp_video_stream_receiver_->OnRtpPacket(first_packet); + + RtpPacketReceived second_packet(&extension_map); + RtpGenericFrameDescriptor second_packet_descriptor; + second_packet_descriptor.SetFirstPacketInSubFrame(false); + second_packet_descriptor.SetLastPacketInSubFrame(true); + second_packet_descriptor.SetFirstSubFrameInFrame(true); + second_packet_descriptor.SetLastSubFrameInFrame(true); + EXPECT_TRUE(second_packet.SetExtension( + second_packet_descriptor)); + + second_packet.SetMarker(true); + second_packet.SetPayloadType(kPayloadType); + second_packet.SetSequenceNumber(2); + + uint8_t* second_packet_payload = second_packet.SetPayloadSize(data.size()); + memcpy(second_packet_payload, data.data(), data.size()); + // The first byte is the header, so we ignore the first byte of |data|. + mock_on_complete_frame_callback_.AppendExpectedBitstream(data.data() + 1, + data.size() - 1); + + EXPECT_CALL(mock_on_complete_frame_callback_, DoOnCompleteFrame) + .WillOnce(Invoke([kSpatialIndex](video_coding::EncodedFrame* frame) { + EXPECT_EQ(frame->num_references, 2U); + EXPECT_EQ(frame->references[0], frame->id.picture_id - 90); + EXPECT_EQ(frame->references[1], frame->id.picture_id - 80); + EXPECT_EQ(frame->id.spatial_layer, kSpatialIndex); + })); + + rtp_video_stream_receiver_->OnRtpPacket(second_packet); +} + #if RTC_DCHECK_IS_ON && GTEST_HAS_DEATH_TEST && !defined(WEBRTC_ANDROID) TEST_F(RtpVideoStreamReceiverTest, RepeatedSecondarySinkDisallowed) { MockRtpPacketSink secondary_sink;