diff --git a/modules/rtp_rtcp/BUILD.gn b/modules/rtp_rtcp/BUILD.gn index 1cf0982b58..f2baf6ecc2 100644 --- a/modules/rtp_rtcp/BUILD.gn +++ b/modules/rtp_rtcp/BUILD.gn @@ -217,6 +217,8 @@ rtc_library("rtp_rtcp") { "source/video_rtp_depacketizer_raw.h", "source/video_rtp_depacketizer_vp8.cc", "source/video_rtp_depacketizer_vp8.h", + "source/video_rtp_depacketizer_vp9.cc", + "source/video_rtp_depacketizer_vp9.h", ] if (rtc_enable_bwe_test_logging) { @@ -483,6 +485,7 @@ if (rtc_include_tests) { "source/ulpfec_receiver_unittest.cc", "source/video_rtp_depacketizer_raw_unittest.cc", "source/video_rtp_depacketizer_vp8_unittest.cc", + "source/video_rtp_depacketizer_vp9_unittest.cc", ] deps = [ ":fec_test_helper", diff --git a/modules/rtp_rtcp/source/create_video_rtp_depacketizer.cc b/modules/rtp_rtcp/source/create_video_rtp_depacketizer.cc index 52edab0f83..604c6415e7 100644 --- a/modules/rtp_rtcp/source/create_video_rtp_depacketizer.cc +++ b/modules/rtp_rtcp/source/create_video_rtp_depacketizer.cc @@ -17,6 +17,7 @@ #include "modules/rtp_rtcp/source/rtp_format.h" #include "modules/rtp_rtcp/source/video_rtp_depacketizer.h" #include "modules/rtp_rtcp/source/video_rtp_depacketizer_vp8.h" +#include "modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.h" #include "rtc_base/checks.h" #include "rtc_base/copy_on_write_buffer.h" @@ -60,6 +61,8 @@ std::unique_ptr CreateVideoRtpDepacketizer( switch (codec) { case kVideoCodecVP8: return std::make_unique(); + case kVideoCodecVP9: + return std::make_unique(); default: return std::make_unique(codec); } diff --git a/modules/rtp_rtcp/source/rtp_format_vp9.cc b/modules/rtp_rtcp/source/rtp_format_vp9.cc index f83a12b0e2..0094075147 100644 --- a/modules/rtp_rtcp/source/rtp_format_vp9.cc +++ b/modules/rtp_rtcp/source/rtp_format_vp9.cc @@ -14,6 +14,7 @@ #include "api/video/video_codec_constants.h" #include "modules/rtp_rtcp/source/rtp_packet_to_send.h" +#include "modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.h" #include "modules/video_coding/codecs/interface/common_constants.h" #include "rtc_base/bit_buffer.h" #include "rtc_base/checks.h" @@ -279,169 +280,6 @@ bool WriteSsData(const RTPVideoHeaderVP9& vp9, rtc::BitBufferWriter* writer) { } return true; } - -// Picture ID: -// -// +-+-+-+-+-+-+-+-+ -// I: |M| PICTURE ID | M:0 => picture id is 7 bits. -// +-+-+-+-+-+-+-+-+ M:1 => picture id is 15 bits. -// M: | EXTENDED PID | -// +-+-+-+-+-+-+-+-+ -// -bool ParsePictureId(rtc::BitBuffer* parser, RTPVideoHeaderVP9* vp9) { - uint32_t picture_id; - uint32_t m_bit; - RETURN_FALSE_ON_ERROR(parser->ReadBits(&m_bit, 1)); - if (m_bit) { - RETURN_FALSE_ON_ERROR(parser->ReadBits(&picture_id, 15)); - vp9->max_picture_id = kMaxTwoBytePictureId; - } else { - RETURN_FALSE_ON_ERROR(parser->ReadBits(&picture_id, 7)); - vp9->max_picture_id = kMaxOneBytePictureId; - } - vp9->picture_id = picture_id; - return true; -} - -// Layer indices (flexible mode): -// -// +-+-+-+-+-+-+-+-+ -// L: | T |U| S |D| -// +-+-+-+-+-+-+-+-+ -// -bool ParseLayerInfoCommon(rtc::BitBuffer* parser, RTPVideoHeaderVP9* vp9) { - uint32_t t, u_bit, s, d_bit; - RETURN_FALSE_ON_ERROR(parser->ReadBits(&t, 3)); - RETURN_FALSE_ON_ERROR(parser->ReadBits(&u_bit, 1)); - RETURN_FALSE_ON_ERROR(parser->ReadBits(&s, 3)); - RETURN_FALSE_ON_ERROR(parser->ReadBits(&d_bit, 1)); - vp9->temporal_idx = t; - vp9->temporal_up_switch = u_bit ? true : false; - if (s >= kMaxSpatialLayers) - return false; - vp9->spatial_idx = s; - vp9->inter_layer_predicted = d_bit ? true : false; - return true; -} - -// Layer indices (non-flexible mode): -// -// +-+-+-+-+-+-+-+-+ -// L: | T |U| S |D| -// +-+-+-+-+-+-+-+-+ -// | TL0PICIDX | -// +-+-+-+-+-+-+-+-+ -// -bool ParseLayerInfoNonFlexibleMode(rtc::BitBuffer* parser, - RTPVideoHeaderVP9* vp9) { - uint8_t tl0picidx; - RETURN_FALSE_ON_ERROR(parser->ReadUInt8(&tl0picidx)); - vp9->tl0_pic_idx = tl0picidx; - return true; -} - -bool ParseLayerInfo(rtc::BitBuffer* parser, RTPVideoHeaderVP9* vp9) { - if (!ParseLayerInfoCommon(parser, vp9)) - return false; - - if (vp9->flexible_mode) - return true; - - return ParseLayerInfoNonFlexibleMode(parser, vp9); -} - -// Reference indices: -// -// +-+-+-+-+-+-+-+-+ P=1,F=1: At least one reference index -// P,F: | P_DIFF |N| up to 3 times has to be specified. -// +-+-+-+-+-+-+-+-+ N=1: An additional P_DIFF follows -// current P_DIFF. -// -bool ParseRefIndices(rtc::BitBuffer* parser, RTPVideoHeaderVP9* vp9) { - if (vp9->picture_id == kNoPictureId) - return false; - - vp9->num_ref_pics = 0; - uint32_t n_bit; - do { - if (vp9->num_ref_pics == kMaxVp9RefPics) - return false; - - uint32_t p_diff; - RETURN_FALSE_ON_ERROR(parser->ReadBits(&p_diff, 7)); - RETURN_FALSE_ON_ERROR(parser->ReadBits(&n_bit, 1)); - - vp9->pid_diff[vp9->num_ref_pics] = p_diff; - uint32_t scaled_pid = vp9->picture_id; - if (p_diff > scaled_pid) { - // TODO(asapersson): Max should correspond to the picture id of last wrap. - scaled_pid += vp9->max_picture_id + 1; - } - vp9->ref_picture_id[vp9->num_ref_pics++] = scaled_pid - p_diff; - } while (n_bit); - - return true; -} - -// Scalability structure (SS). -// -// +-+-+-+-+-+-+-+-+ -// V: | N_S |Y|G|-|-|-| -// +-+-+-+-+-+-+-+-+ -| -// Y: | WIDTH | (OPTIONAL) . -// + + . -// | | (OPTIONAL) . -// +-+-+-+-+-+-+-+-+ . N_S + 1 times -// | HEIGHT | (OPTIONAL) . -// + + . -// | | (OPTIONAL) . -// +-+-+-+-+-+-+-+-+ -| -// G: | N_G | (OPTIONAL) -// +-+-+-+-+-+-+-+-+ -| -// N_G: | T |U| R |-|-| (OPTIONAL) . -// +-+-+-+-+-+-+-+-+ -| . N_G times -// | P_DIFF | (OPTIONAL) . R times . -// +-+-+-+-+-+-+-+-+ -| -| -// -bool ParseSsData(rtc::BitBuffer* parser, RTPVideoHeaderVP9* vp9) { - uint32_t n_s, y_bit, g_bit; - RETURN_FALSE_ON_ERROR(parser->ReadBits(&n_s, 3)); - RETURN_FALSE_ON_ERROR(parser->ReadBits(&y_bit, 1)); - RETURN_FALSE_ON_ERROR(parser->ReadBits(&g_bit, 1)); - RETURN_FALSE_ON_ERROR(parser->ConsumeBits(3)); - vp9->num_spatial_layers = n_s + 1; - vp9->spatial_layer_resolution_present = y_bit ? true : false; - vp9->gof.num_frames_in_gof = 0; - - if (y_bit) { - for (size_t i = 0; i < vp9->num_spatial_layers; ++i) { - RETURN_FALSE_ON_ERROR(parser->ReadUInt16(&vp9->width[i])); - RETURN_FALSE_ON_ERROR(parser->ReadUInt16(&vp9->height[i])); - } - } - if (g_bit) { - uint8_t n_g; - RETURN_FALSE_ON_ERROR(parser->ReadUInt8(&n_g)); - vp9->gof.num_frames_in_gof = n_g; - } - for (size_t i = 0; i < vp9->gof.num_frames_in_gof; ++i) { - uint32_t t, u_bit, r; - RETURN_FALSE_ON_ERROR(parser->ReadBits(&t, 3)); - RETURN_FALSE_ON_ERROR(parser->ReadBits(&u_bit, 1)); - RETURN_FALSE_ON_ERROR(parser->ReadBits(&r, 2)); - RETURN_FALSE_ON_ERROR(parser->ConsumeBits(2)); - vp9->gof.temporal_idx[i] = t; - vp9->gof.temporal_up_switch[i] = u_bit ? true : false; - vp9->gof.num_ref_pics[i] = r; - - for (uint8_t p = 0; p < vp9->gof.num_ref_pics[i]; ++p) { - uint8_t p_diff; - RETURN_FALSE_ON_ERROR(parser->ReadUInt8(&p_diff)); - vp9->gof.pid_diff[i][p] = p_diff; - } - } - return true; -} } // namespace RtpPacketizerVp9::RtpPacketizerVp9(rtc::ArrayView payload, @@ -586,80 +424,14 @@ bool RtpPacketizerVp9::WriteHeader(bool layer_begin, bool RtpDepacketizerVp9::Parse(ParsedPayload* parsed_payload, const uint8_t* payload, size_t payload_length) { - RTC_DCHECK(parsed_payload != nullptr); - if (payload_length == 0) { - RTC_LOG(LS_ERROR) << "Payload length is zero."; + RTC_DCHECK(parsed_payload); + int offset = VideoRtpDepacketizerVp9::ParseRtpPayload( + rtc::MakeArrayView(payload, payload_length), &parsed_payload->video); + if (offset == 0) return false; - } - - // Parse mandatory first byte of payload descriptor. - rtc::BitBuffer parser(payload, payload_length); - uint32_t i_bit, p_bit, l_bit, f_bit, b_bit, e_bit, v_bit, z_bit; - RETURN_FALSE_ON_ERROR(parser.ReadBits(&i_bit, 1)); - RETURN_FALSE_ON_ERROR(parser.ReadBits(&p_bit, 1)); - RETURN_FALSE_ON_ERROR(parser.ReadBits(&l_bit, 1)); - RETURN_FALSE_ON_ERROR(parser.ReadBits(&f_bit, 1)); - RETURN_FALSE_ON_ERROR(parser.ReadBits(&b_bit, 1)); - RETURN_FALSE_ON_ERROR(parser.ReadBits(&e_bit, 1)); - RETURN_FALSE_ON_ERROR(parser.ReadBits(&v_bit, 1)); - RETURN_FALSE_ON_ERROR(parser.ReadBits(&z_bit, 1)); - - // Parsed payload. - parsed_payload->video_header().width = 0; - parsed_payload->video_header().height = 0; - parsed_payload->video_header().simulcastIdx = 0; - parsed_payload->video_header().codec = kVideoCodecVP9; - - parsed_payload->video_header().frame_type = - p_bit ? VideoFrameType::kVideoFrameDelta : VideoFrameType::kVideoFrameKey; - - auto& vp9_header = parsed_payload->video_header() - .video_type_header.emplace(); - vp9_header.InitRTPVideoHeaderVP9(); - vp9_header.inter_pic_predicted = p_bit ? true : false; - vp9_header.flexible_mode = f_bit ? true : false; - vp9_header.beginning_of_frame = b_bit ? true : false; - vp9_header.end_of_frame = e_bit ? true : false; - vp9_header.ss_data_available = v_bit ? true : false; - vp9_header.non_ref_for_inter_layer_pred = z_bit ? true : false; - - // Parse fields that are present. - if (i_bit && !ParsePictureId(&parser, &vp9_header)) { - RTC_LOG(LS_ERROR) << "Failed parsing VP9 picture id."; - return false; - } - if (l_bit && !ParseLayerInfo(&parser, &vp9_header)) { - RTC_LOG(LS_ERROR) << "Failed parsing VP9 layer info."; - return false; - } - if (p_bit && f_bit && !ParseRefIndices(&parser, &vp9_header)) { - RTC_LOG(LS_ERROR) << "Failed parsing VP9 ref indices."; - return false; - } - if (v_bit) { - if (!ParseSsData(&parser, &vp9_header)) { - RTC_LOG(LS_ERROR) << "Failed parsing VP9 SS data."; - return false; - } - if (vp9_header.spatial_layer_resolution_present) { - // TODO(asapersson): Add support for spatial layers. - parsed_payload->video_header().width = vp9_header.width[0]; - parsed_payload->video_header().height = vp9_header.height[0]; - } - } - parsed_payload->video_header().is_first_packet_in_frame = - b_bit && (!l_bit || !vp9_header.inter_layer_predicted); - - uint64_t rem_bits = parser.RemainingBitCount(); - RTC_DCHECK_EQ(rem_bits % 8, 0); - parsed_payload->payload_length = rem_bits / 8; - if (parsed_payload->payload_length == 0) { - RTC_LOG(LS_ERROR) << "Failed parsing VP9 payload data."; - return false; - } - parsed_payload->payload = - payload + payload_length - parsed_payload->payload_length; + parsed_payload->payload = payload + offset; + parsed_payload->payload_length = payload_length - offset; return true; } } // namespace webrtc diff --git a/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.cc b/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.cc new file mode 100644 index 0000000000..a719d7ab12 --- /dev/null +++ b/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.cc @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2019 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.h" + +#include + +#include "api/video/video_codec_constants.h" +#include "modules/rtp_rtcp/source/rtp_packet_to_send.h" +#include "modules/video_coding/codecs/interface/common_constants.h" +#include "rtc_base/bit_buffer.h" +#include "rtc_base/checks.h" +#include "rtc_base/logging.h" + +#define RETURN_FALSE_ON_ERROR(x) \ + if (!(x)) { \ + return false; \ + } + +namespace webrtc { +namespace { + +constexpr int kFailedToParse = 0; + +// Picture ID: +// +// +-+-+-+-+-+-+-+-+ +// I: |M| PICTURE ID | M:0 => picture id is 7 bits. +// +-+-+-+-+-+-+-+-+ M:1 => picture id is 15 bits. +// M: | EXTENDED PID | +// +-+-+-+-+-+-+-+-+ +// +bool ParsePictureId(rtc::BitBuffer* parser, RTPVideoHeaderVP9* vp9) { + uint32_t picture_id; + uint32_t m_bit; + RETURN_FALSE_ON_ERROR(parser->ReadBits(&m_bit, 1)); + if (m_bit) { + RETURN_FALSE_ON_ERROR(parser->ReadBits(&picture_id, 15)); + vp9->max_picture_id = kMaxTwoBytePictureId; + } else { + RETURN_FALSE_ON_ERROR(parser->ReadBits(&picture_id, 7)); + vp9->max_picture_id = kMaxOneBytePictureId; + } + vp9->picture_id = picture_id; + return true; +} + +// Layer indices (flexible mode): +// +// +-+-+-+-+-+-+-+-+ +// L: | T |U| S |D| +// +-+-+-+-+-+-+-+-+ +// +bool ParseLayerInfoCommon(rtc::BitBuffer* parser, RTPVideoHeaderVP9* vp9) { + uint32_t t, u_bit, s, d_bit; + RETURN_FALSE_ON_ERROR(parser->ReadBits(&t, 3)); + RETURN_FALSE_ON_ERROR(parser->ReadBits(&u_bit, 1)); + RETURN_FALSE_ON_ERROR(parser->ReadBits(&s, 3)); + RETURN_FALSE_ON_ERROR(parser->ReadBits(&d_bit, 1)); + vp9->temporal_idx = t; + vp9->temporal_up_switch = u_bit ? true : false; + if (s >= kMaxSpatialLayers) + return false; + vp9->spatial_idx = s; + vp9->inter_layer_predicted = d_bit ? true : false; + return true; +} + +// Layer indices (non-flexible mode): +// +// +-+-+-+-+-+-+-+-+ +// L: | T |U| S |D| +// +-+-+-+-+-+-+-+-+ +// | TL0PICIDX | +// +-+-+-+-+-+-+-+-+ +// +bool ParseLayerInfoNonFlexibleMode(rtc::BitBuffer* parser, + RTPVideoHeaderVP9* vp9) { + uint8_t tl0picidx; + RETURN_FALSE_ON_ERROR(parser->ReadUInt8(&tl0picidx)); + vp9->tl0_pic_idx = tl0picidx; + return true; +} + +bool ParseLayerInfo(rtc::BitBuffer* parser, RTPVideoHeaderVP9* vp9) { + if (!ParseLayerInfoCommon(parser, vp9)) + return false; + + if (vp9->flexible_mode) + return true; + + return ParseLayerInfoNonFlexibleMode(parser, vp9); +} + +// Reference indices: +// +// +-+-+-+-+-+-+-+-+ P=1,F=1: At least one reference index +// P,F: | P_DIFF |N| up to 3 times has to be specified. +// +-+-+-+-+-+-+-+-+ N=1: An additional P_DIFF follows +// current P_DIFF. +// +bool ParseRefIndices(rtc::BitBuffer* parser, RTPVideoHeaderVP9* vp9) { + if (vp9->picture_id == kNoPictureId) + return false; + + vp9->num_ref_pics = 0; + uint32_t n_bit; + do { + if (vp9->num_ref_pics == kMaxVp9RefPics) + return false; + + uint32_t p_diff; + RETURN_FALSE_ON_ERROR(parser->ReadBits(&p_diff, 7)); + RETURN_FALSE_ON_ERROR(parser->ReadBits(&n_bit, 1)); + + vp9->pid_diff[vp9->num_ref_pics] = p_diff; + uint32_t scaled_pid = vp9->picture_id; + if (p_diff > scaled_pid) { + // TODO(asapersson): Max should correspond to the picture id of last wrap. + scaled_pid += vp9->max_picture_id + 1; + } + vp9->ref_picture_id[vp9->num_ref_pics++] = scaled_pid - p_diff; + } while (n_bit); + + return true; +} + +// Scalability structure (SS). +// +// +-+-+-+-+-+-+-+-+ +// V: | N_S |Y|G|-|-|-| +// +-+-+-+-+-+-+-+-+ -| +// Y: | WIDTH | (OPTIONAL) . +// + + . +// | | (OPTIONAL) . +// +-+-+-+-+-+-+-+-+ . N_S + 1 times +// | HEIGHT | (OPTIONAL) . +// + + . +// | | (OPTIONAL) . +// +-+-+-+-+-+-+-+-+ -| +// G: | N_G | (OPTIONAL) +// +-+-+-+-+-+-+-+-+ -| +// N_G: | T |U| R |-|-| (OPTIONAL) . +// +-+-+-+-+-+-+-+-+ -| . N_G times +// | P_DIFF | (OPTIONAL) . R times . +// +-+-+-+-+-+-+-+-+ -| -| +// +bool ParseSsData(rtc::BitBuffer* parser, RTPVideoHeaderVP9* vp9) { + uint32_t n_s, y_bit, g_bit; + RETURN_FALSE_ON_ERROR(parser->ReadBits(&n_s, 3)); + RETURN_FALSE_ON_ERROR(parser->ReadBits(&y_bit, 1)); + RETURN_FALSE_ON_ERROR(parser->ReadBits(&g_bit, 1)); + RETURN_FALSE_ON_ERROR(parser->ConsumeBits(3)); + vp9->num_spatial_layers = n_s + 1; + vp9->spatial_layer_resolution_present = y_bit ? true : false; + vp9->gof.num_frames_in_gof = 0; + + if (y_bit) { + for (size_t i = 0; i < vp9->num_spatial_layers; ++i) { + RETURN_FALSE_ON_ERROR(parser->ReadUInt16(&vp9->width[i])); + RETURN_FALSE_ON_ERROR(parser->ReadUInt16(&vp9->height[i])); + } + } + if (g_bit) { + uint8_t n_g; + RETURN_FALSE_ON_ERROR(parser->ReadUInt8(&n_g)); + vp9->gof.num_frames_in_gof = n_g; + } + for (size_t i = 0; i < vp9->gof.num_frames_in_gof; ++i) { + uint32_t t, u_bit, r; + RETURN_FALSE_ON_ERROR(parser->ReadBits(&t, 3)); + RETURN_FALSE_ON_ERROR(parser->ReadBits(&u_bit, 1)); + RETURN_FALSE_ON_ERROR(parser->ReadBits(&r, 2)); + RETURN_FALSE_ON_ERROR(parser->ConsumeBits(2)); + vp9->gof.temporal_idx[i] = t; + vp9->gof.temporal_up_switch[i] = u_bit ? true : false; + vp9->gof.num_ref_pics[i] = r; + + for (uint8_t p = 0; p < vp9->gof.num_ref_pics[i]; ++p) { + uint8_t p_diff; + RETURN_FALSE_ON_ERROR(parser->ReadUInt8(&p_diff)); + vp9->gof.pid_diff[i][p] = p_diff; + } + } + return true; +} +} // namespace + +absl::optional +VideoRtpDepacketizerVp9::Parse(rtc::CopyOnWriteBuffer rtp_payload) { + rtc::ArrayView payload(rtp_payload.cdata(), + rtp_payload.size()); + absl::optional result(absl::in_place); + int offset = ParseRtpPayload(payload, &result->video_header); + if (offset == kFailedToParse) + return absl::nullopt; + RTC_DCHECK_LT(offset, rtp_payload.size()); + result->video_payload = + rtp_payload.Slice(offset, rtp_payload.size() - offset); + return result; +} + +int VideoRtpDepacketizerVp9::ParseRtpPayload( + rtc::ArrayView rtp_payload, + RTPVideoHeader* video_header) { + RTC_DCHECK(video_header); + // Parse mandatory first byte of payload descriptor. + rtc::BitBuffer parser(rtp_payload.data(), rtp_payload.size()); + uint8_t first_byte; + if (!parser.ReadUInt8(&first_byte)) { + RTC_LOG(LS_ERROR) << "Payload length is zero."; + return kFailedToParse; + } + bool i_bit = first_byte & 0b1000'0000; // PictureId present . + bool p_bit = first_byte & 0b0100'0000; // Inter-picture predicted. + bool l_bit = first_byte & 0b0010'0000; // Layer indices present. + bool f_bit = first_byte & 0b0001'0000; // Flexible mode. + bool b_bit = first_byte & 0b0000'1000; // Begins frame flag. + bool e_bit = first_byte & 0b0000'0100; // Ends frame flag. + bool v_bit = first_byte & 0b0000'0010; // Scalability structure present. + bool z_bit = first_byte & 0b0000'0001; // Not used for inter-layer prediction + + // Parsed payload. + video_header->width = 0; + video_header->height = 0; + video_header->simulcastIdx = 0; + video_header->codec = kVideoCodecVP9; + + video_header->frame_type = + p_bit ? VideoFrameType::kVideoFrameDelta : VideoFrameType::kVideoFrameKey; + + auto& vp9_header = + video_header->video_type_header.emplace(); + vp9_header.InitRTPVideoHeaderVP9(); + vp9_header.inter_pic_predicted = p_bit; + vp9_header.flexible_mode = f_bit; + vp9_header.beginning_of_frame = b_bit; + vp9_header.end_of_frame = e_bit; + vp9_header.ss_data_available = v_bit; + vp9_header.non_ref_for_inter_layer_pred = z_bit; + + // Parse fields that are present. + if (i_bit && !ParsePictureId(&parser, &vp9_header)) { + RTC_LOG(LS_ERROR) << "Failed parsing VP9 picture id."; + return kFailedToParse; + } + if (l_bit && !ParseLayerInfo(&parser, &vp9_header)) { + RTC_LOG(LS_ERROR) << "Failed parsing VP9 layer info."; + return kFailedToParse; + } + if (p_bit && f_bit && !ParseRefIndices(&parser, &vp9_header)) { + RTC_LOG(LS_ERROR) << "Failed parsing VP9 ref indices."; + return kFailedToParse; + } + if (v_bit) { + if (!ParseSsData(&parser, &vp9_header)) { + RTC_LOG(LS_ERROR) << "Failed parsing VP9 SS data."; + return kFailedToParse; + } + if (vp9_header.spatial_layer_resolution_present) { + // TODO(asapersson): Add support for spatial layers. + video_header->width = vp9_header.width[0]; + video_header->height = vp9_header.height[0]; + } + } + video_header->is_first_packet_in_frame = + b_bit && (!l_bit || !vp9_header.inter_layer_predicted); + + size_t byte_offset; + size_t bit_offset; + parser.GetCurrentOffset(&byte_offset, &bit_offset); + RTC_DCHECK_EQ(bit_offset, 0); + if (byte_offset == rtp_payload.size()) { + // Empty vp9 payload data. + return kFailedToParse; + } + + return byte_offset; +} +} // namespace webrtc diff --git a/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.h b/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.h new file mode 100644 index 0000000000..c622cbc75e --- /dev/null +++ b/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2019 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef MODULES_RTP_RTCP_SOURCE_VIDEO_RTP_DEPACKETIZER_VP9_H_ +#define MODULES_RTP_RTCP_SOURCE_VIDEO_RTP_DEPACKETIZER_VP9_H_ + +#include + +#include "absl/types/optional.h" +#include "api/array_view.h" +#include "modules/rtp_rtcp/source/rtp_video_header.h" +#include "modules/rtp_rtcp/source/video_rtp_depacketizer.h" +#include "rtc_base/copy_on_write_buffer.h" + +namespace webrtc { + +class VideoRtpDepacketizerVp9 : public VideoRtpDepacketizer { + public: + VideoRtpDepacketizerVp9() = default; + VideoRtpDepacketizerVp9(const VideoRtpDepacketizerVp9&) = delete; + VideoRtpDepacketizerVp9& operator=(VideoRtpDepacketizerVp9&) = delete; + ~VideoRtpDepacketizerVp9() override = default; + + // Parses vp9 rtp payload descriptor. + // Returns zero on error or vp9 payload header offset on success. + static int ParseRtpPayload(rtc::ArrayView rtp_payload, + RTPVideoHeader* video_header); + + absl::optional Parse( + rtc::CopyOnWriteBuffer rtp_payload) override; +}; + +} // namespace webrtc + +#endif // MODULES_RTP_RTCP_SOURCE_VIDEO_RTP_DEPACKETIZER_VP9_H_ diff --git a/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9_unittest.cc b/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9_unittest.cc new file mode 100644 index 0000000000..937d15de23 --- /dev/null +++ b/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9_unittest.cc @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2019 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.h" + +#include +#include + +#include "api/array_view.h" +#include "test/gmock.h" +#include "test/gtest.h" + +namespace webrtc { +namespace { + +void VerifyHeader(const RTPVideoHeaderVP9& expected, + const RTPVideoHeaderVP9& actual) { + EXPECT_EQ(expected.inter_layer_predicted, actual.inter_layer_predicted); + EXPECT_EQ(expected.inter_pic_predicted, actual.inter_pic_predicted); + EXPECT_EQ(expected.flexible_mode, actual.flexible_mode); + EXPECT_EQ(expected.beginning_of_frame, actual.beginning_of_frame); + EXPECT_EQ(expected.end_of_frame, actual.end_of_frame); + EXPECT_EQ(expected.ss_data_available, actual.ss_data_available); + EXPECT_EQ(expected.non_ref_for_inter_layer_pred, + actual.non_ref_for_inter_layer_pred); + EXPECT_EQ(expected.picture_id, actual.picture_id); + EXPECT_EQ(expected.max_picture_id, actual.max_picture_id); + EXPECT_EQ(expected.temporal_idx, actual.temporal_idx); + EXPECT_EQ(expected.spatial_idx, actual.spatial_idx); + EXPECT_EQ(expected.gof_idx, actual.gof_idx); + EXPECT_EQ(expected.tl0_pic_idx, actual.tl0_pic_idx); + EXPECT_EQ(expected.temporal_up_switch, actual.temporal_up_switch); + + EXPECT_EQ(expected.num_ref_pics, actual.num_ref_pics); + for (uint8_t i = 0; i < expected.num_ref_pics; ++i) { + EXPECT_EQ(expected.pid_diff[i], actual.pid_diff[i]); + EXPECT_EQ(expected.ref_picture_id[i], actual.ref_picture_id[i]); + } + if (expected.ss_data_available) { + EXPECT_EQ(expected.spatial_layer_resolution_present, + actual.spatial_layer_resolution_present); + EXPECT_EQ(expected.num_spatial_layers, actual.num_spatial_layers); + if (expected.spatial_layer_resolution_present) { + for (size_t i = 0; i < expected.num_spatial_layers; i++) { + EXPECT_EQ(expected.width[i], actual.width[i]); + EXPECT_EQ(expected.height[i], actual.height[i]); + } + } + EXPECT_EQ(expected.gof.num_frames_in_gof, actual.gof.num_frames_in_gof); + for (size_t i = 0; i < expected.gof.num_frames_in_gof; i++) { + EXPECT_EQ(expected.gof.temporal_up_switch[i], + actual.gof.temporal_up_switch[i]); + EXPECT_EQ(expected.gof.temporal_idx[i], actual.gof.temporal_idx[i]); + EXPECT_EQ(expected.gof.num_ref_pics[i], actual.gof.num_ref_pics[i]); + for (uint8_t j = 0; j < expected.gof.num_ref_pics[i]; j++) { + EXPECT_EQ(expected.gof.pid_diff[i][j], actual.gof.pid_diff[i][j]); + } + } + } +} + +TEST(VideoRtpDepacketizerVp9Test, ParseBasicHeader) { + uint8_t packet[4] = {0}; + packet[0] = 0x0C; // I:0 P:0 L:0 F:0 B:1 E:1 V:0 Z:0 + + RTPVideoHeader video_header; + int offset = VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header); + + EXPECT_EQ(offset, 1); + RTPVideoHeaderVP9 expected; + expected.InitRTPVideoHeaderVP9(); + expected.beginning_of_frame = true; + expected.end_of_frame = true; + VerifyHeader(expected, + absl::get(video_header.video_type_header)); +} + +TEST(VideoRtpDepacketizerVp9Test, ParseOneBytePictureId) { + uint8_t packet[10] = {0}; + packet[0] = 0x80; // I:1 P:0 L:0 F:0 B:0 E:0 V:0 Z:0 + packet[1] = kMaxOneBytePictureId; + + RTPVideoHeader video_header; + int offset = VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header); + + EXPECT_EQ(offset, 2); + RTPVideoHeaderVP9 expected; + expected.InitRTPVideoHeaderVP9(); + expected.picture_id = kMaxOneBytePictureId; + expected.max_picture_id = kMaxOneBytePictureId; + VerifyHeader(expected, + absl::get(video_header.video_type_header)); +} + +TEST(VideoRtpDepacketizerVp9Test, ParseTwoBytePictureId) { + uint8_t packet[10] = {0}; + packet[0] = 0x80; // I:1 P:0 L:0 F:0 B:0 E:0 V:0 Z:0 + packet[1] = 0x80 | ((kMaxTwoBytePictureId >> 8) & 0x7F); + packet[2] = kMaxTwoBytePictureId & 0xFF; + + RTPVideoHeader video_header; + int offset = VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header); + + EXPECT_EQ(offset, 3); + RTPVideoHeaderVP9 expected; + expected.InitRTPVideoHeaderVP9(); + expected.picture_id = kMaxTwoBytePictureId; + expected.max_picture_id = kMaxTwoBytePictureId; + VerifyHeader(expected, + absl::get(video_header.video_type_header)); +} + +TEST(VideoRtpDepacketizerVp9Test, ParseLayerInfoWithNonFlexibleMode) { + const uint8_t kTemporalIdx = 2; + const uint8_t kUbit = 1; + const uint8_t kSpatialIdx = 1; + const uint8_t kDbit = 1; + const uint8_t kTl0PicIdx = 17; + uint8_t packet[13] = {0}; + packet[0] = 0x20; // I:0 P:0 L:1 F:0 B:0 E:0 V:0 Z:0 + packet[1] = (kTemporalIdx << 5) | (kUbit << 4) | (kSpatialIdx << 1) | kDbit; + packet[2] = kTl0PicIdx; + + RTPVideoHeader video_header; + int offset = VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header); + + EXPECT_EQ(offset, 3); + RTPVideoHeaderVP9 expected; + expected.InitRTPVideoHeaderVP9(); + // T:2 U:1 S:1 D:1 + // TL0PICIDX:17 + expected.temporal_idx = kTemporalIdx; + expected.temporal_up_switch = kUbit ? true : false; + expected.spatial_idx = kSpatialIdx; + expected.inter_layer_predicted = kDbit ? true : false; + expected.tl0_pic_idx = kTl0PicIdx; + VerifyHeader(expected, + absl::get(video_header.video_type_header)); +} + +TEST(VideoRtpDepacketizerVp9Test, ParseLayerInfoWithFlexibleMode) { + const uint8_t kTemporalIdx = 2; + const uint8_t kUbit = 1; + const uint8_t kSpatialIdx = 0; + const uint8_t kDbit = 0; + uint8_t packet[13] = {0}; + packet[0] = 0x38; // I:0 P:0 L:1 F:1 B:1 E:0 V:0 Z:0 + packet[1] = (kTemporalIdx << 5) | (kUbit << 4) | (kSpatialIdx << 1) | kDbit; + + RTPVideoHeader video_header; + int offset = VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header); + + EXPECT_EQ(offset, 2); + RTPVideoHeaderVP9 expected; + expected.InitRTPVideoHeaderVP9(); + // I:0 P:0 L:1 F:1 B:1 E:0 V:0 Z:0 + // L: T:2 U:1 S:0 D:0 + expected.beginning_of_frame = true; + expected.flexible_mode = true; + expected.temporal_idx = kTemporalIdx; + expected.temporal_up_switch = kUbit ? true : false; + expected.spatial_idx = kSpatialIdx; + expected.inter_layer_predicted = kDbit ? true : false; + VerifyHeader(expected, + absl::get(video_header.video_type_header)); +} + +TEST(VideoRtpDepacketizerVp9Test, ParseRefIdx) { + const int16_t kPictureId = 17; + const uint8_t kPdiff1 = 17; + const uint8_t kPdiff2 = 18; + const uint8_t kPdiff3 = 127; + uint8_t packet[13] = {0}; + packet[0] = 0xD8; // I:1 P:1 L:0 F:1 B:1 E:0 V:0 Z:0 + packet[1] = 0x80 | ((kPictureId >> 8) & 0x7F); // Two byte pictureID. + packet[2] = kPictureId; + packet[3] = (kPdiff1 << 1) | 1; // P_DIFF N:1 + packet[4] = (kPdiff2 << 1) | 1; // P_DIFF N:1 + packet[5] = (kPdiff3 << 1) | 0; // P_DIFF N:0 + + RTPVideoHeader video_header; + int offset = VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header); + + EXPECT_EQ(offset, 6); + RTPVideoHeaderVP9 expected; + expected.InitRTPVideoHeaderVP9(); + // I:1 P:1 L:0 F:1 B:1 E:0 V:0 Z:0 + // I: PICTURE ID:17 + // I: + // P,F: P_DIFF:17 N:1 => refPicId = 17 - 17 = 0 + // P,F: P_DIFF:18 N:1 => refPicId = (kMaxPictureId + 1) + 17 - 18 = 0x7FFF + // P,F: P_DIFF:127 N:0 => refPicId = (kMaxPictureId + 1) + 17 - 127 = 32658 + expected.beginning_of_frame = true; + expected.inter_pic_predicted = true; + expected.flexible_mode = true; + expected.picture_id = kPictureId; + expected.num_ref_pics = 3; + expected.pid_diff[0] = kPdiff1; + expected.pid_diff[1] = kPdiff2; + expected.pid_diff[2] = kPdiff3; + expected.ref_picture_id[0] = 0; + expected.ref_picture_id[1] = 0x7FFF; + expected.ref_picture_id[2] = 32658; + VerifyHeader(expected, + absl::get(video_header.video_type_header)); +} + +TEST(VideoRtpDepacketizerVp9Test, ParseRefIdxFailsWithNoPictureId) { + const uint8_t kPdiff = 3; + uint8_t packet[13] = {0}; + packet[0] = 0x58; // I:0 P:1 L:0 F:1 B:1 E:0 V:0 Z:0 + packet[1] = (kPdiff << 1); // P,F: P_DIFF:3 N:0 + + RTPVideoHeader video_header; + EXPECT_EQ(VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header), 0); +} + +TEST(VideoRtpDepacketizerVp9Test, ParseRefIdxFailsWithTooManyRefPics) { + const uint8_t kPdiff = 3; + uint8_t packet[13] = {0}; + packet[0] = 0xD8; // I:1 P:1 L:0 F:1 B:1 E:0 V:0 Z:0 + packet[1] = kMaxOneBytePictureId; // I: PICTURE ID:127 + packet[2] = (kPdiff << 1) | 1; // P,F: P_DIFF:3 N:1 + packet[3] = (kPdiff << 1) | 1; // P,F: P_DIFF:3 N:1 + packet[4] = (kPdiff << 1) | 1; // P,F: P_DIFF:3 N:1 + packet[5] = (kPdiff << 1) | 0; // P,F: P_DIFF:3 N:0 + + RTPVideoHeader video_header; + EXPECT_EQ(VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header), 0); +} + +TEST(VideoRtpDepacketizerVp9Test, ParseSsData) { + const uint8_t kYbit = 0; + const size_t kNs = 2; + const size_t kNg = 2; + uint8_t packet[23] = {0}; + packet[0] = 0x0A; // I:0 P:0 L:0 F:0 B:1 E:0 V:1 Z:0 + packet[1] = ((kNs - 1) << 5) | (kYbit << 4) | (1 << 3); // N_S Y G:1 - + packet[2] = kNg; // N_G + packet[3] = (0 << 5) | (1 << 4) | (0 << 2) | 0; // T:0 U:1 R:0 - + packet[4] = (2 << 5) | (0 << 4) | (1 << 2) | 0; // T:2 U:0 R:1 - + packet[5] = 33; + + RTPVideoHeader video_header; + int offset = VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header); + + EXPECT_EQ(offset, 6); + RTPVideoHeaderVP9 expected; + expected.InitRTPVideoHeaderVP9(); + expected.beginning_of_frame = true; + expected.ss_data_available = true; + expected.num_spatial_layers = kNs; + expected.spatial_layer_resolution_present = kYbit ? true : false; + expected.gof.num_frames_in_gof = kNg; + expected.gof.temporal_idx[0] = 0; + expected.gof.temporal_idx[1] = 2; + expected.gof.temporal_up_switch[0] = true; + expected.gof.temporal_up_switch[1] = false; + expected.gof.num_ref_pics[0] = 0; + expected.gof.num_ref_pics[1] = 1; + expected.gof.pid_diff[1][0] = 33; + VerifyHeader(expected, + absl::get(video_header.video_type_header)); +} + +TEST(VideoRtpDepacketizerVp9Test, ParseFirstPacketInKeyFrame) { + uint8_t packet[2] = {0}; + packet[0] = 0x08; // I:0 P:0 L:0 F:0 B:1 E:0 V:0 Z:0 + + RTPVideoHeader video_header; + VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header); + + EXPECT_EQ(video_header.frame_type, VideoFrameType::kVideoFrameKey); + EXPECT_TRUE(video_header.is_first_packet_in_frame); +} + +TEST(VideoRtpDepacketizerVp9Test, ParseLastPacketInDeltaFrame) { + uint8_t packet[2] = {0}; + packet[0] = 0x44; // I:0 P:1 L:0 F:0 B:0 E:1 V:0 Z:0 + + RTPVideoHeader video_header; + VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header); + + EXPECT_EQ(video_header.frame_type, VideoFrameType::kVideoFrameDelta); + EXPECT_FALSE(video_header.is_first_packet_in_frame); +} + +TEST(VideoRtpDepacketizerVp9Test, ParseResolution) { + const uint16_t kWidth[2] = {640, 1280}; + const uint16_t kHeight[2] = {360, 720}; + uint8_t packet[20] = {0}; + packet[0] = 0x0A; // I:0 P:0 L:0 F:0 B:1 E:0 V:1 Z:0 + packet[1] = (1 << 5) | (1 << 4) | 0; // N_S:1 Y:1 G:0 + packet[2] = kWidth[0] >> 8; + packet[3] = kWidth[0] & 0xFF; + packet[4] = kHeight[0] >> 8; + packet[5] = kHeight[0] & 0xFF; + packet[6] = kWidth[1] >> 8; + packet[7] = kWidth[1] & 0xFF; + packet[8] = kHeight[1] >> 8; + packet[9] = kHeight[1] & 0xFF; + + RTPVideoHeader video_header; + VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header); + + EXPECT_EQ(video_header.width, kWidth[0]); + EXPECT_EQ(video_header.height, kHeight[0]); +} + +TEST(VideoRtpDepacketizerVp9Test, ParseFailsForNoPayloadLength) { + rtc::ArrayView empty; + + RTPVideoHeader video_header; + EXPECT_EQ(VideoRtpDepacketizerVp9::ParseRtpPayload(empty, &video_header), 0); +} + +TEST(VideoRtpDepacketizerVp9Test, ParseFailsForTooShortBufferToFitPayload) { + uint8_t packet[] = {0}; + + RTPVideoHeader video_header; + EXPECT_EQ(VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header), 0); +} + +TEST(VideoRtpDepacketizerVp9Test, ParseNonRefForInterLayerPred) { + RTPVideoHeader video_header; + RTPVideoHeaderVP9 expected; + expected.InitRTPVideoHeaderVP9(); + uint8_t packet[2] = {0}; + + packet[0] = 0x08; // I:0 P:0 L:0 F:0 B:1 E:0 V:0 Z:0 + VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header); + + expected.beginning_of_frame = true; + expected.non_ref_for_inter_layer_pred = false; + VerifyHeader(expected, + absl::get(video_header.video_type_header)); + + packet[0] = 0x05; // I:0 P:0 L:0 F:0 B:0 E:1 V:0 Z:1 + VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header); + + expected.beginning_of_frame = false; + expected.end_of_frame = true; + expected.non_ref_for_inter_layer_pred = true; + VerifyHeader(expected, + absl::get(video_header.video_type_header)); +} + +TEST(VideoRtpDepacketizerVp9Test, ReferencesInputCopyOnWriteBuffer) { + constexpr size_t kHeaderSize = 1; + uint8_t packet[4] = {0}; + packet[0] = 0x0C; // I:0 P:0 L:0 F:0 B:1 E:1 V:0 Z:0 + + rtc::CopyOnWriteBuffer rtp_payload(packet); + VideoRtpDepacketizerVp9 depacketizer; + absl::optional parsed = + depacketizer.Parse(rtp_payload); + ASSERT_TRUE(parsed); + + EXPECT_EQ(parsed->video_payload.size(), rtp_payload.size() - kHeaderSize); + // Compare pointers to check there was no copy on write buffer unsharing. + EXPECT_EQ(parsed->video_payload.cdata(), rtp_payload.cdata() + kHeaderSize); +} +} // namespace +} // namespace webrtc