diff --git a/modules/rtp_rtcp/source/rtp_sender_video.cc b/modules/rtp_rtcp/source/rtp_sender_video.cc index 8294891e7a..7a75973fa4 100644 --- a/modules/rtp_rtcp/source/rtp_sender_video.cc +++ b/modules/rtp_rtcp/source/rtp_sender_video.cc @@ -382,10 +382,15 @@ void RTPSenderVideo::AddRtpHeaderExtensions( descriptor.active_decode_targets_bitmask = active_decode_targets_tracker_.ActiveDecodeTargetsBitmask(); } - // To avoid extra structure copy, temporary share ownership of the - // video_structure with the dependency descriptor. + // VP9 mark all layer frames of the first picture as kVideoFrameKey, + // Structure should be attached to the descriptor to lowest spatial layer + // when inter layer dependency is used, i.e. L structures; or to all + // layers when inter layer dependency is not used, i.e. S structures. + // Distinguish these two cases by checking if there are any dependencies. if (video_header.frame_type == VideoFrameType::kVideoFrameKey && - first_packet) { + video_header.generic->dependencies.empty() && first_packet) { + // To avoid extra structure copy, temporary share ownership of the + // video_structure with the dependency descriptor. descriptor.attached_structure = absl::WrapUnique(video_structure_.get()); } diff --git a/modules/video_coding/BUILD.gn b/modules/video_coding/BUILD.gn index 5dae630e70..f81387f890 100644 --- a/modules/video_coding/BUILD.gn +++ b/modules/video_coding/BUILD.gn @@ -565,6 +565,8 @@ rtc_library("webrtc_vp9") { "../../rtc_base/synchronization:mutex", "../../system_wrappers:field_trial", "../rtp_rtcp:rtp_rtcp_format", + "svc:scalability_structures", + "svc:scalable_video_controller", "//third_party/libyuv", ] absl_deps = [ diff --git a/modules/video_coding/codecs/test/encoded_video_frame_producer.cc b/modules/video_coding/codecs/test/encoded_video_frame_producer.cc index 277033fe4e..899826eee4 100644 --- a/modules/video_coding/codecs/test/encoded_video_frame_producer.cc +++ b/modules/video_coding/codecs/test/encoded_video_frame_producer.cc @@ -57,7 +57,6 @@ EncodedVideoFrameProducer::Encode() { WEBRTC_VIDEO_CODEC_OK); uint32_t rtp_tick = 90000 / framerate_fps_; - std::vector frame_types = {VideoFrameType::kVideoFrameDelta}; for (int i = 0; i < num_input_frames_; ++i) { VideoFrame frame = VideoFrame::Builder() @@ -65,7 +64,9 @@ EncodedVideoFrameProducer::Encode() { .set_timestamp_rtp(rtp_timestamp_) .build(); rtp_timestamp_ += rtp_tick; - RTC_CHECK_EQ(encoder_.Encode(frame, &frame_types), WEBRTC_VIDEO_CODEC_OK); + RTC_CHECK_EQ(encoder_.Encode(frame, &next_frame_type_), + WEBRTC_VIDEO_CODEC_OK); + next_frame_type_[0] = VideoFrameType::kVideoFrameDelta; } RTC_CHECK_EQ(encoder_.RegisterEncodeCompleteCallback(nullptr), diff --git a/modules/video_coding/codecs/test/encoded_video_frame_producer.h b/modules/video_coding/codecs/test/encoded_video_frame_producer.h index 1b1b9018f9..2216287b92 100644 --- a/modules/video_coding/codecs/test/encoded_video_frame_producer.h +++ b/modules/video_coding/codecs/test/encoded_video_frame_producer.h @@ -40,6 +40,8 @@ class EncodedVideoFrameProducer { // Number of the input frames to pass to the encoder. EncodedVideoFrameProducer& SetNumInputFrames(int value); + // Encode next frame as key frame. + EncodedVideoFrameProducer& ForceKeyFrame(); // Resolution of the input frames. EncodedVideoFrameProducer& SetResolution(RenderResolution value); @@ -57,6 +59,8 @@ class EncodedVideoFrameProducer { int num_input_frames_ = 1; int framerate_fps_ = 30; RenderResolution resolution_ = {320, 180}; + std::vector next_frame_type_ = { + VideoFrameType::kVideoFrameKey}; }; inline EncodedVideoFrameProducer& EncodedVideoFrameProducer::SetNumInputFrames( @@ -66,6 +70,11 @@ inline EncodedVideoFrameProducer& EncodedVideoFrameProducer::SetNumInputFrames( return *this; } +inline EncodedVideoFrameProducer& EncodedVideoFrameProducer::ForceKeyFrame() { + next_frame_type_ = {VideoFrameType::kVideoFrameKey}; + return *this; +} + inline EncodedVideoFrameProducer& EncodedVideoFrameProducer::SetResolution( RenderResolution value) { resolution_ = value; diff --git a/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc b/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc index a55b110cd1..d3b7b94187 100644 --- a/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc +++ b/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc @@ -28,7 +28,9 @@ namespace webrtc { namespace { +using ::testing::ElementsAre; using ::testing::ElementsAreArray; +using ::testing::IsEmpty; using ::testing::SizeIs; using ::testing::UnorderedElementsAreArray; using EncoderInfo = webrtc::VideoEncoder::EncoderInfo; @@ -53,6 +55,21 @@ VideoCodec DefaultCodecSettings() { return codec_settings; } +void ConfigureSvc(VideoCodec& codec_settings, + int num_spatial_layers, + int num_temporal_layers = 1) { + codec_settings.VP9()->numberOfSpatialLayers = num_spatial_layers; + codec_settings.VP9()->numberOfTemporalLayers = num_temporal_layers; + codec_settings.VP9()->frameDroppingOn = false; + + std::vector layers = GetSvcConfig( + codec_settings.width, codec_settings.height, codec_settings.maxFramerate, + /*first_active_layer=*/0, num_spatial_layers, num_temporal_layers, false); + for (size_t i = 0; i < layers.size(); ++i) { + codec_settings.spatialLayers[i] = layers[i]; + } +} + } // namespace class TestVp9Impl : public VideoCodecUnitTest { @@ -72,21 +89,6 @@ class TestVp9Impl : public VideoCodecUnitTest { codec_settings->VP9()->numberOfTemporalLayers = 1; codec_settings->VP9()->numberOfSpatialLayers = 1; } - - void ConfigureSvc(size_t num_spatial_layers, size_t num_temporal_layers = 1) { - codec_settings_.VP9()->numberOfSpatialLayers = - static_cast(num_spatial_layers); - codec_settings_.VP9()->numberOfTemporalLayers = num_temporal_layers; - codec_settings_.VP9()->frameDroppingOn = false; - - std::vector layers = - GetSvcConfig(codec_settings_.width, codec_settings_.height, - codec_settings_.maxFramerate, /*first_active_layer=*/0, - num_spatial_layers, num_temporal_layers, false); - for (size_t i = 0; i < layers.size(); ++i) { - codec_settings_.spatialLayers[i] = layers[i]; - } - } }; class TestVp9ImplForPixelFormat @@ -204,6 +206,28 @@ TEST(Vp9ImplTest, ParserQpEqualsEncodedQp) { EXPECT_EQ(encoded_frame.qp_, qp); } +TEST(Vp9ImplTest, EncodeAttachesTemplateStructureWithSvcController) { + test::ScopedFieldTrials override_field_trials( + "WebRTC-Vp9DependencyDescriptor/Enabled/"); + std::unique_ptr encoder = VP9Encoder::Create(); + VideoCodec codec_settings = DefaultCodecSettings(); + EXPECT_EQ(encoder->InitEncode(&codec_settings, kSettings), + WEBRTC_VIDEO_CODEC_OK); + + std::vector frames = + EncodedVideoFrameProducer(*encoder) + .SetNumInputFrames(2) + .SetResolution({kWidth, kHeight}) + .Encode(); + + ASSERT_THAT(frames, SizeIs(2)); + EXPECT_TRUE(frames[0].codec_specific_info.template_structure); + EXPECT_TRUE(frames[0].codec_specific_info.generic_frame_info); + + EXPECT_FALSE(frames[1].codec_specific_info.template_structure); + EXPECT_TRUE(frames[1].codec_specific_info.generic_frame_info); +} + TEST(Vp9ImplTest, EncoderWith2TemporalLayers) { std::unique_ptr encoder = VP9Encoder::Create(); VideoCodec codec_settings = DefaultCodecSettings(); @@ -226,6 +250,37 @@ TEST(Vp9ImplTest, EncoderWith2TemporalLayers) { EXPECT_EQ(frames[3].codec_specific_info.codecSpecific.VP9.temporal_idx, 1); } +TEST(Vp9ImplTest, EncodeTemporalLayersWithSvcController) { + test::ScopedFieldTrials override_field_trials( + "WebRTC-Vp9DependencyDescriptor/Enabled/"); + std::unique_ptr encoder = VP9Encoder::Create(); + VideoCodec codec_settings = DefaultCodecSettings(); + codec_settings.VP9()->numberOfTemporalLayers = 2; + EXPECT_EQ(encoder->InitEncode(&codec_settings, kSettings), + WEBRTC_VIDEO_CODEC_OK); + + std::vector frames = + EncodedVideoFrameProducer(*encoder) + .SetNumInputFrames(4) + .SetResolution({kWidth, kHeight}) + .Encode(); + + ASSERT_THAT(frames, SizeIs(4)); + EXPECT_EQ(frames[0].codec_specific_info.codecSpecific.VP9.temporal_idx, 0); + EXPECT_EQ(frames[1].codec_specific_info.codecSpecific.VP9.temporal_idx, 1); + EXPECT_EQ(frames[2].codec_specific_info.codecSpecific.VP9.temporal_idx, 0); + EXPECT_EQ(frames[3].codec_specific_info.codecSpecific.VP9.temporal_idx, 1); + // Verify codec agnostic part + ASSERT_TRUE(frames[0].codec_specific_info.generic_frame_info); + ASSERT_TRUE(frames[1].codec_specific_info.generic_frame_info); + ASSERT_TRUE(frames[2].codec_specific_info.generic_frame_info); + ASSERT_TRUE(frames[3].codec_specific_info.generic_frame_info); + EXPECT_EQ(frames[0].codec_specific_info.generic_frame_info->temporal_id, 0); + EXPECT_EQ(frames[1].codec_specific_info.generic_frame_info->temporal_id, 1); + EXPECT_EQ(frames[2].codec_specific_info.generic_frame_info->temporal_id, 0); + EXPECT_EQ(frames[3].codec_specific_info.generic_frame_info->temporal_id, 1); +} + TEST(Vp9ImplTest, EncoderWith2SpatialLayers) { std::unique_ptr encoder = VP9Encoder::Create(); VideoCodec codec_settings = DefaultCodecSettings(); @@ -244,6 +299,37 @@ TEST(Vp9ImplTest, EncoderWith2SpatialLayers) { EXPECT_EQ(frames[1].encoded_image.SpatialIndex(), 1); } +TEST(Vp9ImplTest, EncodeSpatialLayersWithSvcController) { + test::ScopedFieldTrials override_field_trials( + "WebRTC-Vp9DependencyDescriptor/Enabled/"); + std::unique_ptr encoder = VP9Encoder::Create(); + VideoCodec codec_settings = DefaultCodecSettings(); + codec_settings.VP9()->numberOfSpatialLayers = 2; + EXPECT_EQ(encoder->InitEncode(&codec_settings, kSettings), + WEBRTC_VIDEO_CODEC_OK); + + std::vector frames = + EncodedVideoFrameProducer(*encoder) + .SetNumInputFrames(2) + .SetResolution({kWidth, kHeight}) + .Encode(); + + ASSERT_THAT(frames, SizeIs(4)); + EXPECT_EQ(frames[0].encoded_image.SpatialIndex(), 0); + EXPECT_EQ(frames[1].encoded_image.SpatialIndex(), 1); + EXPECT_EQ(frames[2].encoded_image.SpatialIndex(), 0); + EXPECT_EQ(frames[3].encoded_image.SpatialIndex(), 1); + // Verify codec agnostic part + ASSERT_TRUE(frames[0].codec_specific_info.generic_frame_info); + ASSERT_TRUE(frames[1].codec_specific_info.generic_frame_info); + ASSERT_TRUE(frames[2].codec_specific_info.generic_frame_info); + ASSERT_TRUE(frames[3].codec_specific_info.generic_frame_info); + EXPECT_EQ(frames[0].codec_specific_info.generic_frame_info->spatial_id, 0); + EXPECT_EQ(frames[1].codec_specific_info.generic_frame_info->spatial_id, 1); + EXPECT_EQ(frames[2].codec_specific_info.generic_frame_info->spatial_id, 0); + EXPECT_EQ(frames[3].codec_specific_info.generic_frame_info->spatial_id, 1); +} + TEST_F(TestVp9Impl, EncoderExplicitLayering) { // Override default settings. codec_settings_.VP9()->numberOfTemporalLayers = 1; @@ -304,7 +390,7 @@ TEST_F(TestVp9Impl, EnableDisableSpatialLayers) { const size_t num_spatial_layers = 3; const size_t num_frames_to_encode = 5; - ConfigureSvc(num_spatial_layers); + ConfigureSvc(codec_settings_, num_spatial_layers); codec_settings_.VP9()->frameDroppingOn = true; EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK, @@ -350,6 +436,68 @@ TEST_F(TestVp9Impl, EnableDisableSpatialLayers) { } } +TEST(Vp9ImplTest, EnableDisableSpatialLayersWithSvcController) { + test::ScopedFieldTrials override_field_trials( + "WebRTC-Vp9DependencyDescriptor/Enabled/"); + const int num_spatial_layers = 3; + // Configure encoder to produce 3 spatial layers. Encode frames of layer 0 + // then enable layer 1 and encode more frames and so on. + // Then disable layers one by one in the same way. + // Note: bit rate allocation is high to avoid frame dropping due to rate + // control, the encoder should always produce a frame. A dropped + // frame indicates a problem and the test will fail. + std::unique_ptr encoder = VP9Encoder::Create(); + VideoCodec codec_settings = DefaultCodecSettings(); + ConfigureSvc(codec_settings, num_spatial_layers); + codec_settings.VP9()->frameDroppingOn = true; + EXPECT_EQ(encoder->InitEncode(&codec_settings, kSettings), + WEBRTC_VIDEO_CODEC_OK); + + EncodedVideoFrameProducer producer(*encoder); + producer.SetResolution({kWidth, kHeight}); + + // Encode a key frame to validate all other frames are delta frames. + std::vector frames = + producer.SetNumInputFrames(1).Encode(); + ASSERT_THAT(frames, Not(IsEmpty())); + EXPECT_TRUE(frames[0].codec_specific_info.template_structure); + + const size_t num_frames_to_encode = 5; + + VideoBitrateAllocation bitrate_allocation; + for (size_t sl_idx = 0; sl_idx < num_spatial_layers; ++sl_idx) { + // Allocate high bit rate to avoid frame dropping due to rate control. + bitrate_allocation.SetBitrate( + sl_idx, 0, + codec_settings.spatialLayers[sl_idx].targetBitrate * 1000 * 2); + encoder->SetRates(VideoEncoder::RateControlParameters( + bitrate_allocation, codec_settings.maxFramerate)); + + frames = producer.SetNumInputFrames(num_frames_to_encode).Encode(); + // With (sl_idx+1) spatial layers expect (sl_idx+1) frames per input frame. + ASSERT_THAT(frames, SizeIs(num_frames_to_encode * (sl_idx + 1))); + for (size_t i = 0; i < frames.size(); ++i) { + EXPECT_TRUE(frames[i].codec_specific_info.generic_frame_info); + EXPECT_FALSE(frames[i].codec_specific_info.template_structure); + } + } + + for (int sl_idx = num_spatial_layers - 1; sl_idx > 0; --sl_idx) { + bitrate_allocation.SetBitrate(sl_idx, 0, 0); + encoder->SetRates(VideoEncoder::RateControlParameters( + bitrate_allocation, codec_settings.maxFramerate)); + + frames = producer.SetNumInputFrames(num_frames_to_encode).Encode(); + // With |sl_idx| spatial layer disabled, there are |sl_idx| spatial layers + // left. + ASSERT_THAT(frames, SizeIs(num_frames_to_encode * sl_idx)); + for (size_t i = 0; i < frames.size(); ++i) { + EXPECT_TRUE(frames[i].codec_specific_info.generic_frame_info); + EXPECT_FALSE(frames[i].codec_specific_info.template_structure); + } + } +} + TEST_F(TestVp9Impl, DisableEnableBaseLayerTriggersKeyFrame) { // Configure encoder to produce N spatial layers. Encode frames for all // layers. Then disable all but the last layer. Then reenable all back again. @@ -360,7 +508,7 @@ TEST_F(TestVp9Impl, DisableEnableBaseLayerTriggersKeyFrame) { // Must not be multiple of temporal period to exercise all code paths. const size_t num_frames_to_encode = 5; - ConfigureSvc(num_spatial_layers, num_temporal_layers); + ConfigureSvc(codec_settings_, num_spatial_layers, num_temporal_layers); codec_settings_.VP9()->frameDroppingOn = false; codec_settings_.VP9()->flexibleMode = false; codec_settings_.VP9()->interLayerPred = InterLayerPredMode::kOnKeyPic; @@ -506,13 +654,133 @@ TEST_F(TestVp9Impl, DisableEnableBaseLayerTriggersKeyFrame) { } } +TEST(Vp9ImplTest, DisableEnableBaseLayerWithSvcControllerTriggersKeyFrame) { + // Configure encoder to produce N spatial layers. Encode frames for all + // layers. Then disable all but the last layer. Then reenable all back again. + test::ScopedFieldTrials override_field_trials( + "WebRTC-Vp9DependencyDescriptor/Enabled/"); + const size_t num_spatial_layers = 3; + const size_t num_temporal_layers = 3; + // Must not be multiple of temporal period to exercise all code paths. + const size_t num_frames_to_encode = 5; + + std::unique_ptr encoder = VP9Encoder::Create(); + VideoCodec codec_settings = DefaultCodecSettings(); + ConfigureSvc(codec_settings, num_spatial_layers, num_temporal_layers); + codec_settings.VP9()->frameDroppingOn = false; + codec_settings.VP9()->flexibleMode = false; + codec_settings.mode = VideoCodecMode::kRealtimeVideo; + + EXPECT_EQ(encoder->InitEncode(&codec_settings, kSettings), + WEBRTC_VIDEO_CODEC_OK); + + VideoBitrateAllocation bitrate_allocation; + for (size_t sl_idx = 0; sl_idx < num_spatial_layers; ++sl_idx) { + for (size_t tl_idx = 0; tl_idx < num_temporal_layers; ++tl_idx) { + // Allocate high bit rate to avoid frame dropping due to rate control. + bitrate_allocation.SetBitrate( + sl_idx, tl_idx, + codec_settings.spatialLayers[sl_idx].targetBitrate * 1000 * 2); + } + } + encoder->SetRates(VideoEncoder::RateControlParameters( + bitrate_allocation, codec_settings.maxFramerate)); + + EncodedVideoFrameProducer producer(*encoder); + producer.SetResolution({kWidth, kHeight}); + + std::vector frames = + producer.SetNumInputFrames(num_frames_to_encode).Encode(); + ASSERT_THAT(frames, SizeIs(num_frames_to_encode * num_spatial_layers)); + + // Disable all but top spatial layer. + for (size_t sl_idx = 0; sl_idx < num_spatial_layers - 1; ++sl_idx) { + for (size_t tl_idx = 0; tl_idx < num_temporal_layers; ++tl_idx) { + bitrate_allocation.SetBitrate(sl_idx, tl_idx, 0); + } + } + encoder->SetRates(VideoEncoder::RateControlParameters( + bitrate_allocation, codec_settings.maxFramerate)); + + frames = producer.SetNumInputFrames(num_frames_to_encode).Encode(); + EXPECT_THAT(frames, SizeIs(num_frames_to_encode)); + for (const auto& frame : frames) { + // Expect no key-frames generated. + EXPECT_FALSE(frame.codec_specific_info.template_structure); + ASSERT_TRUE(frame.codec_specific_info.generic_frame_info); + EXPECT_EQ(frame.codec_specific_info.generic_frame_info->spatial_id, 2); + } + + frames = producer.ForceKeyFrame().SetNumInputFrames(1).Encode(); + ASSERT_THAT(frames, SizeIs(1)); + // Key-frame should be produced. + EXPECT_EQ(frames[0].encoded_image._frameType, VideoFrameType::kVideoFrameKey); + ASSERT_TRUE(frames[0].codec_specific_info.template_structure); + ASSERT_TRUE(frames[0].codec_specific_info.generic_frame_info); + EXPECT_EQ(frames[0].codec_specific_info.generic_frame_info->spatial_id, 2); + + frames = producer.SetNumInputFrames(num_frames_to_encode).Encode(); + ASSERT_THAT(frames, SizeIs(num_frames_to_encode)); + for (const auto& frame : frames) { + EXPECT_EQ(frame.encoded_image._frameType, VideoFrameType::kVideoFrameDelta); + EXPECT_FALSE(frame.codec_specific_info.template_structure); + ASSERT_TRUE(frame.codec_specific_info.generic_frame_info); + EXPECT_EQ(frame.codec_specific_info.generic_frame_info->spatial_id, 2); + } + + // Enable the second layer back. + // Allocate high bit rate to avoid frame dropping due to rate control. + for (size_t tl_idx = 0; tl_idx < num_temporal_layers; ++tl_idx) { + bitrate_allocation.SetBitrate( + 1, tl_idx, codec_settings.spatialLayers[0].targetBitrate * 1000 * 2); + } + encoder->SetRates(VideoEncoder::RateControlParameters( + bitrate_allocation, codec_settings.maxFramerate)); + + frames = producer.SetNumInputFrames(num_frames_to_encode).Encode(); + ASSERT_THAT(frames, SizeIs(num_frames_to_encode * 2)); + EXPECT_EQ(frames[0].encoded_image._frameType, VideoFrameType::kVideoFrameKey); + EXPECT_TRUE(frames[0].codec_specific_info.template_structure); + ASSERT_TRUE(frames[0].codec_specific_info.generic_frame_info); + EXPECT_EQ(frames[0].codec_specific_info.generic_frame_info->spatial_id, 1); + for (size_t i = 1; i < frames.size(); ++i) { + EXPECT_EQ(frames[i].encoded_image._frameType, + VideoFrameType::kVideoFrameDelta); + EXPECT_FALSE(frames[i].codec_specific_info.template_structure); + ASSERT_TRUE(frames[i].codec_specific_info.generic_frame_info); + EXPECT_EQ(frames[i].codec_specific_info.generic_frame_info->spatial_id, + 1 + static_cast(i % 2)); + } + + // Enable the first layer back. + // Allocate high bit rate to avoid frame dropping due to rate control. + for (size_t tl_idx = 0; tl_idx < num_temporal_layers; ++tl_idx) { + bitrate_allocation.SetBitrate( + 0, tl_idx, codec_settings.spatialLayers[1].targetBitrate * 1000 * 2); + } + encoder->SetRates(VideoEncoder::RateControlParameters( + bitrate_allocation, codec_settings.maxFramerate)); + + frames = producer.SetNumInputFrames(num_frames_to_encode).Encode(); + ASSERT_THAT(frames, SizeIs(num_frames_to_encode * 3)); + EXPECT_TRUE(frames[0].codec_specific_info.template_structure); + ASSERT_TRUE(frames[0].codec_specific_info.generic_frame_info); + EXPECT_EQ(frames[0].codec_specific_info.generic_frame_info->spatial_id, 0); + for (size_t i = 1; i < frames.size(); ++i) { + EXPECT_FALSE(frames[i].codec_specific_info.template_structure); + ASSERT_TRUE(frames[i].codec_specific_info.generic_frame_info); + EXPECT_EQ(frames[i].codec_specific_info.generic_frame_info->spatial_id, + static_cast(i % 3)); + } +} + TEST_F(TestVp9Impl, DisableEnableBaseLayerTriggersKeyFrameForScreenshare) { // Configure encoder to produce N spatial layers. Encode frames for all // layers. Then disable all but the last layer. Then reenable all back again. const size_t num_spatial_layers = 3; const size_t num_frames_to_encode = 5; - ConfigureSvc(num_spatial_layers); + ConfigureSvc(codec_settings_, num_spatial_layers); codec_settings_.VP9()->frameDroppingOn = false; codec_settings_.mode = VideoCodecMode::kScreensharing; codec_settings_.VP9()->interLayerPred = InterLayerPredMode::kOn; @@ -630,7 +898,7 @@ TEST_F(TestVp9Impl, DisableEnableBaseLayerTriggersKeyFrameForScreenshare) { TEST_F(TestVp9Impl, EndOfPicture) { const size_t num_spatial_layers = 2; - ConfigureSvc(num_spatial_layers); + ConfigureSvc(codec_settings_, num_spatial_layers); EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK, encoder_->InitEncode(&codec_settings_, kSettings)); @@ -671,7 +939,7 @@ TEST_F(TestVp9Impl, EndOfPicture) { TEST_F(TestVp9Impl, InterLayerPred) { const size_t num_spatial_layers = 2; - ConfigureSvc(num_spatial_layers); + ConfigureSvc(codec_settings_, num_spatial_layers); codec_settings_.VP9()->frameDroppingOn = false; VideoBitrateAllocation bitrate_allocation; @@ -746,7 +1014,7 @@ TEST_F(TestVp9Impl, const size_t num_spatial_layers = 3; const size_t num_frames_to_encode = 2; - ConfigureSvc(num_spatial_layers); + ConfigureSvc(codec_settings_, num_spatial_layers); codec_settings_.VP9()->frameDroppingOn = false; const std::vector inter_layer_pred_modes = { @@ -803,7 +1071,7 @@ TEST_F(TestVp9Impl, const size_t num_spatial_layers = 3; const size_t num_frames_to_encode = 2; - ConfigureSvc(num_spatial_layers); + ConfigureSvc(codec_settings_, num_spatial_layers); codec_settings_.VP9()->frameDroppingOn = false; codec_settings_.VP9()->flexibleMode = false; @@ -858,7 +1126,7 @@ TEST_F(TestVp9Impl, EnablingDisablingUpperLayerInTheSameGof) { const size_t num_spatial_layers = 2; const size_t num_temporal_layers = 2; - ConfigureSvc(num_spatial_layers, num_temporal_layers); + ConfigureSvc(codec_settings_, num_spatial_layers, num_temporal_layers); codec_settings_.VP9()->frameDroppingOn = false; codec_settings_.VP9()->flexibleMode = false; @@ -930,7 +1198,7 @@ TEST_F(TestVp9Impl, EnablingDisablingUpperLayerAccrossGof) { const size_t num_spatial_layers = 2; const size_t num_temporal_layers = 2; - ConfigureSvc(num_spatial_layers, num_temporal_layers); + ConfigureSvc(codec_settings_, num_spatial_layers, num_temporal_layers); codec_settings_.VP9()->frameDroppingOn = false; codec_settings_.VP9()->flexibleMode = false; @@ -1010,7 +1278,7 @@ TEST_F(TestVp9Impl, EnablingNewLayerInScreenshareForcesAllLayersWithSS) { const size_t num_frames_to_encode_before_drop = 1; codec_settings_.maxFramerate = 30; - ConfigureSvc(num_spatial_layers); + ConfigureSvc(codec_settings_, num_spatial_layers); codec_settings_.spatialLayers[0].maxFramerate = 5.0; // use 30 for the SL 1 instead of 10, so even if SL 0 frame is dropped due to // framerate capping we would still get back at least a middle layer. It @@ -1069,7 +1337,7 @@ TEST_F(TestVp9Impl, ScreenshareFrameDropping) { const int num_frames_to_detect_drops = 2; codec_settings_.maxFramerate = 30; - ConfigureSvc(num_spatial_layers); + ConfigureSvc(codec_settings_, num_spatial_layers); // use 30 for the SL0 and SL1 because it simplifies the test. codec_settings_.spatialLayers[0].maxFramerate = 30.0; codec_settings_.spatialLayers[1].maxFramerate = 30.0; @@ -1159,7 +1427,7 @@ TEST_F(TestVp9Impl, RemovingLayerIsNotDelayedInScreenshareAndAddsSsInfo) { const size_t num_dropped_frames = 5; codec_settings_.maxFramerate = 30; - ConfigureSvc(num_spatial_layers); + ConfigureSvc(codec_settings_, num_spatial_layers); codec_settings_.spatialLayers[0].maxFramerate = 5.0; // use 30 for the SL 1 instead of 5, so even if SL 0 frame is dropped due to // framerate capping we would still get back at least a middle layer. It @@ -1246,7 +1514,7 @@ TEST_F(TestVp9Impl, DisableNewLayerInVideoDelaysSsInfoTillTL0) { const size_t num_temporal_layers = 2; // Chosen by hand, the 2nd frame is dropped with configured per-layer max // framerate. - ConfigureSvc(num_spatial_layers, num_temporal_layers); + ConfigureSvc(codec_settings_, num_spatial_layers, num_temporal_layers); codec_settings_.VP9()->frameDroppingOn = false; codec_settings_.mode = VideoCodecMode::kRealtimeVideo; codec_settings_.VP9()->interLayerPred = InterLayerPredMode::kOnKeyPic; @@ -1305,7 +1573,7 @@ TEST_F(TestVp9Impl, DisableNewLayerInVideoDelaysSsInfoTillTL0) { TEST_F(TestVp9Impl, LowLayerMarkedAsRefIfHighLayerNotEncodedAndInterLayerPredIsEnabled) { - ConfigureSvc(3); + ConfigureSvc(codec_settings_, 3); codec_settings_.VP9()->frameDroppingOn = false; codec_settings_.VP9()->interLayerPred = InterLayerPredMode::kOn; @@ -1710,7 +1978,7 @@ TEST_F(TestVp9Impl, ReenablingUpperLayerAfterKFWithInterlayerPredIsEnabled) { // Force low frame-rate, so all layers are present for all frames. codec_settings_.maxFramerate = 5; - ConfigureSvc(num_spatial_layers); + ConfigureSvc(codec_settings_, num_spatial_layers); EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK, encoder_->InitEncode(&codec_settings_, kSettings)); diff --git a/modules/video_coding/codecs/vp9/vp9_impl.cc b/modules/video_coding/codecs/vp9/vp9_impl.cc index 49fa92fd8d..599a4d5ac1 100644 --- a/modules/video_coding/codecs/vp9/vp9_impl.cc +++ b/modules/video_coding/codecs/vp9/vp9_impl.cc @@ -27,12 +27,16 @@ #include "common_video/libyuv/include/webrtc_libyuv.h" #include "modules/rtp_rtcp/include/rtp_rtcp_defines.h" #include "modules/video_coding/codecs/vp9/svc_rate_allocator.h" +#include "modules/video_coding/svc/create_scalability_structure.h" +#include "modules/video_coding/svc/scalable_video_controller.h" +#include "modules/video_coding/svc/scalable_video_controller_no_layering.h" #include "modules/video_coding/utility/vp9_uncompressed_header_parser.h" #include "rtc_base/checks.h" #include "rtc_base/experiments/field_trial_parser.h" #include "rtc_base/experiments/rate_control_settings.h" #include "rtc_base/keep_ref_until_done.h" #include "rtc_base/logging.h" +#include "rtc_base/strings/string_builder.h" #include "rtc_base/time_utils.h" #include "rtc_base/trace_event.h" #include "third_party/libyuv/include/libyuv/convert.h" @@ -214,6 +218,107 @@ void UpdateRateSettings(vpx_codec_enc_cfg_t* config, config->rc_dropframe_thresh = new_settings.rc_dropframe_thresh; } +std::unique_ptr CreateVp9ScalabilityStructure( + const VideoCodec& codec) { + int num_spatial_layers = codec.VP9().numberOfSpatialLayers; + int num_temporal_layers = + std::max(1, int{codec.VP9().numberOfTemporalLayers}); + if (num_spatial_layers == 1 && num_temporal_layers == 1) { + return std::make_unique(); + } + + if (codec.VP9().interLayerPred != InterLayerPredMode::kOn || + codec.mode == VideoCodecMode::kScreensharing) { + // TODO(bugs.webrtc.org/11999): Return names of the structure when they are + // implemented and support frame skipping. + return nullptr; + } + + char name[20]; + rtc::SimpleStringBuilder ss(name); + ss << "L" << num_spatial_layers << "T" << num_temporal_layers; + + // Check spatial ratio. + if (num_spatial_layers > 1 && codec.spatialLayers[0].targetBitrate > 0) { + if (codec.width != codec.spatialLayers[num_spatial_layers - 1].width || + codec.height != codec.spatialLayers[num_spatial_layers - 1].height) { + RTC_LOG(LS_WARNING) + << "Top layer resolution expected to match overall resolution"; + return nullptr; + } + // Check if the ratio is one of the supported. + int numerator; + int denominator; + if (codec.spatialLayers[1].width == 2 * codec.spatialLayers[0].width) { + numerator = 1; + denominator = 2; + // no suffix for 1:2 ratio. + } else if (2 * codec.spatialLayers[1].width == + 3 * codec.spatialLayers[0].width) { + numerator = 2; + denominator = 3; + ss << "h"; + } else { + RTC_LOG(LS_WARNING) << "Unsupported scalability ratio " + << codec.spatialLayers[0].width << ":" + << codec.spatialLayers[1].width; + return nullptr; + } + // Validate ratio is consistent for all spatial layer transitions. + for (int sid = 1; sid < num_spatial_layers; ++sid) { + if (codec.spatialLayers[sid].width * numerator != + codec.spatialLayers[sid - 1].width * denominator || + codec.spatialLayers[sid].height * numerator != + codec.spatialLayers[sid - 1].height * denominator) { + RTC_LOG(LS_WARNING) << "Inconsistent scalability ratio " << numerator + << ":" << denominator; + return nullptr; + } + } + } + + auto scalability_structure_controller = CreateScalabilityStructure(name); + if (scalability_structure_controller == nullptr) { + RTC_LOG(LS_WARNING) << "Unsupported scalability structure " << name; + } else { + RTC_LOG(LS_INFO) << "Created scalability structure " << name; + } + return scalability_structure_controller; +} + +vpx_svc_ref_frame_config_t Vp9References( + rtc::ArrayView layers) { + vpx_svc_ref_frame_config_t ref_config = {}; + for (const ScalableVideoController::LayerFrameConfig& layer_frame : layers) { + const auto& buffers = layer_frame.Buffers(); + RTC_DCHECK_LE(buffers.size(), 3); + int sid = layer_frame.SpatialId(); + if (!buffers.empty()) { + ref_config.lst_fb_idx[sid] = buffers[0].id; + ref_config.reference_last[sid] = buffers[0].referenced; + if (buffers[0].updated) { + ref_config.update_buffer_slot[sid] |= (1 << buffers[0].id); + } + } + if (buffers.size() > 1) { + ref_config.gld_fb_idx[sid] = buffers[1].id; + ref_config.reference_golden[sid] = buffers[1].referenced; + if (buffers[1].updated) { + ref_config.update_buffer_slot[sid] |= (1 << buffers[1].id); + } + } + if (buffers.size() > 2) { + ref_config.alt_fb_idx[sid] = buffers[2].id; + ref_config.reference_alt_ref[sid] = buffers[2].referenced; + if (buffers[2].updated) { + ref_config.update_buffer_slot[sid] |= (1 << buffers[2].id); + } + } + } + // TODO(bugs.webrtc.org/11999): Fill ref_config.duration + return ref_config; +} + } // namespace void VP9EncoderImpl::EncoderOutputCodedPacketCallback(vpx_codec_cx_pkt* pkt, @@ -262,6 +367,9 @@ VP9EncoderImpl::VP9EncoderImpl(const cricket::VideoCodec& codec, first_frame_in_picture_(true), ss_info_needed_(false), force_all_active_layers_(false), + use_svc_controller_( + absl::StartsWith(trials.Lookup("WebRTC-Vp9DependencyDescriptor"), + "Enabled")), is_flexible_mode_(false), variable_framerate_experiment_(ParseVariableFramerateConfig(trials)), variable_framerate_controller_( @@ -438,6 +546,18 @@ bool VP9EncoderImpl::SetSvcRates( force_all_active_layers_ = true; } + if (svc_controller_) { + VideoBitrateAllocation allocation; + for (int sid = 0; sid < num_spatial_layers_; ++sid) { + for (int tid = 0; tid < num_temporal_layers_; ++tid) { + allocation.SetBitrate( + sid, tid, + config_->layer_target_bitrate[sid * num_temporal_layers_ + tid] * + 1000); + } + } + svc_controller_->OnRatesUpdated(allocation); + } current_bitrate_allocation_ = bitrate_allocation; cpu_speed_ = GetCpuSpeed(highest_active_width, highest_active_height); config_changed_ = true; @@ -528,6 +648,9 @@ int VP9EncoderImpl::InitEncode(const VideoCodec* inst, num_temporal_layers_ = 1; } + if (use_svc_controller_) { + svc_controller_ = CreateVp9ScalabilityStructure(*inst); + } framerate_controller_ = std::vector( num_spatial_layers_, FramerateController(codec_.maxFramerate)); @@ -706,7 +829,13 @@ int VP9EncoderImpl::InitAndSetControlSettings(const VideoCodec* inst) { svc_params_.min_quantizers[i] = config_->rc_min_quantizer; } config_->ss_number_layers = num_spatial_layers_; - if (ExplicitlyConfiguredSpatialLayers()) { + if (svc_controller_) { + auto stream_config = svc_controller_->StreamConfig(); + for (int i = 0; i < stream_config.num_spatial_layers; ++i) { + svc_params_.scaling_factor_num[i] = stream_config.scaling_factor_num[i]; + svc_params_.scaling_factor_den[i] = stream_config.scaling_factor_den[i]; + } + } else if (ExplicitlyConfiguredSpatialLayers()) { for (int i = 0; i < num_spatial_layers_; ++i) { const auto& layer = codec_.spatialLayers[i]; RTC_CHECK_GT(layer.width, 0); @@ -920,6 +1049,13 @@ int VP9EncoderImpl::Encode(const VideoFrame& input_image, force_key_frame_ = true; } + if (svc_controller_) { + layer_frames_ = svc_controller_->NextFrameConfig(force_key_frame_); + if (layer_frames_.empty()) { + return WEBRTC_VIDEO_CODEC_ERROR; + } + } + vpx_svc_layer_id_t layer_id = {0}; if (!force_key_frame_) { const size_t gof_idx = (pics_since_key_ + 1) % gof_.num_frames_in_gof; @@ -991,6 +1127,15 @@ int VP9EncoderImpl::Encode(const VideoFrame& input_image, layer_id.spatial_layer_id = first_active_layer_; } + if (svc_controller_) { + layer_id.spatial_layer_id = layer_frames_.front().SpatialId(); + layer_id.temporal_layer_id = layer_frames_.front().TemporalId(); + for (const auto& layer : layer_frames_) { + layer_id.temporal_layer_id_per_spatial[layer.SpatialId()] = + layer.TemporalId(); + } + } + vpx_codec_control(encoder_, VP9E_SET_SVC_LAYER_ID, &layer_id); if (num_spatial_layers_ > 1) { @@ -1086,7 +1231,10 @@ int VP9EncoderImpl::Encode(const VideoFrame& input_image, flags = VPX_EFLAG_FORCE_KF; } - if (external_ref_control_) { + if (svc_controller_) { + vpx_svc_ref_frame_config_t ref_config = Vp9References(layer_frames_); + vpx_codec_control(encoder_, VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_config); + } else if (external_ref_control_) { vpx_svc_ref_frame_config_t ref_config = SetReferences(force_key_frame_, layer_id.spatial_layer_id); @@ -1252,6 +1400,31 @@ void VP9EncoderImpl::PopulateCodecSpecific(CodecSpecificInfo* codec_specific, } first_frame_in_picture_ = false; + + // Populate codec-agnostic section in the codec specific structure. + if (svc_controller_) { + auto it = absl::c_find_if( + layer_frames_, + [&](const ScalableVideoController::LayerFrameConfig& config) { + return config.SpatialId() == spatial_idx->value_or(0); + }); + RTC_CHECK(it != layer_frames_.end()) + << "Failed to find spatial id " << spatial_idx->value_or(0); + codec_specific->generic_frame_info = svc_controller_->OnEncodeDone(*it); + if (is_key_frame) { + codec_specific->template_structure = + svc_controller_->DependencyStructure(); + auto& resolutions = codec_specific->template_structure->resolutions; + resolutions.resize(num_spatial_layers_); + for (int sid = 0; sid < num_spatial_layers_; ++sid) { + resolutions[sid] = RenderResolution( + /*width=*/codec_.width * svc_params_.scaling_factor_num[sid] / + svc_params_.scaling_factor_den[sid], + /*height=*/codec_.height * svc_params_.scaling_factor_num[sid] / + svc_params_.scaling_factor_den[sid]); + } + } + } } void VP9EncoderImpl::FillReferenceIndices(const vpx_codec_cx_pkt& pkt, diff --git a/modules/video_coding/codecs/vp9/vp9_impl.h b/modules/video_coding/codecs/vp9/vp9_impl.h index 6e23dc6d79..7ba6a1baba 100644 --- a/modules/video_coding/codecs/vp9/vp9_impl.h +++ b/modules/video_coding/codecs/vp9/vp9_impl.h @@ -26,6 +26,7 @@ #include "media/base/vp9_profile.h" #include "modules/video_coding/codecs/vp9/include/vp9.h" #include "modules/video_coding/codecs/vp9/vp9_frame_buffer_pool.h" +#include "modules/video_coding/svc/scalable_video_controller.h" #include "modules/video_coding/utility/framerate_controller.h" #include "vpx/vp8cx.h" #include "vpx/vpx_decoder.h" @@ -139,7 +140,9 @@ class VP9EncoderImpl : public VP9Encoder { VideoBitrateAllocation current_bitrate_allocation_; bool ss_info_needed_; bool force_all_active_layers_; + const bool use_svc_controller_; + std::unique_ptr svc_controller_; std::vector framerate_controller_; // Used for flexible mode. @@ -163,6 +166,7 @@ class VP9EncoderImpl : public VP9Encoder { size_t temporal_layer_id = 0; }; std::map ref_buf_; + std::vector layer_frames_; // Variable frame-rate related fields and methods. const struct VariableFramerateExperiment {