From 8da67f61652f7745dc9f0639f1b647c1e77b22bd Mon Sep 17 00:00:00 2001 From: Danil Chapovalov Date: Fri, 12 Mar 2021 12:56:55 +0100 Subject: [PATCH] In ksvc controller reuse unused frame configuration vp9 encoder wrapper rely on that behaviour to generate vp9-specific temporal references Bug: webrtc:11999 Change-Id: I35536af4eca76450e2f72777e06ad3af872a5800 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/211340 Commit-Queue: Danil Chapovalov Reviewed-by: Philip Eliasson Cr-Commit-Position: refs/heads/master@{#33445} --- .../svc/scalability_structure_key_svc.cc | 123 ++++++++++-------- .../svc/scalability_structure_key_svc.h | 11 +- .../scalability_structure_key_svc_unittest.cc | 102 ++++++++++++++- 3 files changed, 176 insertions(+), 60 deletions(-) diff --git a/modules/video_coding/svc/scalability_structure_key_svc.cc b/modules/video_coding/svc/scalability_structure_key_svc.cc index c430aa42a9..1cee80e84b 100644 --- a/modules/video_coding/svc/scalability_structure_key_svc.cc +++ b/modules/video_coding/svc/scalability_structure_key_svc.cc @@ -22,28 +22,6 @@ #include "rtc_base/logging.h" namespace webrtc { -namespace { -// Values to use as LayerFrameConfig::Id -enum : int { kKey, kDelta }; - -DecodeTargetIndication -Dti(int sid, int tid, const ScalableVideoController::LayerFrameConfig& config) { - if (config.IsKeyframe() || config.Id() == kKey) { - RTC_DCHECK_EQ(config.TemporalId(), 0); - return sid < config.SpatialId() ? DecodeTargetIndication::kNotPresent - : DecodeTargetIndication::kSwitch; - } - - if (sid != config.SpatialId() || tid < config.TemporalId()) { - return DecodeTargetIndication::kNotPresent; - } - if (tid == config.TemporalId() && tid > 0) { - return DecodeTargetIndication::kDiscardable; - } - return DecodeTargetIndication::kSwitch; -} - -} // namespace constexpr int ScalabilityStructureKeySvc::kMaxNumSpatialLayers; constexpr int ScalabilityStructureKeySvc::kMaxNumTemporalLayers; @@ -88,6 +66,25 @@ bool ScalabilityStructureKeySvc::TemporalLayerIsActive(int tid) const { return false; } +DecodeTargetIndication ScalabilityStructureKeySvc::Dti( + int sid, + int tid, + const LayerFrameConfig& config) { + if (config.IsKeyframe() || config.Id() == kKey) { + RTC_DCHECK_EQ(config.TemporalId(), 0); + return sid < config.SpatialId() ? DecodeTargetIndication::kNotPresent + : DecodeTargetIndication::kSwitch; + } + + if (sid != config.SpatialId() || tid < config.TemporalId()) { + return DecodeTargetIndication::kNotPresent; + } + if (tid == config.TemporalId() && tid > 0) { + return DecodeTargetIndication::kDiscardable; + } + return DecodeTargetIndication::kSwitch; +} + std::vector ScalabilityStructureKeySvc::KeyframeConfig() { std::vector configs; @@ -129,7 +126,7 @@ ScalabilityStructureKeySvc::T0Config() { continue; } configs.emplace_back(); - configs.back().Id(kDelta).S(sid).T(0).ReferenceAndUpdate( + configs.back().Id(kDeltaT0).S(sid).T(0).ReferenceAndUpdate( BufferIndex(sid, /*tid=*/0)); } return configs; @@ -145,7 +142,7 @@ ScalabilityStructureKeySvc::T1Config() { } configs.emplace_back(); ScalableVideoController::LayerFrameConfig& config = configs.back(); - config.Id(kDelta).S(sid).T(1).Reference(BufferIndex(sid, /*tid=*/0)); + config.Id(kDeltaT1).S(sid).T(1).Reference(BufferIndex(sid, /*tid=*/0)); if (num_temporal_layers_ > 2) { config.Update(BufferIndex(sid, /*tid=*/1)); } @@ -154,7 +151,7 @@ ScalabilityStructureKeySvc::T1Config() { } std::vector -ScalabilityStructureKeySvc::T2Config() { +ScalabilityStructureKeySvc::T2Config(FramePattern pattern) { std::vector configs; configs.reserve(num_spatial_layers_); for (int sid = 0; sid < num_spatial_layers_; ++sid) { @@ -163,7 +160,7 @@ ScalabilityStructureKeySvc::T2Config() { } configs.emplace_back(); ScalableVideoController::LayerFrameConfig& config = configs.back(); - config.Id(kDelta).S(sid).T(2); + config.Id(pattern).S(sid).T(2); if (can_reference_t1_frame_for_spatial_id_[sid]) { config.Reference(BufferIndex(sid, /*tid=*/1)); } else { @@ -173,6 +170,37 @@ ScalabilityStructureKeySvc::T2Config() { return configs; } +ScalabilityStructureKeySvc::FramePattern +ScalabilityStructureKeySvc::NextPattern(FramePattern last_pattern) const { + switch (last_pattern) { + case kNone: + return kKey; + case kDeltaT2B: + return kDeltaT0; + case kDeltaT2A: + if (TemporalLayerIsActive(1)) { + return kDeltaT1; + } + return kDeltaT0; + case kDeltaT1: + if (TemporalLayerIsActive(2)) { + return kDeltaT2B; + } + return kDeltaT0; + case kDeltaT0: + case kKey: + if (TemporalLayerIsActive(2)) { + return kDeltaT2A; + } + if (TemporalLayerIsActive(1)) { + return kDeltaT1; + } + return kDeltaT0; + } + RTC_NOTREACHED(); + return kNone; +} + std::vector ScalabilityStructureKeySvc::NextFrameConfig(bool restart) { if (active_decode_targets_.none()) { @@ -184,37 +212,19 @@ ScalabilityStructureKeySvc::NextFrameConfig(bool restart) { last_pattern_ = kNone; } - switch (last_pattern_) { - case kNone: - last_pattern_ = kDeltaT0; + FramePattern current_pattern = NextPattern(last_pattern_); + switch (current_pattern) { + case kKey: return KeyframeConfig(); - case kDeltaT2B: - last_pattern_ = kDeltaT0; - return T0Config(); - case kDeltaT2A: - if (TemporalLayerIsActive(1)) { - last_pattern_ = kDeltaT1; - return T1Config(); - } - last_pattern_ = kDeltaT0; + case kDeltaT0: return T0Config(); case kDeltaT1: - if (TemporalLayerIsActive(2)) { - last_pattern_ = kDeltaT2B; - return T2Config(); - } - last_pattern_ = kDeltaT0; - return T0Config(); - case kDeltaT0: - if (TemporalLayerIsActive(2)) { - last_pattern_ = kDeltaT2A; - return T2Config(); - } else if (TemporalLayerIsActive(1)) { - last_pattern_ = kDeltaT1; - return T1Config(); - } - last_pattern_ = kDeltaT0; - return T0Config(); + return T1Config(); + case kDeltaT2A: + case kDeltaT2B: + return T2Config(current_pattern); + case kNone: + break; } RTC_NOTREACHED(); return {}; @@ -222,6 +232,11 @@ ScalabilityStructureKeySvc::NextFrameConfig(bool restart) { GenericFrameInfo ScalabilityStructureKeySvc::OnEncodeDone( const LayerFrameConfig& config) { + // When encoder drops all frames for a temporal unit, it is better to reuse + // old temporal pattern rather than switch to next one, thus switch to next + // pattern defered here from the `NextFrameConfig`. + // In particular creating VP9 references rely on this behavior. + last_pattern_ = static_cast(config.Id()); if (config.TemporalId() == 1) { can_reference_t1_frame_for_spatial_id_.set(config.SpatialId()); } diff --git a/modules/video_coding/svc/scalability_structure_key_svc.h b/modules/video_coding/svc/scalability_structure_key_svc.h index 110c2a83cb..b66f6f83e4 100644 --- a/modules/video_coding/svc/scalability_structure_key_svc.h +++ b/modules/video_coding/svc/scalability_structure_key_svc.h @@ -32,8 +32,9 @@ class ScalabilityStructureKeySvc : public ScalableVideoController { void OnRatesUpdated(const VideoBitrateAllocation& bitrates) override; private: - enum FramePattern { + enum FramePattern : int { kNone, + kKey, kDeltaT0, kDeltaT2A, kDeltaT1, @@ -53,10 +54,16 @@ class ScalabilityStructureKeySvc : public ScalableVideoController { active_decode_targets_.set(sid * num_temporal_layers_ + tid, value); } bool TemporalLayerIsActive(int tid) const; + static DecodeTargetIndication Dti(int sid, + int tid, + const LayerFrameConfig& config); + std::vector KeyframeConfig(); std::vector T0Config(); std::vector T1Config(); - std::vector T2Config(); + std::vector T2Config(FramePattern pattern); + + FramePattern NextPattern(FramePattern last_pattern) const; const int num_spatial_layers_; const int num_temporal_layers_; diff --git a/modules/video_coding/svc/scalability_structure_key_svc_unittest.cc b/modules/video_coding/svc/scalability_structure_key_svc_unittest.cc index 34ec74726d..5f923bb487 100644 --- a/modules/video_coding/svc/scalability_structure_key_svc_unittest.cc +++ b/modules/video_coding/svc/scalability_structure_key_svc_unittest.cc @@ -62,14 +62,108 @@ TEST(ScalabilityStructureL3T3KeyTest, // Simulate T1 frame dropped by the encoder, // i.e. retrieve config, but skip calling OnEncodeDone. structure.NextFrameConfig(/*restart=*/false); - // one more temporal units (T2) + // one more temporal unit. wrapper.GenerateFrames(/*num_temporal_units=*/1, frames); - ASSERT_THAT(frames, SizeIs(9)); + EXPECT_THAT(frames, SizeIs(9)); + EXPECT_TRUE(wrapper.FrameReferencesAreValid(frames)); +} + +TEST(ScalabilityStructureL3T3KeyTest, + SkippingFrameReusePreviousFrameConfiguration) { + std::vector frames; + ScalabilityStructureL3T3Key structure; + ScalabilityStructureWrapper wrapper(structure); + + // 1st 2 temporal units (T0 and T2) + wrapper.GenerateFrames(/*num_temporal_units=*/2, frames); + ASSERT_THAT(frames, SizeIs(6)); + ASSERT_EQ(frames[0].temporal_id, 0); + ASSERT_EQ(frames[3].temporal_id, 2); + + // Simulate a frame dropped by the encoder, + // i.e. retrieve config, but skip calling OnEncodeDone. + structure.NextFrameConfig(/*restart=*/false); + // two more temporal unit, expect temporal pattern continues + wrapper.GenerateFrames(/*num_temporal_units=*/2, frames); + ASSERT_THAT(frames, SizeIs(12)); + // Expect temporal pattern continues as if there were no dropped frames. + EXPECT_EQ(frames[6].temporal_id, 1); + EXPECT_EQ(frames[9].temporal_id, 2); +} + +TEST(ScalabilityStructureL3T3KeyTest, SkippingKeyFrameTriggersNewKeyFrame) { + std::vector frames; + ScalabilityStructureL3T3Key structure; + ScalabilityStructureWrapper wrapper(structure); + + // Ask for a key frame config, but do not return any frames + structure.NextFrameConfig(/*restart=*/false); + + // Ask for more frames, expect they start with a key frame. + wrapper.GenerateFrames(/*num_temporal_units=*/2, frames); + ASSERT_THAT(frames, SizeIs(6)); + ASSERT_EQ(frames[0].temporal_id, 0); + ASSERT_EQ(frames[3].temporal_id, 2); + EXPECT_TRUE(wrapper.FrameReferencesAreValid(frames)); +} + +TEST(ScalabilityStructureL3T3KeyTest, + SkippingT2FrameAndDisablingT2LayerProduceT1AsNextFrame) { + std::vector frames; + ScalabilityStructureL3T3Key structure; + ScalabilityStructureWrapper wrapper(structure); + + wrapper.GenerateFrames(/*num_temporal_units=*/1, frames); + // Ask for next (T2) frame config, but do not return any frames + auto config = structure.NextFrameConfig(/*restart=*/false); + ASSERT_THAT(config, Not(IsEmpty())); + ASSERT_EQ(config.front().TemporalId(), 2); + + // Disable T2 layer, + structure.OnRatesUpdated(EnableTemporalLayers(/*s0=*/2, /*s1=*/2, /*s2=*/2)); + // Expect instead of reusing unused config, T1 config is generated. + config = structure.NextFrameConfig(/*restart=*/false); + ASSERT_THAT(config, Not(IsEmpty())); + EXPECT_EQ(config.front().TemporalId(), 1); +} + +TEST(ScalabilityStructureL3T3KeyTest, EnableT2LayerWhileProducingT1Frame) { + std::vector frames; + ScalabilityStructureL3T3Key structure; + ScalabilityStructureWrapper wrapper(structure); + + // Disable T2 layer, + structure.OnRatesUpdated(EnableTemporalLayers(/*s0=*/2, /*s1=*/2, /*s2=*/2)); + + // Generate the key frame. + wrapper.GenerateFrames(/*num_temporal_units=*/1, frames); + ASSERT_THAT(frames, SizeIs(3)); EXPECT_EQ(frames[0].temporal_id, 0); - EXPECT_EQ(frames[3].temporal_id, 2); - // T1 frames were dropped by the encoder. + + // Ask for next (T1) frame config, but do not return any frames yet. + auto config = structure.NextFrameConfig(/*restart=*/false); + ASSERT_THAT(config, Not(IsEmpty())); + ASSERT_EQ(config.front().TemporalId(), 1); + + // Reenable T2 layer. + structure.OnRatesUpdated(EnableTemporalLayers(/*s0=*/3, /*s1=*/3, /*s2=*/3)); + + // Finish encoding previously requested config. + for (auto layer_config : config) { + GenericFrameInfo info = structure.OnEncodeDone(layer_config); + EXPECT_EQ(info.temporal_id, 1); + frames.push_back(info); + } + ASSERT_THAT(frames, SizeIs(6)); + + // Generate more frames, expect T2 pattern resumes. + wrapper.GenerateFrames(/*num_temporal_units=*/4, frames); + ASSERT_THAT(frames, SizeIs(18)); EXPECT_EQ(frames[6].temporal_id, 2); + EXPECT_EQ(frames[9].temporal_id, 0); + EXPECT_EQ(frames[12].temporal_id, 2); + EXPECT_EQ(frames[15].temporal_id, 1); EXPECT_TRUE(wrapper.FrameReferencesAreValid(frames)); }