From 8da67f61652f7745dc9f0639f1b647c1e77b22bd Mon Sep 17 00:00:00 2001
From: Danil Chapovalov <danilchap@webrtc.org>
Date: Fri, 12 Mar 2021 12:56:55 +0100
Subject: [PATCH] In ksvc controller reuse unused frame configuration

vp9 encoder wrapper rely on that behaviour
to generate vp9-specific temporal references

Bug: webrtc:11999
Change-Id: I35536af4eca76450e2f72777e06ad3af872a5800
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/211340
Commit-Queue: Danil Chapovalov <danilchap@webrtc.org>
Reviewed-by: Philip Eliasson <philipel@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#33445}
---
 .../svc/scalability_structure_key_svc.cc      | 123 ++++++++++--------
 .../svc/scalability_structure_key_svc.h       |  11 +-
 .../scalability_structure_key_svc_unittest.cc | 102 ++++++++++++++-
 3 files changed, 176 insertions(+), 60 deletions(-)
diff --git a/modules/video_coding/svc/scalability_structure_key_svc.cc b/modules/video_coding/svc/scalability_structure_key_svc.cc
index c430aa42a9..1cee80e84b 100644
--- a/modules/video_coding/svc/scalability_structure_key_svc.cc
+++ b/modules/video_coding/svc/scalability_structure_key_svc.cc
@@ -22,28 +22,6 @@
 #include "rtc_base/logging.h"
 
 namespace webrtc {
-namespace {
-// Values to use as LayerFrameConfig::Id
-enum : int { kKey, kDelta };
-
-DecodeTargetIndication
-Dti(int sid, int tid, const ScalableVideoController::LayerFrameConfig& config) {
-  if (config.IsKeyframe() || config.Id() == kKey) {
-    RTC_DCHECK_EQ(config.TemporalId(), 0);
-    return sid < config.SpatialId() ? DecodeTargetIndication::kNotPresent
-                                    : DecodeTargetIndication::kSwitch;
-  }
-
-  if (sid != config.SpatialId() || tid < config.TemporalId()) {
-    return DecodeTargetIndication::kNotPresent;
-  }
-  if (tid == config.TemporalId() && tid > 0) {
-    return DecodeTargetIndication::kDiscardable;
-  }
-  return DecodeTargetIndication::kSwitch;
-}
-
-}  // namespace
 
 constexpr int ScalabilityStructureKeySvc::kMaxNumSpatialLayers;
 constexpr int ScalabilityStructureKeySvc::kMaxNumTemporalLayers;
@@ -88,6 +66,25 @@ bool ScalabilityStructureKeySvc::TemporalLayerIsActive(int tid) const {
   return false;
 }
 
+DecodeTargetIndication ScalabilityStructureKeySvc::Dti(
+    int sid,
+    int tid,
+    const LayerFrameConfig& config) {
+  if (config.IsKeyframe() || config.Id() == kKey) {
+    RTC_DCHECK_EQ(config.TemporalId(), 0);
+    return sid < config.SpatialId() ? DecodeTargetIndication::kNotPresent
+                                    : DecodeTargetIndication::kSwitch;
+  }
+
+  if (sid != config.SpatialId() || tid < config.TemporalId()) {
+    return DecodeTargetIndication::kNotPresent;
+  }
+  if (tid == config.TemporalId() && tid > 0) {
+    return DecodeTargetIndication::kDiscardable;
+  }
+  return DecodeTargetIndication::kSwitch;
+}
+
 std::vector<ScalableVideoController::LayerFrameConfig>
 ScalabilityStructureKeySvc::KeyframeConfig() {
   std::vector<LayerFrameConfig> configs;
@@ -129,7 +126,7 @@ ScalabilityStructureKeySvc::T0Config() {
       continue;
     }
     configs.emplace_back();
-    configs.back().Id(kDelta).S(sid).T(0).ReferenceAndUpdate(
+    configs.back().Id(kDeltaT0).S(sid).T(0).ReferenceAndUpdate(
         BufferIndex(sid, /*tid=*/0));
   }
   return configs;
@@ -145,7 +142,7 @@ ScalabilityStructureKeySvc::T1Config() {
     }
     configs.emplace_back();
     ScalableVideoController::LayerFrameConfig& config = configs.back();
-    config.Id(kDelta).S(sid).T(1).Reference(BufferIndex(sid, /*tid=*/0));
+    config.Id(kDeltaT1).S(sid).T(1).Reference(BufferIndex(sid, /*tid=*/0));
     if (num_temporal_layers_ > 2) {
       config.Update(BufferIndex(sid, /*tid=*/1));
     }
@@ -154,7 +151,7 @@ ScalabilityStructureKeySvc::T1Config() {
 }
 
 std::vector<ScalableVideoController::LayerFrameConfig>
-ScalabilityStructureKeySvc::T2Config() {
+ScalabilityStructureKeySvc::T2Config(FramePattern pattern) {
   std::vector<LayerFrameConfig> configs;
   configs.reserve(num_spatial_layers_);
   for (int sid = 0; sid < num_spatial_layers_; ++sid) {
@@ -163,7 +160,7 @@ ScalabilityStructureKeySvc::T2Config() {
     }
     configs.emplace_back();
     ScalableVideoController::LayerFrameConfig& config = configs.back();
-    config.Id(kDelta).S(sid).T(2);
+    config.Id(pattern).S(sid).T(2);
     if (can_reference_t1_frame_for_spatial_id_[sid]) {
       config.Reference(BufferIndex(sid, /*tid=*/1));
     } else {
@@ -173,6 +170,37 @@ ScalabilityStructureKeySvc::T2Config() {
   return configs;
 }
 
+ScalabilityStructureKeySvc::FramePattern
+ScalabilityStructureKeySvc::NextPattern(FramePattern last_pattern) const {
+  switch (last_pattern) {
+    case kNone:
+      return kKey;
+    case kDeltaT2B:
+      return kDeltaT0;
+    case kDeltaT2A:
+      if (TemporalLayerIsActive(1)) {
+        return kDeltaT1;
+      }
+      return kDeltaT0;
+    case kDeltaT1:
+      if (TemporalLayerIsActive(2)) {
+        return kDeltaT2B;
+      }
+      return kDeltaT0;
+    case kDeltaT0:
+    case kKey:
+      if (TemporalLayerIsActive(2)) {
+        return kDeltaT2A;
+      }
+      if (TemporalLayerIsActive(1)) {
+        return kDeltaT1;
+      }
+      return kDeltaT0;
+  }
+  RTC_NOTREACHED();
+  return kNone;
+}
+
 std::vector<ScalableVideoController::LayerFrameConfig>
 ScalabilityStructureKeySvc::NextFrameConfig(bool restart) {
   if (active_decode_targets_.none()) {
@@ -184,37 +212,19 @@ ScalabilityStructureKeySvc::NextFrameConfig(bool restart) {
     last_pattern_ = kNone;
   }
 
-  switch (last_pattern_) {
-    case kNone:
-      last_pattern_ = kDeltaT0;
+  FramePattern current_pattern = NextPattern(last_pattern_);
+  switch (current_pattern) {
+    case kKey:
       return KeyframeConfig();
-    case kDeltaT2B:
-      last_pattern_ = kDeltaT0;
-      return T0Config();
-    case kDeltaT2A:
-      if (TemporalLayerIsActive(1)) {
-        last_pattern_ = kDeltaT1;
-        return T1Config();
-      }
-      last_pattern_ = kDeltaT0;
+    case kDeltaT0:
       return T0Config();
     case kDeltaT1:
-      if (TemporalLayerIsActive(2)) {
-        last_pattern_ = kDeltaT2B;
-        return T2Config();
-      }
-      last_pattern_ = kDeltaT0;
-      return T0Config();
-    case kDeltaT0:
-      if (TemporalLayerIsActive(2)) {
-        last_pattern_ = kDeltaT2A;
-        return T2Config();
-      } else if (TemporalLayerIsActive(1)) {
-        last_pattern_ = kDeltaT1;
-        return T1Config();
-      }
-      last_pattern_ = kDeltaT0;
-      return T0Config();
+      return T1Config();
+    case kDeltaT2A:
+    case kDeltaT2B:
+      return T2Config(current_pattern);
+    case kNone:
+      break;
   }
   RTC_NOTREACHED();
   return {};
@@ -222,6 +232,11 @@ ScalabilityStructureKeySvc::NextFrameConfig(bool restart) {
 
 GenericFrameInfo ScalabilityStructureKeySvc::OnEncodeDone(
     const LayerFrameConfig& config) {
+  // When encoder drops all frames for a temporal unit, it is better to reuse
+  // old temporal pattern rather than switch to next one, thus switch to next
+  // pattern defered here from the `NextFrameConfig`.
+  // In particular creating VP9 references rely on this behavior.
+  last_pattern_ = static_cast<FramePattern>(config.Id());
   if (config.TemporalId() == 1) {
     can_reference_t1_frame_for_spatial_id_.set(config.SpatialId());
   }
diff --git a/modules/video_coding/svc/scalability_structure_key_svc.h b/modules/video_coding/svc/scalability_structure_key_svc.h
index 110c2a83cb..b66f6f83e4 100644
--- a/modules/video_coding/svc/scalability_structure_key_svc.h
+++ b/modules/video_coding/svc/scalability_structure_key_svc.h
@@ -32,8 +32,9 @@ class ScalabilityStructureKeySvc : public ScalableVideoController {
   void OnRatesUpdated(const VideoBitrateAllocation& bitrates) override;
 
  private:
-  enum FramePattern {
+  enum FramePattern : int {
     kNone,
+    kKey,
     kDeltaT0,
     kDeltaT2A,
     kDeltaT1,
@@ -53,10 +54,16 @@ class ScalabilityStructureKeySvc : public ScalableVideoController {
     active_decode_targets_.set(sid * num_temporal_layers_ + tid, value);
   }
   bool TemporalLayerIsActive(int tid) const;
+  static DecodeTargetIndication Dti(int sid,
+                                    int tid,
+                                    const LayerFrameConfig& config);
+
   std::vector<LayerFrameConfig> KeyframeConfig();
   std::vector<LayerFrameConfig> T0Config();
   std::vector<LayerFrameConfig> T1Config();
-  std::vector<LayerFrameConfig> T2Config();
+  std::vector<LayerFrameConfig> T2Config(FramePattern pattern);
+
+  FramePattern NextPattern(FramePattern last_pattern) const;
 
   const int num_spatial_layers_;
   const int num_temporal_layers_;
diff --git a/modules/video_coding/svc/scalability_structure_key_svc_unittest.cc b/modules/video_coding/svc/scalability_structure_key_svc_unittest.cc
index 34ec74726d..5f923bb487 100644
--- a/modules/video_coding/svc/scalability_structure_key_svc_unittest.cc
+++ b/modules/video_coding/svc/scalability_structure_key_svc_unittest.cc
@@ -62,14 +62,108 @@ TEST(ScalabilityStructureL3T3KeyTest,
   // Simulate T1 frame dropped by the encoder,
   // i.e. retrieve config, but skip calling OnEncodeDone.
   structure.NextFrameConfig(/*restart=*/false);
-  // one more temporal units (T2)
+  // one more temporal unit.
   wrapper.GenerateFrames(/*num_temporal_units=*/1, frames);
 
-  ASSERT_THAT(frames, SizeIs(9));
+  EXPECT_THAT(frames, SizeIs(9));
+  EXPECT_TRUE(wrapper.FrameReferencesAreValid(frames));
+}
+
+TEST(ScalabilityStructureL3T3KeyTest,
+     SkippingFrameReusePreviousFrameConfiguration) {
+  std::vector<GenericFrameInfo> frames;
+  ScalabilityStructureL3T3Key structure;
+  ScalabilityStructureWrapper wrapper(structure);
+
+  // 1st 2 temporal units (T0 and T2)
+  wrapper.GenerateFrames(/*num_temporal_units=*/2, frames);
+  ASSERT_THAT(frames, SizeIs(6));
+  ASSERT_EQ(frames[0].temporal_id, 0);
+  ASSERT_EQ(frames[3].temporal_id, 2);
+
+  // Simulate a frame dropped by the encoder,
+  // i.e. retrieve config, but skip calling OnEncodeDone.
+  structure.NextFrameConfig(/*restart=*/false);
+  // two more temporal unit, expect temporal pattern continues
+  wrapper.GenerateFrames(/*num_temporal_units=*/2, frames);
+  ASSERT_THAT(frames, SizeIs(12));
+  // Expect temporal pattern continues as if there were no dropped frames.
+  EXPECT_EQ(frames[6].temporal_id, 1);
+  EXPECT_EQ(frames[9].temporal_id, 2);
+}
+
+TEST(ScalabilityStructureL3T3KeyTest, SkippingKeyFrameTriggersNewKeyFrame) {
+  std::vector<GenericFrameInfo> frames;
+  ScalabilityStructureL3T3Key structure;
+  ScalabilityStructureWrapper wrapper(structure);
+
+  // Ask for a key frame config, but do not return any frames
+  structure.NextFrameConfig(/*restart=*/false);
+
+  // Ask for more frames, expect they start with a key frame.
+  wrapper.GenerateFrames(/*num_temporal_units=*/2, frames);
+  ASSERT_THAT(frames, SizeIs(6));
+  ASSERT_EQ(frames[0].temporal_id, 0);
+  ASSERT_EQ(frames[3].temporal_id, 2);
+  EXPECT_TRUE(wrapper.FrameReferencesAreValid(frames));
+}
+
+TEST(ScalabilityStructureL3T3KeyTest,
+     SkippingT2FrameAndDisablingT2LayerProduceT1AsNextFrame) {
+  std::vector<GenericFrameInfo> frames;
+  ScalabilityStructureL3T3Key structure;
+  ScalabilityStructureWrapper wrapper(structure);
+
+  wrapper.GenerateFrames(/*num_temporal_units=*/1, frames);
+  // Ask for next (T2) frame config, but do not return any frames
+  auto config = structure.NextFrameConfig(/*restart=*/false);
+  ASSERT_THAT(config, Not(IsEmpty()));
+  ASSERT_EQ(config.front().TemporalId(), 2);
+
+  // Disable T2 layer,
+  structure.OnRatesUpdated(EnableTemporalLayers(/*s0=*/2, /*s1=*/2, /*s2=*/2));
+  // Expect instead of reusing unused config, T1 config is generated.
+  config = structure.NextFrameConfig(/*restart=*/false);
+  ASSERT_THAT(config, Not(IsEmpty()));
+  EXPECT_EQ(config.front().TemporalId(), 1);
+}
+
+TEST(ScalabilityStructureL3T3KeyTest, EnableT2LayerWhileProducingT1Frame) {
+  std::vector<GenericFrameInfo> frames;
+  ScalabilityStructureL3T3Key structure;
+  ScalabilityStructureWrapper wrapper(structure);
+
+  // Disable T2 layer,
+  structure.OnRatesUpdated(EnableTemporalLayers(/*s0=*/2, /*s1=*/2, /*s2=*/2));
+
+  // Generate the key frame.
+  wrapper.GenerateFrames(/*num_temporal_units=*/1, frames);
+  ASSERT_THAT(frames, SizeIs(3));
   EXPECT_EQ(frames[0].temporal_id, 0);
-  EXPECT_EQ(frames[3].temporal_id, 2);
-  // T1 frames were dropped by the encoder.
+
+  // Ask for next (T1) frame config, but do not return any frames yet.
+  auto config = structure.NextFrameConfig(/*restart=*/false);
+  ASSERT_THAT(config, Not(IsEmpty()));
+  ASSERT_EQ(config.front().TemporalId(), 1);
+
+  // Reenable T2 layer.
+  structure.OnRatesUpdated(EnableTemporalLayers(/*s0=*/3, /*s1=*/3, /*s2=*/3));
+
+  // Finish encoding previously requested config.
+  for (auto layer_config : config) {
+    GenericFrameInfo info = structure.OnEncodeDone(layer_config);
+    EXPECT_EQ(info.temporal_id, 1);
+    frames.push_back(info);
+  }
+  ASSERT_THAT(frames, SizeIs(6));
+
+  // Generate more frames, expect T2 pattern resumes.
+  wrapper.GenerateFrames(/*num_temporal_units=*/4, frames);
+  ASSERT_THAT(frames, SizeIs(18));
   EXPECT_EQ(frames[6].temporal_id, 2);
+  EXPECT_EQ(frames[9].temporal_id, 0);
+  EXPECT_EQ(frames[12].temporal_id, 2);
+  EXPECT_EQ(frames[15].temporal_id, 1);
 
   EXPECT_TRUE(wrapper.FrameReferencesAreValid(frames));
 }