From 5546aef6820908010e6cc6a3cb89216ebf894d93 Mon Sep 17 00:00:00 2001
From: Ilya Nikolaevskiy <ilnik@webrtc.org>
Date: Tue, 4 Dec 2018 15:54:52 +0100
Subject: [PATCH] Vp9 flexible mode fixes

- Enable vp9 flexible mode in VideoEngine if 3 spatial layers are set.
- Enable flexible mode in loopback tools and quality tests.
- Reset first active spatial layer on keyframe in encoder.
- Ensure duplicate references are not set by the sender in video header.
- Set references manually for flexible mode in vp9 encoder.
- Delay new activated layers until next base layer frame.
- On receive side put each spatial layer as a separate frame to FrameBuffer
  and return several frames combined from FrameBuffer.

Bug: webrtc:10049,webrtc:9794,webrtc:9784
Change-Id: I01e69f134cc145deba666ccc92deb1d37a324ede
Reviewed-on: https://webrtc-review.googlesource.com/c/112289
Commit-Queue: Ilya Nikolaevskiy <ilnik@webrtc.org>
Reviewed-by: Sergey Silkin <ssilkin@webrtc.org>
Reviewed-by: Philip Eliasson <philipel@webrtc.org>
Reviewed-by: Niels Moller <nisse@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#25895}
---
 api/video/encoded_frame.h                     |   3 +
 api/video/encoded_image.h                     |   4 +
 media/engine/webrtcvideoengine.cc             |   3 +
 .../codecs/test/videocodec_test_libvpx.cc     |   6 +-
 modules/video_coding/codecs/vp9/svc_config.cc |   6 +-
 .../codecs/vp9/svc_config_unittest.cc         |   7 +-
 .../codecs/vp9/svc_rate_allocator_unittest.cc |   2 +-
 .../codecs/vp9/test/vp9_impl_unittest.cc      | 258 +++++++++++++++++-
 modules/video_coding/codecs/vp9/vp9_impl.cc   | 211 ++++++++++----
 modules/video_coding/codecs/vp9/vp9_impl.h    |   8 +-
 modules/video_coding/encoded_frame.h          |  10 +-
 modules/video_coding/frame_buffer2.cc         | 107 +++++++-
 modules/video_coding/frame_buffer2.h          |  10 +-
 .../video_coding/frame_buffer2_unittest.cc    | 224 ++++++++-------
 modules/video_coding/frame_object.cc          |   1 +
 .../rtp_frame_reference_finder.cc             |  16 +-
 video/rtp_video_stream_receiver.cc            |   8 +
 video/video_quality_test.cc                   |   4 +
 video/video_receive_stream.cc                 |   4 -
 19 files changed, 694 insertions(+), 198 deletions(-)

diff --git a/api/video/encoded_frame.h b/api/video/encoded_frame.h
index afef0391d0..fa06568aa9 100644
--- a/api/video/encoded_frame.h
+++ b/api/video/encoded_frame.h
@@ -79,6 +79,9 @@ class EncodedFrame : public webrtc::VCMEncodedFrame {
   size_t num_references = 0;
   int64_t references[kMaxFrameReferences];
   bool inter_layer_predicted = false;
+  // Is this subframe the last one in the superframe (In RTP stream that would
+  // mean that the last packet has a marker bit set).
+  bool is_last_spatial_layer = true;
 };
 
 }  // namespace video_coding
diff --git a/api/video/encoded_image.h b/api/video/encoded_image.h
index b909c47ab4..d7919ff850 100644
--- a/api/video/encoded_image.h
+++ b/api/video/encoded_image.h
@@ -68,6 +68,10 @@ class RTC_EXPORT EncodedImage {
   }
 
   size_t size() const { return _length; }
+  void set_size(size_t new_size) {
+    RTC_DCHECK_LE(new_size, _size);
+    _length = new_size;
+  }
   size_t capacity() const { return _size; }
 
   void set_buffer(uint8_t* buffer, size_t capacity) {
diff --git a/media/engine/webrtcvideoengine.cc b/media/engine/webrtcvideoengine.cc
index b96f2ea3d5..48639c48dc 100644
--- a/media/engine/webrtcvideoengine.cc
+++ b/media/engine/webrtcvideoengine.cc
@@ -383,6 +383,9 @@ WebRtcVideoChannel::WebRtcVideoSendStream::ConfigureVideoEncoderSettings(
     if (!is_screencast) {
       // Limit inter-layer prediction to key pictures.
       vp9_settings.interLayerPred = webrtc::InterLayerPredMode::kOnKeyPic;
+    } else {
+      // 3 spatial layers vp9 screenshare needs flexible mode.
+      vp9_settings.flexibleMode = vp9_settings.numberOfSpatialLayers > 2;
     }
     return new rtc::RefCountedObject<
         webrtc::VideoEncoderConfig::Vp9EncoderSpecificSettings>(vp9_settings);
diff --git a/modules/video_coding/codecs/test/videocodec_test_libvpx.cc b/modules/video_coding/codecs/test/videocodec_test_libvpx.cc
index f69fde6884..1e365de1d1 100644
--- a/modules/video_coding/codecs/test/videocodec_test_libvpx.cc
+++ b/modules/video_coding/codecs/test/videocodec_test_libvpx.cc
@@ -124,9 +124,9 @@ TEST(VideoCodecTestLibvpx, ChangeBitrateVP9) {
       {500, 30, kNumFramesLong}};
 
   std::vector<RateControlThresholds> rc_thresholds = {
-      {5, 1, 0, 1, 0.5, 0.1, 0, 1},
-      {15, 2, 0, 1, 0.5, 0.1, 0, 0},
-      {10, 1, 0, 1, 0.5, 0.1, 0, 0}};
+      {5, 2, 0, 1, 0.5, 0.1, 0, 1},
+      {15, 3, 0, 1, 0.5, 0.1, 0, 0},
+      {10, 2, 0, 1, 0.5, 0.1, 0, 0}};
 
   std::vector<QualityThresholds> quality_thresholds = {
       {34, 33, 0.90, 0.88}, {38, 35, 0.95, 0.91}, {35, 34, 0.93, 0.90}};
diff --git a/modules/video_coding/codecs/vp9/svc_config.cc b/modules/video_coding/codecs/vp9/svc_config.cc
index 7a79a420a5..0e76b09d6b 100644
--- a/modules/video_coding/codecs/vp9/svc_config.cc
+++ b/modules/video_coding/codecs/vp9/svc_config.cc
@@ -23,9 +23,9 @@ namespace webrtc {
 namespace {
 const size_t kMinVp9SvcBitrateKbps = 30;
 
-const size_t kMaxNumLayersForScreenSharing = 2;
-const float kMaxScreenSharingLayerFramerateFps[] = {5.0, 5.0};
-const size_t kMaxScreenSharingLayerBitrateKbps[] = {200, 500};
+const size_t kMaxNumLayersForScreenSharing = 3;
+const float kMaxScreenSharingLayerFramerateFps[] = {5.0, 5.0, 30.0};
+const size_t kMaxScreenSharingLayerBitrateKbps[] = {200, 500, 1250};
 }  // namespace
 
 std::vector<SpatialLayer> ConfigureSvcScreenSharing(size_t input_width,
diff --git a/modules/video_coding/codecs/vp9/svc_config_unittest.cc b/modules/video_coding/codecs/vp9/svc_config_unittest.cc
index 683c2d47a0..05802eb5b0 100644
--- a/modules/video_coding/codecs/vp9/svc_config_unittest.cc
+++ b/modules/video_coding/codecs/vp9/svc_config_unittest.cc
@@ -48,12 +48,13 @@ TEST(SvcConfig, ScreenSharing) {
   std::vector<SpatialLayer> spatial_layers =
       GetSvcConfig(1920, 1080, 30, 3, 3, true);
 
-  EXPECT_EQ(spatial_layers.size(), 2UL);
+  EXPECT_EQ(spatial_layers.size(), 3UL);
 
-  for (const SpatialLayer& layer : spatial_layers) {
+  for (size_t i = 0; i < 3; ++i) {
+    const SpatialLayer& layer = spatial_layers[i];
     EXPECT_EQ(layer.width, 1920);
     EXPECT_EQ(layer.height, 1080);
-    EXPECT_EQ(layer.maxFramerate, 5);
+    EXPECT_EQ(layer.maxFramerate, (i < 2) ? 5 : 30);
     EXPECT_EQ(layer.numberOfTemporalLayers, 1);
     EXPECT_LE(layer.minBitrate, layer.maxBitrate);
     EXPECT_LE(layer.minBitrate, layer.targetBitrate);
diff --git a/modules/video_coding/codecs/vp9/svc_rate_allocator_unittest.cc b/modules/video_coding/codecs/vp9/svc_rate_allocator_unittest.cc
index ac225553ea..e430123909 100644
--- a/modules/video_coding/codecs/vp9/svc_rate_allocator_unittest.cc
+++ b/modules/video_coding/codecs/vp9/svc_rate_allocator_unittest.cc
@@ -151,7 +151,7 @@ TEST(SvcRateAllocatorTest, MinBitrateToGetQualityLayer) {
 
   const SpatialLayer* layers = codec.spatialLayers;
 
-  EXPECT_LE(codec.VP9()->numberOfSpatialLayers, 2U);
+  EXPECT_LE(codec.VP9()->numberOfSpatialLayers, 3U);
 
   VideoBitrateAllocation allocation =
       allocator.GetAllocation(layers[0].minBitrate * 1000, 30);
diff --git a/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc b/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc
index f5a0fd1e6a..e081d7649d 100644
--- a/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc
+++ b/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc
@@ -91,15 +91,16 @@ class TestVp9Impl : public VideoCodecUnitTest {
     }
   }
 
-  void ConfigureSvc(size_t num_spatial_layers) {
+  void ConfigureSvc(size_t num_spatial_layers, size_t num_temporal_layers = 1) {
     codec_settings_.VP9()->numberOfSpatialLayers =
         static_cast<unsigned char>(num_spatial_layers);
-    codec_settings_.VP9()->numberOfTemporalLayers = 1;
+    codec_settings_.VP9()->numberOfTemporalLayers = num_temporal_layers;
     codec_settings_.VP9()->frameDroppingOn = false;
 
-    std::vector<SpatialLayer> layers = GetSvcConfig(
-        codec_settings_.width, codec_settings_.height,
-        codec_settings_.maxFramerate, num_spatial_layers, 1, false);
+    std::vector<SpatialLayer> layers =
+        GetSvcConfig(codec_settings_.width, codec_settings_.height,
+                     codec_settings_.maxFramerate, num_spatial_layers,
+                     num_temporal_layers, false);
     for (size_t i = 0; i < layers.size(); ++i) {
       codec_settings_.spatialLayers[i] = layers[i];
     }
@@ -401,6 +402,8 @@ TEST_F(TestVp9Impl, EnableDisableSpatialLayers) {
       std::vector<EncodedImage> encoded_frame;
       std::vector<CodecSpecificInfo> codec_specific_info;
       ASSERT_TRUE(WaitForEncodedFrames(&encoded_frame, &codec_specific_info));
+      EXPECT_EQ(codec_specific_info[0].codecSpecific.VP9.ss_data_available,
+                frame_num == 0);
     }
   }
 
@@ -418,6 +421,8 @@ TEST_F(TestVp9Impl, EnableDisableSpatialLayers) {
       std::vector<EncodedImage> encoded_frame;
       std::vector<CodecSpecificInfo> codec_specific_info;
       ASSERT_TRUE(WaitForEncodedFrames(&encoded_frame, &codec_specific_info));
+      EXPECT_EQ(codec_specific_info[0].codecSpecific.VP9.ss_data_available,
+                frame_num == 0);
     }
   }
 }
@@ -581,6 +586,248 @@ TEST_F(TestVp9Impl,
   }
 }
 
+TEST_F(TestVp9Impl, EnablingNewLayerIsDelayedInScreenshareAndAddsSsInfo) {
+  const size_t num_spatial_layers = 3;
+  // Chosen by hand, the 2nd frame is dropped with configured per-layer max
+  // framerate.
+  const size_t num_frames_to_encode_before_drop = 1;
+  // Chosen by hand, exactly 5 frames are dropped for input fps=30 and max
+  // framerate = 5.
+  const size_t num_dropped_frames = 5;
+
+  codec_settings_.maxFramerate = 30;
+  ConfigureSvc(num_spatial_layers);
+  codec_settings_.spatialLayers[0].maxFramerate = 5.0;
+  // use 30 for the SL 1 instead of 5, so even if SL 0 frame is dropped due to
+  // framerate capping we would still get back at least a middle layer. It
+  // simplifies the test.
+  codec_settings_.spatialLayers[1].maxFramerate = 30.0;
+  codec_settings_.spatialLayers[2].maxFramerate = 30.0;
+  codec_settings_.VP9()->frameDroppingOn = false;
+  codec_settings_.mode = VideoCodecMode::kScreensharing;
+  codec_settings_.VP9()->interLayerPred = InterLayerPredMode::kOn;
+  codec_settings_.VP9()->flexibleMode = true;
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->InitEncode(&codec_settings_, 1 /* number of cores */,
+                                 0 /* max payload size (unused) */));
+
+  // Enable all but the last layer.
+  VideoBitrateAllocation bitrate_allocation;
+  for (size_t sl_idx = 0; sl_idx < num_spatial_layers - 1; ++sl_idx) {
+    bitrate_allocation.SetBitrate(
+        sl_idx, 0, codec_settings_.spatialLayers[sl_idx].targetBitrate * 1000);
+  }
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->SetRateAllocation(bitrate_allocation,
+                                        codec_settings_.maxFramerate));
+
+  // Encode enough frames to force drop due to framerate capping.
+  for (size_t frame_num = 0; frame_num < num_frames_to_encode_before_drop;
+       ++frame_num) {
+    SetWaitForEncodedFramesThreshold(num_spatial_layers - 1);
+    EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+              encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+    std::vector<EncodedImage> encoded_frames;
+    std::vector<CodecSpecificInfo> codec_specific_info;
+    ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+  }
+
+  // Enable the last layer.
+  bitrate_allocation.SetBitrate(
+      num_spatial_layers - 1, 0,
+      codec_settings_.spatialLayers[num_spatial_layers - 1].targetBitrate *
+          1000);
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->SetRateAllocation(bitrate_allocation,
+                                        codec_settings_.maxFramerate));
+
+  for (size_t frame_num = 0; frame_num < num_dropped_frames; ++frame_num) {
+    SetWaitForEncodedFramesThreshold(1);
+    EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+              encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+    // First layer is dropped due to frame rate cap. The last layer should not
+    // be enabled yet.
+    std::vector<EncodedImage> encoded_frames;
+    std::vector<CodecSpecificInfo> codec_specific_info;
+    ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+  }
+
+  SetWaitForEncodedFramesThreshold(2);
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+  // Now all 3 layers should be encoded.
+  std::vector<EncodedImage> encoded_frames;
+  std::vector<CodecSpecificInfo> codec_specific_info;
+  ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+  EXPECT_EQ(encoded_frames.size(), 3u);
+  // Scalability structure has to be triggered.
+  EXPECT_TRUE(codec_specific_info[0].codecSpecific.VP9.ss_data_available);
+}
+
+TEST_F(TestVp9Impl, RemovingLayerIsNotDelayedInScreenshareAndAddsSsInfo) {
+  const size_t num_spatial_layers = 3;
+  // Chosen by hand, the 2nd frame is dropped with configured per-layer max
+  // framerate.
+  const size_t num_frames_to_encode_before_drop = 1;
+  // Chosen by hand, exactly 5 frames are dropped for input fps=30 and max
+  // framerate = 5.
+  const size_t num_dropped_frames = 5;
+
+  codec_settings_.maxFramerate = 30;
+  ConfigureSvc(num_spatial_layers);
+  codec_settings_.spatialLayers[0].maxFramerate = 5.0;
+  // use 30 for the SL 1 instead of 5, so even if SL 0 frame is dropped due to
+  // framerate capping we would still get back at least a middle layer. It
+  // simplifies the test.
+  codec_settings_.spatialLayers[1].maxFramerate = 30.0;
+  codec_settings_.spatialLayers[2].maxFramerate = 30.0;
+  codec_settings_.VP9()->frameDroppingOn = false;
+  codec_settings_.mode = VideoCodecMode::kScreensharing;
+  codec_settings_.VP9()->interLayerPred = InterLayerPredMode::kOn;
+  codec_settings_.VP9()->flexibleMode = true;
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->InitEncode(&codec_settings_, 1 /* number of cores */,
+                                 0 /* max payload size (unused) */));
+
+  // All layers are enabled from the start.
+  VideoBitrateAllocation bitrate_allocation;
+  for (size_t sl_idx = 0; sl_idx < num_spatial_layers; ++sl_idx) {
+    bitrate_allocation.SetBitrate(
+        sl_idx, 0, codec_settings_.spatialLayers[sl_idx].targetBitrate * 1000);
+  }
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->SetRateAllocation(bitrate_allocation,
+                                        codec_settings_.maxFramerate));
+
+  // Encode enough frames to force drop due to framerate capping.
+  for (size_t frame_num = 0; frame_num < num_frames_to_encode_before_drop;
+       ++frame_num) {
+    SetWaitForEncodedFramesThreshold(num_spatial_layers);
+    EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+              encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+    std::vector<EncodedImage> encoded_frames;
+    std::vector<CodecSpecificInfo> codec_specific_info;
+    ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+  }
+
+  // Now the first layer should not have frames in it.
+  for (size_t frame_num = 0; frame_num < num_dropped_frames - 2; ++frame_num) {
+    SetWaitForEncodedFramesThreshold(2);
+    EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+              encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+    // First layer is dropped due to frame rate cap. The last layer should not
+    // be enabled yet.
+    std::vector<EncodedImage> encoded_frames;
+    std::vector<CodecSpecificInfo> codec_specific_info;
+    ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+    // First layer is skipped.
+    EXPECT_EQ(encoded_frames[0].SpatialIndex().value_or(-1), 1);
+  }
+
+  // Disable the last layer.
+  bitrate_allocation.SetBitrate(num_spatial_layers - 1, 0, 0);
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->SetRateAllocation(bitrate_allocation,
+                                        codec_settings_.maxFramerate));
+
+  // Still expected to drop first layer. Last layer has to be disable also.
+  for (size_t frame_num = num_dropped_frames - 2;
+       frame_num < num_dropped_frames; ++frame_num) {
+    // Expect back one frame.
+    SetWaitForEncodedFramesThreshold(1);
+    EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+              encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+    // First layer is dropped due to frame rate cap. The last layer should not
+    // be enabled yet.
+    std::vector<EncodedImage> encoded_frames;
+    std::vector<CodecSpecificInfo> codec_specific_info;
+    ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+    // First layer is skipped.
+    EXPECT_EQ(encoded_frames[0].SpatialIndex().value_or(-1), 1);
+    // No SS data on non-base spatial layer.
+    EXPECT_FALSE(codec_specific_info[0].codecSpecific.VP9.ss_data_available);
+  }
+
+  SetWaitForEncodedFramesThreshold(2);
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+  std::vector<EncodedImage> encoded_frames;
+  std::vector<CodecSpecificInfo> codec_specific_info;
+  ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+  // First layer is not skipped now.
+  EXPECT_EQ(encoded_frames[0].SpatialIndex().value_or(-1), 0);
+  // SS data should be present.
+  EXPECT_TRUE(codec_specific_info[0].codecSpecific.VP9.ss_data_available);
+}
+
+TEST_F(TestVp9Impl, DisableNewLayerInVideoDelaysSsInfoTillTL0) {
+  const size_t num_spatial_layers = 3;
+  const size_t num_temporal_layers = 2;
+  // Chosen by hand, the 2nd frame is dropped with configured per-layer max
+  // framerate.
+  ConfigureSvc(num_spatial_layers, num_temporal_layers);
+  codec_settings_.VP9()->frameDroppingOn = false;
+  codec_settings_.mode = VideoCodecMode::kRealtimeVideo;
+  codec_settings_.VP9()->interLayerPred = InterLayerPredMode::kOnKeyPic;
+  codec_settings_.VP9()->flexibleMode = false;
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->InitEncode(&codec_settings_, 1 /* number of cores */,
+                                 0 /* max payload size (unused) */));
+
+  // Enable all the layers.
+  VideoBitrateAllocation bitrate_allocation;
+  for (size_t sl_idx = 0; sl_idx < num_spatial_layers; ++sl_idx) {
+    for (size_t tl_idx = 0; tl_idx < num_temporal_layers; ++tl_idx) {
+      bitrate_allocation.SetBitrate(
+          sl_idx, tl_idx,
+          codec_settings_.spatialLayers[sl_idx].targetBitrate * 1000 /
+              num_temporal_layers);
+    }
+  }
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->SetRateAllocation(bitrate_allocation,
+                                        codec_settings_.maxFramerate));
+
+  std::vector<EncodedImage> encoded_frames;
+  std::vector<CodecSpecificInfo> codec_specific_info;
+
+  // Encode one TL0 frame
+  SetWaitForEncodedFramesThreshold(num_spatial_layers);
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+  ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+  EXPECT_EQ(codec_specific_info[0].codecSpecific.VP9.temporal_idx, 0u);
+
+  // Disable the last layer.
+  for (size_t tl_idx = 0; tl_idx < num_temporal_layers; ++tl_idx) {
+    bitrate_allocation.SetBitrate(num_spatial_layers - 1, tl_idx, 0);
+  }
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->SetRateAllocation(bitrate_allocation,
+                                        codec_settings_.maxFramerate));
+
+  // Next is TL1 frame. The last layer is disabled immediately, but SS structure
+  // is not provided here.
+  SetWaitForEncodedFramesThreshold(num_spatial_layers - 1);
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+  ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+  EXPECT_EQ(codec_specific_info[0].codecSpecific.VP9.temporal_idx, 1u);
+
+  // Next is TL0 frame, which should have delayed SS structure.
+  SetWaitForEncodedFramesThreshold(num_spatial_layers - 1);
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+  ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+  EXPECT_EQ(codec_specific_info[0].codecSpecific.VP9.temporal_idx, 0u);
+  EXPECT_TRUE(codec_specific_info[0].codecSpecific.VP9.ss_data_available);
+  EXPECT_TRUE(codec_specific_info[0]
+                  .codecSpecific.VP9.spatial_layer_resolution_present);
+  EXPECT_EQ(
+      codec_specific_info[0].codecSpecific.VP9.width[num_spatial_layers - 1],
+      0u);
+}
+
 TEST_F(TestVp9Impl,
        LowLayerMarkedAsRefIfHighLayerNotEncodedAndInterLayerPredIsEnabled) {
   ConfigureSvc(3);
@@ -766,6 +1013,7 @@ TEST_F(TestVp9ImplFrameDropping, DifferentFrameratePerSpatialLayer) {
 
   codec_settings_.VP9()->numberOfSpatialLayers = num_spatial_layers;
   codec_settings_.VP9()->frameDroppingOn = false;
+  codec_settings_.VP9()->flexibleMode = true;
 
   VideoBitrateAllocation bitrate_allocation;
   for (uint8_t sl_idx = 0; sl_idx < num_spatial_layers; ++sl_idx) {
diff --git a/modules/video_coding/codecs/vp9/vp9_impl.cc b/modules/video_coding/codecs/vp9/vp9_impl.cc
index faeeebb0a6..7bb2ea27d0 100644
--- a/modules/video_coding/codecs/vp9/vp9_impl.cc
+++ b/modules/video_coding/codecs/vp9/vp9_impl.cc
@@ -49,6 +49,9 @@ uint8_t kUpdBufIdx[4] = {0, 0, 1, 0};
 
 int kMaxNumTiles4kVideo = 8;
 
+// Maximum allowed PID difference for variable frame-rate mode.
+const int kMaxAllowedPidDIff = 8;
+
 // Only positive speeds, range for real-time coding currently is: 5 - 8.
 // Lower means slower/better quality, higher means fastest/lower quality.
 int GetCpuSpeed(int width, int height) {
@@ -124,6 +127,18 @@ ColorSpace ExtractVP9ColorSpace(vpx_color_space_t space_t,
   }
   return ColorSpace(primaries, transfer, matrix, range);
 }
+
+bool MoreLayersEnabled(const VideoBitrateAllocation& first,
+                       const VideoBitrateAllocation& second) {
+  for (size_t sl_idx = 0; sl_idx < kMaxSpatialLayers; ++sl_idx) {
+    if (first.GetSpatialLayerSum(sl_idx) > 0 &&
+        second.GetSpatialLayerSum(sl_idx) == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace
 
 void VP9EncoderImpl::EncoderOutputCodedPacketCallback(vpx_codec_cx_pkt* pkt,
@@ -154,12 +169,12 @@ VP9EncoderImpl::VP9EncoderImpl(const cricket::VideoCodec& codec)
           field_trial::IsEnabled("WebRTC-Vp9IssueKeyFrameOnLayerDeactivation")),
       is_svc_(false),
       inter_layer_pred_(InterLayerPredMode::kOn),
-      external_ref_control_(
-          field_trial::IsEnabled("WebRTC-Vp9ExternalRefCtrl")),
+      external_ref_control_(false),  // Set in InitEncode because of tests.
       trusted_rate_controller_(
           field_trial::IsEnabled(kVp9TrustedRateControllerFieldTrial)),
       full_superframe_drop_(true),
       first_frame_in_picture_(true),
+      ss_info_needed_(false),
       is_flexible_mode_(false) {
   memset(&codec_, 0, sizeof(codec_));
   memset(&svc_params_, 0, sizeof(vpx_svc_extra_cfg_t));
@@ -314,14 +329,8 @@ int VP9EncoderImpl::SetRateAllocation(
 
   codec_.maxFramerate = frame_rate;
 
-  if (!SetSvcRates(bitrate_allocation)) {
-    return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
-  }
+  requested_bitrate_allocation_ = bitrate_allocation;
 
-  // Update encoder context
-  if (vpx_codec_enc_config_set(encoder_, config_)) {
-    return WEBRTC_VIDEO_CODEC_ERROR;
-  }
   return WEBRTC_VIDEO_CODEC_OK;
 }
 
@@ -461,6 +470,27 @@ int VP9EncoderImpl::InitEncode(const VideoCodec* inst,
 
   is_flexible_mode_ = inst->VP9().flexibleMode;
 
+  inter_layer_pred_ = inst->VP9().interLayerPred;
+
+  different_framerates_used_ = false;
+  for (size_t sl_idx = 1; sl_idx < num_spatial_layers_; ++sl_idx) {
+    if (std::abs(codec_.spatialLayers[sl_idx].maxFramerate -
+                 codec_.spatialLayers[0].maxFramerate) > 1e-9) {
+      different_framerates_used_ = true;
+    }
+  }
+
+  if (different_framerates_used_ && !is_flexible_mode_) {
+    RTC_LOG(LS_ERROR) << "Flexible mode required for different framerates on "
+                         "different spatial layers";
+    return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
+  }
+
+  // External reference control is required for different frame rate on spatial
+  // layers because libvpx generates rtp incompatible references in this case.
+  external_ref_control_ = field_trial::IsEnabled("WebRTC-Vp9ExternalRefCtrl") ||
+                          different_framerates_used_;
+
   if (num_temporal_layers_ == 1) {
     gof_.SetGofInfoVP9(kTemporalStructureMode1);
     config_->temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING;
@@ -493,8 +523,14 @@ int VP9EncoderImpl::InitEncode(const VideoCodec* inst,
     return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
   }
 
-  inter_layer_pred_ = inst->VP9().interLayerPred;
-
+  if (external_ref_control_) {
+    config_->temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+    if (num_temporal_layers_ > 1 && different_framerates_used_) {
+      // External reference control for several temporal layers with different
+      // frame rates on spatial layers is not implemented yet.
+      return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
+    }
+  }
   ref_buf_.clear();
 
   return InitAndSetControlSettings(inst);
@@ -575,9 +611,9 @@ int VP9EncoderImpl::InitAndSetControlSettings(const VideoCodec* inst) {
   }
 
   SvcRateAllocator init_allocator(codec_);
-  VideoBitrateAllocation allocation = init_allocator.GetAllocation(
+  current_bitrate_allocation_ = init_allocator.GetAllocation(
       inst->startBitrate * 1000, inst->maxFramerate);
-  if (!SetSvcRates(allocation)) {
+  if (!SetSvcRates(current_bitrate_allocation_)) {
     return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
   }
 
@@ -595,6 +631,7 @@ int VP9EncoderImpl::InitAndSetControlSettings(const VideoCodec* inst) {
                     inst->VP9().adaptiveQpMode ? 3 : 0);
 
   vpx_codec_control(encoder_, VP9E_SET_FRAME_PARALLEL_DECODING, 0);
+  vpx_codec_control(encoder_, VP9E_SET_SVC_GF_TEMPORAL_REF, 0);
 
   if (is_svc_) {
     vpx_codec_control(encoder_, VP9E_SET_SVC, 1);
@@ -696,21 +733,21 @@ int VP9EncoderImpl::Encode(const VideoFrame& input_image,
     }
   }
 
-  if (VideoCodecMode::kScreensharing == codec_.mode && !force_key_frame_) {
-    // Skip encoding spatial layer frames if their target frame rate is lower
-    // than actual input frame rate.
-    vpx_svc_layer_id_t layer_id = {0};
+  vpx_svc_layer_id_t layer_id = {0};
+  if (!force_key_frame_) {
     const size_t gof_idx = (pics_since_key_ + 1) % gof_.num_frames_in_gof;
     layer_id.temporal_layer_id = gof_.temporal_idx[gof_idx];
 
-    const uint32_t frame_timestamp_ms =
-        1000 * input_image.timestamp() / kVideoPayloadTypeFrequency;
+    if (VideoCodecMode::kScreensharing == codec_.mode) {
+      const uint32_t frame_timestamp_ms =
+          1000 * input_image.timestamp() / kVideoPayloadTypeFrequency;
 
-    for (uint8_t sl_idx = 0; sl_idx < num_active_spatial_layers_; ++sl_idx) {
-      if (framerate_controller_[sl_idx].DropFrame(frame_timestamp_ms)) {
-        ++layer_id.spatial_layer_id;
-      } else {
-        break;
+      for (uint8_t sl_idx = 0; sl_idx < num_active_spatial_layers_; ++sl_idx) {
+        if (framerate_controller_[sl_idx].DropFrame(frame_timestamp_ms)) {
+          ++layer_id.spatial_layer_id;
+        } else {
+          break;
+        }
       }
     }
 
@@ -719,8 +756,42 @@ int VP9EncoderImpl::Encode(const VideoFrame& input_image,
       // Drop entire picture.
       return WEBRTC_VIDEO_CODEC_OK;
     }
+  }
 
-    vpx_codec_control(encoder_, VP9E_SET_SVC_LAYER_ID, &layer_id);
+  for (int sl_idx = 0; sl_idx < num_active_spatial_layers_; ++sl_idx) {
+    layer_id.temporal_layer_id_per_spatial[sl_idx] = layer_id.temporal_layer_id;
+  }
+
+  vpx_codec_control(encoder_, VP9E_SET_SVC_LAYER_ID, &layer_id);
+
+  if (requested_bitrate_allocation_) {
+    bool more_layers_requested = MoreLayersEnabled(
+        *requested_bitrate_allocation_, current_bitrate_allocation_);
+    bool less_layers_requested = MoreLayersEnabled(
+        current_bitrate_allocation_, *requested_bitrate_allocation_);
+    // In SVC can enable new layers only if all lower layers are encoded and at
+    // the base temporal layer.
+    // This will delay rate allocation change until the next frame on the base
+    // spatial layer.
+    // In KSVC or simulcast modes KF will be generated for a new layer, so can
+    // update allocation any time.
+    bool can_upswitch =
+        inter_layer_pred_ != InterLayerPredMode::kOn ||
+        (layer_id.spatial_layer_id == 0 && layer_id.temporal_layer_id == 0);
+    if (!more_layers_requested || can_upswitch) {
+      current_bitrate_allocation_ = *requested_bitrate_allocation_;
+      requested_bitrate_allocation_ = absl::nullopt;
+      if (!SetSvcRates(current_bitrate_allocation_)) {
+        return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
+      }
+      if (less_layers_requested || more_layers_requested) {
+        ss_info_needed_ = true;
+      }
+    }
+  }
+
+  if (vpx_codec_enc_config_set(encoder_, config_)) {
+    return WEBRTC_VIDEO_CODEC_ERROR;
   }
 
   RTC_DCHECK_EQ(input_image.width(), raw_->d_w);
@@ -780,7 +851,8 @@ int VP9EncoderImpl::Encode(const VideoFrame& input_image,
   }
 
   if (external_ref_control_) {
-    vpx_svc_ref_frame_config_t ref_config = SetReferences(force_key_frame_);
+    vpx_svc_ref_frame_config_t ref_config =
+        SetReferences(force_key_frame_, layer_id.spatial_layer_id);
 
     if (VideoCodecMode::kScreensharing == codec_.mode) {
       for (uint8_t sl_idx = 0; sl_idx < num_active_spatial_layers_; ++sl_idx) {
@@ -840,9 +912,22 @@ void VP9EncoderImpl::PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
   vp9_info->ss_data_available =
       (pkt.data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
 
+  if (pkt.data.frame.flags & VPX_FRAME_IS_KEY) {
+    pics_since_key_ = 0;
+  } else if (first_frame_in_picture_) {
+    ++pics_since_key_;
+  }
+
   vpx_svc_layer_id_t layer_id = {0};
   vpx_codec_control(encoder_, VP9E_GET_SVC_LAYER_ID, &layer_id);
 
+  if (ss_info_needed_ && layer_id.temporal_layer_id == 0 &&
+      layer_id.spatial_layer_id == 0) {
+    // Force SS info after the layers configuration has changed.
+    vp9_info->ss_data_available = true;
+    ss_info_needed_ = false;
+  }
+
   RTC_CHECK_GT(num_temporal_layers_, 0);
   RTC_CHECK_GT(num_active_spatial_layers_, 0);
   if (num_temporal_layers_ == 1) {
@@ -864,12 +949,6 @@ void VP9EncoderImpl::PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
   // TODO(asapersson): this info has to be obtained from the encoder.
   vp9_info->temporal_up_switch = false;
 
-  if (pkt.data.frame.flags & VPX_FRAME_IS_KEY) {
-    pics_since_key_ = 0;
-  } else if (first_frame_in_picture_) {
-    ++pics_since_key_;
-  }
-
   const bool is_key_pic = (pics_since_key_ == 0);
   const bool is_inter_layer_pred_allowed =
       (inter_layer_pred_ == InterLayerPredMode::kOn ||
@@ -901,8 +980,6 @@ void VP9EncoderImpl::PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
     vp9_info->gof_idx = kNoGofIdx;
     FillReferenceIndices(pkt, pics_since_key_, vp9_info->inter_layer_predicted,
                          vp9_info);
-    // TODO(webrtc:9794): Add fake reference to empty reference list to
-    // workaround the frame buffer issue on receiver.
   } else {
     vp9_info->gof_idx =
         static_cast<uint8_t>(pics_since_key_ % gof_.num_frames_in_gof);
@@ -985,6 +1062,8 @@ void VP9EncoderImpl::FillReferenceIndices(const vpx_codec_cx_pkt& pkt,
 
   size_t max_ref_temporal_layer_id = 0;
 
+  std::vector<size_t> ref_pid_list;
+
   vp9_info->num_ref_pics = 0;
   for (const RefFrameBuffer& ref_buf : ref_buf_list) {
     RTC_DCHECK_LE(ref_buf.pic_num, pic_num);
@@ -997,6 +1076,16 @@ void VP9EncoderImpl::FillReferenceIndices(const vpx_codec_cx_pkt& pkt,
       }
       RTC_DCHECK_LE(ref_buf.temporal_layer_id, layer_id.temporal_layer_id);
 
+      // Encoder may reference several spatial layers on the same previous
+      // frame in case if some spatial layers are skipped on the current frame.
+      // We shouldn't put duplicate references as it may break some old
+      // clients and isn't RTP compatible.
+      if (std::find(ref_pid_list.begin(), ref_pid_list.end(),
+                    ref_buf.pic_num) != ref_pid_list.end()) {
+        continue;
+      }
+      ref_pid_list.push_back(ref_buf.pic_num);
+
       const size_t p_diff = pic_num - ref_buf.pic_num;
       RTC_DCHECK_LE(p_diff, 127UL);
 
@@ -1038,20 +1127,13 @@ void VP9EncoderImpl::UpdateReferenceBuffers(const vpx_codec_cx_pkt& pkt,
     vpx_svc_ref_frame_config_t enc_layer_conf = {{0}};
     vpx_codec_control(encoder_, VP9E_GET_SVC_REF_FRAME_CONFIG, &enc_layer_conf);
 
-    if (enc_layer_conf.update_last[layer_id.spatial_layer_id]) {
-      ref_buf_[enc_layer_conf.lst_fb_idx[layer_id.spatial_layer_id]] =
-          frame_buf;
+    for (size_t i = 0; i < kNumVp9Buffers; ++i) {
+      if (enc_layer_conf.update_buffer_slot[layer_id.spatial_layer_id] &
+          (1 << i)) {
+        ref_buf_[i] = frame_buf;
+      }
     }
 
-    if (enc_layer_conf.update_alt_ref[layer_id.spatial_layer_id]) {
-      ref_buf_[enc_layer_conf.alt_fb_idx[layer_id.spatial_layer_id]] =
-          frame_buf;
-    }
-
-    if (enc_layer_conf.update_golden[layer_id.spatial_layer_id]) {
-      ref_buf_[enc_layer_conf.gld_fb_idx[layer_id.spatial_layer_id]] =
-          frame_buf;
-    }
   } else {
     RTC_DCHECK_EQ(num_spatial_layers_, 1);
     RTC_DCHECK_EQ(num_temporal_layers_, 1);
@@ -1061,7 +1143,9 @@ void VP9EncoderImpl::UpdateReferenceBuffers(const vpx_codec_cx_pkt& pkt,
   }
 }
 
-vpx_svc_ref_frame_config_t VP9EncoderImpl::SetReferences(bool is_key_pic) {
+vpx_svc_ref_frame_config_t VP9EncoderImpl::SetReferences(
+    bool is_key_pic,
+    size_t first_active_spatial_layer_id) {
   // kRefBufIdx, kUpdBufIdx need to be updated to support longer GOFs.
   RTC_DCHECK_LE(gof_.num_frames_in_gof, 4);
 
@@ -1083,8 +1167,10 @@ vpx_svc_ref_frame_config_t VP9EncoderImpl::SetReferences(bool is_key_pic) {
   // for temporal references plus 1 buffer for spatial reference. 7 buffers
   // in total.
 
-  for (size_t sl_idx = 0; sl_idx < num_active_spatial_layers_; ++sl_idx) {
-    const size_t gof_idx = pics_since_key_ % gof_.num_frames_in_gof;
+  for (size_t sl_idx = first_active_spatial_layer_id;
+       sl_idx < num_active_spatial_layers_; ++sl_idx) {
+    const size_t curr_pic_num = is_key_pic ? 0 : pics_since_key_ + 1;
+    const size_t gof_idx = curr_pic_num % gof_.num_frames_in_gof;
 
     if (!is_key_pic) {
       // Set up temporal reference.
@@ -1096,36 +1182,47 @@ vpx_svc_ref_frame_config_t VP9EncoderImpl::SetReferences(bool is_key_pic) {
 
       // Sanity check that reference picture number is smaller than current
       // picture number.
-      const size_t curr_pic_num = pics_since_key_ + 1;
       RTC_DCHECK_LT(ref_buf_[buf_idx].pic_num, curr_pic_num);
       const size_t pid_diff = curr_pic_num - ref_buf_[buf_idx].pic_num;
+      // Incorrect spatial layer may be in the buffer due to a key-frame.
+      const bool same_spatial_layer =
+          ref_buf_[buf_idx].spatial_layer_id == sl_idx;
+      bool correct_pid = false;
+      if (different_framerates_used_) {
+        correct_pid = pid_diff < kMaxAllowedPidDIff;
+      } else {
+        // Below code assumes single temporal referecence.
+        RTC_DCHECK_EQ(gof_.num_ref_pics[gof_idx], 1);
+        correct_pid = pid_diff == gof_.pid_diff[gof_idx][0];
+      }
 
-      // Below code assumes single temporal referecence.
-      RTC_DCHECK_EQ(gof_.num_ref_pics[gof_idx], 1);
-      if (pid_diff == gof_.pid_diff[gof_idx][0]) {
+      if (same_spatial_layer && correct_pid) {
         ref_config.lst_fb_idx[sl_idx] = buf_idx;
         ref_config.reference_last[sl_idx] = 1;
       } else {
         // This reference doesn't match with one specified by GOF. This can
         // only happen if spatial layer is enabled dynamically without key
         // frame. Spatial prediction is supposed to be enabled in this case.
-        RTC_DCHECK(is_inter_layer_pred_allowed);
+        RTC_DCHECK(is_inter_layer_pred_allowed &&
+                   sl_idx > first_active_spatial_layer_id);
       }
     }
 
-    if (is_inter_layer_pred_allowed && sl_idx > 0) {
+    if (is_inter_layer_pred_allowed && sl_idx > first_active_spatial_layer_id) {
       // Set up spatial reference.
       RTC_DCHECK(last_updated_buf_idx);
       ref_config.gld_fb_idx[sl_idx] = *last_updated_buf_idx;
       ref_config.reference_golden[sl_idx] = 1;
     } else {
-      RTC_DCHECK(ref_config.reference_last[sl_idx] != 0 || sl_idx == 0 ||
+      RTC_DCHECK(ref_config.reference_last[sl_idx] != 0 ||
+                 sl_idx == first_active_spatial_layer_id ||
                  inter_layer_pred_ == InterLayerPredMode::kOff);
     }
 
     last_updated_buf_idx.reset();
 
-    if (gof_.temporal_idx[gof_idx] <= num_temporal_layers_ - 1) {
+    if (gof_.temporal_idx[gof_idx] < num_temporal_layers_ - 1 ||
+        num_temporal_layers_ == 1) {
       last_updated_buf_idx = sl_idx * num_temporal_refs + kUpdBufIdx[gof_idx];
 
       // Ensure last frame buffer is not used for temporal prediction (it is
diff --git a/modules/video_coding/codecs/vp9/vp9_impl.h b/modules/video_coding/codecs/vp9/vp9_impl.h
index 33f41fd7d4..a2dab26010 100644
--- a/modules/video_coding/codecs/vp9/vp9_impl.h
+++ b/modules/video_coding/codecs/vp9/vp9_impl.h
@@ -70,7 +70,9 @@ class VP9EncoderImpl : public VP9Encoder {
                             CodecSpecificInfoVP9* vp9_info);
   void UpdateReferenceBuffers(const vpx_codec_cx_pkt& pkt,
                               const size_t pic_num);
-  vpx_svc_ref_frame_config_t SetReferences(bool is_key_pic);
+  vpx_svc_ref_frame_config_t SetReferences(
+      bool is_key_pic,
+      size_t first_active_spatial_layer_id);
 
   bool ExplicitlyConfiguredSpatialLayers() const;
   bool SetSvcRates(const VideoBitrateAllocation& bitrate_allocation);
@@ -110,6 +112,7 @@ class VP9EncoderImpl : public VP9Encoder {
   GofInfoVP9 gof_;  // Contains each frame's temporal information for
                     // non-flexible mode.
   bool force_key_frame_;
+  bool different_framerates_used_;
   size_t pics_since_key_;
   uint8_t num_temporal_layers_;
   uint8_t num_spatial_layers_;         // Number of configured SLs
@@ -121,6 +124,9 @@ class VP9EncoderImpl : public VP9Encoder {
   const bool trusted_rate_controller_;
   const bool full_superframe_drop_;
   bool first_frame_in_picture_;
+  VideoBitrateAllocation current_bitrate_allocation_;
+  absl::optional<VideoBitrateAllocation> requested_bitrate_allocation_;
+  bool ss_info_needed_;
 
   std::vector<FramerateController> framerate_controller_;
 
diff --git a/modules/video_coding/encoded_frame.h b/modules/video_coding/encoded_frame.h
index 124ed4427b..fcc3b5dc81 100644
--- a/modules/video_coding/encoded_frame.h
+++ b/modules/video_coding/encoded_frame.h
@@ -67,9 +67,11 @@ class VCMEncodedFrame : protected EncodedImage {
   /**
    *   Frame RTP timestamp (90kHz)
    */
-  using EncodedImage::Timestamp;
+  using EncodedImage::set_size;
   using EncodedImage::SetTimestamp;
   using EncodedImage::size;
+  using EncodedImage::Timestamp;
+
   /**
    *   Get render time in milliseconds
    */
@@ -90,6 +92,7 @@ class VCMEncodedFrame : protected EncodedImage {
    * Get video timing
    */
   EncodedImage::Timing video_timing() const { return timing_; }
+  EncodedImage::Timing* video_timing_mutable() { return &timing_; }
   /**
    *   True if this frame is complete, false otherwise
    */
@@ -109,8 +112,10 @@ class VCMEncodedFrame : protected EncodedImage {
    *   the object.
    */
   const CodecSpecificInfo* CodecSpecific() const { return &_codecSpecificInfo; }
+  void SetCodecSpecific(const CodecSpecificInfo* codec_specific) {
+    _codecSpecificInfo = *codec_specific;
+  }
 
- protected:
   /**
    * Verifies that current allocated buffer size is larger than or equal to the
    * input size.
@@ -121,6 +126,7 @@ class VCMEncodedFrame : protected EncodedImage {
    */
   void VerifyAndAllocate(size_t minimumSize);
 
+ protected:
   void Reset();
 
   void CopyCodecSpecific(const RTPVideoHeader* header);
diff --git a/modules/video_coding/frame_buffer2.cc b/modules/video_coding/frame_buffer2.cc
index 0385460310..52ec2da527 100644
--- a/modules/video_coding/frame_buffer2.cc
+++ b/modules/video_coding/frame_buffer2.cc
@@ -87,10 +87,10 @@ FrameBuffer::ReturnReason FrameBuffer::NextFrame(
 
       wait_ms = max_wait_time_ms;
 
-      // Need to hold |crit_| in order to use |frames_|, therefore we
+      // Need to hold |crit_| in order to access frames_to_decode_. therefore we
       // set it here in the loop instead of outside the loop in order to not
-      // acquire the lock unnecesserily.
-      next_frame_it_ = frames_.end();
+      // acquire the lock unnecessarily.
+      frames_to_decode_.clear();
 
       // |frame_it| points to the first frame after the
       // |last_decoded_frame_it_|.
@@ -128,7 +128,53 @@ FrameBuffer::ReturnReason FrameBuffer::NextFrame(
           continue;
         }
 
-        next_frame_it_ = frame_it;
+        // Only ever return all parts of a superframe. Therefore skip this
+        // frame if it's not a beginning of a superframe.
+        if (frame->inter_layer_predicted) {
+          continue;
+        }
+
+        // Gather all remaining frames for the same superframe.
+        std::vector<FrameMap::iterator> current_superframe;
+        current_superframe.push_back(frame_it);
+        bool last_layer_completed =
+            frame_it->second.frame->is_last_spatial_layer;
+        FrameMap::iterator next_frame_it = frame_it;
+        while (true) {
+          ++next_frame_it;
+          if (next_frame_it == frames_.end() ||
+              next_frame_it->first.picture_id != frame->id.picture_id ||
+              !next_frame_it->second.continuous) {
+            break;
+          }
+          // Check if the next frame has some undecoded references other than
+          // the previous frame in the same superframe.
+          size_t num_allowed_undecoded_refs =
+              (next_frame_it->second.frame->inter_layer_predicted) ? 1 : 0;
+          if (next_frame_it->second.num_missing_decodable >
+              num_allowed_undecoded_refs) {
+            break;
+          }
+          // All frames in the superframe should have the same timestamp.
+          if (frame->Timestamp() != next_frame_it->second.frame->Timestamp()) {
+            RTC_LOG(LS_WARNING)
+                << "Frames in a single superframe have different"
+                   " timestamps. Skipping undecodable superframe.";
+            break;
+          }
+          current_superframe.push_back(next_frame_it);
+          last_layer_completed =
+              next_frame_it->second.frame->is_last_spatial_layer;
+        }
+        // Check if the current superframe is complete.
+        // TODO(bugs.webrtc.org/10064): consider returning all available to
+        // decode frames even if the superframe is not complete yet.
+        if (!last_layer_completed) {
+          continue;
+        }
+
+        frames_to_decode_ = std::move(current_superframe);
+
         if (frame->RenderTime() == -1) {
           frame->SetRenderTime(
               timing_->RenderTimeMs(frame->Timestamp(), now_ms));
@@ -154,9 +200,10 @@ FrameBuffer::ReturnReason FrameBuffer::NextFrame(
   {
     rtc::CritScope lock(&crit_);
     now_ms = clock_->TimeInMilliseconds();
-    if (next_frame_it_ != frames_.end()) {
-      std::unique_ptr<EncodedFrame> frame =
-          std::move(next_frame_it_->second.frame);
+    std::vector<EncodedFrame*> frames_out;
+    for (const FrameMap::iterator& frame_it : frames_to_decode_) {
+      RTC_DCHECK(frame_it != frames_.end());
+      EncodedFrame* frame = frame_it->second.frame.release();
 
       if (!frame->delayed_by_retransmission()) {
         int64_t frame_delay;
@@ -187,14 +234,22 @@ FrameBuffer::ReturnReason FrameBuffer::NextFrame(
 
       UpdateJitterDelay();
       UpdateTimingFrameInfo();
-      PropagateDecodability(next_frame_it_->second);
+      PropagateDecodability(frame_it->second);
 
-      AdvanceLastDecodedFrame(next_frame_it_);
+      AdvanceLastDecodedFrame(frame_it);
       last_decoded_frame_timestamp_ = frame->Timestamp();
-      *frame_out = std::move(frame);
+      frames_out.push_back(frame);
+    }
+
+    if (!frames_out.empty()) {
+      if (frames_out.size() == 1) {
+        frame_out->reset(frames_out[0]);
+      } else {
+        frame_out->reset(CombineAndDeleteFrames(frames_out));
+      }
       return kFrameFound;
     }
-  }
+  }  // rtc::Critscope lock(&crit_)
 
   if (latest_return_time_ms - now_ms > 0) {
     // If |next_frame_it_ == frames_.end()| and there is still time left, it
@@ -203,7 +258,6 @@ FrameBuffer::ReturnReason FrameBuffer::NextFrame(
     // remaining time and then return.
     return NextFrame(latest_return_time_ms - now_ms, frame_out);
   }
-
   return kTimeout;
 }
 
@@ -606,11 +660,38 @@ void FrameBuffer::ClearFramesAndHistory() {
   frames_.clear();
   last_decoded_frame_it_ = frames_.end();
   last_continuous_frame_it_ = frames_.end();
-  next_frame_it_ = frames_.end();
+  frames_to_decode_.clear();
   num_frames_history_ = 0;
   num_frames_buffered_ = 0;
 }
 
+EncodedFrame* FrameBuffer::CombineAndDeleteFrames(
+    const std::vector<EncodedFrame*>& frames) const {
+  RTC_DCHECK(!frames.empty());
+  EncodedFrame* frame = frames[0];
+  size_t total_length = 0;
+  for (size_t i = 0; i < frames.size(); ++i) {
+    total_length += frames[i]->size();
+  }
+  frame->VerifyAndAllocate(total_length);
+  uint8_t* buffer = frame->MutableBuffer();
+  // Append all remaining frames to the first one.
+  size_t used_buffer_bytes = frame->size();
+  for (size_t i = 1; i < frames.size(); ++i) {
+    EncodedFrame* frame_to_append = frames[i];
+    memcpy(buffer + used_buffer_bytes, frame_to_append->Buffer(),
+           frame_to_append->size());
+    used_buffer_bytes += frame_to_append->size();
+    frame->video_timing_mutable()->network2_timestamp_ms =
+        frame_to_append->video_timing().network2_timestamp_ms;
+    frame->video_timing_mutable()->receive_finish_ms =
+        frame_to_append->video_timing().receive_finish_ms;
+    delete frame_to_append;
+  }
+  frame->set_size(total_length);
+  return frame;
+}
+
 FrameBuffer::FrameInfo::FrameInfo() = default;
 FrameBuffer::FrameInfo::FrameInfo(FrameInfo&&) = default;
 FrameBuffer::FrameInfo::~FrameInfo() = default;
diff --git a/modules/video_coding/frame_buffer2.h b/modules/video_coding/frame_buffer2.h
index dc5e5a2e37..c311bc8f2f 100644
--- a/modules/video_coding/frame_buffer2.h
+++ b/modules/video_coding/frame_buffer2.h
@@ -15,6 +15,7 @@
 #include <map>
 #include <memory>
 #include <utility>
+#include <vector>
 
 #include "api/video/encoded_frame.h"
 #include "modules/video_coding/include/video_coding_defines.h"
@@ -156,6 +157,13 @@ class FrameBuffer {
   bool HasBadRenderTiming(const EncodedFrame& frame, int64_t now_ms)
       RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_);
 
+  // The cleaner solution would be to have the NextFrame function return a
+  // vector of frames, but until the decoding pipeline can support decoding
+  // multiple frames at the same time we combine all frames to one frame and
+  // return it. See bugs.webrtc.org/10064
+  EncodedFrame* CombineAndDeleteFrames(
+      const std::vector<EncodedFrame*>& frames) const;
+
   FrameMap frames_ RTC_GUARDED_BY(crit_);
 
   rtc::CriticalSection crit_;
@@ -167,7 +175,7 @@ class FrameBuffer {
   absl::optional<uint32_t> last_decoded_frame_timestamp_ RTC_GUARDED_BY(crit_);
   FrameMap::iterator last_decoded_frame_it_ RTC_GUARDED_BY(crit_);
   FrameMap::iterator last_continuous_frame_it_ RTC_GUARDED_BY(crit_);
-  FrameMap::iterator next_frame_it_ RTC_GUARDED_BY(crit_);
+  std::vector<FrameMap::iterator> frames_to_decode_ RTC_GUARDED_BY(crit_);
   int num_frames_history_ RTC_GUARDED_BY(crit_);
   int num_frames_buffered_ RTC_GUARDED_BY(crit_);
   bool stopped_ RTC_GUARDED_BY(crit_);
diff --git a/modules/video_coding/frame_buffer2_unittest.cc b/modules/video_coding/frame_buffer2_unittest.cc
index ca7af09d5f..578734c615 100644
--- a/modules/video_coding/frame_buffer2_unittest.cc
+++ b/modules/video_coding/frame_buffer2_unittest.cc
@@ -124,6 +124,7 @@ class TestFrameBuffer2 : public ::testing::Test {
   static constexpr int kFps1 = 1000;
   static constexpr int kFps10 = kFps1 / 10;
   static constexpr int kFps20 = kFps1 / 20;
+  static constexpr size_t kFrameSize = 10;
 
   TestFrameBuffer2()
       : clock_(0),
@@ -150,6 +151,7 @@ class TestFrameBuffer2 : public ::testing::Test {
                   uint8_t spatial_layer,
                   int64_t ts_ms,
                   bool inter_layer_predicted,
+                  bool last_spatial_layer,
                   T... refs) {
     static_assert(sizeof...(refs) <= kMaxReferences,
                   "To many references specified for EncodedFrame.");
@@ -162,6 +164,10 @@ class TestFrameBuffer2 : public ::testing::Test {
     frame->SetTimestamp(ts_ms * 90);
     frame->num_references = references.size();
     frame->inter_layer_predicted = inter_layer_predicted;
+    frame->is_last_spatial_layer = last_spatial_layer;
+    // Add some data to buffer.
+    frame->VerifyAndAllocate(kFrameSize);
+    frame->SetSize(kFrameSize);
     for (size_t r = 0; r < references.size(); ++r)
       frame->references[r] = references[r];
 
@@ -194,6 +200,13 @@ class TestFrameBuffer2 : public ::testing::Test {
     ASSERT_EQ(spatial_layer, frames_[index]->id.spatial_layer);
   }
 
+  void CheckFrameSize(size_t index, size_t size) {
+    rtc::CritScope lock(&crit_);
+    ASSERT_LT(index, frames_.size());
+    ASSERT_TRUE(frames_[index]);
+    ASSERT_EQ(frames_[index]->size(), size);
+  }
+
   void CheckNoFrame(size_t index) {
     rtc::CritScope lock(&crit_);
     ASSERT_LT(index, frames_.size());
@@ -246,7 +259,7 @@ TEST_F(TestFrameBuffer2, WaitForFrame) {
   uint32_t ts = Rand();
 
   ExtractFrame(50);
-  InsertFrame(pid, 0, ts, false);
+  InsertFrame(pid, 0, ts, false, true);
   CheckFrame(0, pid, 0);
 }
 
@@ -254,13 +267,11 @@ TEST_F(TestFrameBuffer2, OneSuperFrame) {
   uint16_t pid = Rand();
   uint32_t ts = Rand();
 
-  InsertFrame(pid, 0, ts, false);
-  ExtractFrame();
-  InsertFrame(pid, 1, ts, true);
+  InsertFrame(pid, 0, ts, false, false);
+  InsertFrame(pid, 1, ts, true, true);
   ExtractFrame();
 
   CheckFrame(0, pid, 0);
-  CheckFrame(1, pid, 1);
 }
 
 TEST_F(TestFrameBuffer2, SetPlayoutDelay) {
@@ -293,8 +304,8 @@ TEST_F(TestFrameBuffer2, DISABLED_OneUnorderedSuperFrame) {
   uint32_t ts = Rand();
 
   ExtractFrame(50);
-  InsertFrame(pid, 1, ts, true);
-  InsertFrame(pid, 0, ts, false);
+  InsertFrame(pid, 1, ts, true, true);
+  InsertFrame(pid, 0, ts, false, false);
   ExtractFrame();
 
   CheckFrame(0, pid, 0);
@@ -305,14 +316,14 @@ TEST_F(TestFrameBuffer2, DISABLED_OneLayerStreamReordered) {
   uint16_t pid = Rand();
   uint32_t ts = Rand();
 
-  InsertFrame(pid, 0, ts, false);
+  InsertFrame(pid, 0, ts, false, true);
   ExtractFrame();
   CheckFrame(0, pid, 0);
   for (int i = 1; i < 10; i += 2) {
     ExtractFrame(50);
-    InsertFrame(pid + i + 1, 0, ts + (i + 1) * kFps10, false, pid + i);
+    InsertFrame(pid + i + 1, 0, ts + (i + 1) * kFps10, false, true, pid + i);
     clock_.AdvanceTimeMilliseconds(kFps10);
-    InsertFrame(pid + i, 0, ts + i * kFps10, false, pid + i - 1);
+    InsertFrame(pid + i, 0, ts + i * kFps10, false, true, pid + i - 1);
     clock_.AdvanceTimeMilliseconds(kFps10);
     ExtractFrame();
     CheckFrame(i, pid + i, 0);
@@ -330,9 +341,9 @@ TEST_F(TestFrameBuffer2, MissingFrame) {
   uint16_t pid = Rand();
   uint32_t ts = Rand();
 
-  InsertFrame(pid, 0, ts, false);
-  InsertFrame(pid + 2, 0, ts, false, pid);
-  InsertFrame(pid + 3, 0, ts, false, pid + 1, pid + 2);
+  InsertFrame(pid, 0, ts, false, true);
+  InsertFrame(pid + 2, 0, ts, false, true, pid);
+  InsertFrame(pid + 3, 0, ts, false, true, pid + 1, pid + 2);
   ExtractFrame();
   ExtractFrame();
   ExtractFrame();
@@ -346,11 +357,11 @@ TEST_F(TestFrameBuffer2, OneLayerStream) {
   uint16_t pid = Rand();
   uint32_t ts = Rand();
 
-  InsertFrame(pid, 0, ts, false);
+  InsertFrame(pid, 0, ts, false, true);
   ExtractFrame();
   CheckFrame(0, pid, 0);
   for (int i = 1; i < 10; ++i) {
-    InsertFrame(pid + i, 0, ts + i * kFps10, false, pid + i - 1);
+    InsertFrame(pid + i, 0, ts + i * kFps10, false, true, pid + i - 1);
     ExtractFrame();
     clock_.AdvanceTimeMilliseconds(kFps10);
     CheckFrame(i, pid + i, 0);
@@ -361,12 +372,13 @@ TEST_F(TestFrameBuffer2, DropTemporalLayerSlowDecoder) {
   uint16_t pid = Rand();
   uint32_t ts = Rand();
 
-  InsertFrame(pid, 0, ts, false);
-  InsertFrame(pid + 1, 0, ts + kFps20, false, pid);
+  InsertFrame(pid, 0, ts, false, true);
+  InsertFrame(pid + 1, 0, ts + kFps20, false, true, pid);
   for (int i = 2; i < 10; i += 2) {
     uint32_t ts_tl0 = ts + i / 2 * kFps10;
-    InsertFrame(pid + i, 0, ts_tl0, false, pid + i - 2);
-    InsertFrame(pid + i + 1, 0, ts_tl0 + kFps20, false, pid + i, pid + i - 1);
+    InsertFrame(pid + i, 0, ts_tl0, false, true, pid + i - 2);
+    InsertFrame(pid + i + 1, 0, ts_tl0 + kFps20, false, true, pid + i,
+                pid + i - 1);
   }
 
   for (int i = 0; i < 10; ++i) {
@@ -386,49 +398,15 @@ TEST_F(TestFrameBuffer2, DropTemporalLayerSlowDecoder) {
   CheckNoFrame(9);
 }
 
-TEST_F(TestFrameBuffer2, DropSpatialLayerSlowDecoder) {
-  uint16_t pid = Rand();
-  uint32_t ts = Rand();
-
-  InsertFrame(pid, 0, ts, false);
-  InsertFrame(pid, 1, ts, false);
-  for (int i = 1; i < 6; ++i) {
-    uint32_t ts_tl0 = ts + i * kFps10;
-    InsertFrame(pid + i, 0, ts_tl0, false, pid + i - 1);
-    InsertFrame(pid + i, 1, ts_tl0, false, pid + i - 1);
-  }
-
-  ExtractFrame();
-  ExtractFrame();
-  clock_.AdvanceTimeMilliseconds(57);
-  for (int i = 2; i < 12; ++i) {
-    ExtractFrame();
-    clock_.AdvanceTimeMilliseconds(57);
-  }
-
-  CheckFrame(0, pid, 0);
-  CheckFrame(1, pid, 1);
-  CheckFrame(2, pid + 1, 0);
-  CheckFrame(3, pid + 1, 1);
-  CheckFrame(4, pid + 2, 0);
-  CheckFrame(5, pid + 2, 1);
-  CheckFrame(6, pid + 3, 0);
-  CheckFrame(7, pid + 4, 0);
-  CheckFrame(8, pid + 5, 0);
-  CheckNoFrame(9);
-  CheckNoFrame(10);
-  CheckNoFrame(11);
-}
-
 TEST_F(TestFrameBuffer2, InsertLateFrame) {
   uint16_t pid = Rand();
   uint32_t ts = Rand();
 
-  InsertFrame(pid, 0, ts, false);
+  InsertFrame(pid, 0, ts, false, true);
   ExtractFrame();
-  InsertFrame(pid + 2, 0, ts, false);
+  InsertFrame(pid + 2, 0, ts, false, true);
   ExtractFrame();
-  InsertFrame(pid + 1, 0, ts, false, pid);
+  InsertFrame(pid + 1, 0, ts, false, true, pid);
   ExtractFrame();
 
   CheckFrame(0, pid, 0);
@@ -441,12 +419,12 @@ TEST_F(TestFrameBuffer2, ProtectionMode) {
   uint32_t ts = Rand();
 
   EXPECT_CALL(jitter_estimator_, GetJitterEstimate(1.0));
-  InsertFrame(pid, 0, ts, false);
+  InsertFrame(pid, 0, ts, false, true);
   ExtractFrame();
 
   buffer_->SetProtectionMode(kProtectionNackFEC);
   EXPECT_CALL(jitter_estimator_, GetJitterEstimate(0.0));
-  InsertFrame(pid + 1, 0, ts, false);
+  InsertFrame(pid + 1, 0, ts, false, true);
   ExtractFrame();
 }
 
@@ -454,45 +432,45 @@ TEST_F(TestFrameBuffer2, NoContinuousFrame) {
   uint16_t pid = Rand();
   uint32_t ts = Rand();
 
-  EXPECT_EQ(-1, InsertFrame(pid + 1, 0, ts, false, pid));
+  EXPECT_EQ(-1, InsertFrame(pid + 1, 0, ts, false, true, pid));
 }
 
 TEST_F(TestFrameBuffer2, LastContinuousFrameSingleLayer) {
   uint16_t pid = Rand();
   uint32_t ts = Rand();
 
-  EXPECT_EQ(pid, InsertFrame(pid, 0, ts, false));
-  EXPECT_EQ(pid, InsertFrame(pid + 2, 0, ts, false, pid + 1));
-  EXPECT_EQ(pid + 2, InsertFrame(pid + 1, 0, ts, false, pid));
-  EXPECT_EQ(pid + 2, InsertFrame(pid + 4, 0, ts, false, pid + 3));
-  EXPECT_EQ(pid + 5, InsertFrame(pid + 5, 0, ts, false));
+  EXPECT_EQ(pid, InsertFrame(pid, 0, ts, false, true));
+  EXPECT_EQ(pid, InsertFrame(pid + 2, 0, ts, false, true, pid + 1));
+  EXPECT_EQ(pid + 2, InsertFrame(pid + 1, 0, ts, false, true, pid));
+  EXPECT_EQ(pid + 2, InsertFrame(pid + 4, 0, ts, false, true, pid + 3));
+  EXPECT_EQ(pid + 5, InsertFrame(pid + 5, 0, ts, false, true));
 }
 
 TEST_F(TestFrameBuffer2, LastContinuousFrameTwoLayers) {
   uint16_t pid = Rand();
   uint32_t ts = Rand();
 
-  EXPECT_EQ(pid, InsertFrame(pid, 0, ts, false));
-  EXPECT_EQ(pid, InsertFrame(pid, 1, ts, true));
-  EXPECT_EQ(pid, InsertFrame(pid + 1, 1, ts, true, pid));
-  EXPECT_EQ(pid, InsertFrame(pid + 2, 0, ts, false, pid + 1));
-  EXPECT_EQ(pid, InsertFrame(pid + 2, 1, ts, true, pid + 1));
-  EXPECT_EQ(pid, InsertFrame(pid + 3, 0, ts, false, pid + 2));
-  EXPECT_EQ(pid + 3, InsertFrame(pid + 1, 0, ts, false, pid));
-  EXPECT_EQ(pid + 3, InsertFrame(pid + 3, 1, ts, true, pid + 2));
+  EXPECT_EQ(pid, InsertFrame(pid, 0, ts, false, false));
+  EXPECT_EQ(pid, InsertFrame(pid, 1, ts, true, true));
+  EXPECT_EQ(pid, InsertFrame(pid + 1, 1, ts, true, true, pid));
+  EXPECT_EQ(pid, InsertFrame(pid + 2, 0, ts, false, false, pid + 1));
+  EXPECT_EQ(pid, InsertFrame(pid + 2, 1, ts, true, true, pid + 1));
+  EXPECT_EQ(pid, InsertFrame(pid + 3, 0, ts, false, false, pid + 2));
+  EXPECT_EQ(pid + 3, InsertFrame(pid + 1, 0, ts, false, false, pid));
+  EXPECT_EQ(pid + 3, InsertFrame(pid + 3, 1, ts, true, true, pid + 2));
 }
 
 TEST_F(TestFrameBuffer2, PictureIdJumpBack) {
   uint16_t pid = Rand();
   uint32_t ts = Rand();
 
-  EXPECT_EQ(pid, InsertFrame(pid, 0, ts, false));
-  EXPECT_EQ(pid + 1, InsertFrame(pid + 1, 0, ts + 1, false, pid));
+  EXPECT_EQ(pid, InsertFrame(pid, 0, ts, false, true));
+  EXPECT_EQ(pid + 1, InsertFrame(pid + 1, 0, ts + 1, false, true, pid));
   ExtractFrame();
   CheckFrame(0, pid, 0);
 
   // Jump back in pid but increase ts.
-  EXPECT_EQ(pid - 1, InsertFrame(pid - 1, 0, ts + 2, false));
+  EXPECT_EQ(pid - 1, InsertFrame(pid - 1, 0, ts + 2, false, true));
   ExtractFrame();
   ExtractFrame();
   CheckFrame(1, pid - 1, 0);
@@ -511,6 +489,7 @@ TEST_F(TestFrameBuffer2, StatsCallback) {
 
   {
     std::unique_ptr<FrameObjectFake> frame(new FrameObjectFake());
+    frame->VerifyAndAllocate(kFrameSize);
     frame->SetSize(kFrameSize);
     frame->id.picture_id = pid;
     frame->id.spatial_layer = 0;
@@ -526,42 +505,42 @@ TEST_F(TestFrameBuffer2, StatsCallback) {
 }
 
 TEST_F(TestFrameBuffer2, ForwardJumps) {
-  EXPECT_EQ(5453, InsertFrame(5453, 0, 1, false));
+  EXPECT_EQ(5453, InsertFrame(5453, 0, 1, false, true));
   ExtractFrame();
-  EXPECT_EQ(5454, InsertFrame(5454, 0, 1, false, 5453));
+  EXPECT_EQ(5454, InsertFrame(5454, 0, 1, false, true, 5453));
   ExtractFrame();
-  EXPECT_EQ(15670, InsertFrame(15670, 0, 1, false));
+  EXPECT_EQ(15670, InsertFrame(15670, 0, 1, false, true));
   ExtractFrame();
-  EXPECT_EQ(29804, InsertFrame(29804, 0, 1, false));
+  EXPECT_EQ(29804, InsertFrame(29804, 0, 1, false, true));
   ExtractFrame();
-  EXPECT_EQ(29805, InsertFrame(29805, 0, 1, false, 29804));
+  EXPECT_EQ(29805, InsertFrame(29805, 0, 1, false, true, 29804));
   ExtractFrame();
-  EXPECT_EQ(29806, InsertFrame(29806, 0, 1, false, 29805));
+  EXPECT_EQ(29806, InsertFrame(29806, 0, 1, false, true, 29805));
   ExtractFrame();
-  EXPECT_EQ(33819, InsertFrame(33819, 0, 1, false));
+  EXPECT_EQ(33819, InsertFrame(33819, 0, 1, false, true));
   ExtractFrame();
-  EXPECT_EQ(41248, InsertFrame(41248, 0, 1, false));
+  EXPECT_EQ(41248, InsertFrame(41248, 0, 1, false, true));
   ExtractFrame();
 }
 
 TEST_F(TestFrameBuffer2, DuplicateFrames) {
-  EXPECT_EQ(22256, InsertFrame(22256, 0, 1, false));
+  EXPECT_EQ(22256, InsertFrame(22256, 0, 1, false, true));
   ExtractFrame();
-  EXPECT_EQ(22256, InsertFrame(22256, 0, 1, false));
+  EXPECT_EQ(22256, InsertFrame(22256, 0, 1, false, true));
 }
 
 // TODO(philipel): implement more unittests related to invalid references.
 TEST_F(TestFrameBuffer2, InvalidReferences) {
-  EXPECT_EQ(-1, InsertFrame(0, 0, 1000, false, 2));
-  EXPECT_EQ(1, InsertFrame(1, 0, 2000, false));
+  EXPECT_EQ(-1, InsertFrame(0, 0, 1000, false, true, 2));
+  EXPECT_EQ(1, InsertFrame(1, 0, 2000, false, true));
   ExtractFrame();
-  EXPECT_EQ(2, InsertFrame(2, 0, 3000, false, 1));
+  EXPECT_EQ(2, InsertFrame(2, 0, 3000, false, true, 1));
 }
 
 TEST_F(TestFrameBuffer2, KeyframeRequired) {
-  EXPECT_EQ(1, InsertFrame(1, 0, 1000, false));
-  EXPECT_EQ(2, InsertFrame(2, 0, 2000, false, 1));
-  EXPECT_EQ(3, InsertFrame(3, 0, 3000, false));
+  EXPECT_EQ(1, InsertFrame(1, 0, 1000, false, true));
+  EXPECT_EQ(2, InsertFrame(2, 0, 2000, false, true, 1));
+  EXPECT_EQ(3, InsertFrame(3, 0, 3000, false, true));
   ExtractFrame();
   ExtractFrame(0, true);
   ExtractFrame();
@@ -575,42 +554,81 @@ TEST_F(TestFrameBuffer2, KeyframeClearsFullBuffer) {
   const int kMaxBufferSize = 600;
 
   for (int i = 1; i <= kMaxBufferSize; ++i)
-    EXPECT_EQ(-1, InsertFrame(i, 0, i * 1000, false, i - 1));
+    EXPECT_EQ(-1, InsertFrame(i, 0, i * 1000, false, true, i - 1));
   ExtractFrame();
   CheckNoFrame(0);
 
-  EXPECT_EQ(
-      kMaxBufferSize + 1,
-      InsertFrame(kMaxBufferSize + 1, 0, (kMaxBufferSize + 1) * 1000, false));
+  EXPECT_EQ(kMaxBufferSize + 1,
+            InsertFrame(kMaxBufferSize + 1, 0, (kMaxBufferSize + 1) * 1000,
+                        false, true));
   ExtractFrame();
   CheckFrame(1, kMaxBufferSize + 1, 0);
 }
 
 TEST_F(TestFrameBuffer2, DontUpdateOnUndecodableFrame) {
-  InsertFrame(1, 0, 0, false);
+  InsertFrame(1, 0, 0, false, true);
   ExtractFrame(0, true);
-  InsertFrame(3, 0, 0, false, 2, 0);
-  InsertFrame(3, 0, 0, false, 0);
-  InsertFrame(2, 0, 0, false);
+  InsertFrame(3, 0, 0, false, true, 2, 0);
+  InsertFrame(3, 0, 0, false, true, 0);
+  InsertFrame(2, 0, 0, false, true);
   ExtractFrame(0, true);
   ExtractFrame(0, true);
 }
 
 TEST_F(TestFrameBuffer2, DontDecodeOlderTimestamp) {
-  InsertFrame(2, 0, 1, false);
-  InsertFrame(1, 0, 2, false);  // Older picture id but newer timestamp.
+  InsertFrame(2, 0, 1, false, true);
+  InsertFrame(1, 0, 2, false, true);  // Older picture id but newer timestamp.
   ExtractFrame(0);
   ExtractFrame(0);
   CheckFrame(0, 1, 0);
   CheckNoFrame(1);
 
-  InsertFrame(3, 0, 4, false);
-  InsertFrame(4, 0, 3, false);  // Newer picture id but older timestamp.
+  InsertFrame(3, 0, 4, false, true);
+  InsertFrame(4, 0, 3, false, true);  // Newer picture id but older timestamp.
   ExtractFrame(0);
   ExtractFrame(0);
   CheckFrame(2, 3, 0);
   CheckNoFrame(3);
 }
 
+TEST_F(TestFrameBuffer2, CombineFramesToSuperframe) {
+  uint16_t pid = Rand();
+  uint32_t ts = Rand();
+
+  InsertFrame(pid, 0, ts, false, false);
+  InsertFrame(pid, 1, ts, true, true);
+  ExtractFrame(0);
+  ExtractFrame(0);
+  CheckFrame(0, pid, 0);
+  CheckNoFrame(1);
+  // Two frames should be combined and returned together.
+  CheckFrameSize(0, kFrameSize * 2);
+}
+
+TEST_F(TestFrameBuffer2, HigherSpatialLayerNonDecodable) {
+  uint16_t pid = Rand();
+  uint32_t ts = Rand();
+
+  InsertFrame(pid, 0, ts, false, false);
+  InsertFrame(pid, 1, ts, true, true);
+
+  ExtractFrame(0);
+  CheckFrame(0, pid, 0);
+
+  InsertFrame(pid + 1, 1, ts + kFps20, false, true, pid);
+  InsertFrame(pid + 2, 0, ts + kFps10, false, false, pid);
+  InsertFrame(pid + 2, 1, ts + kFps10, true, true, pid + 1);
+
+  clock_.AdvanceTimeMilliseconds(1000);
+  // Frame pid+1 is decodable but too late.
+  // In superframe pid+2 frame sid=0 is decodable, but frame sid=1 is not.
+  // Incorrect implementation might skip pid+1 frame and output undecodable
+  // pid+2 instead.
+  ExtractFrame();
+  ExtractFrame();
+  CheckFrame(1, pid + 1, 1);
+  CheckFrame(2, pid + 2, 0);
+}
+
 }  // namespace video_coding
 }  // namespace webrtc
diff --git a/modules/video_coding/frame_object.cc b/modules/video_coding/frame_object.cc
index 37fcef2a46..925f1a191f 100644
--- a/modules/video_coding/frame_object.cc
+++ b/modules/video_coding/frame_object.cc
@@ -104,6 +104,7 @@ RtpFrameObject::RtpFrameObject(PacketBuffer* packet_buffer,
     timing_.receive_finish_ms = last_packet->receive_time_ms;
   }
   timing_.flags = last_packet->video_header.video_timing.flags;
+  is_last_spatial_layer = last_packet->markerBit;
 }
 
 RtpFrameObject::~RtpFrameObject() {
diff --git a/modules/video_coding/rtp_frame_reference_finder.cc b/modules/video_coding/rtp_frame_reference_finder.cc
index 40b16f4156..f6fce17215 100644
--- a/modules/video_coding/rtp_frame_reference_finder.cc
+++ b/modules/video_coding/rtp_frame_reference_finder.cc
@@ -489,12 +489,24 @@ RtpFrameReferenceFinder::FrameDecision RtpFrameReferenceFinder::ManageFrameVp9(
       UnwrapPictureIds(frame);
       return kHandOff;
     }
-  } else {
-    if (frame->frame_type() == kVideoFrameKey) {
+  } else if (frame->frame_type() == kVideoFrameKey) {
+    if (frame->id.spatial_layer == 0) {
       RTC_LOG(LS_WARNING) << "Received keyframe without scalability structure";
       return kDrop;
     }
+    const auto gof_info_it = gof_info_.find(unwrapped_tl0);
+    if (gof_info_it == gof_info_.end())
+      return kStash;
 
+    info = &gof_info_it->second;
+
+    if (frame->frame_type() == kVideoFrameKey) {
+      frame->num_references = 0;
+      FrameReceivedVp9(frame->id.picture_id, info);
+      UnwrapPictureIds(frame);
+      return kHandOff;
+    }
+  } else {
     auto gof_info_it = gof_info_.find(
         (codec_header.temporal_idx == 0) ? unwrapped_tl0 - 1 : unwrapped_tl0);
 
diff --git a/video/rtp_video_stream_receiver.cc b/video/rtp_video_stream_receiver.cc
index f872860255..0cd3b7ebc0 100644
--- a/video/rtp_video_stream_receiver.cc
+++ b/video/rtp_video_stream_receiver.cc
@@ -496,6 +496,14 @@ void RtpVideoStreamReceiver::ReceivePacket(const RtpPacketReceived& packet) {
       VideoSendTiming::kInvalid;
   webrtc_rtp_header.video_header().is_last_packet_in_frame =
       webrtc_rtp_header.header.markerBit;
+  if (parsed_payload.video_header().codec == kVideoCodecVP9) {
+    const RTPVideoHeaderVP9& codec_header = absl::get<RTPVideoHeaderVP9>(
+        parsed_payload.video_header().video_type_header);
+    webrtc_rtp_header.video_header().is_last_packet_in_frame |=
+        codec_header.end_of_frame;
+    webrtc_rtp_header.video_header().is_first_packet_in_frame |=
+        codec_header.beginning_of_frame;
+  }
 
   packet.GetExtension<VideoOrientation>(
       &webrtc_rtp_header.video_header().rotation);
diff --git a/video/video_quality_test.cc b/video/video_quality_test.cc
index 3261d41c7a..d6ccb6522e 100644
--- a/video/video_quality_test.cc
+++ b/video/video_quality_test.cc
@@ -670,6 +670,10 @@ void VideoQualityTest::SetupVideo(Transport* send_transport,
         vp9_settings.numberOfSpatialLayers = static_cast<unsigned char>(
             params_.ss[video_idx].num_spatial_layers);
         vp9_settings.interLayerPred = params_.ss[video_idx].inter_layer_pred;
+        // High FPS vp9 screenshare requires flexible mode.
+        if (params_.video[video_idx].fps > 5) {
+          vp9_settings.flexibleMode = true;
+        }
         video_encoder_configs_[video_idx].encoder_specific_settings =
             new rtc::RefCountedObject<
                 VideoEncoderConfig::Vp9EncoderSpecificSettings>(vp9_settings);
diff --git a/video/video_receive_stream.cc b/video/video_receive_stream.cc
index 31757112c5..2043f7d203 100644
--- a/video/video_receive_stream.cc
+++ b/video/video_receive_stream.cc
@@ -381,10 +381,6 @@ void VideoReceiveStream::RequestKeyFrame() {
 
 void VideoReceiveStream::OnCompleteFrame(
     std::unique_ptr<video_coding::EncodedFrame> frame) {
-  // TODO(webrtc:9249): Workaround to allow decoding of VP9 SVC stream with
-  // partially enabled inter-layer prediction.
-  frame->id.spatial_layer = 0;
-
   // TODO(https://bugs.webrtc.org/9974): Consider removing this workaround.
   int64_t time_now_ms = rtc::TimeMillis();
   if (last_complete_frame_time_ms_ > 0 &&