Vp9 flexible mode fixes

- Enable vp9 flexible mode in VideoEngine if 3 spatial layers are set. - Enable flexible mode in loopback tools and quality tests. - Reset first active spatial layer on keyframe in encoder. - Ensure duplicate references are not set by the sender in video header. - Set references manually for flexible mode in vp9 encoder. - Delay new activated layers until next base layer frame. - On receive side put each spatial layer as a separate frame to FrameBuffer and return several frames combined from FrameBuffer. Bug: webrtc:10049,webrtc:9794,webrtc:9784 Change-Id: I01e69f134cc145deba666ccc92deb1d37a324ede Reviewed-on: https://webrtc-review.googlesource.com/c/112289 Commit-Queue: Ilya Nikolaevskiy <ilnik@webrtc.org> Reviewed-by: Sergey Silkin <ssilkin@webrtc.org> Reviewed-by: Philip Eliasson <philipel@webrtc.org> Reviewed-by: Niels Moller <nisse@webrtc.org> Cr-Commit-Position: refs/heads/master@{#25895}
2018-12-04 15:54:52 +01:00 · 2018-12-04 15:54:52 +01:00 · 5546aef682
commit 5546aef682
parent 77894ccb5d
19 changed files with 694 additions and 198 deletions
--- a/api/video/encoded_frame.h
+++ b/api/video/encoded_frame.h
@ -79,6 +79,9 @@ class EncodedFrame : public webrtc::VCMEncodedFrame {
  size_t num_references = 0;
  int64_t references[kMaxFrameReferences];
  bool inter_layer_predicted = false;
+  // Is this subframe the last one in the superframe (In RTP stream that would
+  // mean that the last packet has a marker bit set).
+  bool is_last_spatial_layer = true;
 };

 }  // namespace video_coding
--- a/api/video/encoded_image.h
+++ b/api/video/encoded_image.h
@ -68,6 +68,10 @@ class RTC_EXPORT EncodedImage {
  }

  size_t size() const { return _length; }
+  void set_size(size_t new_size) {
+    RTC_DCHECK_LE(new_size, _size);
+    _length = new_size;
+  }
  size_t capacity() const { return _size; }

  void set_buffer(uint8_t* buffer, size_t capacity) {
--- a/media/engine/webrtcvideoengine.cc
+++ b/media/engine/webrtcvideoengine.cc
@ -383,6 +383,9 @@ WebRtcVideoChannel::WebRtcVideoSendStream::ConfigureVideoEncoderSettings(
    if (!is_screencast) {
      // Limit inter-layer prediction to key pictures.
      vp9_settings.interLayerPred = webrtc::InterLayerPredMode::kOnKeyPic;
+    } else {
+      // 3 spatial layers vp9 screenshare needs flexible mode.
+      vp9_settings.flexibleMode = vp9_settings.numberOfSpatialLayers > 2;
    }
    return new rtc::RefCountedObject<
        webrtc::VideoEncoderConfig::Vp9EncoderSpecificSettings>(vp9_settings);
--- a/modules/video_coding/codecs/test/videocodec_test_libvpx.cc
+++ b/modules/video_coding/codecs/test/videocodec_test_libvpx.cc
@ -124,9 +124,9 @@ TEST(VideoCodecTestLibvpx, ChangeBitrateVP9) {
      {500, 30, kNumFramesLong}};

  std::vector<RateControlThresholds> rc_thresholds = {
-      {5, 1, 0, 1, 0.5, 0.1, 0, 1},
-      {15, 2, 0, 1, 0.5, 0.1, 0, 0},
-      {10, 1, 0, 1, 0.5, 0.1, 0, 0}};
+      {5, 2, 0, 1, 0.5, 0.1, 0, 1},
+      {15, 3, 0, 1, 0.5, 0.1, 0, 0},
+      {10, 2, 0, 1, 0.5, 0.1, 0, 0}};

  std::vector<QualityThresholds> quality_thresholds = {
      {34, 33, 0.90, 0.88}, {38, 35, 0.95, 0.91}, {35, 34, 0.93, 0.90}};
--- a/modules/video_coding/codecs/vp9/svc_config.cc
+++ b/modules/video_coding/codecs/vp9/svc_config.cc
@ -23,9 +23,9 @@ namespace webrtc {
 namespace {
 const size_t kMinVp9SvcBitrateKbps = 30;

-const size_t kMaxNumLayersForScreenSharing = 2;
-const float kMaxScreenSharingLayerFramerateFps[] = {5.0, 5.0};
-const size_t kMaxScreenSharingLayerBitrateKbps[] = {200, 500};
+const size_t kMaxNumLayersForScreenSharing = 3;
+const float kMaxScreenSharingLayerFramerateFps[] = {5.0, 5.0, 30.0};
+const size_t kMaxScreenSharingLayerBitrateKbps[] = {200, 500, 1250};
 }  // namespace

 std::vector<SpatialLayer> ConfigureSvcScreenSharing(size_t input_width,
--- a/modules/video_coding/codecs/vp9/svc_config_unittest.cc
+++ b/modules/video_coding/codecs/vp9/svc_config_unittest.cc
@ -48,12 +48,13 @@ TEST(SvcConfig, ScreenSharing) {
  std::vector<SpatialLayer> spatial_layers =
      GetSvcConfig(1920, 1080, 30, 3, 3, true);

-  EXPECT_EQ(spatial_layers.size(), 2UL);
+  EXPECT_EQ(spatial_layers.size(), 3UL);

-  for (const SpatialLayer& layer : spatial_layers) {
+  for (size_t i = 0; i < 3; ++i) {
+    const SpatialLayer& layer = spatial_layers[i];
    EXPECT_EQ(layer.width, 1920);
    EXPECT_EQ(layer.height, 1080);
-    EXPECT_EQ(layer.maxFramerate, 5);
+    EXPECT_EQ(layer.maxFramerate, (i < 2) ? 5 : 30);
    EXPECT_EQ(layer.numberOfTemporalLayers, 1);
    EXPECT_LE(layer.minBitrate, layer.maxBitrate);
    EXPECT_LE(layer.minBitrate, layer.targetBitrate);
--- a/modules/video_coding/codecs/vp9/svc_rate_allocator_unittest.cc
+++ b/modules/video_coding/codecs/vp9/svc_rate_allocator_unittest.cc
@ -151,7 +151,7 @@ TEST(SvcRateAllocatorTest, MinBitrateToGetQualityLayer) {

  const SpatialLayer* layers = codec.spatialLayers;

-  EXPECT_LE(codec.VP9()->numberOfSpatialLayers, 2U);
+  EXPECT_LE(codec.VP9()->numberOfSpatialLayers, 3U);

  VideoBitrateAllocation allocation =
      allocator.GetAllocation(layers[0].minBitrate * 1000, 30);
--- a/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc
+++ b/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc
@ -91,15 +91,16 @@ class TestVp9Impl : public VideoCodecUnitTest {
    }
  }

-  void ConfigureSvc(size_t num_spatial_layers) {
+  void ConfigureSvc(size_t num_spatial_layers, size_t num_temporal_layers = 1) {
    codec_settings_.VP9()->numberOfSpatialLayers =
        static_cast<unsigned char>(num_spatial_layers);
-    codec_settings_.VP9()->numberOfTemporalLayers = 1;
+    codec_settings_.VP9()->numberOfTemporalLayers = num_temporal_layers;
    codec_settings_.VP9()->frameDroppingOn = false;

-    std::vector<SpatialLayer> layers = GetSvcConfig(
-        codec_settings_.width, codec_settings_.height,
-        codec_settings_.maxFramerate, num_spatial_layers, 1, false);
+    std::vector<SpatialLayer> layers =
+        GetSvcConfig(codec_settings_.width, codec_settings_.height,
+                     codec_settings_.maxFramerate, num_spatial_layers,
+                     num_temporal_layers, false);
    for (size_t i = 0; i < layers.size(); ++i) {
      codec_settings_.spatialLayers[i] = layers[i];
    }
@ -401,6 +402,8 @@ TEST_F(TestVp9Impl, EnableDisableSpatialLayers) {
      std::vector<EncodedImage> encoded_frame;
      std::vector<CodecSpecificInfo> codec_specific_info;
      ASSERT_TRUE(WaitForEncodedFrames(&encoded_frame, &codec_specific_info));
+      EXPECT_EQ(codec_specific_info[0].codecSpecific.VP9.ss_data_available,
+                frame_num == 0);
    }
  }

@ -418,6 +421,8 @@ TEST_F(TestVp9Impl, EnableDisableSpatialLayers) {
      std::vector<EncodedImage> encoded_frame;
      std::vector<CodecSpecificInfo> codec_specific_info;
      ASSERT_TRUE(WaitForEncodedFrames(&encoded_frame, &codec_specific_info));
+      EXPECT_EQ(codec_specific_info[0].codecSpecific.VP9.ss_data_available,
+                frame_num == 0);
    }
  }
 }
@ -581,6 +586,248 @@ TEST_F(TestVp9Impl,
  }
 }

+TEST_F(TestVp9Impl, EnablingNewLayerIsDelayedInScreenshareAndAddsSsInfo) {
+  const size_t num_spatial_layers = 3;
+  // Chosen by hand, the 2nd frame is dropped with configured per-layer max
+  // framerate.
+  const size_t num_frames_to_encode_before_drop = 1;
+  // Chosen by hand, exactly 5 frames are dropped for input fps=30 and max
+  // framerate = 5.
+  const size_t num_dropped_frames = 5;
+
+  codec_settings_.maxFramerate = 30;
+  ConfigureSvc(num_spatial_layers);
+  codec_settings_.spatialLayers[0].maxFramerate = 5.0;
+  // use 30 for the SL 1 instead of 5, so even if SL 0 frame is dropped due to
+  // framerate capping we would still get back at least a middle layer. It
+  // simplifies the test.
+  codec_settings_.spatialLayers[1].maxFramerate = 30.0;
+  codec_settings_.spatialLayers[2].maxFramerate = 30.0;
+  codec_settings_.VP9()->frameDroppingOn = false;
+  codec_settings_.mode = VideoCodecMode::kScreensharing;
+  codec_settings_.VP9()->interLayerPred = InterLayerPredMode::kOn;
+  codec_settings_.VP9()->flexibleMode = true;
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->InitEncode(&codec_settings_, 1 /* number of cores */,
+                                 0 /* max payload size (unused) */));
+
+  // Enable all but the last layer.
+  VideoBitrateAllocation bitrate_allocation;
+  for (size_t sl_idx = 0; sl_idx < num_spatial_layers - 1; ++sl_idx) {
+    bitrate_allocation.SetBitrate(
+        sl_idx, 0, codec_settings_.spatialLayers[sl_idx].targetBitrate * 1000);
+  }
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->SetRateAllocation(bitrate_allocation,
+                                        codec_settings_.maxFramerate));
+
+  // Encode enough frames to force drop due to framerate capping.
+  for (size_t frame_num = 0; frame_num < num_frames_to_encode_before_drop;
+       ++frame_num) {
+    SetWaitForEncodedFramesThreshold(num_spatial_layers - 1);
+    EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+              encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+    std::vector<EncodedImage> encoded_frames;
+    std::vector<CodecSpecificInfo> codec_specific_info;
+    ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+  }
+
+  // Enable the last layer.
+  bitrate_allocation.SetBitrate(
+      num_spatial_layers - 1, 0,
+      codec_settings_.spatialLayers[num_spatial_layers - 1].targetBitrate *
+          1000);
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->SetRateAllocation(bitrate_allocation,
+                                        codec_settings_.maxFramerate));
+
+  for (size_t frame_num = 0; frame_num < num_dropped_frames; ++frame_num) {
+    SetWaitForEncodedFramesThreshold(1);
+    EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+              encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+    // First layer is dropped due to frame rate cap. The last layer should not
+    // be enabled yet.
+    std::vector<EncodedImage> encoded_frames;
+    std::vector<CodecSpecificInfo> codec_specific_info;
+    ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+  }
+
+  SetWaitForEncodedFramesThreshold(2);
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+  // Now all 3 layers should be encoded.
+  std::vector<EncodedImage> encoded_frames;
+  std::vector<CodecSpecificInfo> codec_specific_info;
+  ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+  EXPECT_EQ(encoded_frames.size(), 3u);
+  // Scalability structure has to be triggered.
+  EXPECT_TRUE(codec_specific_info[0].codecSpecific.VP9.ss_data_available);
+}
+
+TEST_F(TestVp9Impl, RemovingLayerIsNotDelayedInScreenshareAndAddsSsInfo) {
+  const size_t num_spatial_layers = 3;
+  // Chosen by hand, the 2nd frame is dropped with configured per-layer max
+  // framerate.
+  const size_t num_frames_to_encode_before_drop = 1;
+  // Chosen by hand, exactly 5 frames are dropped for input fps=30 and max
+  // framerate = 5.
+  const size_t num_dropped_frames = 5;
+
+  codec_settings_.maxFramerate = 30;
+  ConfigureSvc(num_spatial_layers);
+  codec_settings_.spatialLayers[0].maxFramerate = 5.0;
+  // use 30 for the SL 1 instead of 5, so even if SL 0 frame is dropped due to
+  // framerate capping we would still get back at least a middle layer. It
+  // simplifies the test.
+  codec_settings_.spatialLayers[1].maxFramerate = 30.0;
+  codec_settings_.spatialLayers[2].maxFramerate = 30.0;
+  codec_settings_.VP9()->frameDroppingOn = false;
+  codec_settings_.mode = VideoCodecMode::kScreensharing;
+  codec_settings_.VP9()->interLayerPred = InterLayerPredMode::kOn;
+  codec_settings_.VP9()->flexibleMode = true;
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->InitEncode(&codec_settings_, 1 /* number of cores */,
+                                 0 /* max payload size (unused) */));
+
+  // All layers are enabled from the start.
+  VideoBitrateAllocation bitrate_allocation;
+  for (size_t sl_idx = 0; sl_idx < num_spatial_layers; ++sl_idx) {
+    bitrate_allocation.SetBitrate(
+        sl_idx, 0, codec_settings_.spatialLayers[sl_idx].targetBitrate * 1000);
+  }
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->SetRateAllocation(bitrate_allocation,
+                                        codec_settings_.maxFramerate));
+
+  // Encode enough frames to force drop due to framerate capping.
+  for (size_t frame_num = 0; frame_num < num_frames_to_encode_before_drop;
+       ++frame_num) {
+    SetWaitForEncodedFramesThreshold(num_spatial_layers);
+    EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+              encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+    std::vector<EncodedImage> encoded_frames;
+    std::vector<CodecSpecificInfo> codec_specific_info;
+    ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+  }
+
+  // Now the first layer should not have frames in it.
+  for (size_t frame_num = 0; frame_num < num_dropped_frames - 2; ++frame_num) {
+    SetWaitForEncodedFramesThreshold(2);
+    EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+              encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+    // First layer is dropped due to frame rate cap. The last layer should not
+    // be enabled yet.
+    std::vector<EncodedImage> encoded_frames;
+    std::vector<CodecSpecificInfo> codec_specific_info;
+    ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+    // First layer is skipped.
+    EXPECT_EQ(encoded_frames[0].SpatialIndex().value_or(-1), 1);
+  }
+
+  // Disable the last layer.
+  bitrate_allocation.SetBitrate(num_spatial_layers - 1, 0, 0);
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->SetRateAllocation(bitrate_allocation,
+                                        codec_settings_.maxFramerate));
+
+  // Still expected to drop first layer. Last layer has to be disable also.
+  for (size_t frame_num = num_dropped_frames - 2;
+       frame_num < num_dropped_frames; ++frame_num) {
+    // Expect back one frame.
+    SetWaitForEncodedFramesThreshold(1);
+    EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+              encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+    // First layer is dropped due to frame rate cap. The last layer should not
+    // be enabled yet.
+    std::vector<EncodedImage> encoded_frames;
+    std::vector<CodecSpecificInfo> codec_specific_info;
+    ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+    // First layer is skipped.
+    EXPECT_EQ(encoded_frames[0].SpatialIndex().value_or(-1), 1);
+    // No SS data on non-base spatial layer.
+    EXPECT_FALSE(codec_specific_info[0].codecSpecific.VP9.ss_data_available);
+  }
+
+  SetWaitForEncodedFramesThreshold(2);
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+  std::vector<EncodedImage> encoded_frames;
+  std::vector<CodecSpecificInfo> codec_specific_info;
+  ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+  // First layer is not skipped now.
+  EXPECT_EQ(encoded_frames[0].SpatialIndex().value_or(-1), 0);
+  // SS data should be present.
+  EXPECT_TRUE(codec_specific_info[0].codecSpecific.VP9.ss_data_available);
+}
+
+TEST_F(TestVp9Impl, DisableNewLayerInVideoDelaysSsInfoTillTL0) {
+  const size_t num_spatial_layers = 3;
+  const size_t num_temporal_layers = 2;
+  // Chosen by hand, the 2nd frame is dropped with configured per-layer max
+  // framerate.
+  ConfigureSvc(num_spatial_layers, num_temporal_layers);
+  codec_settings_.VP9()->frameDroppingOn = false;
+  codec_settings_.mode = VideoCodecMode::kRealtimeVideo;
+  codec_settings_.VP9()->interLayerPred = InterLayerPredMode::kOnKeyPic;
+  codec_settings_.VP9()->flexibleMode = false;
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->InitEncode(&codec_settings_, 1 /* number of cores */,
+                                 0 /* max payload size (unused) */));
+
+  // Enable all the layers.
+  VideoBitrateAllocation bitrate_allocation;
+  for (size_t sl_idx = 0; sl_idx < num_spatial_layers; ++sl_idx) {
+    for (size_t tl_idx = 0; tl_idx < num_temporal_layers; ++tl_idx) {
+      bitrate_allocation.SetBitrate(
+          sl_idx, tl_idx,
+          codec_settings_.spatialLayers[sl_idx].targetBitrate * 1000 /
+              num_temporal_layers);
+    }
+  }
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->SetRateAllocation(bitrate_allocation,
+                                        codec_settings_.maxFramerate));
+
+  std::vector<EncodedImage> encoded_frames;
+  std::vector<CodecSpecificInfo> codec_specific_info;
+
+  // Encode one TL0 frame
+  SetWaitForEncodedFramesThreshold(num_spatial_layers);
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+  ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+  EXPECT_EQ(codec_specific_info[0].codecSpecific.VP9.temporal_idx, 0u);
+
+  // Disable the last layer.
+  for (size_t tl_idx = 0; tl_idx < num_temporal_layers; ++tl_idx) {
+    bitrate_allocation.SetBitrate(num_spatial_layers - 1, tl_idx, 0);
+  }
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->SetRateAllocation(bitrate_allocation,
+                                        codec_settings_.maxFramerate));
+
+  // Next is TL1 frame. The last layer is disabled immediately, but SS structure
+  // is not provided here.
+  SetWaitForEncodedFramesThreshold(num_spatial_layers - 1);
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+  ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+  EXPECT_EQ(codec_specific_info[0].codecSpecific.VP9.temporal_idx, 1u);
+
+  // Next is TL0 frame, which should have delayed SS structure.
+  SetWaitForEncodedFramesThreshold(num_spatial_layers - 1);
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+  ASSERT_TRUE(WaitForEncodedFrames(&encoded_frames, &codec_specific_info));
+  EXPECT_EQ(codec_specific_info[0].codecSpecific.VP9.temporal_idx, 0u);
+  EXPECT_TRUE(codec_specific_info[0].codecSpecific.VP9.ss_data_available);
+  EXPECT_TRUE(codec_specific_info[0]
+                  .codecSpecific.VP9.spatial_layer_resolution_present);
+  EXPECT_EQ(
+      codec_specific_info[0].codecSpecific.VP9.width[num_spatial_layers - 1],
+      0u);
+}
+
 TEST_F(TestVp9Impl,
       LowLayerMarkedAsRefIfHighLayerNotEncodedAndInterLayerPredIsEnabled) {
  ConfigureSvc(3);
@ -766,6 +1013,7 @@ TEST_F(TestVp9ImplFrameDropping, DifferentFrameratePerSpatialLayer) {

  codec_settings_.VP9()->numberOfSpatialLayers = num_spatial_layers;
  codec_settings_.VP9()->frameDroppingOn = false;
+  codec_settings_.VP9()->flexibleMode = true;

  VideoBitrateAllocation bitrate_allocation;
  for (uint8_t sl_idx = 0; sl_idx < num_spatial_layers; ++sl_idx) {
--- a/modules/video_coding/codecs/vp9/vp9_impl.cc
+++ b/modules/video_coding/codecs/vp9/vp9_impl.cc
@ -49,6 +49,9 @@ uint8_t kUpdBufIdx[4] = {0, 0, 1, 0};

 int kMaxNumTiles4kVideo = 8;

+// Maximum allowed PID difference for variable frame-rate mode.
+const int kMaxAllowedPidDIff = 8;
+
 // Only positive speeds, range for real-time coding currently is: 5 - 8.
 // Lower means slower/better quality, higher means fastest/lower quality.
 int GetCpuSpeed(int width, int height) {
@ -124,6 +127,18 @@ ColorSpace ExtractVP9ColorSpace(vpx_color_space_t space_t,
  }
  return ColorSpace(primaries, transfer, matrix, range);
 }
+
+bool MoreLayersEnabled(const VideoBitrateAllocation& first,
+                       const VideoBitrateAllocation& second) {
+  for (size_t sl_idx = 0; sl_idx < kMaxSpatialLayers; ++sl_idx) {
+    if (first.GetSpatialLayerSum(sl_idx) > 0 &&
+        second.GetSpatialLayerSum(sl_idx) == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace

 void VP9EncoderImpl::EncoderOutputCodedPacketCallback(vpx_codec_cx_pkt* pkt,
@ -154,12 +169,12 @@ VP9EncoderImpl::VP9EncoderImpl(const cricket::VideoCodec& codec)
          field_trial::IsEnabled("WebRTC-Vp9IssueKeyFrameOnLayerDeactivation")),
      is_svc_(false),
      inter_layer_pred_(InterLayerPredMode::kOn),
-      external_ref_control_(
-          field_trial::IsEnabled("WebRTC-Vp9ExternalRefCtrl")),
+      external_ref_control_(false),  // Set in InitEncode because of tests.
      trusted_rate_controller_(
          field_trial::IsEnabled(kVp9TrustedRateControllerFieldTrial)),
      full_superframe_drop_(true),
      first_frame_in_picture_(true),
+      ss_info_needed_(false),
      is_flexible_mode_(false) {
  memset(&codec_, 0, sizeof(codec_));
  memset(&svc_params_, 0, sizeof(vpx_svc_extra_cfg_t));
@ -314,14 +329,8 @@ int VP9EncoderImpl::SetRateAllocation(

  codec_.maxFramerate = frame_rate;

-  if (!SetSvcRates(bitrate_allocation)) {
-    return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
-  }
+  requested_bitrate_allocation_ = bitrate_allocation;

-  // Update encoder context
-  if (vpx_codec_enc_config_set(encoder_, config_)) {
-    return WEBRTC_VIDEO_CODEC_ERROR;
-  }
  return WEBRTC_VIDEO_CODEC_OK;
 }

@ -461,6 +470,27 @@ int VP9EncoderImpl::InitEncode(const VideoCodec* inst,

  is_flexible_mode_ = inst->VP9().flexibleMode;

+  inter_layer_pred_ = inst->VP9().interLayerPred;
+
+  different_framerates_used_ = false;
+  for (size_t sl_idx = 1; sl_idx < num_spatial_layers_; ++sl_idx) {
+    if (std::abs(codec_.spatialLayers[sl_idx].maxFramerate -
+                 codec_.spatialLayers[0].maxFramerate) > 1e-9) {
+      different_framerates_used_ = true;
+    }
+  }
+
+  if (different_framerates_used_ && !is_flexible_mode_) {
+    RTC_LOG(LS_ERROR) << "Flexible mode required for different framerates on "
+                         "different spatial layers";
+    return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
+  }
+
+  // External reference control is required for different frame rate on spatial
+  // layers because libvpx generates rtp incompatible references in this case.
+  external_ref_control_ = field_trial::IsEnabled("WebRTC-Vp9ExternalRefCtrl") ||
+                          different_framerates_used_;
+
  if (num_temporal_layers_ == 1) {
    gof_.SetGofInfoVP9(kTemporalStructureMode1);
    config_->temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING;
@ -493,8 +523,14 @@ int VP9EncoderImpl::InitEncode(const VideoCodec* inst,
    return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
  }

-  inter_layer_pred_ = inst->VP9().interLayerPred;
-
+  if (external_ref_control_) {
+    config_->temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+    if (num_temporal_layers_ > 1 && different_framerates_used_) {
+      // External reference control for several temporal layers with different
+      // frame rates on spatial layers is not implemented yet.
+      return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
+    }
+  }
  ref_buf_.clear();

  return InitAndSetControlSettings(inst);
@ -575,9 +611,9 @@ int VP9EncoderImpl::InitAndSetControlSettings(const VideoCodec* inst) {
  }

  SvcRateAllocator init_allocator(codec_);
-  VideoBitrateAllocation allocation = init_allocator.GetAllocation(
+  current_bitrate_allocation_ = init_allocator.GetAllocation(
      inst->startBitrate * 1000, inst->maxFramerate);
-  if (!SetSvcRates(allocation)) {
+  if (!SetSvcRates(current_bitrate_allocation_)) {
    return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
  }

@ -595,6 +631,7 @@ int VP9EncoderImpl::InitAndSetControlSettings(const VideoCodec* inst) {
                    inst->VP9().adaptiveQpMode ? 3 : 0);

  vpx_codec_control(encoder_, VP9E_SET_FRAME_PARALLEL_DECODING, 0);
+  vpx_codec_control(encoder_, VP9E_SET_SVC_GF_TEMPORAL_REF, 0);

  if (is_svc_) {
    vpx_codec_control(encoder_, VP9E_SET_SVC, 1);
@ -696,21 +733,21 @@ int VP9EncoderImpl::Encode(const VideoFrame& input_image,
    }
  }

-  if (VideoCodecMode::kScreensharing == codec_.mode && !force_key_frame_) {
-    // Skip encoding spatial layer frames if their target frame rate is lower
-    // than actual input frame rate.
-    vpx_svc_layer_id_t layer_id = {0};
+  vpx_svc_layer_id_t layer_id = {0};
+  if (!force_key_frame_) {
    const size_t gof_idx = (pics_since_key_ + 1) % gof_.num_frames_in_gof;
    layer_id.temporal_layer_id = gof_.temporal_idx[gof_idx];

-    const uint32_t frame_timestamp_ms =
-        1000 * input_image.timestamp() / kVideoPayloadTypeFrequency;
+    if (VideoCodecMode::kScreensharing == codec_.mode) {
+      const uint32_t frame_timestamp_ms =
+          1000 * input_image.timestamp() / kVideoPayloadTypeFrequency;

-    for (uint8_t sl_idx = 0; sl_idx < num_active_spatial_layers_; ++sl_idx) {
-      if (framerate_controller_[sl_idx].DropFrame(frame_timestamp_ms)) {
-        ++layer_id.spatial_layer_id;
-      } else {
-        break;
+      for (uint8_t sl_idx = 0; sl_idx < num_active_spatial_layers_; ++sl_idx) {
+        if (framerate_controller_[sl_idx].DropFrame(frame_timestamp_ms)) {
+          ++layer_id.spatial_layer_id;
+        } else {
+          break;
+        }
      }
    }

@ -719,8 +756,42 @@ int VP9EncoderImpl::Encode(const VideoFrame& input_image,
      // Drop entire picture.
      return WEBRTC_VIDEO_CODEC_OK;
    }
+  }

-    vpx_codec_control(encoder_, VP9E_SET_SVC_LAYER_ID, &layer_id);
+  for (int sl_idx = 0; sl_idx < num_active_spatial_layers_; ++sl_idx) {
+    layer_id.temporal_layer_id_per_spatial[sl_idx] = layer_id.temporal_layer_id;
+  }
+
+  vpx_codec_control(encoder_, VP9E_SET_SVC_LAYER_ID, &layer_id);
+
+  if (requested_bitrate_allocation_) {
+    bool more_layers_requested = MoreLayersEnabled(
+        *requested_bitrate_allocation_, current_bitrate_allocation_);
+    bool less_layers_requested = MoreLayersEnabled(
+        current_bitrate_allocation_, *requested_bitrate_allocation_);
+    // In SVC can enable new layers only if all lower layers are encoded and at
+    // the base temporal layer.
+    // This will delay rate allocation change until the next frame on the base
+    // spatial layer.
+    // In KSVC or simulcast modes KF will be generated for a new layer, so can
+    // update allocation any time.
+    bool can_upswitch =
+        inter_layer_pred_ != InterLayerPredMode::kOn ||
+        (layer_id.spatial_layer_id == 0 && layer_id.temporal_layer_id == 0);
+    if (!more_layers_requested || can_upswitch) {
+      current_bitrate_allocation_ = *requested_bitrate_allocation_;
+      requested_bitrate_allocation_ = absl::nullopt;
+      if (!SetSvcRates(current_bitrate_allocation_)) {
+        return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
+      }
+      if (less_layers_requested || more_layers_requested) {
+        ss_info_needed_ = true;
+      }
+    }
+  }
+
+  if (vpx_codec_enc_config_set(encoder_, config_)) {
+    return WEBRTC_VIDEO_CODEC_ERROR;
  }

  RTC_DCHECK_EQ(input_image.width(), raw_->d_w);
@ -780,7 +851,8 @@ int VP9EncoderImpl::Encode(const VideoFrame& input_image,
  }

  if (external_ref_control_) {
-    vpx_svc_ref_frame_config_t ref_config = SetReferences(force_key_frame_);
+    vpx_svc_ref_frame_config_t ref_config =
+        SetReferences(force_key_frame_, layer_id.spatial_layer_id);

    if (VideoCodecMode::kScreensharing == codec_.mode) {
      for (uint8_t sl_idx = 0; sl_idx < num_active_spatial_layers_; ++sl_idx) {
@ -840,9 +912,22 @@ void VP9EncoderImpl::PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
  vp9_info->ss_data_available =
      (pkt.data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;

+  if (pkt.data.frame.flags & VPX_FRAME_IS_KEY) {
+    pics_since_key_ = 0;
+  } else if (first_frame_in_picture_) {
+    ++pics_since_key_;
+  }
+
  vpx_svc_layer_id_t layer_id = {0};
  vpx_codec_control(encoder_, VP9E_GET_SVC_LAYER_ID, &layer_id);

+  if (ss_info_needed_ && layer_id.temporal_layer_id == 0 &&
+      layer_id.spatial_layer_id == 0) {
+    // Force SS info after the layers configuration has changed.
+    vp9_info->ss_data_available = true;
+    ss_info_needed_ = false;
+  }
+
  RTC_CHECK_GT(num_temporal_layers_, 0);
  RTC_CHECK_GT(num_active_spatial_layers_, 0);
  if (num_temporal_layers_ == 1) {
@ -864,12 +949,6 @@ void VP9EncoderImpl::PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
  // TODO(asapersson): this info has to be obtained from the encoder.
  vp9_info->temporal_up_switch = false;

-  if (pkt.data.frame.flags & VPX_FRAME_IS_KEY) {
-    pics_since_key_ = 0;
-  } else if (first_frame_in_picture_) {
-    ++pics_since_key_;
-  }
-
  const bool is_key_pic = (pics_since_key_ == 0);
  const bool is_inter_layer_pred_allowed =
      (inter_layer_pred_ == InterLayerPredMode::kOn ||
@ -901,8 +980,6 @@ void VP9EncoderImpl::PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
    vp9_info->gof_idx = kNoGofIdx;
    FillReferenceIndices(pkt, pics_since_key_, vp9_info->inter_layer_predicted,
                         vp9_info);
-    // TODO(webrtc:9794): Add fake reference to empty reference list to
-    // workaround the frame buffer issue on receiver.
  } else {
    vp9_info->gof_idx =
        static_cast<uint8_t>(pics_since_key_ % gof_.num_frames_in_gof);
@ -985,6 +1062,8 @@ void VP9EncoderImpl::FillReferenceIndices(const vpx_codec_cx_pkt& pkt,

  size_t max_ref_temporal_layer_id = 0;

+  std::vector<size_t> ref_pid_list;
+
  vp9_info->num_ref_pics = 0;
  for (const RefFrameBuffer& ref_buf : ref_buf_list) {
    RTC_DCHECK_LE(ref_buf.pic_num, pic_num);
@ -997,6 +1076,16 @@ void VP9EncoderImpl::FillReferenceIndices(const vpx_codec_cx_pkt& pkt,
      }
      RTC_DCHECK_LE(ref_buf.temporal_layer_id, layer_id.temporal_layer_id);

+      // Encoder may reference several spatial layers on the same previous
+      // frame in case if some spatial layers are skipped on the current frame.
+      // We shouldn't put duplicate references as it may break some old
+      // clients and isn't RTP compatible.
+      if (std::find(ref_pid_list.begin(), ref_pid_list.end(),
+                    ref_buf.pic_num) != ref_pid_list.end()) {
+        continue;
+      }
+      ref_pid_list.push_back(ref_buf.pic_num);
+
      const size_t p_diff = pic_num - ref_buf.pic_num;
      RTC_DCHECK_LE(p_diff, 127UL);

@ -1038,20 +1127,13 @@ void VP9EncoderImpl::UpdateReferenceBuffers(const vpx_codec_cx_pkt& pkt,
    vpx_svc_ref_frame_config_t enc_layer_conf = {{0}};
    vpx_codec_control(encoder_, VP9E_GET_SVC_REF_FRAME_CONFIG, &enc_layer_conf);

-    if (enc_layer_conf.update_last[layer_id.spatial_layer_id]) {
-      ref_buf_[enc_layer_conf.lst_fb_idx[layer_id.spatial_layer_id]] =
-          frame_buf;
+    for (size_t i = 0; i < kNumVp9Buffers; ++i) {
+      if (enc_layer_conf.update_buffer_slot[layer_id.spatial_layer_id] &
+          (1 << i)) {
+        ref_buf_[i] = frame_buf;
+      }
    }

-    if (enc_layer_conf.update_alt_ref[layer_id.spatial_layer_id]) {
-      ref_buf_[enc_layer_conf.alt_fb_idx[layer_id.spatial_layer_id]] =
-          frame_buf;
-    }
-
-    if (enc_layer_conf.update_golden[layer_id.spatial_layer_id]) {
-      ref_buf_[enc_layer_conf.gld_fb_idx[layer_id.spatial_layer_id]] =
-          frame_buf;
-    }
  } else {
    RTC_DCHECK_EQ(num_spatial_layers_, 1);
    RTC_DCHECK_EQ(num_temporal_layers_, 1);
@ -1061,7 +1143,9 @@ void VP9EncoderImpl::UpdateReferenceBuffers(const vpx_codec_cx_pkt& pkt,
  }
 }

-vpx_svc_ref_frame_config_t VP9EncoderImpl::SetReferences(bool is_key_pic) {
+vpx_svc_ref_frame_config_t VP9EncoderImpl::SetReferences(
+    bool is_key_pic,
+    size_t first_active_spatial_layer_id) {
  // kRefBufIdx, kUpdBufIdx need to be updated to support longer GOFs.
  RTC_DCHECK_LE(gof_.num_frames_in_gof, 4);

@ -1083,8 +1167,10 @@ vpx_svc_ref_frame_config_t VP9EncoderImpl::SetReferences(bool is_key_pic) {
  // for temporal references plus 1 buffer for spatial reference. 7 buffers
  // in total.

-  for (size_t sl_idx = 0; sl_idx < num_active_spatial_layers_; ++sl_idx) {
-    const size_t gof_idx = pics_since_key_ % gof_.num_frames_in_gof;
+  for (size_t sl_idx = first_active_spatial_layer_id;
+       sl_idx < num_active_spatial_layers_; ++sl_idx) {
+    const size_t curr_pic_num = is_key_pic ? 0 : pics_since_key_ + 1;
+    const size_t gof_idx = curr_pic_num % gof_.num_frames_in_gof;

    if (!is_key_pic) {
      // Set up temporal reference.
@ -1096,36 +1182,47 @@ vpx_svc_ref_frame_config_t VP9EncoderImpl::SetReferences(bool is_key_pic) {

      // Sanity check that reference picture number is smaller than current
      // picture number.
-      const size_t curr_pic_num = pics_since_key_ + 1;
      RTC_DCHECK_LT(ref_buf_[buf_idx].pic_num, curr_pic_num);
      const size_t pid_diff = curr_pic_num - ref_buf_[buf_idx].pic_num;
+      // Incorrect spatial layer may be in the buffer due to a key-frame.
+      const bool same_spatial_layer =
+          ref_buf_[buf_idx].spatial_layer_id == sl_idx;
+      bool correct_pid = false;
+      if (different_framerates_used_) {
+        correct_pid = pid_diff < kMaxAllowedPidDIff;
+      } else {
+        // Below code assumes single temporal referecence.
+        RTC_DCHECK_EQ(gof_.num_ref_pics[gof_idx], 1);
+        correct_pid = pid_diff == gof_.pid_diff[gof_idx][0];
+      }

-      // Below code assumes single temporal referecence.
-      RTC_DCHECK_EQ(gof_.num_ref_pics[gof_idx], 1);
-      if (pid_diff == gof_.pid_diff[gof_idx][0]) {
+      if (same_spatial_layer && correct_pid) {
        ref_config.lst_fb_idx[sl_idx] = buf_idx;
        ref_config.reference_last[sl_idx] = 1;
      } else {
        // This reference doesn't match with one specified by GOF. This can
        // only happen if spatial layer is enabled dynamically without key
        // frame. Spatial prediction is supposed to be enabled in this case.
-        RTC_DCHECK(is_inter_layer_pred_allowed);
+        RTC_DCHECK(is_inter_layer_pred_allowed &&
+                   sl_idx > first_active_spatial_layer_id);
      }
    }

-    if (is_inter_layer_pred_allowed && sl_idx > 0) {
+    if (is_inter_layer_pred_allowed && sl_idx > first_active_spatial_layer_id) {
      // Set up spatial reference.
      RTC_DCHECK(last_updated_buf_idx);
      ref_config.gld_fb_idx[sl_idx] = *last_updated_buf_idx;
      ref_config.reference_golden[sl_idx] = 1;
    } else {
-      RTC_DCHECK(ref_config.reference_last[sl_idx] != 0 || sl_idx == 0 ||
+      RTC_DCHECK(ref_config.reference_last[sl_idx] != 0 ||
+                 sl_idx == first_active_spatial_layer_id ||
                 inter_layer_pred_ == InterLayerPredMode::kOff);
    }

    last_updated_buf_idx.reset();

-    if (gof_.temporal_idx[gof_idx] <= num_temporal_layers_ - 1) {
+    if (gof_.temporal_idx[gof_idx] < num_temporal_layers_ - 1 ||
+        num_temporal_layers_ == 1) {
      last_updated_buf_idx = sl_idx * num_temporal_refs + kUpdBufIdx[gof_idx];

      // Ensure last frame buffer is not used for temporal prediction (it is
--- a/modules/video_coding/codecs/vp9/vp9_impl.h
+++ b/modules/video_coding/codecs/vp9/vp9_impl.h
@ -70,7 +70,9 @@ class VP9EncoderImpl : public VP9Encoder {
                            CodecSpecificInfoVP9* vp9_info);
  void UpdateReferenceBuffers(const vpx_codec_cx_pkt& pkt,
                              const size_t pic_num);
-  vpx_svc_ref_frame_config_t SetReferences(bool is_key_pic);
+  vpx_svc_ref_frame_config_t SetReferences(
+      bool is_key_pic,
+      size_t first_active_spatial_layer_id);

  bool ExplicitlyConfiguredSpatialLayers() const;
  bool SetSvcRates(const VideoBitrateAllocation& bitrate_allocation);
@ -110,6 +112,7 @@ class VP9EncoderImpl : public VP9Encoder {
  GofInfoVP9 gof_;  // Contains each frame's temporal information for
                    // non-flexible mode.
  bool force_key_frame_;
+  bool different_framerates_used_;
  size_t pics_since_key_;
  uint8_t num_temporal_layers_;
  uint8_t num_spatial_layers_;         // Number of configured SLs
@ -121,6 +124,9 @@ class VP9EncoderImpl : public VP9Encoder {
  const bool trusted_rate_controller_;
  const bool full_superframe_drop_;
  bool first_frame_in_picture_;
+  VideoBitrateAllocation current_bitrate_allocation_;
+  absl::optional<VideoBitrateAllocation> requested_bitrate_allocation_;
+  bool ss_info_needed_;

  std::vector<FramerateController> framerate_controller_;

--- a/modules/video_coding/encoded_frame.h
+++ b/modules/video_coding/encoded_frame.h
@ -67,9 +67,11 @@ class VCMEncodedFrame : protected EncodedImage {
  /**
   *   Frame RTP timestamp (90kHz)
   */
-  using EncodedImage::Timestamp;
+  using EncodedImage::set_size;
  using EncodedImage::SetTimestamp;
  using EncodedImage::size;
+  using EncodedImage::Timestamp;
+
  /**
   *   Get render time in milliseconds
   */
@ -90,6 +92,7 @@ class VCMEncodedFrame : protected EncodedImage {
   * Get video timing
   */
  EncodedImage::Timing video_timing() const { return timing_; }
+  EncodedImage::Timing* video_timing_mutable() { return &timing_; }
  /**
   *   True if this frame is complete, false otherwise
   */
@ -109,8 +112,10 @@ class VCMEncodedFrame : protected EncodedImage {
   *   the object.
   */
  const CodecSpecificInfo* CodecSpecific() const { return &_codecSpecificInfo; }
+  void SetCodecSpecific(const CodecSpecificInfo* codec_specific) {
+    _codecSpecificInfo = *codec_specific;
+  }

- protected:
  /**
   * Verifies that current allocated buffer size is larger than or equal to the
   * input size.
@ -121,6 +126,7 @@ class VCMEncodedFrame : protected EncodedImage {
   */
  void VerifyAndAllocate(size_t minimumSize);

+ protected:
  void Reset();

  void CopyCodecSpecific(const RTPVideoHeader* header);
--- a/modules/video_coding/frame_buffer2.cc
+++ b/modules/video_coding/frame_buffer2.cc
@ -87,10 +87,10 @@ FrameBuffer::ReturnReason FrameBuffer::NextFrame(

      wait_ms = max_wait_time_ms;

-      // Need to hold |crit_| in order to use |frames_|, therefore we
+      // Need to hold |crit_| in order to access frames_to_decode_. therefore we
      // set it here in the loop instead of outside the loop in order to not
-      // acquire the lock unnecesserily.
-      next_frame_it_ = frames_.end();
+      // acquire the lock unnecessarily.
+      frames_to_decode_.clear();

      // |frame_it| points to the first frame after the
      // |last_decoded_frame_it_|.
@ -128,7 +128,53 @@ FrameBuffer::ReturnReason FrameBuffer::NextFrame(
          continue;
        }

-        next_frame_it_ = frame_it;
+        // Only ever return all parts of a superframe. Therefore skip this
+        // frame if it's not a beginning of a superframe.
+        if (frame->inter_layer_predicted) {
+          continue;
+        }
+
+        // Gather all remaining frames for the same superframe.
+        std::vector<FrameMap::iterator> current_superframe;
+        current_superframe.push_back(frame_it);
+        bool last_layer_completed =
+            frame_it->second.frame->is_last_spatial_layer;
+        FrameMap::iterator next_frame_it = frame_it;
+        while (true) {
+          ++next_frame_it;
+          if (next_frame_it == frames_.end() ||
+              next_frame_it->first.picture_id != frame->id.picture_id ||
+              !next_frame_it->second.continuous) {
+            break;
+          }
+          // Check if the next frame has some undecoded references other than
+          // the previous frame in the same superframe.
+          size_t num_allowed_undecoded_refs =
+              (next_frame_it->second.frame->inter_layer_predicted) ? 1 : 0;
+          if (next_frame_it->second.num_missing_decodable >
+              num_allowed_undecoded_refs) {
+            break;
+          }
+          // All frames in the superframe should have the same timestamp.
+          if (frame->Timestamp() != next_frame_it->second.frame->Timestamp()) {
+            RTC_LOG(LS_WARNING)
+                << "Frames in a single superframe have different"
+                   " timestamps. Skipping undecodable superframe.";
+            break;
+          }
+          current_superframe.push_back(next_frame_it);
+          last_layer_completed =
+              next_frame_it->second.frame->is_last_spatial_layer;
+        }
+        // Check if the current superframe is complete.
+        // TODO(bugs.webrtc.org/10064): consider returning all available to
+        // decode frames even if the superframe is not complete yet.
+        if (!last_layer_completed) {
+          continue;
+        }
+
+        frames_to_decode_ = std::move(current_superframe);
+
        if (frame->RenderTime() == -1) {
          frame->SetRenderTime(
              timing_->RenderTimeMs(frame->Timestamp(), now_ms));
@ -154,9 +200,10 @@ FrameBuffer::ReturnReason FrameBuffer::NextFrame(
  {
    rtc::CritScope lock(&crit_);
    now_ms = clock_->TimeInMilliseconds();
-    if (next_frame_it_ != frames_.end()) {
-      std::unique_ptr<EncodedFrame> frame =
-          std::move(next_frame_it_->second.frame);
+    std::vector<EncodedFrame*> frames_out;
+    for (const FrameMap::iterator& frame_it : frames_to_decode_) {
+      RTC_DCHECK(frame_it != frames_.end());
+      EncodedFrame* frame = frame_it->second.frame.release();

      if (!frame->delayed_by_retransmission()) {
        int64_t frame_delay;
@ -187,14 +234,22 @@ FrameBuffer::ReturnReason FrameBuffer::NextFrame(

      UpdateJitterDelay();
      UpdateTimingFrameInfo();
-      PropagateDecodability(next_frame_it_->second);
+      PropagateDecodability(frame_it->second);

-      AdvanceLastDecodedFrame(next_frame_it_);
+      AdvanceLastDecodedFrame(frame_it);
      last_decoded_frame_timestamp_ = frame->Timestamp();
-      *frame_out = std::move(frame);
+      frames_out.push_back(frame);
+    }
+
+    if (!frames_out.empty()) {
+      if (frames_out.size() == 1) {
+        frame_out->reset(frames_out[0]);
+      } else {
+        frame_out->reset(CombineAndDeleteFrames(frames_out));
+      }
      return kFrameFound;
    }
-  }
+  }  // rtc::Critscope lock(&crit_)

  if (latest_return_time_ms - now_ms > 0) {
    // If |next_frame_it_ == frames_.end()| and there is still time left, it
@ -203,7 +258,6 @@ FrameBuffer::ReturnReason FrameBuffer::NextFrame(
    // remaining time and then return.
    return NextFrame(latest_return_time_ms - now_ms, frame_out);
  }
-
  return kTimeout;
 }

@ -606,11 +660,38 @@ void FrameBuffer::ClearFramesAndHistory() {
  frames_.clear();
  last_decoded_frame_it_ = frames_.end();
  last_continuous_frame_it_ = frames_.end();
-  next_frame_it_ = frames_.end();
+  frames_to_decode_.clear();
  num_frames_history_ = 0;
  num_frames_buffered_ = 0;
 }

+EncodedFrame* FrameBuffer::CombineAndDeleteFrames(
+    const std::vector<EncodedFrame*>& frames) const {
+  RTC_DCHECK(!frames.empty());
+  EncodedFrame* frame = frames[0];
+  size_t total_length = 0;
+  for (size_t i = 0; i < frames.size(); ++i) {
+    total_length += frames[i]->size();
+  }
+  frame->VerifyAndAllocate(total_length);
+  uint8_t* buffer = frame->MutableBuffer();
+  // Append all remaining frames to the first one.
+  size_t used_buffer_bytes = frame->size();
+  for (size_t i = 1; i < frames.size(); ++i) {
+    EncodedFrame* frame_to_append = frames[i];
+    memcpy(buffer + used_buffer_bytes, frame_to_append->Buffer(),
+           frame_to_append->size());
+    used_buffer_bytes += frame_to_append->size();
+    frame->video_timing_mutable()->network2_timestamp_ms =
+        frame_to_append->video_timing().network2_timestamp_ms;
+    frame->video_timing_mutable()->receive_finish_ms =
+        frame_to_append->video_timing().receive_finish_ms;
+    delete frame_to_append;
+  }
+  frame->set_size(total_length);
+  return frame;
+}
+
 FrameBuffer::FrameInfo::FrameInfo() = default;
 FrameBuffer::FrameInfo::FrameInfo(FrameInfo&&) = default;
 FrameBuffer::FrameInfo::~FrameInfo() = default;
--- a/modules/video_coding/frame_buffer2.h
+++ b/modules/video_coding/frame_buffer2.h
@ -15,6 +15,7 @@
 #include <map>
 #include <memory>
 #include <utility>
+#include <vector>

 #include "api/video/encoded_frame.h"
 #include "modules/video_coding/include/video_coding_defines.h"
@ -156,6 +157,13 @@ class FrameBuffer {
  bool HasBadRenderTiming(const EncodedFrame& frame, int64_t now_ms)
      RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_);

+  // The cleaner solution would be to have the NextFrame function return a
+  // vector of frames, but until the decoding pipeline can support decoding
+  // multiple frames at the same time we combine all frames to one frame and
+  // return it. See bugs.webrtc.org/10064
+  EncodedFrame* CombineAndDeleteFrames(
+      const std::vector<EncodedFrame*>& frames) const;
+
  FrameMap frames_ RTC_GUARDED_BY(crit_);

  rtc::CriticalSection crit_;
@ -167,7 +175,7 @@ class FrameBuffer {
  absl::optional<uint32_t> last_decoded_frame_timestamp_ RTC_GUARDED_BY(crit_);
  FrameMap::iterator last_decoded_frame_it_ RTC_GUARDED_BY(crit_);
  FrameMap::iterator last_continuous_frame_it_ RTC_GUARDED_BY(crit_);
-  FrameMap::iterator next_frame_it_ RTC_GUARDED_BY(crit_);
+  std::vector<FrameMap::iterator> frames_to_decode_ RTC_GUARDED_BY(crit_);
  int num_frames_history_ RTC_GUARDED_BY(crit_);
  int num_frames_buffered_ RTC_GUARDED_BY(crit_);
  bool stopped_ RTC_GUARDED_BY(crit_);
--- a/modules/video_coding/frame_buffer2_unittest.cc
+++ b/modules/video_coding/frame_buffer2_unittest.cc
@ -124,6 +124,7 @@ class TestFrameBuffer2 : public ::testing::Test {
  static constexpr int kFps1 = 1000;
  static constexpr int kFps10 = kFps1 / 10;
  static constexpr int kFps20 = kFps1 / 20;
+  static constexpr size_t kFrameSize = 10;

  TestFrameBuffer2()
      : clock_(0),
@ -150,6 +151,7 @@ class TestFrameBuffer2 : public ::testing::Test {
                  uint8_t spatial_layer,
                  int64_t ts_ms,
                  bool inter_layer_predicted,
+                  bool last_spatial_layer,
                  T... refs) {
    static_assert(sizeof...(refs) <= kMaxReferences,
                  "To many references specified for EncodedFrame.");
@ -162,6 +164,10 @@ class TestFrameBuffer2 : public ::testing::Test {
    frame->SetTimestamp(ts_ms * 90);
    frame->num_references = references.size();
    frame->inter_layer_predicted = inter_layer_predicted;
+    frame->is_last_spatial_layer = last_spatial_layer;
+    // Add some data to buffer.
+    frame->VerifyAndAllocate(kFrameSize);
+    frame->SetSize(kFrameSize);
    for (size_t r = 0; r < references.size(); ++r)
      frame->references[r] = references[r];

@ -194,6 +200,13 @@ class TestFrameBuffer2 : public ::testing::Test {
    ASSERT_EQ(spatial_layer, frames_[index]->id.spatial_layer);
  }

+  void CheckFrameSize(size_t index, size_t size) {
+    rtc::CritScope lock(&crit_);
+    ASSERT_LT(index, frames_.size());
+    ASSERT_TRUE(frames_[index]);
+    ASSERT_EQ(frames_[index]->size(), size);
+  }
+
  void CheckNoFrame(size_t index) {
    rtc::CritScope lock(&crit_);
    ASSERT_LT(index, frames_.size());
@ -246,7 +259,7 @@ TEST_F(TestFrameBuffer2, WaitForFrame) {
  uint32_t ts = Rand();

  ExtractFrame(50);
-  InsertFrame(pid, 0, ts, false);
+  InsertFrame(pid, 0, ts, false, true);
  CheckFrame(0, pid, 0);
 }

@ -254,13 +267,11 @@ TEST_F(TestFrameBuffer2, OneSuperFrame) {
  uint16_t pid = Rand();
  uint32_t ts = Rand();

-  InsertFrame(pid, 0, ts, false);
-  ExtractFrame();
-  InsertFrame(pid, 1, ts, true);
+  InsertFrame(pid, 0, ts, false, false);
+  InsertFrame(pid, 1, ts, true, true);
  ExtractFrame();

  CheckFrame(0, pid, 0);
-  CheckFrame(1, pid, 1);
 }

 TEST_F(TestFrameBuffer2, SetPlayoutDelay) {
@ -293,8 +304,8 @@ TEST_F(TestFrameBuffer2, DISABLED_OneUnorderedSuperFrame) {
  uint32_t ts = Rand();

  ExtractFrame(50);
-  InsertFrame(pid, 1, ts, true);
-  InsertFrame(pid, 0, ts, false);
+  InsertFrame(pid, 1, ts, true, true);
+  InsertFrame(pid, 0, ts, false, false);
  ExtractFrame();

  CheckFrame(0, pid, 0);
@ -305,14 +316,14 @@ TEST_F(TestFrameBuffer2, DISABLED_OneLayerStreamReordered) {
  uint16_t pid = Rand();
  uint32_t ts = Rand();

-  InsertFrame(pid, 0, ts, false);
+  InsertFrame(pid, 0, ts, false, true);
  ExtractFrame();
  CheckFrame(0, pid, 0);
  for (int i = 1; i < 10; i += 2) {
    ExtractFrame(50);
-    InsertFrame(pid + i + 1, 0, ts + (i + 1) * kFps10, false, pid + i);
+    InsertFrame(pid + i + 1, 0, ts + (i + 1) * kFps10, false, true, pid + i);
    clock_.AdvanceTimeMilliseconds(kFps10);
-    InsertFrame(pid + i, 0, ts + i * kFps10, false, pid + i - 1);
+    InsertFrame(pid + i, 0, ts + i * kFps10, false, true, pid + i - 1);
    clock_.AdvanceTimeMilliseconds(kFps10);
    ExtractFrame();
    CheckFrame(i, pid + i, 0);
@ -330,9 +341,9 @@ TEST_F(TestFrameBuffer2, MissingFrame) {
  uint16_t pid = Rand();
  uint32_t ts = Rand();

-  InsertFrame(pid, 0, ts, false);
-  InsertFrame(pid + 2, 0, ts, false, pid);
-  InsertFrame(pid + 3, 0, ts, false, pid + 1, pid + 2);
+  InsertFrame(pid, 0, ts, false, true);
+  InsertFrame(pid + 2, 0, ts, false, true, pid);
+  InsertFrame(pid + 3, 0, ts, false, true, pid + 1, pid + 2);
  ExtractFrame();
  ExtractFrame();
  ExtractFrame();
@ -346,11 +357,11 @@ TEST_F(TestFrameBuffer2, OneLayerStream) {
  uint16_t pid = Rand();
  uint32_t ts = Rand();

-  InsertFrame(pid, 0, ts, false);
+  InsertFrame(pid, 0, ts, false, true);
  ExtractFrame();
  CheckFrame(0, pid, 0);
  for (int i = 1; i < 10; ++i) {
-    InsertFrame(pid + i, 0, ts + i * kFps10, false, pid + i - 1);
+    InsertFrame(pid + i, 0, ts + i * kFps10, false, true, pid + i - 1);
    ExtractFrame();
    clock_.AdvanceTimeMilliseconds(kFps10);
    CheckFrame(i, pid + i, 0);
@ -361,12 +372,13 @@ TEST_F(TestFrameBuffer2, DropTemporalLayerSlowDecoder) {
  uint16_t pid = Rand();
  uint32_t ts = Rand();

-  InsertFrame(pid, 0, ts, false);
-  InsertFrame(pid + 1, 0, ts + kFps20, false, pid);
+  InsertFrame(pid, 0, ts, false, true);
+  InsertFrame(pid + 1, 0, ts + kFps20, false, true, pid);
  for (int i = 2; i < 10; i += 2) {
    uint32_t ts_tl0 = ts + i / 2 * kFps10;
-    InsertFrame(pid + i, 0, ts_tl0, false, pid + i - 2);
-    InsertFrame(pid + i + 1, 0, ts_tl0 + kFps20, false, pid + i, pid + i - 1);
+    InsertFrame(pid + i, 0, ts_tl0, false, true, pid + i - 2);
+    InsertFrame(pid + i + 1, 0, ts_tl0 + kFps20, false, true, pid + i,
+                pid + i - 1);
  }

  for (int i = 0; i < 10; ++i) {
@ -386,49 +398,15 @@ TEST_F(TestFrameBuffer2, DropTemporalLayerSlowDecoder) {
  CheckNoFrame(9);
 }

-TEST_F(TestFrameBuffer2, DropSpatialLayerSlowDecoder) {
-  uint16_t pid = Rand();
-  uint32_t ts = Rand();
-
-  InsertFrame(pid, 0, ts, false);
-  InsertFrame(pid, 1, ts, false);
-  for (int i = 1; i < 6; ++i) {
-    uint32_t ts_tl0 = ts + i * kFps10;
-    InsertFrame(pid + i, 0, ts_tl0, false, pid + i - 1);
-    InsertFrame(pid + i, 1, ts_tl0, false, pid + i - 1);
-  }
-
-  ExtractFrame();
-  ExtractFrame();
-  clock_.AdvanceTimeMilliseconds(57);
-  for (int i = 2; i < 12; ++i) {
-    ExtractFrame();
-    clock_.AdvanceTimeMilliseconds(57);
-  }
-
-  CheckFrame(0, pid, 0);
-  CheckFrame(1, pid, 1);
-  CheckFrame(2, pid + 1, 0);
-  CheckFrame(3, pid + 1, 1);
-  CheckFrame(4, pid + 2, 0);
-  CheckFrame(5, pid + 2, 1);
-  CheckFrame(6, pid + 3, 0);
-  CheckFrame(7, pid + 4, 0);
-  CheckFrame(8, pid + 5, 0);
-  CheckNoFrame(9);
-  CheckNoFrame(10);
-  CheckNoFrame(11);
-}
-
 TEST_F(TestFrameBuffer2, InsertLateFrame) {
  uint16_t pid = Rand();
  uint32_t ts = Rand();

-  InsertFrame(pid, 0, ts, false);
+  InsertFrame(pid, 0, ts, false, true);
  ExtractFrame();
-  InsertFrame(pid + 2, 0, ts, false);
+  InsertFrame(pid + 2, 0, ts, false, true);
  ExtractFrame();
-  InsertFrame(pid + 1, 0, ts, false, pid);
+  InsertFrame(pid + 1, 0, ts, false, true, pid);
  ExtractFrame();

  CheckFrame(0, pid, 0);
@ -441,12 +419,12 @@ TEST_F(TestFrameBuffer2, ProtectionMode) {
  uint32_t ts = Rand();

  EXPECT_CALL(jitter_estimator_, GetJitterEstimate(1.0));
-  InsertFrame(pid, 0, ts, false);
+  InsertFrame(pid, 0, ts, false, true);
  ExtractFrame();

  buffer_->SetProtectionMode(kProtectionNackFEC);
  EXPECT_CALL(jitter_estimator_, GetJitterEstimate(0.0));
-  InsertFrame(pid + 1, 0, ts, false);
+  InsertFrame(pid + 1, 0, ts, false, true);
  ExtractFrame();
 }

@ -454,45 +432,45 @@ TEST_F(TestFrameBuffer2, NoContinuousFrame) {
  uint16_t pid = Rand();
  uint32_t ts = Rand();

-  EXPECT_EQ(-1, InsertFrame(pid + 1, 0, ts, false, pid));
+  EXPECT_EQ(-1, InsertFrame(pid + 1, 0, ts, false, true, pid));
 }

 TEST_F(TestFrameBuffer2, LastContinuousFrameSingleLayer) {
  uint16_t pid = Rand();
  uint32_t ts = Rand();

-  EXPECT_EQ(pid, InsertFrame(pid, 0, ts, false));
-  EXPECT_EQ(pid, InsertFrame(pid + 2, 0, ts, false, pid + 1));
-  EXPECT_EQ(pid + 2, InsertFrame(pid + 1, 0, ts, false, pid));
-  EXPECT_EQ(pid + 2, InsertFrame(pid + 4, 0, ts, false, pid + 3));
-  EXPECT_EQ(pid + 5, InsertFrame(pid + 5, 0, ts, false));
+  EXPECT_EQ(pid, InsertFrame(pid, 0, ts, false, true));
+  EXPECT_EQ(pid, InsertFrame(pid + 2, 0, ts, false, true, pid + 1));
+  EXPECT_EQ(pid + 2, InsertFrame(pid + 1, 0, ts, false, true, pid));
+  EXPECT_EQ(pid + 2, InsertFrame(pid + 4, 0, ts, false, true, pid + 3));
+  EXPECT_EQ(pid + 5, InsertFrame(pid + 5, 0, ts, false, true));
 }

 TEST_F(TestFrameBuffer2, LastContinuousFrameTwoLayers) {
  uint16_t pid = Rand();
  uint32_t ts = Rand();

-  EXPECT_EQ(pid, InsertFrame(pid, 0, ts, false));
-  EXPECT_EQ(pid, InsertFrame(pid, 1, ts, true));
-  EXPECT_EQ(pid, InsertFrame(pid + 1, 1, ts, true, pid));
-  EXPECT_EQ(pid, InsertFrame(pid + 2, 0, ts, false, pid + 1));
-  EXPECT_EQ(pid, InsertFrame(pid + 2, 1, ts, true, pid + 1));
-  EXPECT_EQ(pid, InsertFrame(pid + 3, 0, ts, false, pid + 2));
-  EXPECT_EQ(pid + 3, InsertFrame(pid + 1, 0, ts, false, pid));
-  EXPECT_EQ(pid + 3, InsertFrame(pid + 3, 1, ts, true, pid + 2));
+  EXPECT_EQ(pid, InsertFrame(pid, 0, ts, false, false));
+  EXPECT_EQ(pid, InsertFrame(pid, 1, ts, true, true));
+  EXPECT_EQ(pid, InsertFrame(pid + 1, 1, ts, true, true, pid));
+  EXPECT_EQ(pid, InsertFrame(pid + 2, 0, ts, false, false, pid + 1));
+  EXPECT_EQ(pid, InsertFrame(pid + 2, 1, ts, true, true, pid + 1));
+  EXPECT_EQ(pid, InsertFrame(pid + 3, 0, ts, false, false, pid + 2));
+  EXPECT_EQ(pid + 3, InsertFrame(pid + 1, 0, ts, false, false, pid));
+  EXPECT_EQ(pid + 3, InsertFrame(pid + 3, 1, ts, true, true, pid + 2));
 }

 TEST_F(TestFrameBuffer2, PictureIdJumpBack) {
  uint16_t pid = Rand();
  uint32_t ts = Rand();

-  EXPECT_EQ(pid, InsertFrame(pid, 0, ts, false));
-  EXPECT_EQ(pid + 1, InsertFrame(pid + 1, 0, ts + 1, false, pid));
+  EXPECT_EQ(pid, InsertFrame(pid, 0, ts, false, true));
+  EXPECT_EQ(pid + 1, InsertFrame(pid + 1, 0, ts + 1, false, true, pid));
  ExtractFrame();
  CheckFrame(0, pid, 0);

  // Jump back in pid but increase ts.
-  EXPECT_EQ(pid - 1, InsertFrame(pid - 1, 0, ts + 2, false));
+  EXPECT_EQ(pid - 1, InsertFrame(pid - 1, 0, ts + 2, false, true));
  ExtractFrame();
  ExtractFrame();
  CheckFrame(1, pid - 1, 0);
@ -511,6 +489,7 @@ TEST_F(TestFrameBuffer2, StatsCallback) {

  {
    std::unique_ptr<FrameObjectFake> frame(new FrameObjectFake());
+    frame->VerifyAndAllocate(kFrameSize);
    frame->SetSize(kFrameSize);
    frame->id.picture_id = pid;
    frame->id.spatial_layer = 0;
@ -526,42 +505,42 @@ TEST_F(TestFrameBuffer2, StatsCallback) {
 }

 TEST_F(TestFrameBuffer2, ForwardJumps) {
-  EXPECT_EQ(5453, InsertFrame(5453, 0, 1, false));
+  EXPECT_EQ(5453, InsertFrame(5453, 0, 1, false, true));
  ExtractFrame();
-  EXPECT_EQ(5454, InsertFrame(5454, 0, 1, false, 5453));
+  EXPECT_EQ(5454, InsertFrame(5454, 0, 1, false, true, 5453));
  ExtractFrame();
-  EXPECT_EQ(15670, InsertFrame(15670, 0, 1, false));
+  EXPECT_EQ(15670, InsertFrame(15670, 0, 1, false, true));
  ExtractFrame();
-  EXPECT_EQ(29804, InsertFrame(29804, 0, 1, false));
+  EXPECT_EQ(29804, InsertFrame(29804, 0, 1, false, true));
  ExtractFrame();
-  EXPECT_EQ(29805, InsertFrame(29805, 0, 1, false, 29804));
+  EXPECT_EQ(29805, InsertFrame(29805, 0, 1, false, true, 29804));
  ExtractFrame();
-  EXPECT_EQ(29806, InsertFrame(29806, 0, 1, false, 29805));
+  EXPECT_EQ(29806, InsertFrame(29806, 0, 1, false, true, 29805));
  ExtractFrame();
-  EXPECT_EQ(33819, InsertFrame(33819, 0, 1, false));
+  EXPECT_EQ(33819, InsertFrame(33819, 0, 1, false, true));
  ExtractFrame();
-  EXPECT_EQ(41248, InsertFrame(41248, 0, 1, false));
+  EXPECT_EQ(41248, InsertFrame(41248, 0, 1, false, true));
  ExtractFrame();
 }

 TEST_F(TestFrameBuffer2, DuplicateFrames) {
-  EXPECT_EQ(22256, InsertFrame(22256, 0, 1, false));
+  EXPECT_EQ(22256, InsertFrame(22256, 0, 1, false, true));
  ExtractFrame();
-  EXPECT_EQ(22256, InsertFrame(22256, 0, 1, false));
+  EXPECT_EQ(22256, InsertFrame(22256, 0, 1, false, true));
 }

 // TODO(philipel): implement more unittests related to invalid references.
 TEST_F(TestFrameBuffer2, InvalidReferences) {
-  EXPECT_EQ(-1, InsertFrame(0, 0, 1000, false, 2));
-  EXPECT_EQ(1, InsertFrame(1, 0, 2000, false));
+  EXPECT_EQ(-1, InsertFrame(0, 0, 1000, false, true, 2));
+  EXPECT_EQ(1, InsertFrame(1, 0, 2000, false, true));
  ExtractFrame();
-  EXPECT_EQ(2, InsertFrame(2, 0, 3000, false, 1));
+  EXPECT_EQ(2, InsertFrame(2, 0, 3000, false, true, 1));
 }

 TEST_F(TestFrameBuffer2, KeyframeRequired) {
-  EXPECT_EQ(1, InsertFrame(1, 0, 1000, false));
-  EXPECT_EQ(2, InsertFrame(2, 0, 2000, false, 1));
-  EXPECT_EQ(3, InsertFrame(3, 0, 3000, false));
+  EXPECT_EQ(1, InsertFrame(1, 0, 1000, false, true));
+  EXPECT_EQ(2, InsertFrame(2, 0, 2000, false, true, 1));
+  EXPECT_EQ(3, InsertFrame(3, 0, 3000, false, true));
  ExtractFrame();
  ExtractFrame(0, true);
  ExtractFrame();
@ -575,42 +554,81 @@ TEST_F(TestFrameBuffer2, KeyframeClearsFullBuffer) {
  const int kMaxBufferSize = 600;

  for (int i = 1; i <= kMaxBufferSize; ++i)
-    EXPECT_EQ(-1, InsertFrame(i, 0, i * 1000, false, i - 1));
+    EXPECT_EQ(-1, InsertFrame(i, 0, i * 1000, false, true, i - 1));
  ExtractFrame();
  CheckNoFrame(0);

-  EXPECT_EQ(
-      kMaxBufferSize + 1,
-      InsertFrame(kMaxBufferSize + 1, 0, (kMaxBufferSize + 1) * 1000, false));
+  EXPECT_EQ(kMaxBufferSize + 1,
+            InsertFrame(kMaxBufferSize + 1, 0, (kMaxBufferSize + 1) * 1000,
+                        false, true));
  ExtractFrame();
  CheckFrame(1, kMaxBufferSize + 1, 0);
 }

 TEST_F(TestFrameBuffer2, DontUpdateOnUndecodableFrame) {
-  InsertFrame(1, 0, 0, false);
+  InsertFrame(1, 0, 0, false, true);
  ExtractFrame(0, true);
-  InsertFrame(3, 0, 0, false, 2, 0);
-  InsertFrame(3, 0, 0, false, 0);
-  InsertFrame(2, 0, 0, false);
+  InsertFrame(3, 0, 0, false, true, 2, 0);
+  InsertFrame(3, 0, 0, false, true, 0);
+  InsertFrame(2, 0, 0, false, true);
  ExtractFrame(0, true);
  ExtractFrame(0, true);
 }

 TEST_F(TestFrameBuffer2, DontDecodeOlderTimestamp) {
-  InsertFrame(2, 0, 1, false);
-  InsertFrame(1, 0, 2, false);  // Older picture id but newer timestamp.
+  InsertFrame(2, 0, 1, false, true);
+  InsertFrame(1, 0, 2, false, true);  // Older picture id but newer timestamp.
  ExtractFrame(0);
  ExtractFrame(0);
  CheckFrame(0, 1, 0);
  CheckNoFrame(1);

-  InsertFrame(3, 0, 4, false);
-  InsertFrame(4, 0, 3, false);  // Newer picture id but older timestamp.
+  InsertFrame(3, 0, 4, false, true);
+  InsertFrame(4, 0, 3, false, true);  // Newer picture id but older timestamp.
  ExtractFrame(0);
  ExtractFrame(0);
  CheckFrame(2, 3, 0);
  CheckNoFrame(3);
 }

+TEST_F(TestFrameBuffer2, CombineFramesToSuperframe) {
+  uint16_t pid = Rand();
+  uint32_t ts = Rand();
+
+  InsertFrame(pid, 0, ts, false, false);
+  InsertFrame(pid, 1, ts, true, true);
+  ExtractFrame(0);
+  ExtractFrame(0);
+  CheckFrame(0, pid, 0);
+  CheckNoFrame(1);
+  // Two frames should be combined and returned together.
+  CheckFrameSize(0, kFrameSize * 2);
+}
+
+TEST_F(TestFrameBuffer2, HigherSpatialLayerNonDecodable) {
+  uint16_t pid = Rand();
+  uint32_t ts = Rand();
+
+  InsertFrame(pid, 0, ts, false, false);
+  InsertFrame(pid, 1, ts, true, true);
+
+  ExtractFrame(0);
+  CheckFrame(0, pid, 0);
+
+  InsertFrame(pid + 1, 1, ts + kFps20, false, true, pid);
+  InsertFrame(pid + 2, 0, ts + kFps10, false, false, pid);
+  InsertFrame(pid + 2, 1, ts + kFps10, true, true, pid + 1);
+
+  clock_.AdvanceTimeMilliseconds(1000);
+  // Frame pid+1 is decodable but too late.
+  // In superframe pid+2 frame sid=0 is decodable, but frame sid=1 is not.
+  // Incorrect implementation might skip pid+1 frame and output undecodable
+  // pid+2 instead.
+  ExtractFrame();
+  ExtractFrame();
+  CheckFrame(1, pid + 1, 1);
+  CheckFrame(2, pid + 2, 0);
+}
+
 }  // namespace video_coding
 }  // namespace webrtc
--- a/modules/video_coding/frame_object.cc
+++ b/modules/video_coding/frame_object.cc
@ -104,6 +104,7 @@ RtpFrameObject::RtpFrameObject(PacketBuffer* packet_buffer,
    timing_.receive_finish_ms = last_packet->receive_time_ms;
  }
  timing_.flags = last_packet->video_header.video_timing.flags;
+  is_last_spatial_layer = last_packet->markerBit;
 }

 RtpFrameObject::~RtpFrameObject() {
--- a/modules/video_coding/rtp_frame_reference_finder.cc
+++ b/modules/video_coding/rtp_frame_reference_finder.cc
@ -489,12 +489,24 @@ RtpFrameReferenceFinder::FrameDecision RtpFrameReferenceFinder::ManageFrameVp9(
      UnwrapPictureIds(frame);
      return kHandOff;
    }
-  } else {
-    if (frame->frame_type() == kVideoFrameKey) {
+  } else if (frame->frame_type() == kVideoFrameKey) {
+    if (frame->id.spatial_layer == 0) {
      RTC_LOG(LS_WARNING) << "Received keyframe without scalability structure";
      return kDrop;
    }
+    const auto gof_info_it = gof_info_.find(unwrapped_tl0);
+    if (gof_info_it == gof_info_.end())
+      return kStash;

+    info = &gof_info_it->second;
+
+    if (frame->frame_type() == kVideoFrameKey) {
+      frame->num_references = 0;
+      FrameReceivedVp9(frame->id.picture_id, info);
+      UnwrapPictureIds(frame);
+      return kHandOff;
+    }
+  } else {
    auto gof_info_it = gof_info_.find(
        (codec_header.temporal_idx == 0) ? unwrapped_tl0 - 1 : unwrapped_tl0);

--- a/video/rtp_video_stream_receiver.cc
+++ b/video/rtp_video_stream_receiver.cc
@ -496,6 +496,14 @@ void RtpVideoStreamReceiver::ReceivePacket(const RtpPacketReceived& packet) {
      VideoSendTiming::kInvalid;
  webrtc_rtp_header.video_header().is_last_packet_in_frame =
      webrtc_rtp_header.header.markerBit;
+  if (parsed_payload.video_header().codec == kVideoCodecVP9) {
+    const RTPVideoHeaderVP9& codec_header = absl::get<RTPVideoHeaderVP9>(
+        parsed_payload.video_header().video_type_header);
+    webrtc_rtp_header.video_header().is_last_packet_in_frame |=
+        codec_header.end_of_frame;
+    webrtc_rtp_header.video_header().is_first_packet_in_frame |=
+        codec_header.beginning_of_frame;
+  }

  packet.GetExtension<VideoOrientation>(
      &webrtc_rtp_header.video_header().rotation);
--- a/video/video_quality_test.cc
+++ b/video/video_quality_test.cc
@ -670,6 +670,10 @@ void VideoQualityTest::SetupVideo(Transport* send_transport,
        vp9_settings.numberOfSpatialLayers = static_cast<unsigned char>(
            params_.ss[video_idx].num_spatial_layers);
        vp9_settings.interLayerPred = params_.ss[video_idx].inter_layer_pred;
+        // High FPS vp9 screenshare requires flexible mode.
+        if (params_.video[video_idx].fps > 5) {
+          vp9_settings.flexibleMode = true;
+        }
        video_encoder_configs_[video_idx].encoder_specific_settings =
            new rtc::RefCountedObject<
                VideoEncoderConfig::Vp9EncoderSpecificSettings>(vp9_settings);
--- a/video/video_receive_stream.cc
+++ b/video/video_receive_stream.cc
@ -381,10 +381,6 @@ void VideoReceiveStream::RequestKeyFrame() {

 void VideoReceiveStream::OnCompleteFrame(
    std::unique_ptr<video_coding::EncodedFrame> frame) {
-  // TODO(webrtc:9249): Workaround to allow decoding of VP9 SVC stream with
-  // partially enabled inter-layer prediction.
-  frame->id.spatial_layer = 0;
-
  // TODO(https://bugs.webrtc.org/9974): Consider removing this workaround.
  int64_t time_now_ms = rtc::TimeMillis();
  if (last_complete_frame_time_ms_ > 0 &&