From 6a8f30e5a36f052da19182df10dd972b57f5fd1b Mon Sep 17 00:00:00 2001
From: Sergey Silkin <ssilkin@webrtc.org>
Date: Thu, 26 Apr 2018 11:03:49 +0200
Subject: [PATCH] Add control for inter-layer prediction mode.

This allows to control inter-layer prediction at encoding VP9 SVC.
There are three options:
1. Disabled.
2. Enabled for all pictures.
3. Enabled for key pictures, disabled for others.

Inter-layer prediction is enabled for all pictures by default.

Bug: none
Change-Id: I49fe43d8744c92bec349d815100ba158519f0664
Reviewed-on: https://webrtc-review.googlesource.com/71500
Reviewed-by: Karl Wiberg <kwiberg@webrtc.org>
Reviewed-by: Rasmus Brandt <brandtr@webrtc.org>
Commit-Queue: Sergey Silkin <ssilkin@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#23049}
---
 api/video_codecs/video_encoder.cc             |  1 +
 common_types.h                                | 10 +++
 .../codecs/vp9/test/vp9_impl_unittest.cc      | 65 +++++++++++++++++++
 modules/video_coding/codecs/vp9/vp9_impl.cc   | 58 +++++++++++++----
 modules/video_coding/codecs/vp9/vp9_impl.h    |  3 +-
 5 files changed, 123 insertions(+), 14 deletions(-)
diff --git a/api/video_codecs/video_encoder.cc b/api/video_codecs/video_encoder.cc
index fd8f1425f4..008780e38c 100644
--- a/api/video_codecs/video_encoder.cc
+++ b/api/video_codecs/video_encoder.cc
@@ -38,6 +38,7 @@ VideoCodecVP9 VideoEncoder::GetDefaultVp9Settings() {
   vp9_settings.automaticResizeOn = true;
   vp9_settings.numberOfSpatialLayers = 1;
   vp9_settings.flexibleMode = false;
+  vp9_settings.interLayerPred = InterLayerPredMode::kOn;
 
   return vp9_settings;
 }
diff --git a/common_types.h b/common_types.h
index 30e91633c2..af2171722f 100644
--- a/common_types.h
+++ b/common_types.h
@@ -361,6 +361,15 @@ struct VideoCodecVP8 {
   int keyFrameInterval;
 };
 
+enum class InterLayerPredMode {
+  kOn,       // Allow inter-layer prediction for all frames.
+             // Frame of low spatial layer can be used for
+             // prediction of next spatial layer frame.
+  kOff,      // Encoder produces independent spatial layers.
+  kOnKeyPic  // Allow inter-layer prediction only for frames
+             // within key picture.
+};
+
 // VP9 specific.
 struct VideoCodecVP9 {
   bool operator==(const VideoCodecVP9& other) const;
@@ -376,6 +385,7 @@ struct VideoCodecVP9 {
   bool automaticResizeOn;
   unsigned char numberOfSpatialLayers;
   bool flexibleMode;
+  InterLayerPredMode interLayerPred;
 };
 
 // TODO(magjed): Move this and other H264 related classes out to their own file.
diff --git a/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc b/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc
index 49d4c58c71..cfcf0a8ea7 100644
--- a/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc
+++ b/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc
@@ -318,4 +318,69 @@ TEST_F(TestVp9Impl, EndOfPicture) {
   EXPECT_TRUE(codec_specific[0].codecSpecific.VP9.end_of_picture);
 }
 
+TEST_F(TestVp9Impl, InterLayerPred) {
+  const size_t num_spatial_layers = 2;
+  const size_t num_temporal_layers = 1;
+  codec_settings_.VP9()->numberOfSpatialLayers =
+      static_cast<unsigned char>(num_spatial_layers);
+  codec_settings_.VP9()->numberOfTemporalLayers =
+      static_cast<unsigned char>(num_temporal_layers);
+  codec_settings_.VP9()->frameDroppingOn = false;
+
+  std::vector<SpatialLayer> layers =
+      GetSvcConfig(codec_settings_.width, codec_settings_.height,
+                   num_spatial_layers, num_temporal_layers);
+
+  BitrateAllocation bitrate_allocation;
+  for (size_t i = 0; i < layers.size(); ++i) {
+    codec_settings_.spatialLayers[i] = layers[i];
+    bitrate_allocation.SetBitrate(i, 0, layers[i].targetBitrate * 1000);
+  }
+
+  const std::vector<InterLayerPredMode> inter_layer_pred_modes = {
+      InterLayerPredMode::kOff, InterLayerPredMode::kOn,
+      InterLayerPredMode::kOnKeyPic};
+
+  for (const InterLayerPredMode inter_layer_pred : inter_layer_pred_modes) {
+    codec_settings_.VP9()->interLayerPred = inter_layer_pred;
+    EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+              encoder_->InitEncode(&codec_settings_, 1 /* number of cores */,
+                                   0 /* max payload size (unused) */));
+
+    EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+              encoder_->SetRateAllocation(bitrate_allocation,
+                                          codec_settings_.maxFramerate));
+
+    SetWaitForEncodedFramesThreshold(2);
+    EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+              encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+
+    std::vector<EncodedImage> frames;
+    std::vector<CodecSpecificInfo> codec_specific;
+    ASSERT_TRUE(WaitForEncodedFrames(&frames, &codec_specific));
+
+    // Key frame.
+    EXPECT_FALSE(codec_specific[0].codecSpecific.VP9.inter_pic_predicted);
+    EXPECT_EQ(codec_specific[0].codecSpecific.VP9.spatial_idx, 0);
+    EXPECT_EQ(codec_specific[0].codecSpecific.VP9.non_ref_for_inter_layer_pred,
+              inter_layer_pred == InterLayerPredMode::kOff);
+    EXPECT_TRUE(
+        codec_specific[1].codecSpecific.VP9.non_ref_for_inter_layer_pred);
+
+    SetWaitForEncodedFramesThreshold(2);
+    EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+              encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+    ASSERT_TRUE(WaitForEncodedFrames(&frames, &codec_specific));
+
+    // Delta frame.
+    EXPECT_TRUE(codec_specific[0].codecSpecific.VP9.inter_pic_predicted);
+    EXPECT_EQ(codec_specific[0].codecSpecific.VP9.spatial_idx, 0);
+    EXPECT_EQ(codec_specific[0].codecSpecific.VP9.non_ref_for_inter_layer_pred,
+              inter_layer_pred == InterLayerPredMode::kOff ||
+                  inter_layer_pred == InterLayerPredMode::kOnKeyPic);
+    EXPECT_TRUE(
+        codec_specific[1].codecSpecific.VP9.non_ref_for_inter_layer_pred);
+  }
+}
+
 }  // namespace webrtc
diff --git a/modules/video_coding/codecs/vp9/vp9_impl.cc b/modules/video_coding/codecs/vp9/vp9_impl.cc
index 0611f415fd..89b5197593 100644
--- a/modules/video_coding/codecs/vp9/vp9_impl.cc
+++ b/modules/video_coding/codecs/vp9/vp9_impl.cc
@@ -74,9 +74,10 @@ VP9EncoderImpl::VP9EncoderImpl()
       config_(nullptr),
       raw_(nullptr),
       input_image_(nullptr),
-      frames_since_kf_(0),
+      pics_since_key_(0),
       num_temporal_layers_(0),
       num_spatial_layers_(0),
+      inter_layer_pred_(InterLayerPredMode::kOn),
       is_flexible_mode_(false),
       frames_encoded_(0),
       // Use two spatial when screensharing with flexible mode.
@@ -367,6 +368,8 @@ int VP9EncoderImpl::InitEncode(const VideoCodec* inst,
     return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
   }
 
+  inter_layer_pred_ = inst->VP9().interLayerPred;
+
   return InitAndSetControlSettings(inst);
 }
 
@@ -456,10 +459,28 @@ int VP9EncoderImpl::InitAndSetControlSettings(const VideoCodec* inst) {
   vpx_codec_control(
       encoder_, VP9E_SET_SVC,
       (num_temporal_layers_ > 1 || num_spatial_layers_ > 1) ? 1 : 0);
+
   if (num_temporal_layers_ > 1 || num_spatial_layers_ > 1) {
     vpx_codec_control(encoder_, VP9E_SET_SVC_PARAMETERS,
                       &svc_params_);
   }
+
+  if (num_spatial_layers_ > 1) {
+    switch (inter_layer_pred_) {
+      case InterLayerPredMode::kOn:
+        vpx_codec_control(encoder_, VP9E_SET_SVC_INTER_LAYER_PRED, 0);
+        break;
+      case InterLayerPredMode::kOff:
+        vpx_codec_control(encoder_, VP9E_SET_SVC_INTER_LAYER_PRED, 1);
+        break;
+      case InterLayerPredMode::kOnKeyPic:
+        vpx_codec_control(encoder_, VP9E_SET_SVC_INTER_LAYER_PRED, 2);
+        break;
+      default:
+        RTC_NOTREACHED();
+    }
+  }
+
   // Register callback for getting each spatial layer.
   vpx_codec_priv_output_cx_pkt_cb_pair_t cbp = {
       VP9EncoderImpl::EncoderOutputCodedPacketCallback,
@@ -604,7 +625,6 @@ void VP9EncoderImpl::PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
       ((pkt.data.frame.flags & VPX_FRAME_IS_KEY) && !codec_.VP9()->flexibleMode)
           ? true
           : false;
-  vp9_info->non_ref_for_inter_layer_pred = false;
 
   vpx_svc_layer_id_t layer_id = {0};
   vpx_codec_control(encoder_, VP9E_GET_SVC_LAYER_ID, &layer_id);
@@ -630,18 +650,30 @@ void VP9EncoderImpl::PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
   // TODO(asapersson): this info has to be obtained from the encoder.
   vp9_info->temporal_up_switch = false;
 
-  if (first_frame_in_picture) {
-    // TODO(asapersson): this info has to be obtained from the encoder.
-    vp9_info->inter_layer_predicted = false;
-    ++frames_since_kf_;
-  } else {
-    // TODO(asapersson): this info has to be obtained from the encoder.
-    vp9_info->inter_layer_predicted = true;
+  if (pkt.data.frame.flags & VPX_FRAME_IS_KEY) {
+    pics_since_key_ = 0;
+  } else if (first_frame_in_picture) {
+    ++pics_since_key_;
   }
 
-  if (pkt.data.frame.flags & VPX_FRAME_IS_KEY) {
-    frames_since_kf_ = 0;
-  }
+  const bool is_key_pic = (pics_since_key_ == 0);
+  const bool is_inter_layer_pred_allowed =
+      (inter_layer_pred_ == InterLayerPredMode::kOn ||
+       (inter_layer_pred_ == InterLayerPredMode::kOnKeyPic && is_key_pic));
+
+  // Always set inter_layer_predicted to true on high layer frame if inter-layer
+  // prediction (ILP) is allowed even if encoder didn't actually use it.
+  // Setting inter_layer_predicted to false would allow receiver to decode high
+  // layer frame without decoding low layer frame. If that would happen (e.g.
+  // if low layer frame is lost) then receiver won't be able to decode next high
+  // layer frame which uses ILP.
+  vp9_info->inter_layer_predicted =
+      first_frame_in_picture ? false : is_inter_layer_pred_allowed;
+
+  const bool is_last_layer =
+      (layer_id.spatial_layer_id + 1 == num_spatial_layers_);
+  vp9_info->non_ref_for_inter_layer_pred =
+      is_last_layer ? true : !is_inter_layer_pred_allowed;
 
   // Always populate this, so that the packetizer can properly set the marker
   // bit.
@@ -656,7 +688,7 @@ void VP9EncoderImpl::PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
     }
   } else {
     vp9_info->gof_idx =
-        static_cast<uint8_t>(frames_since_kf_ % gof_.num_frames_in_gof);
+        static_cast<uint8_t>(pics_since_key_ % gof_.num_frames_in_gof);
     vp9_info->temporal_up_switch = gof_.temporal_up_switch[vp9_info->gof_idx];
   }
 
diff --git a/modules/video_coding/codecs/vp9/vp9_impl.h b/modules/video_coding/codecs/vp9/vp9_impl.h
index 981912f961..45c47bd0f5 100644
--- a/modules/video_coding/codecs/vp9/vp9_impl.h
+++ b/modules/video_coding/codecs/vp9/vp9_impl.h
@@ -120,9 +120,10 @@ class VP9EncoderImpl : public VP9Encoder {
   const VideoFrame* input_image_;
   GofInfoVP9 gof_;       // Contains each frame's temporal information for
                          // non-flexible mode.
-  size_t frames_since_kf_;
+  size_t pics_since_key_;
   uint8_t num_temporal_layers_;
   uint8_t num_spatial_layers_;
+  InterLayerPredMode inter_layer_pred_;
 
   // Used for flexible mode.
   bool is_flexible_mode_;