From db36884e76b8802055cb875bbd8f7413a186b475 Mon Sep 17 00:00:00 2001
From: Sergey Silkin <ssilkin@webrtc.org>
Date: Tue, 2 Apr 2024 13:25:56 +0200
Subject: [PATCH] Reland "Mark frames with inter_layer_predicted=true as delta
 frames"

This is a reland of commit 7ae48c452abf8694a1b0a7a9a2aef13a9d10298a with  updated RtpVp9RefFinder

RtpVp9RefFinder relied on the fact that frames with (inter_pic_predicted=true && inter_layer_predicted=true) were marked as keyframes. Since this is not the case anymore, the related code paths in RtpVp9RefFinder have been deleted.

Calculation of gof_info_[] index for non-keyframes has been updated to account for that fact it is now possible to received multiple T0 frames belonging to the same temporal unit (we don't need to do "unwrapped_tl0 - 1" in this case).

Original change's description:
> Mark frames with inter_layer_predicted=true as delta frames
>
> As it is currently implemented, the VP9 depacketizer decides packet's frame type based on p_bit ("Inter-picture predicted layer frame"). p_bit is set to 0 for upper spatial layer frames of keyframe since they do not have temporal refs. This results in marking packets of upper spatial layer frames, and, eventually these frames, of SVC keyframes as "keyframe" while they are in fact delta frames.
>
> Normally spatial layer frames are merged into a superframe and the superframe is passed to decoder. But passing individual layers to a single decoder instance is a valid scenario too and is used in downstream projects. In this case, an upper layer frame marked as keyframe may cause decoder reset [2] and break decoding.
>
> This CL changes frame type decision logic in the VP9 depacketizer such that only packets with both P and D (inter-layer predicted) bits unset are considered as keyframe packets.
>
> When spatial layer frames are merged into a superframe in CombineAndDeleteFrames [1], frame type of the superframe is inferred from the lowest spatial layer frame.
>
> [1] https://source.chromium.org/chromium/chromium/src/+/main:third_party/webrtc/modules/video_coding/frame_helpers.cc;l=53
>
> [2] https://source.corp.google.com/piper///depot/google3/third_party/webrtc/files/stable/webrtc/modules/video_coding/codecs/vp9/libvpx_vp9_decoder.cc;l=209
>
> Bug: webrtc:15827
> Change-Id: Idc3445636f0eae0192dac998876fedec48628560
> Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/343342
> Reviewed-by: Danil Chapovalov <danilchap@webrtc.org>
> Commit-Queue: Sergey Silkin <ssilkin@webrtc.org>
> Cr-Commit-Position: refs/heads/main@{#41939}

Bug: webrtc:15827
Change-Id: Ic69b94989919cf6d353bceea85d0eba63bc500ee
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/344144
Reviewed-by: Philip Eliasson <philipel@webrtc.org>
Commit-Queue: Sergey Silkin <ssilkin@webrtc.org>
Reviewed-by: Danil Chapovalov <danilchap@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#41985}
---
 .../source/video_rtp_depacketizer_vp9.cc      |  6 +-
 .../video_rtp_depacketizer_vp9_unittest.cc    | 13 ++++
 modules/video_coding/rtp_vp9_ref_finder.cc    | 64 +++++++++----------
 .../rtp_vp9_ref_finder_unittest.cc            | 12 ++++
 4 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.cc b/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.cc
index 41f363d221..a8b04db78d 100644
--- a/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.cc
+++ b/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.cc
@@ -180,9 +180,6 @@ int VideoRtpDepacketizerVp9::ParseRtpPayload(
   video_header->simulcastIdx = 0;
   video_header->codec = kVideoCodecVP9;
 
-  video_header->frame_type =
-      p_bit ? VideoFrameType::kVideoFrameDelta : VideoFrameType::kVideoFrameKey;
-
   auto& vp9_header =
       video_header->video_type_header.emplace<RTPVideoHeaderVP9>();
   vp9_header.InitRTPVideoHeaderVP9();
@@ -211,6 +208,9 @@ int VideoRtpDepacketizerVp9::ParseRtpPayload(
       video_header->height = vp9_header.height[0];
     }
   }
+  video_header->frame_type = p_bit || vp9_header.inter_layer_predicted
+                                 ? VideoFrameType::kVideoFrameDelta
+                                 : VideoFrameType::kVideoFrameKey;
   video_header->is_first_packet_in_frame = b_bit;
   video_header->is_last_packet_in_frame = e_bit;
 
diff --git a/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9_unittest.cc b/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9_unittest.cc
index 36af59a779..7f89812133 100644
--- a/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9_unittest.cc
+++ b/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9_unittest.cc
@@ -369,5 +369,18 @@ TEST(VideoRtpDepacketizerVp9Test, ReferencesInputCopyOnWriteBuffer) {
   // Compare pointers to check there was no copy on write buffer unsharing.
   EXPECT_EQ(parsed->video_payload.cdata(), rtp_payload.cdata() + kHeaderSize);
 }
+
+TEST(VideoRtpDepacketizerVp9Test, InterLayerPredOnlyFrameMerkedAsDelta) {
+  // Set P=0 and D=1 and vefify that the depaketizers marks this packet as a
+  // part of a delta frame (not a keyframe).
+  uint8_t packet[13] = {0};
+  packet[0] = 0b0010'0000;  // I:0 P:0 L:1 F:0 B:0 E:0 V:0 Z:0
+  packet[1] = 0b0000'0001;  // T:000 U:0 S:000 D:1
+  packet[2] = 0;            // TL0PICIDX
+
+  RTPVideoHeader video_header;
+  VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header);
+  EXPECT_EQ(video_header.frame_type, VideoFrameType::kVideoFrameDelta);
+}
 }  // namespace
 }  // namespace webrtc
diff --git a/modules/video_coding/rtp_vp9_ref_finder.cc b/modules/video_coding/rtp_vp9_ref_finder.cc
index 175ed3464b..f0d51adad8 100644
--- a/modules/video_coding/rtp_vp9_ref_finder.cc
+++ b/modules/video_coding/rtp_vp9_ref_finder.cc
@@ -132,24 +132,20 @@ RtpVp9RefFinder::FrameDecision RtpVp9RefFinder::ManageFrameGof(
       FlattenFrameIdAndRefs(frame, codec_header.inter_layer_predicted);
       return kHandOff;
     }
-  } else if (frame->frame_type() == VideoFrameType::kVideoFrameKey) {
-    if (frame->SpatialIndex() == 0) {
+  } else {
+    if (frame->frame_type() == VideoFrameType::kVideoFrameKey) {
       RTC_LOG(LS_WARNING) << "Received keyframe without scalability structure";
       return kDrop;
     }
-    const auto gof_info_it = gof_info_.find(unwrapped_tl0);
-    if (gof_info_it == gof_info_.end())
-      return kStash;
 
-    info = &gof_info_it->second;
-
-    frame->num_references = 0;
-    FrameReceivedVp9(frame->Id(), info);
-    FlattenFrameIdAndRefs(frame, codec_header.inter_layer_predicted);
-    return kHandOff;
-  } else {
-    auto gof_info_it = gof_info_.find(
-        (codec_header.temporal_idx == 0) ? unwrapped_tl0 - 1 : unwrapped_tl0);
+    // tl0_idx is incremented on temporal_idx=0 frames of the lowest spatial
+    // layer (which spatial_idx is not necessarily zero). Upper spatial layer
+    // frames with inter-layer prediction use GOF info of their base spatial
+    // layer frames.
+    const bool use_prev_gof =
+        codec_header.temporal_idx == 0 && !codec_header.inter_layer_predicted;
+    auto gof_info_it =
+        gof_info_.find(use_prev_gof ? unwrapped_tl0 - 1 : unwrapped_tl0);
 
     // Gof info for this frame is not available yet, stash this frame.
     if (gof_info_it == gof_info_.end())
@@ -185,29 +181,29 @@ RtpVp9RefFinder::FrameDecision RtpVp9RefFinder::ManageFrameGof(
   auto up_switch_erase_to = up_switch_.lower_bound(old_picture_id);
   up_switch_.erase(up_switch_.begin(), up_switch_erase_to);
 
-  size_t diff =
-      ForwardDiff<uint16_t, kFrameIdLength>(info->gof->pid_start, frame->Id());
-  size_t gof_idx = diff % info->gof->num_frames_in_gof;
+  if (codec_header.inter_pic_predicted) {
+    size_t diff = ForwardDiff<uint16_t, kFrameIdLength>(info->gof->pid_start,
+                                                        frame->Id());
+    size_t gof_idx = diff % info->gof->num_frames_in_gof;
 
-  if (info->gof->num_ref_pics[gof_idx] > EncodedFrame::kMaxFrameReferences) {
-    return kDrop;
-  }
-  // Populate references according to the scalability structure.
-  frame->num_references = info->gof->num_ref_pics[gof_idx];
-  for (size_t i = 0; i < frame->num_references; ++i) {
-    frame->references[i] =
-        Subtract<kFrameIdLength>(frame->Id(), info->gof->pid_diff[gof_idx][i]);
-
-    // If this is a reference to a frame earlier than the last up switch point,
-    // then ignore this reference.
-    if (UpSwitchInIntervalVp9(frame->Id(), codec_header.temporal_idx,
-                              frame->references[i])) {
-      --frame->num_references;
+    if (info->gof->num_ref_pics[gof_idx] > EncodedFrame::kMaxFrameReferences) {
+      return kDrop;
     }
-  }
 
-  // Override GOF references.
-  if (!codec_header.inter_pic_predicted) {
+    // Populate references according to the scalability structure.
+    frame->num_references = info->gof->num_ref_pics[gof_idx];
+    for (size_t i = 0; i < frame->num_references; ++i) {
+      frame->references[i] = Subtract<kFrameIdLength>(
+          frame->Id(), info->gof->pid_diff[gof_idx][i]);
+
+      // If this is a reference to a frame earlier than the last up switch
+      // point, then ignore this reference.
+      if (UpSwitchInIntervalVp9(frame->Id(), codec_header.temporal_idx,
+                                frame->references[i])) {
+        --frame->num_references;
+      }
+    }
+  } else {
     frame->num_references = 0;
   }
 
diff --git a/modules/video_coding/rtp_vp9_ref_finder_unittest.cc b/modules/video_coding/rtp_vp9_ref_finder_unittest.cc
index a3cb31ade5..23b6cfca9a 100644
--- a/modules/video_coding/rtp_vp9_ref_finder_unittest.cc
+++ b/modules/video_coding/rtp_vp9_ref_finder_unittest.cc
@@ -360,6 +360,18 @@ TEST_F(RtpVp9RefFinderTest, GofSkipFramesTemporalLayers_0212) {
   EXPECT_THAT(frames_, HasFrameWithIdAndRefs(35, {30}));
 }
 
+TEST_F(RtpVp9RefFinderTest, GofInterLayerPredS0KeyS1Delta) {
+  GofInfoVP9 ss;
+  ss.SetGofInfoVP9(kTemporalStructureMode1);
+
+  Insert(Frame().Pid(1).SidAndTid(0, 0).Tl0(0).AsKeyFrame().Gof(&ss));
+  Insert(Frame().Pid(1).SidAndTid(1, 0).Tl0(0).AsInterLayer().NotAsInterPic());
+
+  ASSERT_EQ(2UL, frames_.size());
+  EXPECT_THAT(frames_, HasFrameWithIdAndRefs(5, {}));
+  EXPECT_THAT(frames_, HasFrameWithIdAndRefs(6, {5}));
+}
+
 TEST_F(RtpVp9RefFinderTest, GofTemporalLayers_01) {
   GofInfoVP9 ss;
   ss.SetGofInfoVP9(kTemporalStructureMode2);  // 0101 pattern