From c99753ac8f051e379ae68e281aaef04b0a5ca8f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Bostr=C3=B6m?= <hbos@webrtc.org>
Date: Mon, 27 Mar 2023 20:23:30 +0200
Subject: [PATCH] =?UTF-8?q?Fix=20VP9=20{active,inactive,inactive}=C2=A0bit?=
 =?UTF-8?q?rate=20issue=20causing=20spatial=20drop.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The EncoderStreamFactory triggers different code paths depending on
`number_of_streams`: one for simulcast and one for non-simulcast.
The non-simulcast path is desired for both normal streams and SVC
streams.

The simulcast path gives sensible max bitrates for 4:2:1 scenarios, but
when encodings like {active,inactive,inactive} is specified in order to
do standard SVC, the max bps of the first encoding is so low that an
SVC stream will never send more than its first spatial layer (even when
scaleResolutionDownBy is 1).

Because of this, standard SVC is broken. This CL fixes this problem by
using the CreateDefaultVideoStreams() code path instead, which is the
same one that legacy SVC uses. With this fix, legacy and standard SVC
produce the same behavior regarding bitrate.

An added benefit of this is that numberOfSimulcastStreams == 1 in the
standard SVC path as well.

{active,inactive,inactive} tests are updated to verify the full
resolution is achieved after ramp-up. I've also confirmed that this
fixes the bug in Canary, see https://crbug.com/1428098#c2.

Bug: chromium:1428098, webrtc:15041, webrtc:15034
Change-Id: Ia1eb4ff59c4e2a56af833f7ac907a66bca8ea054
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/299147
Reviewed-by: Ilya Nikolaevskiy <ilnik@webrtc.org>
Commit-Queue: Henrik Boström <hbos@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#39697}
---
 pc/peer_connection_simulcast_unittest.cc | 107 ++++++++++++++++++-----
 video/config/encoder_stream_factory.cc   |  24 ++++-
 2 files changed, 106 insertions(+), 25 deletions(-)
diff --git a/pc/peer_connection_simulcast_unittest.cc b/pc/peer_connection_simulcast_unittest.cc
index 23d598cc0a..05bf78f3cc 100644
--- a/pc/peer_connection_simulcast_unittest.cc
+++ b/pc/peer_connection_simulcast_unittest.cc
@@ -1007,15 +1007,31 @@ class PeerConnectionSimulcastWithMediaFlowTests
   bool HasOutboundRtpWithRidAndScalabilityMode(
       rtc::scoped_refptr<PeerConnectionTestWrapper> pc_wrapper,
       absl::string_view rid,
-      absl::string_view expected_scalability_mode) {
+      absl::string_view expected_scalability_mode,
+      uint32_t frame_height) {
     rtc::scoped_refptr<const RTCStatsReport> report = GetStats(pc_wrapper);
     std::vector<const RTCOutboundRtpStreamStats*> outbound_rtps =
         report->GetStatsOfType<RTCOutboundRtpStreamStats>();
     auto* outbound_rtp = FindOutboundRtpByRid(outbound_rtps, rid);
-    if (!outbound_rtp || !outbound_rtp->scalability_mode.is_defined()) {
+    if (!outbound_rtp || !outbound_rtp->scalability_mode.is_defined() ||
+        *outbound_rtp->scalability_mode != expected_scalability_mode) {
       return false;
     }
-    return *outbound_rtp->scalability_mode == expected_scalability_mode;
+    if (outbound_rtp->frame_height.is_defined()) {
+      RTC_LOG(LS_INFO) << "Waiting for target resolution (" << frame_height
+                       << "p). Currently at " << *outbound_rtp->frame_height
+                       << "p...";
+    } else {
+      RTC_LOG(LS_INFO)
+          << "Waiting for target resolution. No frames encoded yet...";
+    }
+    if (!outbound_rtp->frame_height.is_defined() ||
+        *outbound_rtp->frame_height != frame_height) {
+      // Sleep to avoid log spam when this is used in EXPECT_TRUE_WAIT().
+      rtc::Thread::Current()->SleepMs(1000);
+      return false;
+    }
+    return true;
   }
 
   bool OutboundRtpResolutionsAreLessThanOrEqualToExpectations(
@@ -1366,16 +1382,11 @@ TEST_F(PeerConnectionSimulcastWithMediaFlowTests,
   // We expect to see bytes flowing almost immediately on the lowest layer.
   EXPECT_TRUE_WAIT(HasOutboundRtpBytesSent(local_pc_wrapper, 1u),
                    kDefaultTimeout.ms());
-  EXPECT_TRUE(OutboundRtpResolutionsAreLessThanOrEqualToExpectations(
-      local_pc_wrapper, {{"f", 1280, 720}}));
-  // Verify codec and scalability mode.
-  rtc::scoped_refptr<const RTCStatsReport> report = GetStats(local_pc_wrapper);
-  std::vector<const RTCOutboundRtpStreamStats*> outbound_rtps =
-      report->GetStatsOfType<RTCOutboundRtpStreamStats>();
-  ASSERT_THAT(outbound_rtps, SizeIs(1u));
-  EXPECT_THAT(GetCurrentCodecMimeType(report, *outbound_rtps[0]),
-              StrCaseEq("video/VP9"));
-  EXPECT_THAT(*outbound_rtps[0]->scalability_mode, StrEq("L3T3_KEY"));
+  // Wait until scalability mode is reported and expected resolution reached.
+  // Ramp up time may be significant.
+  EXPECT_TRUE_WAIT(HasOutboundRtpWithRidAndScalabilityMode(
+                       local_pc_wrapper, "f", "L3T3_KEY", 720),
+                   (2 * kLongTimeoutForRampingUp).ms());
 
   // Despite SVC being used on a single RTP stream, GetParameters() returns the
   // three encodings that we configured earlier (this is not spec-compliant but
@@ -1390,8 +1401,9 @@ TEST_F(PeerConnectionSimulcastWithMediaFlowTests,
   EXPECT_FALSE(encodings[2].scalability_mode.has_value());
 }
 
-// The spec-compliant way to configure SVC. The expected outcome is the same as
-// for the legacy SVC case except that we only have one encoding.
+// The spec-compliant way to configure SVC for a single stream. The expected
+// outcome is the same as for the legacy SVC case except that we only have one
+// encoding in GetParameters().
 TEST_F(PeerConnectionSimulcastWithMediaFlowTests,
        SendingOneEncoding_VP9_StandardSVC) {
   rtc::scoped_refptr<PeerConnectionTestWrapper> local_pc_wrapper = CreatePc();
@@ -1439,6 +1451,58 @@ TEST_F(PeerConnectionSimulcastWithMediaFlowTests,
               Optional(std::string("L3T3_KEY")));
 }
 
+// The {active,inactive,inactive} case is technically simulcast but since we
+// only have one active stream, we're able to do SVC (multiple spatial layers
+// is not supported if multiple encodings are active). The expected outcome is
+// the same as above except we end up with two inactive RTP streams which are
+// observable in GetStats().
+TEST_F(PeerConnectionSimulcastWithMediaFlowTests,
+       SendingThreeEncodings_VP9_StandardSVC) {
+  rtc::scoped_refptr<PeerConnectionTestWrapper> local_pc_wrapper = CreatePc();
+  rtc::scoped_refptr<PeerConnectionTestWrapper> remote_pc_wrapper = CreatePc();
+  ExchangeIceCandidates(local_pc_wrapper, remote_pc_wrapper);
+
+  std::vector<SimulcastLayer> layers =
+      CreateLayers({"f", "h", "q"}, /*active=*/true);
+  rtc::scoped_refptr<RtpTransceiverInterface> transceiver =
+      AddTransceiverWithSimulcastLayers(local_pc_wrapper, remote_pc_wrapper,
+                                        layers);
+  std::vector<RtpCodecCapability> codecs =
+      GetCapabilitiesAndRestrictToCodec(local_pc_wrapper, "VP9");
+  transceiver->SetCodecPreferences(codecs);
+  // Configure SVC, a.k.a. "L3T3_KEY".
+  rtc::scoped_refptr<RtpSenderInterface> sender = transceiver->sender();
+  RtpParameters parameters = sender->GetParameters();
+  ASSERT_EQ(parameters.encodings.size(), 3u);
+  parameters.encodings[0].scalability_mode = "L3T3_KEY";
+  parameters.encodings[0].scale_resolution_down_by = 1;
+  parameters.encodings[1].active = false;
+  parameters.encodings[2].active = false;
+  EXPECT_TRUE(sender->SetParameters(parameters).ok());
+
+  NegotiateWithSimulcastTweaks(local_pc_wrapper, remote_pc_wrapper, layers);
+  local_pc_wrapper->WaitForConnection();
+  remote_pc_wrapper->WaitForConnection();
+
+  // Since the standard API is configuring simulcast we get three outbound-rtps,
+  // but only one is active.
+  EXPECT_TRUE_WAIT(HasOutboundRtpBytesSent(local_pc_wrapper, 3u, 1u),
+                   kDefaultTimeout.ms());
+  // Wait until scalability mode is reported and expected resolution reached.
+  // Ramp up time is significant.
+  EXPECT_TRUE_WAIT(HasOutboundRtpWithRidAndScalabilityMode(
+                       local_pc_wrapper, "f", "L3T3_KEY", 720),
+                   (2 * kLongTimeoutForRampingUp).ms());
+
+  // GetParameters() is consistent with what we asked for and got.
+  parameters = sender->GetParameters();
+  ASSERT_EQ(parameters.encodings.size(), 3u);
+  EXPECT_THAT(parameters.encodings[0].scalability_mode,
+              Optional(std::string("L3T3_KEY")));
+  EXPECT_FALSE(parameters.encodings[1].scalability_mode.has_value());
+  EXPECT_FALSE(parameters.encodings[2].scalability_mode.has_value());
+}
+
 TEST_F(PeerConnectionSimulcastWithMediaFlowTests,
        SendingThreeEncodings_VP9_Simulcast) {
   rtc::scoped_refptr<PeerConnectionTestWrapper> local_pc_wrapper = CreatePc();
@@ -1535,7 +1599,7 @@ TEST_F(PeerConnectionSimulcastWithMediaFlowTests,
   ASSERT_EQ(parameters.encodings.size(), 3u);
   parameters.encodings[0].active = true;
   parameters.encodings[0].scalability_mode = "L2T2_KEY";
-  parameters.encodings[0].scale_resolution_down_by = 4.0;
+  parameters.encodings[0].scale_resolution_down_by = 2.0;
   parameters.encodings[1].active = false;
   parameters.encodings[1].scalability_mode = absl::nullopt;
   parameters.encodings[2].active = false;
@@ -1545,11 +1609,12 @@ TEST_F(PeerConnectionSimulcastWithMediaFlowTests,
   // Since the standard API is configuring simulcast we get three outbound-rtps,
   // but only one is active.
   EXPECT_TRUE_WAIT(HasOutboundRtpBytesSent(local_pc_wrapper, 3u, 1u),
-                   kLongTimeoutForRampingUp.ms());
-  // Reduce risk of flakiness by waiting until the scalability mode is reported.
-  EXPECT_TRUE_WAIT(HasOutboundRtpWithRidAndScalabilityMode(local_pc_wrapper,
-                                                           "f", "L2T2_KEY"),
-                   kLongTimeoutForRampingUp.ms());
+                   kDefaultTimeout.ms());
+  // Wait until scalability mode is reported and expected resolution reached.
+  // Ramp up time may be significant.
+  EXPECT_TRUE_WAIT(HasOutboundRtpWithRidAndScalabilityMode(
+                       local_pc_wrapper, "f", "L2T2_KEY", 720 / 2),
+                   (2 * kLongTimeoutForRampingUp).ms());
 
   // GetParameters() does not report any fallback.
   parameters = sender->GetParameters();
diff --git a/video/config/encoder_stream_factory.cc b/video/config/encoder_stream_factory.cc
index fceadf09b4..de475b90eb 100644
--- a/video/config/encoder_stream_factory.cc
+++ b/video/config/encoder_stream_factory.cc
@@ -138,10 +138,26 @@ std::vector<webrtc::VideoStream> EncoderStreamFactory::CreateEncoderStreams(
   const absl::optional<webrtc::DataRate> experimental_min_bitrate =
       GetExperimentalMinVideoBitrate(encoder_config.codec_type);
 
-  if (encoder_config.number_of_streams > 1 ||
-      ((absl::EqualsIgnoreCase(codec_name_, kVp8CodecName) ||
-        absl::EqualsIgnoreCase(codec_name_, kH264CodecName)) &&
-       is_screenshare_ && conference_mode_)) {
+  bool is_simulcast = (encoder_config.number_of_streams > 1);
+  // If scalability mode was specified, don't treat {active,inactive,inactive}
+  // as simulcast since the simulcast configuration assumes very low bitrates
+  // on the first layer. This would prevent rampup of multiple spatial layers.
+  // See https://crbug.com/webrtc/15041.
+  if (is_simulcast &&
+      encoder_config.simulcast_layers[0].scalability_mode.has_value()) {
+    // Require at least one non-first layer to be active for is_simulcast=true.
+    is_simulcast = false;
+    for (size_t i = 1; i < encoder_config.simulcast_layers.size(); ++i) {
+      if (encoder_config.simulcast_layers[i].active) {
+        is_simulcast = true;
+        break;
+      }
+    }
+  }
+
+  if (is_simulcast || ((absl::EqualsIgnoreCase(codec_name_, kVp8CodecName) ||
+                        absl::EqualsIgnoreCase(codec_name_, kH264CodecName)) &&
+                       is_screenshare_ && conference_mode_)) {
     return CreateSimulcastOrConferenceModeScreenshareStreams(
         frame_width, frame_height, encoder_config, experimental_min_bitrate);
   }