From 602b13ac87cd78021fd04d5a1fc8de43e376ce31 Mon Sep 17 00:00:00 2001
From: Evan Shrubsole <eshr@google.com>
Date: Mon, 12 Oct 2020 14:25:41 +0200
Subject: [PATCH] Reland "NV12 support for VP8 simulcast"

This is a reland of 76d3e7a8d1539483a8465d6502cc8259e74ccebf

I have run the WPT tests and ensured they are now passing with this
change. I have changed the following,

- The old CL was assuming that ToI420 frames had type I420, but they
could be I420A which was causing a crash.
- I fixed a copy-paste error in the offset of the V stride.

Original change's description:
> NV12 support for VP8 simulcast
>
> Tested using video_loopback with generated NV12 frames.
>
> Bug: webrtc:11635, webrtc:11975
> Change-Id: I14b2d663c55a83d80e48e226fcf706cb18903193
> Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/186722
> Commit-Queue: Evan Shrubsole <eshr@google.com>
> Reviewed-by: Ilya Nikolaevskiy <ilnik@webrtc.org>
> Cr-Commit-Position: refs/heads/master@{#32325}

Bug: webrtc:11635
Bug: webrtc:11975
Change-Id: Ifa790af97cd7ab30c6cb4648ebd140abc1593b0b
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/187490
Reviewed-by: Ilya Nikolaevskiy <ilnik@webrtc.org>
Commit-Queue: Evan Shrubsole <eshr@google.com>
Cr-Commit-Position: refs/heads/master@{#32381}
---
 .../codecs/vp8/libvpx_vp8_encoder.cc          | 150 +++++++++++++-----
 .../codecs/vp8/libvpx_vp8_encoder.h           |   4 +
 .../codecs/vp8/test/vp8_impl_unittest.cc      |  38 +++++
 3 files changed, 154 insertions(+), 38 deletions(-)
diff --git a/modules/video_coding/codecs/vp8/libvpx_vp8_encoder.cc b/modules/video_coding/codecs/vp8/libvpx_vp8_encoder.cc
index 67712de9ec..d52d779712 100644
--- a/modules/video_coding/codecs/vp8/libvpx_vp8_encoder.cc
+++ b/modules/video_coding/codecs/vp8/libvpx_vp8_encoder.cc
@@ -497,6 +497,10 @@ int LibvpxVp8Encoder::InitEncode(const VideoCodec* inst,
     return WEBRTC_VIDEO_CODEC_ERR_PARAMETER;
   }
 
+  // Use the previous pixel format to avoid extra image allocations.
+  vpx_img_fmt_t pixel_format =
+      raw_images_.empty() ? VPX_IMG_FMT_I420 : raw_images_[0].fmt;
+
   int retVal = Release();
   if (retVal < 0) {
     return retVal;
@@ -650,8 +654,8 @@ int LibvpxVp8Encoder::InitEncode(const VideoCodec* inst,
   // Creating a wrapper to the image - setting image data to NULL.
   // Actual pointer will be set in encode. Setting align to 1, as it
   // is meaningless (no memory allocation is done here).
-  libvpx_->img_wrap(&raw_images_[0], VPX_IMG_FMT_I420, inst->width,
-                    inst->height, 1, NULL);
+  libvpx_->img_wrap(&raw_images_[0], pixel_format, inst->width, inst->height, 1,
+                    NULL);
 
   // Note the order we use is different from webm, we have lowest resolution
   // at position 0 and they have highest resolution at position 0.
@@ -699,10 +703,9 @@ int LibvpxVp8Encoder::InitEncode(const VideoCodec* inst,
     // Setting alignment to 32 - as that ensures at least 16 for all
     // planes (32 for Y, 16 for U,V). Libvpx sets the requested stride for
     // the y plane, but only half of it to the u and v planes.
-    libvpx_->img_alloc(&raw_images_[i], VPX_IMG_FMT_I420,
-                       inst->simulcastStream[stream_idx].width,
-                       inst->simulcastStream[stream_idx].height,
-                       kVp832ByteAlign);
+    libvpx_->img_alloc(
+        &raw_images_[i], pixel_format, inst->simulcastStream[stream_idx].width,
+        inst->simulcastStream[stream_idx].height, kVp832ByteAlign);
     SetStreamState(stream_bitrates[stream_idx] > 0, stream_idx);
     vpx_configs_[i].rc_target_bitrate = stream_bitrates[stream_idx];
     if (stream_bitrates[stream_idx] > 0) {
@@ -1014,26 +1017,31 @@ int LibvpxVp8Encoder::Encode(const VideoFrame& frame,
     flags[i] = send_key_frame ? VPX_EFLAG_FORCE_KF : EncodeFlags(tl_configs[i]);
   }
 
-  rtc::scoped_refptr<I420BufferInterface> input_image =
-      frame.video_frame_buffer()->ToI420();
+  rtc::scoped_refptr<VideoFrameBuffer> input_image = frame.video_frame_buffer();
   // Since we are extracting raw pointers from |input_image| to
   // |raw_images_[0]|, the resolution of these frames must match.
   RTC_DCHECK_EQ(input_image->width(), raw_images_[0].d_w);
   RTC_DCHECK_EQ(input_image->height(), raw_images_[0].d_h);
-
-  // Image in vpx_image_t format.
-  // Input image is const. VP8's raw image is not defined as const.
-  raw_images_[0].planes[VPX_PLANE_Y] =
-      const_cast<uint8_t*>(input_image->DataY());
-  raw_images_[0].planes[VPX_PLANE_U] =
-      const_cast<uint8_t*>(input_image->DataU());
-  raw_images_[0].planes[VPX_PLANE_V] =
-      const_cast<uint8_t*>(input_image->DataV());
-
-  raw_images_[0].stride[VPX_PLANE_Y] = input_image->StrideY();
-  raw_images_[0].stride[VPX_PLANE_U] = input_image->StrideU();
-  raw_images_[0].stride[VPX_PLANE_V] = input_image->StrideV();
-
+  switch (input_image->type()) {
+    case VideoFrameBuffer::Type::kI420:
+      PrepareI420Image(input_image->GetI420());
+      break;
+    case VideoFrameBuffer::Type::kNV12:
+      PrepareNV12Image(input_image->GetNV12());
+      break;
+    default: {
+      rtc::scoped_refptr<I420BufferInterface> i420_image =
+          input_image->ToI420();
+      if (!i420_image) {
+        RTC_LOG(LS_ERROR) << "Failed to convert "
+                          << VideoFrameBufferTypeToString(input_image->type())
+                          << " image to I420. Can't encode frame.";
+        return WEBRTC_VIDEO_CODEC_ERROR;
+      }
+      input_image = i420_image;
+      PrepareI420Image(i420_image);
+    }
+  }
   struct CleanUpOnExit {
     explicit CleanUpOnExit(vpx_image_t& raw_image) : raw_image_(raw_image) {}
     ~CleanUpOnExit() {
@@ -1044,22 +1052,6 @@ int LibvpxVp8Encoder::Encode(const VideoFrame& frame,
     vpx_image_t& raw_image_;
   } clean_up_on_exit(raw_images_[0]);
 
-  for (size_t i = 1; i < encoders_.size(); ++i) {
-    // Scale the image down a number of times by downsampling factor
-    libyuv::I420Scale(
-        raw_images_[i - 1].planes[VPX_PLANE_Y],
-        raw_images_[i - 1].stride[VPX_PLANE_Y],
-        raw_images_[i - 1].planes[VPX_PLANE_U],
-        raw_images_[i - 1].stride[VPX_PLANE_U],
-        raw_images_[i - 1].planes[VPX_PLANE_V],
-        raw_images_[i - 1].stride[VPX_PLANE_V], raw_images_[i - 1].d_w,
-        raw_images_[i - 1].d_h, raw_images_[i].planes[VPX_PLANE_Y],
-        raw_images_[i].stride[VPX_PLANE_Y], raw_images_[i].planes[VPX_PLANE_U],
-        raw_images_[i].stride[VPX_PLANE_U], raw_images_[i].planes[VPX_PLANE_V],
-        raw_images_[i].stride[VPX_PLANE_V], raw_images_[i].d_w,
-        raw_images_[i].d_h, libyuv::kFilterBilinear);
-  }
-
   if (send_key_frame) {
     // Adapt the size of the key frame when in screenshare with 1 temporal
     // layer.
@@ -1311,6 +1303,88 @@ int LibvpxVp8Encoder::RegisterEncodeCompleteCallback(
   return WEBRTC_VIDEO_CODEC_OK;
 }
 
+void LibvpxVp8Encoder::MaybeUpdatePixelFormat(vpx_img_fmt fmt) {
+  RTC_DCHECK(!raw_images_.empty());
+  if (raw_images_[0].fmt == fmt) {
+    RTC_DCHECK(std::all_of(
+        std::next(raw_images_.begin()), raw_images_.end(),
+        [fmt](const vpx_image_t& raw_img) { return raw_img.fmt == fmt; }))
+        << "Not all raw images had the right format!";
+    return;
+  }
+  RTC_LOG(INFO) << "Updating vp8 encoder pixel format to "
+                << (fmt == VPX_IMG_FMT_NV12 ? "NV12" : "I420");
+  for (size_t i = 0; i < raw_images_.size(); ++i) {
+    vpx_image_t& img = raw_images_[i];
+    auto d_w = img.d_w;
+    auto d_h = img.d_h;
+    libvpx_->img_free(&img);
+    // First image is wrapping the input frame, the rest are allocated.
+    if (i == 0) {
+      libvpx_->img_wrap(&img, fmt, d_w, d_h, 1, NULL);
+    } else {
+      libvpx_->img_alloc(&img, fmt, d_w, d_h, kVp832ByteAlign);
+    }
+  }
+}
+
+void LibvpxVp8Encoder::PrepareI420Image(const I420BufferInterface* frame) {
+  RTC_DCHECK(!raw_images_.empty());
+  MaybeUpdatePixelFormat(VPX_IMG_FMT_I420);
+  // Image in vpx_image_t format.
+  // Input image is const. VP8's raw image is not defined as const.
+  raw_images_[0].planes[VPX_PLANE_Y] = const_cast<uint8_t*>(frame->DataY());
+  raw_images_[0].planes[VPX_PLANE_U] = const_cast<uint8_t*>(frame->DataU());
+  raw_images_[0].planes[VPX_PLANE_V] = const_cast<uint8_t*>(frame->DataV());
+
+  raw_images_[0].stride[VPX_PLANE_Y] = frame->StrideY();
+  raw_images_[0].stride[VPX_PLANE_U] = frame->StrideU();
+  raw_images_[0].stride[VPX_PLANE_V] = frame->StrideV();
+
+  for (size_t i = 1; i < encoders_.size(); ++i) {
+    // Scale the image down a number of times by downsampling factor
+    libyuv::I420Scale(
+        raw_images_[i - 1].planes[VPX_PLANE_Y],
+        raw_images_[i - 1].stride[VPX_PLANE_Y],
+        raw_images_[i - 1].planes[VPX_PLANE_U],
+        raw_images_[i - 1].stride[VPX_PLANE_U],
+        raw_images_[i - 1].planes[VPX_PLANE_V],
+        raw_images_[i - 1].stride[VPX_PLANE_V], raw_images_[i - 1].d_w,
+        raw_images_[i - 1].d_h, raw_images_[i].planes[VPX_PLANE_Y],
+        raw_images_[i].stride[VPX_PLANE_Y], raw_images_[i].planes[VPX_PLANE_U],
+        raw_images_[i].stride[VPX_PLANE_U], raw_images_[i].planes[VPX_PLANE_V],
+        raw_images_[i].stride[VPX_PLANE_V], raw_images_[i].d_w,
+        raw_images_[i].d_h, libyuv::kFilterBilinear);
+  }
+}
+
+void LibvpxVp8Encoder::PrepareNV12Image(const NV12BufferInterface* frame) {
+  RTC_DCHECK(!raw_images_.empty());
+  MaybeUpdatePixelFormat(VPX_IMG_FMT_NV12);
+  // Image in vpx_image_t format.
+  // Input image is const. VP8's raw image is not defined as const.
+  raw_images_[0].planes[VPX_PLANE_Y] = const_cast<uint8_t*>(frame->DataY());
+  raw_images_[0].planes[VPX_PLANE_U] = const_cast<uint8_t*>(frame->DataUV());
+  raw_images_[0].planes[VPX_PLANE_V] = raw_images_[0].planes[VPX_PLANE_U] + 1;
+  raw_images_[0].stride[VPX_PLANE_Y] = frame->StrideY();
+  raw_images_[0].stride[VPX_PLANE_U] = frame->StrideUV();
+  raw_images_[0].stride[VPX_PLANE_V] = frame->StrideUV();
+
+  for (size_t i = 1; i < encoders_.size(); ++i) {
+    // Scale the image down a number of times by downsampling factor
+    libyuv::NV12Scale(
+        raw_images_[i - 1].planes[VPX_PLANE_Y],
+        raw_images_[i - 1].stride[VPX_PLANE_Y],
+        raw_images_[i - 1].planes[VPX_PLANE_U],
+        raw_images_[i - 1].stride[VPX_PLANE_U], raw_images_[i - 1].d_w,
+        raw_images_[i - 1].d_h, raw_images_[i].planes[VPX_PLANE_Y],
+        raw_images_[i].stride[VPX_PLANE_Y], raw_images_[i].planes[VPX_PLANE_U],
+        raw_images_[i].stride[VPX_PLANE_U], raw_images_[i].d_w,
+        raw_images_[i].d_h, libyuv::kFilterBilinear);
+    raw_images_[i].planes[VPX_PLANE_V] = raw_images_[i].planes[VPX_PLANE_U] + 1;
+  }
+}
+
 // static
 LibvpxVp8Encoder::VariableFramerateExperiment
 LibvpxVp8Encoder::ParseVariableFramerateConfig(std::string group_name) {
diff --git a/modules/video_coding/codecs/vp8/libvpx_vp8_encoder.h b/modules/video_coding/codecs/vp8/libvpx_vp8_encoder.h
index 731a9a08df..c08b9b0883 100644
--- a/modules/video_coding/codecs/vp8/libvpx_vp8_encoder.h
+++ b/modules/video_coding/codecs/vp8/libvpx_vp8_encoder.h
@@ -93,6 +93,10 @@ class LibvpxVp8Encoder : public VideoEncoder {
 
   bool UpdateVpxConfiguration(size_t stream_index);
 
+  void MaybeUpdatePixelFormat(vpx_img_fmt fmt);
+  void PrepareI420Image(const I420BufferInterface* frame);
+  void PrepareNV12Image(const NV12BufferInterface* frame);
+
   const std::unique_ptr<LibvpxInterface> libvpx_;
 
   const CpuSpeedExperiment experimental_cpu_speed_config_arm_;
diff --git a/modules/video_coding/codecs/vp8/test/vp8_impl_unittest.cc b/modules/video_coding/codecs/vp8/test/vp8_impl_unittest.cc
index 18181fdc8e..cc6189701b 100644
--- a/modules/video_coding/codecs/vp8/test/vp8_impl_unittest.cc
+++ b/modules/video_coding/codecs/vp8/test/vp8_impl_unittest.cc
@@ -266,6 +266,44 @@ TEST_F(TestVp8Impl, EncodeFrameAndRelease) {
             encoder_->Encode(NextInputFrame(), nullptr));
 }
 
+TEST_F(TestVp8Impl, EncodeNv12FrameSimulcast) {
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK, encoder_->Release());
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->InitEncode(&codec_settings_, kSettings));
+
+  EncodedImage encoded_frame;
+  CodecSpecificInfo codec_specific_info;
+  input_frame_generator_ = test::CreateSquareFrameGenerator(
+      kWidth, kHeight, test::FrameGeneratorInterface::OutputType::kNV12,
+      absl::nullopt);
+  EncodeAndWaitForFrame(NextInputFrame(), &encoded_frame, &codec_specific_info);
+
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK, encoder_->Release());
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_UNINITIALIZED,
+            encoder_->Encode(NextInputFrame(), nullptr));
+}
+
+TEST_F(TestVp8Impl, EncodeI420FrameAfterNv12Frame) {
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK, encoder_->Release());
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->InitEncode(&codec_settings_, kSettings));
+
+  EncodedImage encoded_frame;
+  CodecSpecificInfo codec_specific_info;
+  input_frame_generator_ = test::CreateSquareFrameGenerator(
+      kWidth, kHeight, test::FrameGeneratorInterface::OutputType::kNV12,
+      absl::nullopt);
+  EncodeAndWaitForFrame(NextInputFrame(), &encoded_frame, &codec_specific_info);
+  input_frame_generator_ = test::CreateSquareFrameGenerator(
+      kWidth, kHeight, test::FrameGeneratorInterface::OutputType::kI420,
+      absl::nullopt);
+  EncodeAndWaitForFrame(NextInputFrame(), &encoded_frame, &codec_specific_info);
+
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK, encoder_->Release());
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_UNINITIALIZED,
+            encoder_->Encode(NextInputFrame(), nullptr));
+}
+
 TEST_F(TestVp8Impl, InitDecode) {
   EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK, decoder_->Release());
   EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,