Support IVF source in video codec tester

and move frame buffering from video source to decoder. Bug: webrtc:42225151, b/337757868 Change-Id: I577031da02065ff4a2d0bce4ac0f8ee411823d4f Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/353341 Commit-Queue: Sergey Silkin <ssilkin@webrtc.org> Reviewed-by: Mirko Bonadei <mbonadei@webrtc.org> Cr-Commit-Position: refs/heads/main@{#42428}
2024-06-03 16:16:01 +02:00 · 2024-06-03 16:16:01 +02:00 · b792d60929
commit b792d60929
parent ed3040193c
3 changed files with 161 additions and 56 deletions
--- a/test/BUILD.gn
+++ b/test/BUILD.gn
@ -722,6 +722,8 @@ if (rtc_include_tests) {
        "../api/units:time_delta",
        "../api/video:encoded_image",
        "../api/video:video_frame",
+        "../api/video_codecs:builtin_video_decoder_factory",
+        "../api/video_codecs:builtin_video_encoder_factory",
        "../api/video_codecs:scalability_mode",
        "../api/video_codecs:video_codecs_api",
        "../call:video_stream_api",
@ -1338,7 +1340,10 @@ rtc_library("video_codec_tester") {
  deps = [
    ":scoped_key_value_config",
    "../api:array_view",
+    "../api:create_frame_generator",
+    "../api:frame_generator_api",
    "../api/environment",
+    "../api/environment:environment_factory",
    "../api/numerics:numerics",
    "../api/test/metrics:metric",
    "../api/test/metrics:metrics_logger",
--- a/test/video_codec_tester.cc
+++ b/test/video_codec_tester.cc
@ -18,6 +18,9 @@
 #include "absl/strings/match.h"
 #include "api/array_view.h"
 #include "api/environment/environment.h"
+#include "api/environment/environment_factory.h"
+#include "api/test/create_frame_generator.h"
+#include "api/test/frame_generator_interface.h"
 #include "api/units/time_delta.h"
 #include "api/units/timestamp.h"
 #include "api/video/builtin_video_bitrate_allocator_factory.h"
@ -83,36 +86,66 @@ const std::set<ScalabilityMode> kKeySvcScalabilityModes{
    ScalabilityMode::kL3T1_KEY,       ScalabilityMode::kL3T2_KEY,
    ScalabilityMode::kL3T3_KEY};

-// A thread-safe raw video frame reader.
+rtc::scoped_refptr<VideoFrameBuffer> ScaleFrame(
+    rtc::scoped_refptr<VideoFrameBuffer> buffer,
+    int scaled_width,
+    int scaled_height) {
+  if (buffer->width() == scaled_width && buffer->height() == scaled_height) {
+    return buffer;
+  }
+  return buffer->Scale(scaled_width, scaled_height);
+}
+
+// A video source that reads frames from YUV, Y4M or IVF (compressed with VPx,
+// AV1 or H264) files.
 class VideoSource {
 public:
  explicit VideoSource(VideoSourceSettings source_settings)
      : source_settings_(source_settings) {
-    MutexLock lock(&mutex_);
-    if (absl::EndsWith(source_settings.file_path, "y4m")) {
-      frame_reader_ =
+    if (absl::EndsWith(source_settings.file_path, "ivf")) {
+      ivf_reader_ = CreateFromIvfFileFrameGenerator(CreateEnvironment(),
+                                                    source_settings.file_path);
+    } else if (absl::EndsWith(source_settings.file_path, "y4m")) {
+      yuv_reader_ =
          CreateY4mFrameReader(source_settings_.file_path,
                               YuvFrameReaderImpl::RepeatMode::kPingPong);
    } else {
-      frame_reader_ = CreateYuvFrameReader(
+      yuv_reader_ = CreateYuvFrameReader(
          source_settings_.file_path, source_settings_.resolution,
          YuvFrameReaderImpl::RepeatMode::kPingPong);
    }
-    RTC_CHECK(frame_reader_);
+    RTC_CHECK(ivf_reader_ || yuv_reader_);
  }

-  // Pulls next frame.
  VideoFrame PullFrame(uint32_t timestamp_rtp,
-                       Resolution resolution,
-                       Frequency framerate) {
-    MutexLock lock(&mutex_);
-    int frame_num;
-    auto buffer = frame_reader_->PullFrame(
-        &frame_num, resolution,
-        {.num = framerate.millihertz<int>(),
-         .den = source_settings_.framerate.millihertz<int>()});
-    RTC_CHECK(buffer) << "Can not pull frame. RTP timestamp " << timestamp_rtp;
-    frame_num_[timestamp_rtp] = frame_num;
+                       Resolution output_resolution,
+                       Frequency output_framerate) {
+    // If the source and output frame rates differ, resampling is performed by
+    // skipping or repeating source frames.
+    time_delta_ = time_delta_.value_or(1 / source_settings_.framerate);
+    int seek = 0;
+    while (time_delta_->us() <= 0) {
+      *time_delta_ += 1 / source_settings_.framerate;
+      ++seek;
+    }
+    *time_delta_ -= 1 / output_framerate;
+
+    if (seek > 0 || last_frame_ == nullptr) {
+      rtc::scoped_refptr<VideoFrameBuffer> buffer;
+      do {
+        if (yuv_reader_) {
+          buffer = yuv_reader_->PullFrame();
+        } else {
+          buffer = ivf_reader_->NextFrame().buffer;
+        }
+      } while (--seek > 0);
+      RTC_CHECK(buffer) << "Could not read frame. timestamp_rtp "
+                        << timestamp_rtp;
+      last_frame_ = buffer;
+    }
+
+    rtc::scoped_refptr<VideoFrameBuffer> buffer = ScaleFrame(
+        last_frame_, output_resolution.width, output_resolution.height);
    return VideoFrame::Builder()
        .set_video_frame_buffer(buffer)
        .set_rtp_timestamp(timestamp_rtp)
@ -120,27 +153,16 @@ class VideoSource {
        .build();
  }

-  // Reads frame specified by `timestamp_rtp`, scales it to `resolution` and
-  // returns. Frame with the given `timestamp_rtp` is expected to be pulled
-  // before.
-  VideoFrame ReadFrame(uint32_t timestamp_rtp, Resolution resolution) {
-    MutexLock lock(&mutex_);
-    RTC_CHECK(frame_num_.find(timestamp_rtp) != frame_num_.end())
-        << "Frame with RTP timestamp " << timestamp_rtp
-        << " was not pulled before";
-    auto buffer =
-        frame_reader_->ReadFrame(frame_num_.at(timestamp_rtp), resolution);
-    return VideoFrame::Builder()
-        .set_video_frame_buffer(buffer)
-        .set_rtp_timestamp(timestamp_rtp)
-        .build();
-  }
-
 private:
  VideoSourceSettings source_settings_;
-  std::unique_ptr<FrameReader> frame_reader_ RTC_GUARDED_BY(mutex_);
-  std::map<uint32_t, int> frame_num_ RTC_GUARDED_BY(mutex_);
-  Mutex mutex_;
+  std::unique_ptr<FrameReader> yuv_reader_;
+  std::unique_ptr<FrameGeneratorInterface> ivf_reader_;
+  rtc::scoped_refptr<VideoFrameBuffer> last_frame_;
+  // Time delta between the source and output video. Used for frame rate
+  // scaling. This value increases by the source frame duration each time a
+  // frame is read from the source, and decreases by the output frame duration
+  // each time an output frame is delivered.
+  absl::optional<TimeDelta> time_delta_;
 };

 // Pacer calculates delay necessary to keep frame encode or decode call spaced
@ -345,9 +367,6 @@ class LeakyBucket {

 class VideoCodecAnalyzer : public VideoCodecTester::VideoCodecStats {
 public:
-  explicit VideoCodecAnalyzer(VideoSource* video_source)
-      : video_source_(video_source) {}
-
  void StartEncode(const VideoFrame& video_frame,
                   const EncodingSettings& encoding_settings) {
    int64_t encode_start_us = rtc::TimeMicros();
@ -436,7 +455,9 @@ class VideoCodecAnalyzer : public VideoCodecTester::VideoCodecStats {
        });
  }

-  void FinishDecode(const VideoFrame& decoded_frame, int spatial_idx) {
+  void FinishDecode(const VideoFrame& decoded_frame,
+                    int spatial_idx,
+                    absl::optional<VideoFrame> ref_frame = absl::nullopt) {
    int64_t decode_finished_us = rtc::TimeMicros();
    task_queue_.PostTask([this, timestamp_rtp = decoded_frame.rtp_timestamp(),
                          spatial_idx, width = decoded_frame.width(),
@ -452,20 +473,19 @@ class VideoCodecAnalyzer : public VideoCodecTester::VideoCodecStats {
      frame.decoded = true;
    });

-    if (video_source_ != nullptr) {
+    if (ref_frame.has_value()) {
      // Copy hardware-backed frame into main memory to release output buffers
      // which number may be limited in hardware decoders.
      rtc::scoped_refptr<I420BufferInterface> decoded_buffer =
          decoded_frame.video_frame_buffer()->ToI420();

-      task_queue_.PostTask([this, decoded_buffer,
+      task_queue_.PostTask([this, decoded_buffer, ref_frame,
                            timestamp_rtp = decoded_frame.rtp_timestamp(),
                            spatial_idx]() {
-        VideoFrame ref_frame = video_source_->ReadFrame(
-            timestamp_rtp, {.width = decoded_buffer->width(),
-                            .height = decoded_buffer->height()});
        rtc::scoped_refptr<I420BufferInterface> ref_buffer =
-            ref_frame.video_frame_buffer()->ToI420();
+            ScaleFrame(ref_frame->video_frame_buffer(), decoded_buffer->width(),
+                       decoded_buffer->height())
+                ->ToI420();
        Frame& frame = frames_.at(timestamp_rtp).at(spatial_idx);
        frame.psnr = CalcPsnr(*decoded_buffer, *ref_buffer);
      });
@ -788,7 +808,6 @@ class VideoCodecAnalyzer : public VideoCodecTester::VideoCodecStats {
    return SamplesStatsCounter::StatsSample{value, time};
  }

-  VideoSource* const video_source_;
  LimitedTaskQueue task_queue_;
  // RTP timestamp -> spatial layer -> Frame
  std::map<uint32_t, std::map<int, Frame>> frames_;
@ -837,7 +856,8 @@ class Decoder : public DecodedImageCallback {
    });
  }

-  void Decode(const EncodedImage& encoded_frame) {
+  void Decode(const EncodedImage& encoded_frame,
+              absl::optional<VideoFrame> ref_frame = absl::nullopt) {
    int spatial_idx = encoded_frame.SpatialIndex().value_or(
        encoded_frame.SimulcastIndex().value_or(0));
    {
@ -846,6 +866,10 @@ class Decoder : public DecodedImageCallback {
          << "Spatial index changed from " << *spatial_idx_ << " to "
          << spatial_idx;
      spatial_idx_ = spatial_idx;
+
+      if (ref_frame.has_value()) {
+        ref_frames_.insert({encoded_frame.RtpTimestamp(), *ref_frame});
+      }
    }

    Timestamp pts =
@ -876,12 +900,20 @@ class Decoder : public DecodedImageCallback {
 private:
  int Decoded(VideoFrame& decoded_frame) override {
    int spatial_idx;
+    absl::optional<VideoFrame> ref_frame;
    {
      MutexLock lock(&mutex_);
      spatial_idx = *spatial_idx_;
+
+      if (ref_frames_.size() > 0) {
+        auto it = ref_frames_.find(decoded_frame.rtp_timestamp());
+        RTC_CHECK(it != ref_frames_.end());
+        ref_frame = it->second;
+        ref_frames_.erase(ref_frames_.begin(), std::next(it));
+      }
    }

-    analyzer_->FinishDecode(decoded_frame, spatial_idx);
+    analyzer_->FinishDecode(decoded_frame, spatial_idx, ref_frame);

    if (y4m_writer_) {
      y4m_writer_->Write(decoded_frame, spatial_idx);
@ -900,6 +932,7 @@ class Decoder : public DecodedImageCallback {
  std::unique_ptr<TesterY4mWriter> y4m_writer_;
  absl::optional<VideoCodecType> codec_type_;
  absl::optional<int> spatial_idx_ RTC_GUARDED_BY(mutex_);
+  std::map<uint32_t, VideoFrame> ref_frames_ RTC_GUARDED_BY(mutex_);
  Mutex mutex_;
 };

@ -1573,7 +1606,7 @@ VideoCodecTester::RunDecodeTest(const Environment& env,
                                const DecoderSettings& decoder_settings,
                                const SdpVideoFormat& sdp_video_format) {
  std::unique_ptr<VideoCodecAnalyzer> analyzer =
-      std::make_unique<VideoCodecAnalyzer>(/*video_source=*/nullptr);
+      std::make_unique<VideoCodecAnalyzer>();
  Decoder decoder(env, decoder_factory, decoder_settings, analyzer.get());
  decoder.Initialize(sdp_video_format);

@ -1595,7 +1628,7 @@ VideoCodecTester::RunEncodeTest(
    const std::map<uint32_t, EncodingSettings>& encoding_settings) {
  VideoSource video_source(source_settings);
  std::unique_ptr<VideoCodecAnalyzer> analyzer =
-      std::make_unique<VideoCodecAnalyzer>(/*video_source=*/nullptr);
+      std::make_unique<VideoCodecAnalyzer>();
  Encoder encoder(env, encoder_factory, encoder_settings, analyzer.get());
  encoder.Initialize(encoding_settings.begin()->second);

@ -1624,7 +1657,7 @@ VideoCodecTester::RunEncodeDecodeTest(
    const std::map<uint32_t, EncodingSettings>& encoding_settings) {
  VideoSource video_source(source_settings);
  std::unique_ptr<VideoCodecAnalyzer> analyzer =
-      std::make_unique<VideoCodecAnalyzer>(&video_source);
+      std::make_unique<VideoCodecAnalyzer>();
  const EncodingSettings& frame_settings = encoding_settings.begin()->second;
  Encoder encoder(env, encoder_factory, encoder_settings, analyzer.get());
  encoder.Initialize(frame_settings);
@ -1645,10 +1678,11 @@ VideoCodecTester::RunEncodeDecodeTest(
    VideoFrame source_frame = video_source.PullFrame(
        timestamp_rtp, top_layer.resolution, top_layer.framerate);
    encoder.Encode(source_frame, frame_settings,
-                   [&decoders](const EncodedImage& encoded_frame) {
+                   [&decoders,
+                    source_frame](const EncodedImage& encoded_frame) {
                     int sidx = encoded_frame.SpatialIndex().value_or(
                         encoded_frame.SimulcastIndex().value_or(0));
-                     decoders.at(sidx)->Decode(encoded_frame);
+                     decoders.at(sidx)->Decode(encoded_frame, source_frame);
                   });
  }

--- a/test/video_codec_tester_unittest.cc
+++ b/test/video_codec_tester_unittest.cc
@ -29,6 +29,8 @@
 #include "api/units/time_delta.h"
 #include "api/video/i420_buffer.h"
 #include "api/video/video_frame.h"
+#include "api/video_codecs/builtin_video_decoder_factory.h"
+#include "api/video_codecs/builtin_video_encoder_factory.h"
 #include "api/video_codecs/scalability_mode.h"
 #include "api/video_codecs/video_decoder.h"
 #include "api/video_codecs/video_encoder.h"
@ -185,9 +187,11 @@ class VideoCodecTesterTest : public ::testing::Test {
  std::unique_ptr<VideoCodecStats> RunEncodeDecodeTest(
      std::string codec_type,
      ScalabilityMode scalability_mode,
-      std::vector<std::vector<Frame>> encoded_frames) {
+      std::vector<std::vector<Frame>> encoded_frames,
+      absl::optional<int> num_source_frames = absl::nullopt) {
    int num_frames = encoded_frames.size();
-    std::string yuv_path = CreateYuvFile(kWidth, kHeight, num_frames);
+    std::string yuv_path =
+        CreateYuvFile(kWidth, kHeight, num_source_frames.value_or(num_frames));
    VideoSourceSettings video_source_settings{
        .file_path = yuv_path,
        .resolution = {.width = kWidth, .height = kHeight},
@ -486,6 +490,33 @@ TEST_F(VideoCodecTesterTest, Psnr) {
  EXPECT_NEAR(slice[1].psnr->v, 34, 1);
 }

+TEST_F(VideoCodecTesterTest, ReversePlayback) {
+  std::unique_ptr<VideoCodecStats> stats = RunEncodeDecodeTest(
+      "VP8", ScalabilityMode::kL1T1,
+      {{{.timestamp_rtp = 0, .frame_size = DataSize::Bytes(1)}},
+       {{.timestamp_rtp = 1, .frame_size = DataSize::Bytes(1)}},
+       {{.timestamp_rtp = 2, .frame_size = DataSize::Bytes(1)}},
+       {{.timestamp_rtp = 3, .frame_size = DataSize::Bytes(1)}},
+       {{.timestamp_rtp = 4, .frame_size = DataSize::Bytes(1)}},
+       {{.timestamp_rtp = 5, .frame_size = DataSize::Bytes(1)}}},
+      /*num_source_frames=*/3);
+
+  std::vector<Frame> slice = stats->Slice(Filter{}, /*merge=*/false);
+  ASSERT_THAT(slice, SizeIs(6));
+  ASSERT_TRUE(slice[0].psnr.has_value());
+  ASSERT_TRUE(slice[1].psnr.has_value());
+  ASSERT_TRUE(slice[2].psnr.has_value());
+  ASSERT_TRUE(slice[3].psnr.has_value());
+  ASSERT_TRUE(slice[4].psnr.has_value());
+  ASSERT_TRUE(slice[5].psnr.has_value());
+  EXPECT_NEAR(slice[0].psnr->y, 48, 1);
+  EXPECT_NEAR(slice[1].psnr->y, 42, 1);
+  EXPECT_NEAR(slice[2].psnr->y, 34, 1);
+  EXPECT_NEAR(slice[3].psnr->y, 42, 1);
+  EXPECT_NEAR(slice[4].psnr->y, 48, 1);
+  EXPECT_NEAR(slice[5].psnr->y, 42, 1);
+}
+
 struct ScalabilityTestParameters {
  std::string codec_type;
  ScalabilityMode scalability_mode;
@ -871,5 +902,40 @@ INSTANTIATE_TEST_SUITE_P(
                DataRate::KilobitsPerSec(700), DataRate::KilobitsPerSec(800),
                DataRate::KilobitsPerSec(900)}}));

+// TODO(webrtc:42225151): Add an IVF test stream and enable the test.
+TEST(VideoCodecTester, DISABLED_CompressedVideoSource) {
+  const Environment env = CreateEnvironment();
+  std::unique_ptr<VideoEncoderFactory> encoder_factory =
+      CreateBuiltinVideoEncoderFactory();
+  std::unique_ptr<VideoDecoderFactory> decoder_factory =
+      CreateBuiltinVideoDecoderFactory();
+
+  VideoSourceSettings source_settings{
+      .file_path = ".ivf",
+      .resolution = {.width = 320, .height = 180},
+      .framerate = Frequency::Hertz(30)};
+
+  EncodingSettings encoding_settings = VideoCodecTester::CreateEncodingSettings(
+      env, "AV1", "L1T1", 320, 180, {DataRate::KilobitsPerSec(128)},
+      Frequency::Hertz(30));
+
+  std::map<uint32_t, EncodingSettings> frame_settings =
+      VideoCodecTester::CreateFrameSettings(encoding_settings, 3);
+
+  std::unique_ptr<VideoCodecStats> stats =
+      VideoCodecTester::RunEncodeDecodeTest(
+          env, source_settings, encoder_factory.get(), decoder_factory.get(),
+          EncoderSettings{}, DecoderSettings{}, frame_settings);
+
+  std::vector<Frame> slice = stats->Slice(Filter{}, /*merge=*/false);
+  ASSERT_THAT(slice, SizeIs(3));
+  ASSERT_TRUE(slice[0].psnr.has_value());
+  ASSERT_TRUE(slice[1].psnr.has_value());
+  ASSERT_TRUE(slice[2].psnr.has_value());
+  EXPECT_NEAR(slice[0].psnr->y, 42, 1);
+  EXPECT_NEAR(slice[1].psnr->y, 38, 1);
+  EXPECT_NEAR(slice[1].psnr->v, 38, 1);
+}
+
 }  // namespace test
 }  // namespace webrtc