diff --git a/modules/video_coding/codecs/vp9/include/vp9_globals.h b/modules/video_coding/codecs/vp9/include/vp9_globals.h index 87dafe4cdf..e6f644ec11 100644 --- a/modules/video_coding/codecs/vp9/include/vp9_globals.h +++ b/modules/video_coding/codecs/vp9/include/vp9_globals.h @@ -46,14 +46,14 @@ struct GofInfoVP9 { case kTemporalStructureMode1: num_frames_in_gof = 1; temporal_idx[0] = 0; - temporal_up_switch[0] = false; + temporal_up_switch[0] = true; num_ref_pics[0] = 1; pid_diff[0][0] = 1; break; case kTemporalStructureMode2: num_frames_in_gof = 2; temporal_idx[0] = 0; - temporal_up_switch[0] = false; + temporal_up_switch[0] = true; num_ref_pics[0] = 1; pid_diff[0][0] = 2; @@ -65,7 +65,7 @@ struct GofInfoVP9 { case kTemporalStructureMode3: num_frames_in_gof = 4; temporal_idx[0] = 0; - temporal_up_switch[0] = false; + temporal_up_switch[0] = true; num_ref_pics[0] = 1; pid_diff[0][0] = 4; @@ -87,7 +87,7 @@ struct GofInfoVP9 { case kTemporalStructureMode4: num_frames_in_gof = 8; temporal_idx[0] = 0; - temporal_up_switch[0] = false; + temporal_up_switch[0] = true; num_ref_pics[0] = 1; pid_diff[0][0] = 4; @@ -97,12 +97,12 @@ struct GofInfoVP9 { pid_diff[1][0] = 1; temporal_idx[2] = 1; - temporal_up_switch[2] = true; + temporal_up_switch[2] = false; num_ref_pics[2] = 1; pid_diff[2][0] = 2; temporal_idx[3] = 2; - temporal_up_switch[3] = false; + temporal_up_switch[3] = true; num_ref_pics[3] = 2; pid_diff[3][0] = 1; pid_diff[3][1] = 2; @@ -113,7 +113,7 @@ struct GofInfoVP9 { pid_diff[4][0] = 4; temporal_idx[5] = 2; - temporal_up_switch[5] = false; + temporal_up_switch[5] = true; num_ref_pics[5] = 2; pid_diff[5][0] = 1; pid_diff[5][1] = 2; @@ -125,7 +125,7 @@ struct GofInfoVP9 { pid_diff[6][1] = 4; temporal_idx[7] = 2; - temporal_up_switch[7] = false; + temporal_up_switch[7] = true; num_ref_pics[7] = 2; pid_diff[7][0] = 1; pid_diff[7][1] = 2; @@ -195,7 +195,10 @@ struct RTPVideoHeaderVP9 { uint8_t temporal_idx; // Temporal layer index, or kNoTemporalIdx. uint8_t spatial_idx; // Spatial layer index, or kNoSpatialIdx. bool temporal_up_switch; // True if upswitch to higher frame rate is possible - // starting from this frame. + // meaning subsequent higher temporal layer pictures + // will not depend on any picture before the current + // picture (in coding order) with temporal layer ID + // greater than `temporal_idx` of this frame. bool inter_layer_predicted; // Frame is dependent on directly lower spatial // layer frame. diff --git a/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc b/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc index 0c3196c854..99680cbe79 100644 --- a/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc +++ b/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc @@ -959,7 +959,7 @@ int LibvpxVp9Encoder::Encode(const VideoFrame& input_image, const size_t gof_idx = (pics_since_key_ + 1) % gof_.num_frames_in_gof; layer_id.temporal_layer_id = gof_.temporal_idx[gof_idx]; - if (VideoCodecMode::kScreensharing == codec_.mode) { + if (codec_.mode == VideoCodecMode::kScreensharing) { const uint32_t frame_timestamp_ms = 1000 * input_image.timestamp() / kVideoPayloadTypeFrequency; @@ -1212,8 +1212,7 @@ int LibvpxVp9Encoder::Encode(const VideoFrame& input_image, bool LibvpxVp9Encoder::PopulateCodecSpecific(CodecSpecificInfo* codec_specific, absl::optional* spatial_idx, - const vpx_codec_cx_pkt& pkt, - uint32_t timestamp) { + const vpx_codec_cx_pkt& pkt) { RTC_CHECK(codec_specific != nullptr); codec_specific->codecType = kVideoCodecVP9; CodecSpecificInfoVP9* vp9_info = &(codec_specific->codecSpecific.VP9); @@ -1248,9 +1247,6 @@ bool LibvpxVp9Encoder::PopulateCodecSpecific(CodecSpecificInfo* codec_specific, *spatial_idx = layer_id.spatial_layer_id; } - // TODO(asapersson): this info has to be obtained from the encoder. - vp9_info->temporal_up_switch = false; - const bool is_key_pic = (pics_since_key_ == 0); const bool is_inter_layer_pred_allowed = (inter_layer_pred_ == InterLayerPredMode::kOn || @@ -1283,6 +1279,20 @@ bool LibvpxVp9Encoder::PopulateCodecSpecific(CodecSpecificInfo* codec_specific, vp9_info); if (vp9_info->flexible_mode) { vp9_info->gof_idx = kNoGofIdx; + if (!svc_controller_) { + if (num_temporal_layers_ == 1) { + vp9_info->temporal_up_switch = true; + } else { + // In flexible mode with > 1 temporal layer but no SVC controller we + // can't techincally determine if a frame is an upswitch point, use + // gof-based data as proxy for now. + // TODO(sprang): Remove once SVC controller is the only choice. + vp9_info->gof_idx = + static_cast(pics_since_key_ % gof_.num_frames_in_gof); + vp9_info->temporal_up_switch = + gof_.temporal_up_switch[vp9_info->gof_idx]; + } + } } else { vp9_info->gof_idx = static_cast(pics_since_key_ % gof_.num_frames_in_gof); @@ -1353,6 +1363,23 @@ bool LibvpxVp9Encoder::PopulateCodecSpecific(CodecSpecificInfo* codec_specific, svc_params_.scaling_factor_den[sid]); } } + if (is_flexible_mode_) { + // Populate data for legacy temporal-upswitch state. + // We can switch up to a higher temporal layer only if all temporal layers + // higher than this (within the current spatial layer) are switch points. + vp9_info->temporal_up_switch = true; + for (int i = layer_id.temporal_layer_id + 1; i < num_temporal_layers_; + ++i) { + // Assumes decode targets are always ordered first by spatial then by + // temporal id. + size_t dti_index = + (layer_id.spatial_layer_id * num_temporal_layers_) + i; + vp9_info->temporal_up_switch &= + (codec_specific->generic_frame_info + ->decode_target_indications[dti_index] == + DecodeTargetIndication::kSwitch); + } + } } return true; } @@ -1428,8 +1455,6 @@ void LibvpxVp9Encoder::FillReferenceIndices(const vpx_codec_cx_pkt& pkt, ref_buf_list.push_back(ref_buf_.at(0)); } - size_t max_ref_temporal_layer_id = 0; - std::vector ref_pid_list; vp9_info->num_ref_pics = 0; @@ -1461,9 +1486,6 @@ void LibvpxVp9Encoder::FillReferenceIndices(const vpx_codec_cx_pkt& pkt, vp9_info->p_diff[vp9_info->num_ref_pics] = static_cast(p_diff); ++vp9_info->num_ref_pics; - - max_ref_temporal_layer_id = - std::max(max_ref_temporal_layer_id, ref_buf.temporal_layer_id); } else { RTC_DCHECK(inter_layer_predicted); // RTP spec only allows to use previous spatial layer for inter-layer @@ -1471,10 +1493,6 @@ void LibvpxVp9Encoder::FillReferenceIndices(const vpx_codec_cx_pkt& pkt, RTC_DCHECK_EQ(ref_buf.spatial_layer_id + 1, layer_id.spatial_layer_id); } } - - vp9_info->temporal_up_switch = - (max_ref_temporal_layer_id < - static_cast(layer_id.temporal_layer_id)); } void LibvpxVp9Encoder::UpdateReferenceBuffers(const vpx_codec_cx_pkt& pkt, @@ -1636,8 +1654,7 @@ void LibvpxVp9Encoder::GetEncodedLayerFrame(const vpx_codec_cx_pkt* pkt) { codec_specific_ = {}; absl::optional spatial_index; - if (!PopulateCodecSpecific(&codec_specific_, &spatial_index, *pkt, - input_image_->timestamp())) { + if (!PopulateCodecSpecific(&codec_specific_, &spatial_index, *pkt)) { // Drop the frame. encoded_image_.set_size(0); return; diff --git a/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.h b/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.h index d2f98c1ea4..93b2a59139 100644 --- a/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.h +++ b/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.h @@ -67,8 +67,7 @@ class LibvpxVp9Encoder : public VP9Encoder { bool PopulateCodecSpecific(CodecSpecificInfo* codec_specific, absl::optional* spatial_idx, - const vpx_codec_cx_pkt& pkt, - uint32_t timestamp); + const vpx_codec_cx_pkt& pkt); void FillReferenceIndices(const vpx_codec_cx_pkt& pkt, const size_t pic_num, const bool inter_layer_predicted, diff --git a/video/video_send_stream_tests.cc b/video/video_send_stream_tests.cc index f948b03655..a7ba243ed2 100644 --- a/video/video_send_stream_tests.cc +++ b/video/video_send_stream_tests.cc @@ -3089,20 +3089,20 @@ class Vp9HeaderObserver : public test::SendTest { void VerifyTemporalLayerStructure0(const RTPVideoHeaderVP9& vp9) const { EXPECT_EQ(kNoTl0PicIdx, vp9.tl0_pic_idx); EXPECT_EQ(kNoTemporalIdx, vp9.temporal_idx); // no tid + // Technically true, but layer indices not available. EXPECT_FALSE(vp9.temporal_up_switch); } void VerifyTemporalLayerStructure1(const RTPVideoHeaderVP9& vp9) const { EXPECT_NE(kNoTl0PicIdx, vp9.tl0_pic_idx); EXPECT_EQ(0, vp9.temporal_idx); // 0,0,0,... - EXPECT_FALSE(vp9.temporal_up_switch); } void VerifyTemporalLayerStructure2(const RTPVideoHeaderVP9& vp9) const { EXPECT_NE(kNoTl0PicIdx, vp9.tl0_pic_idx); EXPECT_GE(vp9.temporal_idx, 0); // 0,1,0,1,... (tid reset on I-frames). EXPECT_LE(vp9.temporal_idx, 1); - EXPECT_EQ(vp9.temporal_idx > 0, vp9.temporal_up_switch); + EXPECT_TRUE(vp9.temporal_up_switch); if (IsNewPictureId(vp9)) { uint8_t expected_tid = (!vp9.inter_pic_predicted || last_vp9_.temporal_idx == 1) ? 0 : 1; @@ -3116,18 +3116,16 @@ class Vp9HeaderObserver : public test::SendTest { EXPECT_LE(vp9.temporal_idx, 2); if (IsNewPictureId(vp9) && vp9.inter_pic_predicted) { EXPECT_NE(vp9.temporal_idx, last_vp9_.temporal_idx); + EXPECT_TRUE(vp9.temporal_up_switch); switch (vp9.temporal_idx) { case 0: - EXPECT_EQ(2, last_vp9_.temporal_idx); - EXPECT_FALSE(vp9.temporal_up_switch); + EXPECT_EQ(last_vp9_.temporal_idx, 2); break; case 1: - EXPECT_EQ(2, last_vp9_.temporal_idx); - EXPECT_TRUE(vp9.temporal_up_switch); + EXPECT_EQ(last_vp9_.temporal_idx, 2); break; case 2: EXPECT_LT(last_vp9_.temporal_idx, 2); - EXPECT_TRUE(vp9.temporal_up_switch); break; } } @@ -3192,8 +3190,12 @@ class Vp9HeaderObserver : public test::SendTest { EXPECT_FALSE(vp9.inter_pic_predicted); // P if (!vp9.inter_pic_predicted) { - EXPECT_TRUE(vp9.temporal_idx == 0 || vp9.temporal_idx == kNoTemporalIdx); - EXPECT_FALSE(vp9.temporal_up_switch); + if (vp9.temporal_idx == kNoTemporalIdx) { + EXPECT_FALSE(vp9.temporal_up_switch); + } else { + EXPECT_EQ(vp9.temporal_idx, 0); + EXPECT_TRUE(vp9.temporal_up_switch); + } } }