From 4e6cd5eaeb9e64d213c5436678ac143589fbadd1 Mon Sep 17 00:00:00 2001 From: Sergey Silkin Date: Mon, 28 May 2018 12:26:36 +0200 Subject: [PATCH] Get actual list of references from encoder in flexible mode. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In flexible mode, use VP9E_GET_SVC_REF_FRAME_CONFIG to get indices of reference frame buffers and buffers update by encoded frame. Set inter_pic_predicted to true only if encoder actually used temporal prediction. Bug: webrtc:9244, webrtc:9270 Change-Id: I4e439abeab9e063d50abdcefc59bf58d6596ea6c Reviewed-on: https://webrtc-review.googlesource.com/74780 Commit-Queue: Sergey Silkin Reviewed-by: Åsa Persson Reviewed-by: Marco Paniconi Cr-Commit-Position: refs/heads/master@{#23410} --- modules/video_coding/codecs/vp9/vp9_impl.cc | 153 +++++++++++++++++--- modules/video_coding/codecs/vp9/vp9_impl.h | 25 +++- 2 files changed, 157 insertions(+), 21 deletions(-) diff --git a/modules/video_coding/codecs/vp9/vp9_impl.cc b/modules/video_coding/codecs/vp9/vp9_impl.cc index 366038b52f..c7c52bb809 100644 --- a/modules/video_coding/codecs/vp9/vp9_impl.cc +++ b/modules/video_coding/codecs/vp9/vp9_impl.cc @@ -11,9 +11,7 @@ #include "modules/video_coding/codecs/vp9/vp9_impl.h" -#include -#include -#include +#include #include #include "vpx/vpx_encoder.h" @@ -82,6 +80,7 @@ VP9EncoderImpl::VP9EncoderImpl() pics_since_key_(0), num_temporal_layers_(0), num_spatial_layers_(0), + is_svc_(false), inter_layer_pred_(InterLayerPredMode::kOn), output_framerate_(1000.0, 1000.0), last_encoded_frame_rtp_timestamp_(0), @@ -293,6 +292,11 @@ int VP9EncoderImpl::InitEncode(const VideoCodec* inst, target_framerate_fps_.reset(); } + is_svc_ = (num_spatial_layers_ > 1 || num_temporal_layers_ > 1); + // Flexible mode requires SVC to be enabled since libvpx API only allows + // to get reference list in SVC mode. + RTC_DCHECK(!inst->VP9().flexibleMode || is_svc_); + // Allocate memory for encoded image if (encoded_image_._buffer != nullptr) { delete[] encoded_image_._buffer; @@ -313,8 +317,7 @@ int VP9EncoderImpl::InitEncode(const VideoCodec* inst, config_->g_w = codec_.width; config_->g_h = codec_.height; config_->rc_target_bitrate = inst->startBitrate; // in kbit/s - config_->g_error_resilient = - (num_spatial_layers_ > 1 || num_temporal_layers_ > 1) ? 1 : 0; + config_->g_error_resilient = is_svc_ ? VPX_ERROR_RESILIENT_DEFAULT : 0; // Setting the time base of the codec. config_->g_timebase.num = 1; config_->g_timebase.den = 90000; @@ -390,6 +393,8 @@ int VP9EncoderImpl::InitEncode(const VideoCodec* inst, inter_layer_pred_ = inst->VP9().interLayerPred; + ref_buf_.clear(); + return InitAndSetControlSettings(inst); } @@ -474,13 +479,10 @@ int VP9EncoderImpl::InitAndSetControlSettings(const VideoCodec* inst) { inst->VP9().adaptiveQpMode ? 3 : 0); vpx_codec_control(encoder_, VP9E_SET_FRAME_PARALLEL_DECODING, 0); - vpx_codec_control( - encoder_, VP9E_SET_SVC, - (num_temporal_layers_ > 1 || num_spatial_layers_ > 1) ? 1 : 0); - if (num_temporal_layers_ > 1 || num_spatial_layers_ > 1) { - vpx_codec_control(encoder_, VP9E_SET_SVC_PARAMETERS, - &svc_params_); + if (is_svc_) { + vpx_codec_control(encoder_, VP9E_SET_SVC, 1); + vpx_codec_control(encoder_, VP9E_SET_SVC_PARAMETERS, &svc_params_); } if (num_spatial_layers_ > 1) { @@ -638,9 +640,6 @@ void VP9EncoderImpl::PopulateCodecSpecific(CodecSpecificInfo* codec_specific, CodecSpecificInfoVP9* vp9_info = &(codec_specific->codecSpecific.VP9); vp9_info->first_frame_in_picture = first_frame_in_picture; - // TODO(asapersson): Set correct value. - vp9_info->inter_pic_predicted = - (pkt.data.frame.flags & VPX_FRAME_IS_KEY) ? false : true; vp9_info->flexible_mode = codec_.VP9()->flexibleMode; vp9_info->ss_data_available = ((pkt.data.frame.flags & VPX_FRAME_IS_KEY) && !codec_.VP9()->flexibleMode) @@ -701,11 +700,22 @@ void VP9EncoderImpl::PopulateCodecSpecific(CodecSpecificInfo* codec_specific, vp9_info->num_spatial_layers = num_spatial_layers_; RTC_DCHECK(!vp9_info->flexible_mode); - vp9_info->gof_idx = - static_cast(pics_since_key_ % gof_.num_frames_in_gof); - vp9_info->temporal_up_switch = gof_.temporal_up_switch[vp9_info->gof_idx]; - vp9_info->num_ref_pics = - is_key_pic ? 0 : gof_.num_ref_pics[vp9_info->gof_idx]; + + vp9_info->num_ref_pics = 0; + if (vp9_info->flexible_mode) { + vp9_info->gof_idx = kNoGofIdx; + FillReferenceIndices(pkt, pics_since_key_, vp9_info->inter_layer_predicted, + vp9_info); + } else { + vp9_info->gof_idx = + static_cast(pics_since_key_ % gof_.num_frames_in_gof); + vp9_info->temporal_up_switch = gof_.temporal_up_switch[vp9_info->gof_idx]; + vp9_info->num_ref_pics = gof_.num_ref_pics[vp9_info->gof_idx]; + } + + vp9_info->inter_pic_predicted = (!is_key_pic && vp9_info->num_ref_pics > 0); + + vp9_info->num_spatial_layers = num_spatial_layers_; if (vp9_info->ss_data_available) { vp9_info->spatial_layer_resolution_present = true; @@ -723,6 +733,107 @@ void VP9EncoderImpl::PopulateCodecSpecific(CodecSpecificInfo* codec_specific, } } +void VP9EncoderImpl::FillReferenceIndices(const vpx_codec_cx_pkt& pkt, + const size_t pic_num, + const bool inter_layer_predicted, + CodecSpecificInfoVP9* vp9_info) { + vpx_svc_layer_id_t layer_id = {0}; + vpx_codec_control(encoder_, VP9E_GET_SVC_LAYER_ID, &layer_id); + + vpx_svc_ref_frame_config_t enc_layer_conf = {{0}}; + vpx_codec_control(encoder_, VP9E_GET_SVC_REF_FRAME_CONFIG, &enc_layer_conf); + + std::vector ref_buf_list; + if (enc_layer_conf.reference_last[layer_id.spatial_layer_id]) { + const size_t fb_idx = enc_layer_conf.lst_fb_idx[layer_id.spatial_layer_id]; + RTC_DCHECK(ref_buf_.find(fb_idx) != ref_buf_.end()); + ref_buf_list.push_back(ref_buf_.at(fb_idx)); + } + + if (enc_layer_conf.reference_alt_ref[layer_id.spatial_layer_id]) { + const size_t fb_idx = enc_layer_conf.alt_fb_idx[layer_id.spatial_layer_id]; + RTC_DCHECK(ref_buf_.find(fb_idx) != ref_buf_.end()); + ref_buf_list.push_back(ref_buf_.at(fb_idx)); + } + + if (enc_layer_conf.reference_golden[layer_id.spatial_layer_id]) { + const size_t fb_idx = enc_layer_conf.gld_fb_idx[layer_id.spatial_layer_id]; + RTC_DCHECK(ref_buf_.find(fb_idx) != ref_buf_.end()); + ref_buf_list.push_back(ref_buf_.at(fb_idx)); + } + + size_t max_ref_temporal_layer_id = 0; + + vp9_info->num_ref_pics = 0; + for (const RefFrameBuffer& ref_buf : ref_buf_list) { + RTC_DCHECK_LE(ref_buf.pic_num, pic_num); + if (ref_buf.pic_num < pic_num) { + if (inter_layer_pred_ != InterLayerPredMode::kOn) { + // RTP spec limits temporal prediction to the same spatial layer. + // It is safe to ignore this requirement if inter-layer prediction is + // enabled for all frames when all base frames are relayed to receiver. + RTC_DCHECK_EQ(ref_buf.spatial_layer_id, layer_id.spatial_layer_id); + } + RTC_DCHECK_LE(ref_buf.temporal_layer_id, layer_id.temporal_layer_id); + + const size_t p_diff = pic_num - ref_buf.pic_num; + RTC_DCHECK_LE(p_diff, 127UL); + + vp9_info->p_diff[vp9_info->num_ref_pics] = static_cast(p_diff); + ++vp9_info->num_ref_pics; + + max_ref_temporal_layer_id = + std::max(max_ref_temporal_layer_id, ref_buf.temporal_layer_id); + } else { + RTC_DCHECK(inter_layer_predicted); + // RTP spec only allows to use previous spatial layer for inter-layer + // prediction. + RTC_DCHECK_EQ(ref_buf.spatial_layer_id + 1, layer_id.spatial_layer_id); + } + } + + vp9_info->temporal_up_switch = + (max_ref_temporal_layer_id < + static_cast(layer_id.temporal_layer_id)); +} + +void VP9EncoderImpl::UpdateReferenceBuffers(const vpx_codec_cx_pkt& pkt, + const size_t pic_num) { + vpx_svc_layer_id_t layer_id = {0}; + vpx_codec_control(encoder_, VP9E_GET_SVC_LAYER_ID, &layer_id); + + vpx_svc_ref_frame_config_t enc_layer_conf = {{0}}; + vpx_codec_control(encoder_, VP9E_GET_SVC_REF_FRAME_CONFIG, &enc_layer_conf); + + const bool is_key_frame = + (pkt.data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; + + RefFrameBuffer frame_buf(pic_num, layer_id.spatial_layer_id, + layer_id.temporal_layer_id); + + if (is_key_frame && layer_id.spatial_layer_id == 0) { + // Key frame updates all ref buffers. + for (size_t i = 0; i < kNumVp9Buffers; ++i) { + ref_buf_[i] = frame_buf; + } + } else { + if (enc_layer_conf.update_last[layer_id.spatial_layer_id]) { + ref_buf_[enc_layer_conf.lst_fb_idx[layer_id.spatial_layer_id]] = + frame_buf; + } + + if (enc_layer_conf.update_alt_ref[layer_id.spatial_layer_id]) { + ref_buf_[enc_layer_conf.alt_fb_idx[layer_id.spatial_layer_id]] = + frame_buf; + } + + if (enc_layer_conf.update_golden[layer_id.spatial_layer_id]) { + ref_buf_[enc_layer_conf.gld_fb_idx[layer_id.spatial_layer_id]] = + frame_buf; + } + } +} + int VP9EncoderImpl::GetEncodedLayerFrame(const vpx_codec_cx_pkt* pkt) { RTC_DCHECK_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); @@ -766,6 +877,10 @@ int VP9EncoderImpl::GetEncodedLayerFrame(const vpx_codec_cx_pkt* pkt) { PopulateCodecSpecific(&codec_specific_, *pkt, input_image_->timestamp(), first_frame_in_picture); + if (is_flexible_mode_) { + UpdateReferenceBuffers(*pkt, pics_since_key_); + } + TRACE_COUNTER1("webrtc", "EncodedFrameSize", encoded_image_._length); encoded_image_._timeStamp = input_image_->timestamp(); encoded_image_.capture_time_ms_ = input_image_->render_time_ms(); diff --git a/modules/video_coding/codecs/vp9/vp9_impl.h b/modules/video_coding/codecs/vp9/vp9_impl.h index d7c9250eca..76a3e0b875 100644 --- a/modules/video_coding/codecs/vp9/vp9_impl.h +++ b/modules/video_coding/codecs/vp9/vp9_impl.h @@ -12,6 +12,7 @@ #ifndef MODULES_VIDEO_CODING_CODECS_VP9_VP9_IMPL_H_ #define MODULES_VIDEO_CODING_CODECS_VP9_VP9_IMPL_H_ +#include #include #include @@ -61,6 +62,12 @@ class VP9EncoderImpl : public VP9Encoder { const vpx_codec_cx_pkt& pkt, uint32_t timestamp, bool first_frame_in_picture); + void FillReferenceIndices(const vpx_codec_cx_pkt& pkt, + const size_t pic_num, + const bool inter_layer_predicted, + CodecSpecificInfoVP9* vp9_info); + void UpdateReferenceBuffers(const vpx_codec_cx_pkt& pkt, + const size_t pic_num); bool ExplicitlyConfiguredSpatialLayers() const; bool SetSvcRates(const VideoBitrateAllocation& bitrate_allocation); @@ -96,12 +103,13 @@ class VP9EncoderImpl : public VP9Encoder { vpx_image_t* raw_; vpx_svc_extra_cfg_t svc_params_; const VideoFrame* input_image_; - GofInfoVP9 gof_; // Contains each frame's temporal information for - // non-flexible mode. + GofInfoVP9 gof_; // Contains each frame's temporal information for + // non-flexible mode. bool force_key_frame_; size_t pics_since_key_; uint8_t num_temporal_layers_; uint8_t num_spatial_layers_; + bool is_svc_; InterLayerPredMode inter_layer_pred_; // Framerate controller. @@ -111,6 +119,19 @@ class VP9EncoderImpl : public VP9Encoder { // Used for flexible mode. bool is_flexible_mode_; + struct RefFrameBuffer { + RefFrameBuffer(size_t pic_num, + size_t spatial_layer_id, + size_t temporal_layer_id) + : pic_num(pic_num), + spatial_layer_id(spatial_layer_id), + temporal_layer_id(temporal_layer_id) {} + RefFrameBuffer() {} + size_t pic_num = 0; + size_t spatial_layer_id = 0; + size_t temporal_layer_id = 0; + }; + std::map ref_buf_; }; class VP9DecoderImpl : public VP9Decoder {