diff --git a/modules/video_coding/BUILD.gn b/modules/video_coding/BUILD.gn
index b1ba48a73b..f000f41f8a 100644
--- a/modules/video_coding/BUILD.gn
+++ b/modules/video_coding/BUILD.gn
@@ -493,6 +493,7 @@ rtc_static_library("webrtc_vp9") {
     "../../rtc_base:checks",
     "../../rtc_base:rtc_base",
     "../../system_wrappers",
+    "../../system_wrappers:field_trial",
     "../rtp_rtcp:rtp_rtcp_format",
     "//third_party/abseil-cpp/absl/memory",
   ]
@@ -733,6 +734,7 @@ if (rtc_include_tests) {
       "../../media:rtc_media_base",
       "../../media:rtc_vp9_profile",
       "../../rtc_base:rtc_base",
+      "../../test:field_trial",
       "../../test:fileutils",
       "../../test:test_common",
       "../../test:test_support",
diff --git a/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc b/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc
index 1b8c2ce876..4733ad979a 100644
--- a/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc
+++ b/modules/video_coding/codecs/vp9/test/vp9_impl_unittest.cc
@@ -16,6 +16,7 @@
 #include "modules/video_coding/codecs/test/video_codec_unittest.h"
 #include "modules/video_coding/codecs/vp9/include/vp9.h"
 #include "modules/video_coding/codecs/vp9/svc_config.h"
+#include "test/field_trial.h"
 #include "test/video_codec_settings.h"
 
 namespace webrtc {
@@ -591,6 +592,45 @@ TEST_P(TestVp9ImplWithLayering, FlexibleMode) {
   }
 }
 
+TEST_P(TestVp9ImplWithLayering, ExternalRefControl) {
+  test::ScopedFieldTrials override_field_trials(
+      "WebRTC-Vp9ExternalRefCtrl/Enabled/");
+  codec_settings_.VP9()->flexibleMode = true;
+  codec_settings_.VP9()->frameDroppingOn = false;
+  codec_settings_.VP9()->numberOfSpatialLayers = num_spatial_layers_;
+  codec_settings_.VP9()->numberOfTemporalLayers = num_temporal_layers_;
+  EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+            encoder_->InitEncode(&codec_settings_, 1 /* number of cores */,
+                                 0 /* max payload size (unused) */));
+
+  GofInfoVP9 gof;
+  if (num_temporal_layers_ == 1) {
+    gof.SetGofInfoVP9(kTemporalStructureMode1);
+  } else if (num_temporal_layers_ == 2) {
+    gof.SetGofInfoVP9(kTemporalStructureMode2);
+  } else if (num_temporal_layers_ == 3) {
+    gof.SetGofInfoVP9(kTemporalStructureMode3);
+  }
+
+  // Encode at least (num_frames_in_gof + 1) frames to verify references
+  // of non-key frame with gof_idx = 0.
+  for (size_t frame_num = 0; frame_num < gof.num_frames_in_gof + 1;
+       ++frame_num) {
+    SetWaitForEncodedFramesThreshold(num_spatial_layers_);
+    EXPECT_EQ(WEBRTC_VIDEO_CODEC_OK,
+              encoder_->Encode(*NextInputFrame(), nullptr, nullptr));
+
+    const bool is_key_frame = frame_num == 0;
+    const size_t gof_idx = frame_num % gof.num_frames_in_gof;
+    const std::vector<uint8_t> p_diff(std::begin(gof.pid_diff[gof_idx]),
+                                      std::end(gof.pid_diff[gof_idx]));
+
+    ExpectFrameWith(num_spatial_layers_, gof.temporal_idx[gof_idx],
+                    gof.temporal_up_switch[gof_idx],
+                    is_key_frame ? 0 : gof.num_ref_pics[gof_idx], p_diff);
+  }
+}
+
 INSTANTIATE_TEST_CASE_P(,
                         TestVp9ImplWithLayering,
                         ::testing::Combine(::testing::Values(1, 2, 3),
diff --git a/modules/video_coding/codecs/vp9/vp9_impl.cc b/modules/video_coding/codecs/vp9/vp9_impl.cc
index 256168d181..d7e2ce198c 100644
--- a/modules/video_coding/codecs/vp9/vp9_impl.cc
+++ b/modules/video_coding/codecs/vp9/vp9_impl.cc
@@ -32,10 +32,16 @@
 #include "rtc_base/logging.h"
 #include "rtc_base/timeutils.h"
 #include "rtc_base/trace_event.h"
+#include "system_wrappers/include/field_trial.h"
 
 namespace webrtc {
 
 namespace {
+// Maps from gof_idx to encoder internal reference frame buffer index. These
+// maps work for 1,2 and 3 temporal layers with GOF length of 1,2 and 4 frames.
+uint8_t kRefBufIdx[4] = {0, 0, 0, 1};
+uint8_t kUpdBufIdx[4] = {0, 0, 1, 0};
+
 // Only positive speeds, range for real-time coding currently is: 5 - 8.
 // Lower means slower/better quality, higher means fastest/lower quality.
 int GetCpuSpeed(int width, int height) {
@@ -153,8 +159,11 @@ VP9EncoderImpl::VP9EncoderImpl(const cricket::VideoCodec& codec)
       pics_since_key_(0),
       num_temporal_layers_(0),
       num_spatial_layers_(0),
+      num_active_spatial_layers_(0),
       is_svc_(false),
       inter_layer_pred_(InterLayerPredMode::kOn),
+      external_ref_control_(
+          webrtc::field_trial::IsEnabled("WebRTC-Vp9ExternalRefCtrl")),
       is_flexible_mode_(false) {
   memset(&codec_, 0, sizeof(codec_));
   memset(&svc_params_, 0, sizeof(vpx_svc_extra_cfg_t));
@@ -769,6 +778,11 @@ int VP9EncoderImpl::Encode(const VideoFrame& input_image,
     flags = VPX_EFLAG_FORCE_KF;
   }
 
+  if (external_ref_control_) {
+    vpx_svc_ref_frame_config_t ref_config = SetReferences(force_key_frame_);
+    vpx_codec_control(encoder_, VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_config);
+  }
+
   // TODO(ssilkin): Frame duration should be specified per spatial layer
   // since their frame rate can be different. For now calculate frame duration
   // based on target frame rate of the highest spatial layer, which frame rate
@@ -876,6 +890,8 @@ void VP9EncoderImpl::PopulateCodecSpecific(CodecSpecificInfo* codec_specific,
     vp9_info->gof_idx = kNoGofIdx;
     FillReferenceIndices(pkt, pics_since_key_, vp9_info->inter_layer_predicted,
                          vp9_info);
+    // TODO(webrtc:9794): Add fake reference to empty reference list to
+    // workaround the frame buffer issue on receiver.
   } else {
     vp9_info->gof_idx =
         static_cast<uint8_t>(pics_since_key_ % gof_.num_frames_in_gof);
@@ -1032,6 +1048,88 @@ void VP9EncoderImpl::UpdateReferenceBuffers(const vpx_codec_cx_pkt& pkt,
   }
 }
 
+vpx_svc_ref_frame_config_t VP9EncoderImpl::SetReferences(bool is_key_pic) {
+  // kRefBufIdx, kUpdBufIdx need to be updated to support longer GOFs.
+  RTC_DCHECK_LE(gof_.num_frames_in_gof, 4);
+
+  vpx_svc_ref_frame_config_t ref_config;
+  memset(&ref_config, 0, sizeof(ref_config));
+
+  const size_t num_temporal_refs = std::max(1, num_temporal_layers_ - 1);
+  const bool is_inter_layer_pred_allowed =
+      inter_layer_pred_ == InterLayerPredMode::kOn ||
+      (inter_layer_pred_ == InterLayerPredMode::kOnKeyPic && is_key_pic);
+  absl::optional<int> last_updated_buf_idx;
+
+  // Put temporal reference to LAST and spatial reference to GOLDEN. Update
+  // frame buffer (i.e. store encoded frame) if current frame is a temporal
+  // reference (i.e. it belongs to a low temporal layer) or it is a spatial
+  // reference. In later case, always store spatial reference in the last
+  // reference frame buffer.
+  // For the case of 3 temporal and 3 spatial layers we need 6 frame buffers
+  // for temporal references plus 1 buffer for spatial reference. 7 buffers
+  // in total.
+
+  for (size_t sl_idx = 0; sl_idx < num_active_spatial_layers_; ++sl_idx) {
+    const size_t gof_idx = pics_since_key_ % gof_.num_frames_in_gof;
+
+    if (!is_key_pic) {
+      // Set up temporal reference.
+      const int buf_idx = sl_idx * num_temporal_refs + kRefBufIdx[gof_idx];
+
+      // Last reference frame buffer is reserved for spatial reference. It is
+      // not supposed to be used for temporal prediction.
+      RTC_DCHECK_LT(buf_idx, kNumVp9Buffers - 1);
+
+      // Sanity check that reference picture number is smaller than current
+      // picture number.
+      const size_t curr_pic_num = pics_since_key_ + 1;
+      RTC_DCHECK_LT(ref_buf_[buf_idx].pic_num, curr_pic_num);
+      const size_t pid_diff = curr_pic_num - ref_buf_[buf_idx].pic_num;
+
+      // Below code assumes single temporal referecence.
+      RTC_DCHECK_EQ(gof_.num_ref_pics[gof_idx], 1);
+      if (pid_diff == gof_.pid_diff[gof_idx][0]) {
+        ref_config.lst_fb_idx[sl_idx] = buf_idx;
+        ref_config.reference_last[sl_idx] = 1;
+      } else {
+        // This reference doesn't match with one specified by GOF. This can
+        // only happen if spatial layer is enabled dynamically without key
+        // frame. Spatial prediction is supposed to be enabled in this case.
+        RTC_DCHECK(is_inter_layer_pred_allowed);
+      }
+    }
+
+    if (is_inter_layer_pred_allowed && sl_idx > 0) {
+      // Set up spatial reference.
+      RTC_DCHECK(last_updated_buf_idx);
+      ref_config.gld_fb_idx[sl_idx] = *last_updated_buf_idx;
+      ref_config.reference_golden[sl_idx] = 1;
+    } else {
+      RTC_DCHECK(ref_config.reference_last[sl_idx] != 0 || sl_idx == 0 ||
+                 inter_layer_pred_ == InterLayerPredMode::kOff);
+    }
+
+    last_updated_buf_idx.reset();
+
+    if (gof_.temporal_idx[gof_idx] <= num_temporal_layers_ - 1) {
+      last_updated_buf_idx = sl_idx * num_temporal_refs + kUpdBufIdx[gof_idx];
+
+      // Ensure last frame buffer is not used for temporal prediction (it is
+      // reserved for spatial reference).
+      RTC_DCHECK_LT(*last_updated_buf_idx, kNumVp9Buffers - 1);
+    } else if (is_inter_layer_pred_allowed) {
+      last_updated_buf_idx = kNumVp9Buffers - 1;
+    }
+
+    if (last_updated_buf_idx) {
+      ref_config.update_buffer_slot[sl_idx] = 1 << *last_updated_buf_idx;
+    }
+  }
+
+  return ref_config;
+}
+
 int VP9EncoderImpl::GetEncodedLayerFrame(const vpx_codec_cx_pkt* pkt) {
   RTC_DCHECK_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
 
diff --git a/modules/video_coding/codecs/vp9/vp9_impl.h b/modules/video_coding/codecs/vp9/vp9_impl.h
index abb4c9d968..a8ea011d64 100644
--- a/modules/video_coding/codecs/vp9/vp9_impl.h
+++ b/modules/video_coding/codecs/vp9/vp9_impl.h
@@ -71,6 +71,7 @@ class VP9EncoderImpl : public VP9Encoder {
                             CodecSpecificInfoVP9* vp9_info);
   void UpdateReferenceBuffers(const vpx_codec_cx_pkt& pkt,
                               const size_t pic_num);
+  vpx_svc_ref_frame_config_t SetReferences(bool is_key_pic);
 
   bool ExplicitlyConfiguredSpatialLayers() const;
   bool SetSvcRates(const VideoBitrateAllocation& bitrate_allocation);
@@ -116,6 +117,7 @@ class VP9EncoderImpl : public VP9Encoder {
   uint8_t num_active_spatial_layers_;  // Number of actively encoded SLs
   bool is_svc_;
   InterLayerPredMode inter_layer_pred_;
+  bool external_ref_control_;
 
   std::vector<FramerateController> framerate_controller_;