diff --git a/webrtc/modules/video_processing/BUILD.gn b/webrtc/modules/video_processing/BUILD.gn
index 43df0802ad..edeac947c7 100644
--- a/webrtc/modules/video_processing/BUILD.gn
+++ b/webrtc/modules/video_processing/BUILD.gn
@@ -6,6 +6,7 @@
 # in the file PATENTS.  All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 
+import("//build/config/arm.gni")
 import("../../build/webrtc.gni")
 
 build_video_processing_sse2 = current_cpu == "x86" || current_cpu == "x64"
@@ -26,8 +27,16 @@ source_set("video_processing") {
     "include/video_processing_defines.h",
     "spatial_resampler.cc",
     "spatial_resampler.h",
+    "util/denoiser_filter.cc",
+    "util/denoiser_filter.h",
+    "util/denoiser_filter_c.cc",
+    "util/denoiser_filter_c.h",
+    "util/skin_detection.cc",
+    "util/skin_detection.h",
     "video_decimator.cc",
     "video_decimator.h",
+    "video_denoiser.cc",
+    "video_denoiser.h",
     "video_processing_impl.cc",
     "video_processing_impl.h",
   ]
@@ -41,6 +50,9 @@ source_set("video_processing") {
   if (build_video_processing_sse2) {
     deps += [ ":video_processing_sse2" ]
   }
+  if (rtc_build_with_neon) {
+    deps += [ ":video_processing_neon" ]
+  }
 
   configs += [ "../..:common_config" ]
   public_configs = [ "../..:common_inherited_config" ]
@@ -56,6 +68,8 @@ if (build_video_processing_sse2) {
   source_set("video_processing_sse2") {
     sources = [
       "content_analysis_sse2.cc",
+      "util/denoiser_filter_sse2.cc",
+      "util/denoiser_filter_sse2.h",
     ]
 
     configs += [ "../..:common_config" ]
@@ -72,3 +86,16 @@ if (build_video_processing_sse2) {
     }
   }
 }
+
+if (rtc_build_with_neon) {
+  source_set("video_processing_neon") {
+    sources = [
+      "util/denoiser_filter_neon.cc",
+      "util/denoiser_filter_neon.h",
+    ]
+    if (current_cpu != "arm64") {
+      configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
+      cflags = [ "-mfpu=neon" ]
+    }
+  }
+}
diff --git a/webrtc/modules/video_processing/frame_preprocessor.cc b/webrtc/modules/video_processing/frame_preprocessor.cc
index a3ec3c8c1c..36e1b9bbfd 100644
--- a/webrtc/modules/video_processing/frame_preprocessor.cc
+++ b/webrtc/modules/video_processing/frame_preprocessor.cc
@@ -13,26 +13,34 @@
 namespace webrtc {
 
 VPMFramePreprocessor::VPMFramePreprocessor()
-    : content_metrics_(NULL),
+    : content_metrics_(nullptr),
       resampled_frame_(),
       enable_ca_(false),
+      enable_denoising_(false),
       frame_cnt_(0) {
   spatial_resampler_ = new VPMSimpleSpatialResampler();
   ca_ = new VPMContentAnalysis(true);
   vd_ = new VPMVideoDecimator();
+  if (enable_denoising_) {
+    denoiser_ = new VideoDenoiser();
+  } else {
+    denoiser_ = nullptr;
+  }
 }
 
 VPMFramePreprocessor::~VPMFramePreprocessor() {
   Reset();
-  delete spatial_resampler_;
   delete ca_;
   delete vd_;
+  if (enable_denoising_)
+    delete denoiser_;
+  delete spatial_resampler_;
 }
 
 void  VPMFramePreprocessor::Reset() {
   ca_->Release();
   vd_->Reset();
-  content_metrics_ = NULL;
+  content_metrics_ = nullptr;
   spatial_resampler_->Reset();
   enable_ca_ = false;
   frame_cnt_ = 0;
@@ -104,11 +112,22 @@ int32_t VPMFramePreprocessor::PreprocessFrame(const VideoFrame& frame,
     return 1;  // drop 1 frame
   }
 
-  // Resizing incoming frame if needed. Otherwise, remains NULL.
+  // Resizing incoming frame if needed. Otherwise, remains nullptr.
   // We are not allowed to resample the input frame (must make a copy of it).
-  *processed_frame = NULL;
+  *processed_frame = nullptr;
+  if (denoiser_ != nullptr) {
+    denoiser_->DenoiseFrame(frame, &denoised_frame_);
+    *processed_frame = &denoised_frame_;
+  }
+
   if (spatial_resampler_->ApplyResample(frame.width(), frame.height()))  {
-    int32_t ret = spatial_resampler_->ResampleFrame(frame, &resampled_frame_);
+    int32_t ret;
+    if (enable_denoising_) {
+      ret = spatial_resampler_->ResampleFrame(denoised_frame_,
+                                              &resampled_frame_);
+    } else {
+      ret = spatial_resampler_->ResampleFrame(frame, &resampled_frame_);
+    }
     if (ret != VPM_OK) return ret;
     *processed_frame = &resampled_frame_;
   }
@@ -118,14 +137,14 @@ int32_t VPMFramePreprocessor::PreprocessFrame(const VideoFrame& frame,
     // Compute new metrics every |kSkipFramesCA| frames, starting with
     // the first frame.
     if (frame_cnt_ % kSkipFrameCA == 0) {
-      if (*processed_frame == NULL)  {
+      if (*processed_frame == nullptr)  {
         content_metrics_ = ca_->ComputeContentMetrics(frame);
       } else {
-        content_metrics_ = ca_->ComputeContentMetrics(resampled_frame_);
+        content_metrics_ = ca_->ComputeContentMetrics(**processed_frame);
       }
     }
-    ++frame_cnt_;
   }
+  ++frame_cnt_;
   return VPM_OK;
 }
 
diff --git a/webrtc/modules/video_processing/frame_preprocessor.h b/webrtc/modules/video_processing/frame_preprocessor.h
index c5313b4066..27592603c7 100644
--- a/webrtc/modules/video_processing/frame_preprocessor.h
+++ b/webrtc/modules/video_processing/frame_preprocessor.h
@@ -18,7 +18,9 @@
 #include "webrtc/modules/video_processing/content_analysis.h"
 #include "webrtc/modules/video_processing/spatial_resampler.h"
 #include "webrtc/modules/video_processing/video_decimator.h"
+#include "webrtc/modules/video_processing/video_denoiser.h"
 #include "webrtc/typedefs.h"
+#include "webrtc/video_frame.h"
 
 namespace webrtc {
 
@@ -65,11 +67,14 @@ class VPMFramePreprocessor {
   enum { kSkipFrameCA = 2 };
 
   VideoContentMetrics* content_metrics_;
+  VideoFrame denoised_frame_;
   VideoFrame resampled_frame_;
   VPMSpatialResampler* spatial_resampler_;
   VPMContentAnalysis* ca_;
   VPMVideoDecimator* vd_;
+  VideoDenoiser* denoiser_;
   bool enable_ca_;
+  bool enable_denoising_;
   int frame_cnt_;
 
 };
diff --git a/webrtc/modules/video_processing/util/denoiser_filter.cc b/webrtc/modules/video_processing/util/denoiser_filter.cc
new file mode 100644
index 0000000000..a5819905b2
--- /dev/null
+++ b/webrtc/modules/video_processing/util/denoiser_filter.cc
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/video_processing/util/denoiser_filter.h"
+#include "webrtc/modules/video_processing/util/denoiser_filter_c.h"
+#include "webrtc/modules/video_processing/util/denoiser_filter_neon.h"
+#include "webrtc/modules/video_processing/util/denoiser_filter_sse2.h"
+#include "webrtc/system_wrappers/include/cpu_features_wrapper.h"
+
+namespace webrtc {
+
+const int kMotionMagnitudeThreshold = 8 * 3;
+const int kSumDiffThreshold = 16 * 16 * 2;
+const int kSumDiffThresholdHigh = 600;
+
+DenoiserFilter* DenoiserFilter::Create() {
+  DenoiserFilter* filter = NULL;
+
+  // If we know the minimum architecture at compile time, avoid CPU detection.
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+  // x86 CPU detection required.
+  if (WebRtc_GetCPUInfo(kSSE2)) {
+    filter =
+        new DenoiserFilterSSE2();
+  } else {
+    filter = new DenoiserFilterC();
+  }
+#elif defined(WEBRTC_DETECT_NEON)
+  if (WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) {
+    filter = new DenoiserFilterNEON();
+  } else {
+    filter = new DenoiserFilterC();
+  }
+#else
+  filter = new DenoiserFilterC();
+#endif
+
+  return filter;
+}
+
+}  // namespace webrtc
diff --git a/webrtc/modules/video_processing/util/denoiser_filter.h b/webrtc/modules/video_processing/util/denoiser_filter.h
new file mode 100644
index 0000000000..19135b3b9e
--- /dev/null
+++ b/webrtc/modules/video_processing/util/denoiser_filter.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_VIDEO_PROCESSING_UTIL_DENOISER_FILTER_H_
+#define WEBRTC_MODULES_VIDEO_PROCESSING_UTIL_DENOISER_FILTER_H_
+
+#include "webrtc/modules/include/module_common_types.h"
+#include "webrtc/modules/video_processing/include/video_processing_defines.h"
+
+namespace webrtc {
+
+extern const int kMotionMagnitudeThreshold;
+extern const int kSumDiffThreshold;
+extern const int kSumDiffThresholdHigh;
+
+enum DenoiserDecision { COPY_BLOCK, FILTER_BLOCK };
+struct DenoiseMetrics {
+  uint32_t var;
+  uint32_t sad;
+  uint8_t denoise;
+  bool is_skin;
+};
+
+class DenoiserFilter {
+ public:
+  static DenoiserFilter* Create();
+
+  virtual ~DenoiserFilter() {}
+
+  virtual void CopyMem16x16(const uint8_t* src,
+                            int src_stride,
+                            uint8_t* dst,
+                            int dst_stride) = 0;
+  virtual void CopyMem8x8(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride) = 0;
+  virtual uint32_t Variance16x8(const uint8_t* a,
+                                int a_stride,
+                                const uint8_t* b,
+                                int b_stride,
+                                unsigned int* sse) = 0;
+  virtual DenoiserDecision MbDenoise(uint8_t* mc_running_avg_y,
+                                     int mc_avg_y_stride,
+                                     uint8_t* running_avg_y,
+                                     int avg_y_stride,
+                                     const uint8_t* sig,
+                                     int sig_stride,
+                                     uint8_t motion_magnitude,
+                                     int increase_denoising) = 0;
+};
+
+}  // namespace webrtc
+
+#endif  // WEBRTC_MODULES_VIDEO_PROCESSING_UTIL_DENOISER_FILTER_H_
diff --git a/webrtc/modules/video_processing/util/denoiser_filter_c.cc b/webrtc/modules/video_processing/util/denoiser_filter_c.cc
new file mode 100644
index 0000000000..e32bf83889
--- /dev/null
+++ b/webrtc/modules/video_processing/util/denoiser_filter_c.cc
@@ -0,0 +1,194 @@
+/*
+ *  Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "webrtc/modules/video_processing/util/denoiser_filter_c.h"
+
+namespace webrtc {
+
+void DenoiserFilterC::CopyMem16x16(const uint8_t* src,
+                                   int src_stride,
+                                   uint8_t* dst,
+                                   int dst_stride) {
+  for (int i = 0; i < 16; i++) {
+    memcpy(dst, src, 16);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void DenoiserFilterC::CopyMem8x8(const uint8_t* src,
+                                 int src_stride,
+                                 uint8_t* dst,
+                                 int dst_stride) {
+  for (int i = 0; i < 8; i++) {
+    memcpy(dst, src, 8);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+uint32_t DenoiserFilterC::Variance16x8(const uint8_t* a,
+                                       int a_stride,
+                                       const uint8_t* b,
+                                       int b_stride,
+                                       uint32_t* sse) {
+  int sum = 0;
+  *sse = 0;
+  a_stride <<= 1;
+  b_stride <<= 1;
+
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 16; j++) {
+      const int diff = a[j] - b[j];
+      sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return *sse - ((static_cast<int64_t>(sum) * sum) >> 8);
+}
+
+DenoiserDecision DenoiserFilterC::MbDenoise(uint8_t* mc_running_avg_y,
+                                            int mc_avg_y_stride,
+                                            uint8_t* running_avg_y,
+                                            int avg_y_stride,
+                                            const uint8_t* sig,
+                                            int sig_stride,
+                                            uint8_t motion_magnitude,
+                                            int increase_denoising) {
+  int sum_diff_thresh = 0;
+  int sum_diff = 0;
+  int adj_val[3] = {3, 4, 6};
+  int shift_inc1 = 0;
+  int shift_inc2 = 1;
+  int col_sum[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  if (motion_magnitude <= kMotionMagnitudeThreshold) {
+    if (increase_denoising) {
+      shift_inc1 = 1;
+      shift_inc2 = 2;
+    }
+    adj_val[0] += shift_inc2;
+    adj_val[1] += shift_inc2;
+    adj_val[2] += shift_inc2;
+  }
+
+  for (int r = 0; r < 16; ++r) {
+    for (int c = 0; c < 16; ++c) {
+      int diff = 0;
+      int adjustment = 0;
+      int absdiff = 0;
+
+      diff = mc_running_avg_y[c] - sig[c];
+      absdiff = abs(diff);
+
+      // When |diff| <= |3 + shift_inc1|, use pixel value from
+      // last denoised raw.
+      if (absdiff <= 3 + shift_inc1) {
+        running_avg_y[c] = mc_running_avg_y[c];
+        col_sum[c] += diff;
+      } else {
+        if (absdiff >= 4 + shift_inc1 && absdiff <= 7)
+          adjustment = adj_val[0];
+        else if (absdiff >= 8 && absdiff <= 15)
+          adjustment = adj_val[1];
+        else
+          adjustment = adj_val[2];
+
+        if (diff > 0) {
+          if ((sig[c] + adjustment) > 255)
+            running_avg_y[c] = 255;
+          else
+            running_avg_y[c] = sig[c] + adjustment;
+
+          col_sum[c] += adjustment;
+        } else {
+          if ((sig[c] - adjustment) < 0)
+            running_avg_y[c] = 0;
+          else
+            running_avg_y[c] = sig[c] - adjustment;
+
+          col_sum[c] -= adjustment;
+        }
+      }
+    }
+
+    // Update pointers for next iteration.
+    sig += sig_stride;
+    mc_running_avg_y += mc_avg_y_stride;
+    running_avg_y += avg_y_stride;
+  }
+
+  for (int c = 0; c < 16; ++c) {
+    if (col_sum[c] >= 128) {
+      col_sum[c] = 127;
+    }
+    sum_diff += col_sum[c];
+  }
+
+  sum_diff_thresh = kSumDiffThreshold;
+  if (increase_denoising)
+    sum_diff_thresh = kSumDiffThresholdHigh;
+  if (abs(sum_diff) > sum_diff_thresh) {
+    int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
+    // Only apply the adjustment for max delta up to 3.
+    if (delta < 4) {
+      sig -= sig_stride * 16;
+      mc_running_avg_y -= mc_avg_y_stride * 16;
+      running_avg_y -= avg_y_stride * 16;
+      for (int r = 0; r < 16; ++r) {
+        for (int c = 0; c < 16; ++c) {
+          int diff = mc_running_avg_y[c] - sig[c];
+          int adjustment = abs(diff);
+          if (adjustment > delta)
+            adjustment = delta;
+          if (diff > 0) {
+            // Bring denoised signal down.
+            if (running_avg_y[c] - adjustment < 0)
+              running_avg_y[c] = 0;
+            else
+              running_avg_y[c] = running_avg_y[c] - adjustment;
+            col_sum[c] -= adjustment;
+          } else if (diff < 0) {
+            // Bring denoised signal up.
+            if (running_avg_y[c] + adjustment > 255)
+              running_avg_y[c] = 255;
+            else
+              running_avg_y[c] = running_avg_y[c] + adjustment;
+            col_sum[c] += adjustment;
+          }
+        }
+        sig += sig_stride;
+        mc_running_avg_y += mc_avg_y_stride;
+        running_avg_y += avg_y_stride;
+      }
+
+      sum_diff = 0;
+      for (int c = 0; c < 16; ++c) {
+        if (col_sum[c] >= 128) {
+          col_sum[c] = 127;
+        }
+        sum_diff += col_sum[c];
+      }
+
+      if (abs(sum_diff) > sum_diff_thresh)
+        return COPY_BLOCK;
+    } else {
+      return COPY_BLOCK;
+    }
+  }
+
+  return FILTER_BLOCK;
+}
+
+}  // namespace webrtc
diff --git a/webrtc/modules/video_processing/util/denoiser_filter_c.h b/webrtc/modules/video_processing/util/denoiser_filter_c.h
new file mode 100644
index 0000000000..830fcfcbbd
--- /dev/null
+++ b/webrtc/modules/video_processing/util/denoiser_filter_c.h
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_VIDEO_PROCESSING_UTIL_DENOISER_FILTER_C_H_
+#define WEBRTC_MODULES_VIDEO_PROCESSING_UTIL_DENOISER_FILTER_C_H_
+
+#include "webrtc/modules/video_processing/util/denoiser_filter.h"
+
+namespace webrtc {
+
+class DenoiserFilterC : public DenoiserFilter {
+ public:
+  DenoiserFilterC() {}
+  void CopyMem16x16(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride) override;
+  void CopyMem8x8(const uint8_t* src,
+                  int src_stride,
+                  uint8_t* dst,
+                  int dst_stride) override;
+  uint32_t Variance16x8(const uint8_t* a,
+                        int a_stride,
+                        const uint8_t* b,
+                        int b_stride,
+                        unsigned int* sse) override;
+  DenoiserDecision MbDenoise(uint8_t* mc_running_avg_y,
+                             int mc_avg_y_stride,
+                             uint8_t* running_avg_y,
+                             int avg_y_stride,
+                             const uint8_t* sig,
+                             int sig_stride,
+                             uint8_t motion_magnitude,
+                             int increase_denoising) override;
+};
+
+
+
+}  // namespace webrtc
+
+#endif  // WEBRTC_MODULES_VIDEO_PROCESSING_UTIL_DENOISER_FILTER_C_H_
diff --git a/webrtc/modules/video_processing/util/denoiser_filter_neon.cc b/webrtc/modules/video_processing/util/denoiser_filter_neon.cc
new file mode 100644
index 0000000000..67d420cda7
--- /dev/null
+++ b/webrtc/modules/video_processing/util/denoiser_filter_neon.cc
@@ -0,0 +1,284 @@
+/*
+ *  Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "webrtc/modules/video_processing/util/denoiser_filter_neon.h"
+
+namespace webrtc {
+
+static int HorizontalAddS16x8(const int16x8_t v_16x8) {
+  const int32x4_t a = vpaddlq_s16(v_16x8);
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static int HorizontalAddS32x4(const int32x4_t v_32x4) {
+  const int64x2_t b = vpaddlq_s32(v_32x4);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static void VarianceNeonW8(const uint8_t* a,
+                           int a_stride,
+                           const uint8_t* b,
+                           int b_stride,
+                           int w,
+                           int h,
+                           uint32_t* sse,
+                           int64_t* sum) {
+  int16x8_t v_sum = vdupq_n_s16(0);
+  int32x4_t v_sse_lo = vdupq_n_s32(0);
+  int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 8) {
+      const uint8x8_t v_a = vld1_u8(&a[j]);
+      const uint8x8_t v_b = vld1_u8(&b[j]);
+      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+      v_sum = vaddq_s16(v_sum, sv_diff);
+      v_sse_lo =
+          vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
+      v_sse_hi =
+          vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+
+  *sum = HorizontalAddS16x8(v_sum);
+  *sse = static_cast<uint32_t>(
+      HorizontalAddS32x4(vaddq_s32(v_sse_lo, v_sse_hi)));
+}
+
+void DenoiserFilterNEON::CopyMem16x16(const uint8_t* src,
+                                      int src_stride,
+                                      uint8_t* dst,
+                                      int dst_stride) {
+  uint8x16_t qtmp;
+  for (int r = 0; r < 16; r++) {
+    qtmp = vld1q_u8(src);
+    vst1q_u8(dst, qtmp);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void DenoiserFilterNEON::CopyMem8x8(const uint8_t* src,
+                                    int src_stride,
+                                    uint8_t* dst,
+                                    int dst_stride) {
+  uint8x8_t vtmp;
+
+  for (int r = 0; r < 8; r++) {
+    vtmp = vld1_u8(src);
+    vst1_u8(dst, vtmp);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+uint32_t DenoiserFilterNEON::Variance16x8(const uint8_t* a,
+                                          int a_stride,
+                                          const uint8_t* b,
+                                          int b_stride,
+                                          uint32_t* sse) {
+  int64_t sum = 0;
+  VarianceNeonW8(a, a_stride << 1, b, b_stride << 1, 16, 8, sse, &sum);
+  return *sse - ((sum * sum) >> 7);
+}
+
+DenoiserDecision DenoiserFilterNEON::MbDenoise(uint8_t* mc_running_avg_y,
+                                               int mc_running_avg_y_stride,
+                                               uint8_t* running_avg_y,
+                                               int running_avg_y_stride,
+                                               const uint8_t* sig,
+                                               int sig_stride,
+                                               uint8_t motion_magnitude,
+                                               int increase_denoising) {
+  // If motion_magnitude is small, making the denoiser more aggressive by
+  // increasing the adjustment for each level, level1 adjustment is
+  // increased, the deltas stay the same.
+  int shift_inc =
+      (increase_denoising && motion_magnitude <= kMotionMagnitudeThreshold)
+          ? 1
+          : 0;
+  const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+      (motion_magnitude <= kMotionMagnitudeThreshold) ? 4 + shift_inc : 3);
+  const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+  const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+  const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
+  const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+  const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+  int64x2_t v_sum_diff_total = vdupq_n_s64(0);
+
+  // Go over lines.
+  for (int r = 0; r < 16; ++r) {
+    // Load inputs.
+    const uint8x16_t v_sig = vld1q_u8(sig);
+    const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+    // Calculate absolute difference and sign masks.
+    const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+    const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+    const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+
+    // Figure out which level that put us in.
+    const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff);
+    const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff);
+    const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff);
+
+    // Calculate absolute adjustments for level 1, 2 and 3.
+    const uint8x16_t v_level2_adjustment =
+        vandq_u8(v_level2_mask, v_delta_level_1_and_2);
+    const uint8x16_t v_level3_adjustment =
+        vandq_u8(v_level3_mask, v_delta_level_2_and_3);
+    const uint8x16_t v_level1and2_adjustment =
+        vaddq_u8(v_level1_adjustment, v_level2_adjustment);
+    const uint8x16_t v_level1and2and3_adjustment =
+        vaddq_u8(v_level1and2_adjustment, v_level3_adjustment);
+
+    // Figure adjustment absolute value by selecting between the absolute
+    // difference if in level0 or the value for level 1, 2 and 3.
+    const uint8x16_t v_abs_adjustment =
+        vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff);
+
+    // Calculate positive and negative adjustments. Apply them to the signal
+    // and accumulate them. Adjustments are less than eight and the maximum
+    // sum of them (7 * 16) can fit in a signed char.
+    const uint8x16_t v_pos_adjustment =
+        vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+    const uint8x16_t v_neg_adjustment =
+        vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+    uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
+    v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
+
+    // Store results.
+    vst1q_u8(running_avg_y, v_running_avg_y);
+
+    // Sum all the accumulators to have the sum of all pixel differences
+    // for this macroblock.
+    {
+      const int8x16_t v_sum_diff =
+          vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
+                    vreinterpretq_s8_u8(v_neg_adjustment));
+      const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff);
+      const int32x4_t fedc_ba98_7654_3210 =
+          vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+      const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
+
+      v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210);
+    }
+
+    // Update pointers for next iteration.
+    sig += sig_stride;
+    mc_running_avg_y += mc_running_avg_y_stride;
+    running_avg_y += running_avg_y_stride;
+  }
+
+  // Too much adjustments => copy block.
+  {
+    int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
+                            vget_low_s64(v_sum_diff_total));
+    int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
+    int sum_diff_thresh = kSumDiffThreshold;
+
+    if (increase_denoising)
+      sum_diff_thresh = kSumDiffThresholdHigh;
+    if (sum_diff > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising),
+      // checK if we can still apply some (weaker) temporal filtering to
+      // this block, that would otherwise not be denoised at all. Simplest
+      // is to apply an additional adjustment to running_avg_y to bring it
+      // closer to sig. The adjustment is capped by a maximum delta, and
+      // chosen such that in most cases the resulting sum_diff will be
+      // within the accceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over the
+      // threshold.
+      int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const uint8x16_t k_delta = vmovq_n_u8(delta);
+        sig -= sig_stride * 16;
+        mc_running_avg_y -= mc_running_avg_y_stride * 16;
+        running_avg_y -= running_avg_y_stride * 16;
+        for (int r = 0; r < 16; ++r) {
+          uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
+          const uint8x16_t v_sig = vld1q_u8(sig);
+          const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+          // Calculate absolute difference and sign masks.
+          const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+          const uint8x16_t v_diff_pos_mask =
+              vcltq_u8(v_sig, v_mc_running_avg_y);
+          const uint8x16_t v_diff_neg_mask =
+              vcgtq_u8(v_sig, v_mc_running_avg_y);
+          // Clamp absolute difference to delta to get the adjustment.
+          const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta));
+
+          const uint8x16_t v_pos_adjustment =
+              vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+          const uint8x16_t v_neg_adjustment =
+              vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+          v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
+          v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
+
+          // Store results.
+          vst1q_u8(running_avg_y, v_running_avg_y);
+
+          {
+            const int8x16_t v_sum_diff =
+                vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
+                          vreinterpretq_s8_u8(v_pos_adjustment));
+
+            const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff);
+            const int32x4_t fedc_ba98_7654_3210 =
+                vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+            const int64x2_t fedcba98_76543210 =
+                vpaddlq_s32(fedc_ba98_7654_3210);
+
+            v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210);
+          }
+          // Update pointers for next iteration.
+          sig += sig_stride;
+          mc_running_avg_y += mc_running_avg_y_stride;
+          running_avg_y += running_avg_y_stride;
+        }
+        {
+          // Update the sum of all pixel differences of this MB.
+          x = vqadd_s64(vget_high_s64(v_sum_diff_total),
+                        vget_low_s64(v_sum_diff_total));
+          sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
+
+          if (sum_diff > sum_diff_thresh) {
+            return COPY_BLOCK;
+          }
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+
+  // Tell above level that block was filtered.
+  running_avg_y -= running_avg_y_stride * 16;
+  sig -= sig_stride * 16;
+
+  return FILTER_BLOCK;
+}
+
+}  // namespace webrtc
diff --git a/webrtc/modules/video_processing/util/denoiser_filter_neon.h b/webrtc/modules/video_processing/util/denoiser_filter_neon.h
new file mode 100644
index 0000000000..bc87ba788e
--- /dev/null
+++ b/webrtc/modules/video_processing/util/denoiser_filter_neon.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_VIDEO_PROCESSING_UTIL_DENOISER_FILTER_NEON_H_
+#define WEBRTC_MODULES_VIDEO_PROCESSING_UTIL_DENOISER_FILTER_NEON_H_
+
+#include "webrtc/modules/video_processing/util/denoiser_filter.h"
+
+namespace webrtc {
+
+class DenoiserFilterNEON : public DenoiserFilter {
+ public:
+  DenoiserFilterNEON() {}
+  void CopyMem16x16(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride) override;
+  void CopyMem8x8(const uint8_t* src,
+                  int src_stride,
+                  uint8_t* dst,
+                  int dst_stride) override;
+  uint32_t Variance16x8(const uint8_t* a,
+                        int a_stride,
+                        const uint8_t* b,
+                        int b_stride,
+                        unsigned int* sse) override;
+  DenoiserDecision MbDenoise(uint8_t* mc_running_avg_y,
+                             int mc_avg_y_stride,
+                             uint8_t* running_avg_y,
+                             int avg_y_stride,
+                             const uint8_t* sig,
+                             int sig_stride,
+                             uint8_t motion_magnitude,
+                             int increase_denoising) override;
+};
+
+}  // namespace webrtc
+
+#endif  // WEBRTC_MODULES_VIDEO_PROCESSING_UTIL_DENOISER_FILTER_NEON_H_
diff --git a/webrtc/modules/video_processing/util/denoiser_filter_sse2.cc b/webrtc/modules/video_processing/util/denoiser_filter_sse2.cc
new file mode 100644
index 0000000000..82f11344c0
--- /dev/null
+++ b/webrtc/modules/video_processing/util/denoiser_filter_sse2.cc
@@ -0,0 +1,282 @@
+/*
+ *  Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "webrtc/modules/video_processing/util/denoiser_filter_sse2.h"
+
+namespace webrtc {
+
+static void Get8x8varSse2(const uint8_t* src,
+                          int src_stride,
+                          const uint8_t* ref,
+                          int ref_stride,
+                          unsigned int* sse,
+                          int* sum) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i vsum = _mm_setzero_si128();
+  __m128i vsse = _mm_setzero_si128();
+
+  for (int i = 0; i < 8; i += 2) {
+    const __m128i src0 = _mm_unpacklo_epi8(
+        _mm_loadl_epi64((const __m128i*)(src + i * src_stride)), zero);
+    const __m128i ref0 = _mm_unpacklo_epi8(
+        _mm_loadl_epi64((const __m128i*)(ref + i * ref_stride)), zero);
+    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+    const __m128i src1 = _mm_unpacklo_epi8(
+        _mm_loadl_epi64((const __m128i*)(src + (i + 1) * src_stride)), zero);
+    const __m128i ref1 = _mm_unpacklo_epi8(
+        _mm_loadl_epi64((const __m128i*)(ref + (i + 1) * ref_stride)), zero);
+    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+    vsum = _mm_add_epi16(vsum, diff0);
+    vsum = _mm_add_epi16(vsum, diff1);
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+  }
+
+  // sum
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+  *sum = static_cast<int>(_mm_extract_epi16(vsum, 0));
+
+  // sse
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+  *sse = _mm_cvtsi128_si32(vsse);
+}
+
+static void VarianceSSE2(const unsigned char* src,
+                         int src_stride,
+                         const unsigned char* ref,
+                         int ref_stride,
+                         int w,
+                         int h,
+                         uint32_t* sse,
+                         uint32_t* sum,
+                         int block_size) {
+  *sse = 0;
+  *sum = 0;
+
+  for (int i = 0; i < h; i += block_size) {
+    for (int j = 0; j < w; j += block_size) {
+      uint32_t sse0 = 0;
+      int32_t sum0 = 0;
+
+      Get8x8varSse2(src + src_stride * i + j, src_stride,
+                    ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+// Compute the sum of all pixel differences of this MB.
+static uint32_t AbsSumDiff16x1(__m128i acc_diff) {
+  const __m128i k_1 = _mm_set1_epi16(1);
+  const __m128i acc_diff_lo =
+      _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_hi =
+      _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
+  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
+  const __m128i hgfe_dcba =
+      _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
+  const __m128i hgfedcba =
+      _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
+  unsigned int sum_diff = abs(_mm_cvtsi128_si32(hgfedcba));
+
+  return sum_diff;
+}
+
+// TODO(jackychen): Optimize this function using SSE2.
+void DenoiserFilterSSE2::CopyMem16x16(const uint8_t* src,
+                                      int src_stride,
+                                      uint8_t* dst,
+                                      int dst_stride) {
+  for (int i = 0; i < 16; i++) {
+    memcpy(dst, src, 16);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+// TODO(jackychen): Optimize this function using SSE2.
+void DenoiserFilterSSE2::CopyMem8x8(const uint8_t* src,
+                                    int src_stride,
+                                    uint8_t* dst,
+                                    int dst_stride) {
+  for (int i = 0; i < 8; i++) {
+    memcpy(dst, src, 8);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+uint32_t DenoiserFilterSSE2::Variance16x8(const uint8_t* src,
+                                          int src_stride,
+                                          const uint8_t* ref,
+                                          int ref_stride,
+                                          unsigned int* sse) {
+  uint32_t sum = 0;
+  VarianceSSE2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum, 8);
+  return *sse - ((sum * sum) >> 7);
+}
+
+DenoiserDecision DenoiserFilterSSE2::MbDenoise(uint8_t* mc_running_avg_y,
+                                               int mc_avg_y_stride,
+                                               uint8_t* running_avg_y,
+                                               int avg_y_stride,
+                                               const uint8_t* sig,
+                                               int sig_stride,
+                                               uint8_t motion_magnitude,
+                                               int increase_denoising) {
+  int shift_inc =
+      (increase_denoising && motion_magnitude <= kMotionMagnitudeThreshold)
+          ? 1
+          : 0;
+  __m128i acc_diff = _mm_setzero_si128();
+  const __m128i k_0 = _mm_setzero_si128();
+  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+  const __m128i k_8 = _mm_set1_epi8(8);
+  const __m128i k_16 = _mm_set1_epi8(16);
+  // Modify each level's adjustment according to motion_magnitude.
+  const __m128i l3 = _mm_set1_epi8(
+      (motion_magnitude <= kMotionMagnitudeThreshold) ? 7 + shift_inc : 6);
+  // Difference between level 3 and level 2 is 2.
+  const __m128i l32 = _mm_set1_epi8(2);
+  // Difference between level 2 and level 1 is 1.
+  const __m128i l21 = _mm_set1_epi8(1);
+
+  for (int r = 0; r < 16; ++r) {
+    // Calculate differences.
+    const __m128i v_sig =
+        _mm_loadu_si128(reinterpret_cast<const __m128i*>(&sig[0]));
+    const __m128i v_mc_running_avg_y =
+        _mm_loadu_si128(reinterpret_cast<__m128i*>(&mc_running_avg_y[0]));
+    __m128i v_running_avg_y;
+    const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+    const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+    // Obtain the sign. FF if diff is negative.
+    const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+    // Clamp absolute difference to 16 to be used to get mask. Doing this
+    // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
+    const __m128i clamped_absdiff =
+        _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_16);
+    // Get masks for l2 l1 and l0 adjustments.
+    const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
+    const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
+    const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
+    // Get adjustments for l2, l1, and l0.
+    __m128i adj2 = _mm_and_si128(mask2, l32);
+    const __m128i adj1 = _mm_and_si128(mask1, l21);
+    const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+    __m128i adj, padj, nadj;
+
+    // Combine the adjustments and get absolute adjustments.
+    adj2 = _mm_add_epi8(adj2, adj1);
+    adj = _mm_sub_epi8(l3, adj2);
+    adj = _mm_andnot_si128(mask0, adj);
+    adj = _mm_or_si128(adj, adj0);
+
+    // Restore the sign and get positive and negative adjustments.
+    padj = _mm_andnot_si128(diff_sign, adj);
+    nadj = _mm_and_si128(diff_sign, adj);
+
+    // Calculate filtered value.
+    v_running_avg_y = _mm_adds_epu8(v_sig, padj);
+    v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(running_avg_y),
+                     v_running_avg_y);
+
+    // Adjustments <=7, and each element in acc_diff can fit in signed
+    // char.
+    acc_diff = _mm_adds_epi8(acc_diff, padj);
+    acc_diff = _mm_subs_epi8(acc_diff, nadj);
+
+    // Update pointers for next iteration.
+    sig += sig_stride;
+    mc_running_avg_y += mc_avg_y_stride;
+    running_avg_y += avg_y_stride;
+  }
+
+  {
+    // Compute the sum of all pixel differences of this MB.
+    unsigned int abs_sum_diff = AbsSumDiff16x1(acc_diff);
+    unsigned int sum_diff_thresh = kSumDiffThreshold;
+    if (increase_denoising)
+      sum_diff_thresh = kSumDiffThresholdHigh;
+    if (abs_sum_diff > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising),
+      // check if we can still apply some (weaker) temporal filtering to
+      // this block, that would otherwise not be denoised at all. Simplest
+      // is to apply an additional adjustment to running_avg_y to bring it
+      // closer to sig. The adjustment is capped by a maximum delta, and
+      // chosen such that in most cases the resulting sum_diff will be
+      // within the acceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over the
+      // threshold.
+      int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const __m128i k_delta = _mm_set1_epi8(delta);
+        sig -= sig_stride * 16;
+        mc_running_avg_y -= mc_avg_y_stride * 16;
+        running_avg_y -= avg_y_stride * 16;
+        for (int r = 0; r < 16; ++r) {
+          __m128i v_running_avg_y =
+              _mm_loadu_si128(reinterpret_cast<__m128i*>(&running_avg_y[0]));
+          // Calculate differences.
+          const __m128i v_sig =
+              _mm_loadu_si128(reinterpret_cast<const __m128i*>(&sig[0]));
+          const __m128i v_mc_running_avg_y = _mm_loadu_si128(
+              reinterpret_cast<__m128i*>(&mc_running_avg_y[0]));
+          const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+          const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+          // Obtain the sign. FF if diff is negative.
+          const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+          // Clamp absolute difference to delta to get the adjustment.
+          const __m128i adj = _mm_min_epu8(
+              _mm_or_si128(pdiff, ndiff), k_delta);
+          // Restore the sign and get positive and negative adjustments.
+          __m128i padj, nadj;
+          padj = _mm_andnot_si128(diff_sign, adj);
+          nadj = _mm_and_si128(diff_sign, adj);
+          // Calculate filtered value.
+          v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
+          v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
+          _mm_storeu_si128(reinterpret_cast<__m128i*>(running_avg_y),
+                           v_running_avg_y);
+
+          // Accumulate the adjustments.
+          acc_diff = _mm_subs_epi8(acc_diff, padj);
+          acc_diff = _mm_adds_epi8(acc_diff, nadj);
+
+          // Update pointers for next iteration.
+          sig += sig_stride;
+          mc_running_avg_y += mc_avg_y_stride;
+          running_avg_y += avg_y_stride;
+        }
+        abs_sum_diff = AbsSumDiff16x1(acc_diff);
+        if (abs_sum_diff > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+}  // namespace webrtc
diff --git a/webrtc/modules/video_processing/util/denoiser_filter_sse2.h b/webrtc/modules/video_processing/util/denoiser_filter_sse2.h
new file mode 100644
index 0000000000..31d8510902
--- /dev/null
+++ b/webrtc/modules/video_processing/util/denoiser_filter_sse2.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_VIDEO_PROCESSING_UTIL_DENOISER_FILTER_SSE2_H_
+#define WEBRTC_MODULES_VIDEO_PROCESSING_UTIL_DENOISER_FILTER_SSE2_H_
+
+#include "webrtc/modules/video_processing/util/denoiser_filter.h"
+
+namespace webrtc {
+
+class DenoiserFilterSSE2 : public DenoiserFilter {
+ public:
+  DenoiserFilterSSE2() {}
+  void CopyMem16x16(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride) override;
+  void CopyMem8x8(const uint8_t* src,
+                  int src_stride,
+                  uint8_t* dst,
+                  int dst_stride) override;
+  uint32_t Variance16x8(const uint8_t* a,
+                        int a_stride,
+                        const uint8_t* b,
+                        int b_stride,
+                        unsigned int* sse) override;
+  DenoiserDecision MbDenoise(uint8_t* mc_running_avg_y,
+                             int mc_avg_y_stride,
+                             uint8_t* running_avg_y,
+                             int avg_y_stride,
+                             const uint8_t* sig,
+                             int sig_stride,
+                             uint8_t motion_magnitude,
+                             int increase_denoising) override;
+};
+
+}  // namespace webrtc
+
+#endif  // WEBRTC_MODULES_VIDEO_PROCESSING_UTIL_DENOISER_FILTER_SSE2_H_
diff --git a/webrtc/modules/video_processing/util/skin_detection.cc b/webrtc/modules/video_processing/util/skin_detection.cc
new file mode 100755
index 0000000000..b3b2cd6aad
--- /dev/null
+++ b/webrtc/modules/video_processing/util/skin_detection.cc
@@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "webrtc/modules/video_processing/util/skin_detection.h"
+
+namespace webrtc {
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[2] = {7463, 9614};                 // q6
+static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157};  // q16
+static const int skin_threshold = 1570636;                    // q18
+
+// Thresholds on luminance.
+static const int y_low = 20;
+static const int y_high = 220;
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int EvaluateSkinColorDifference(int cb, int cr) {
+  const int cb_q6 = cb << 6;
+  const int cr_q6 = cr << 6;
+  const int cb_diff_q12 = (cb_q6 - skin_mean[0]) * (cb_q6 - skin_mean[0]);
+  const int cbcr_diff_q12 = (cb_q6 - skin_mean[0]) * (cr_q6 - skin_mean[1]);
+  const int cr_diff_q12 = (cr_q6 - skin_mean[1]) * (cr_q6 - skin_mean[1]);
+  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+  const int skin_diff =
+      skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
+      skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
+  return skin_diff;
+}
+
+bool MbHasSkinColor(const uint8_t* y_src,
+                    const uint8_t* u_src,
+                    const uint8_t* v_src,
+                    const int stride_y,
+                    const int stride_u,
+                    const int stride_v,
+                    const int mb_row,
+                    const int mb_col) {
+  const uint8_t* y =
+      y_src + ((mb_row << 4) + 8) * stride_y + (mb_col << 4) + 8;
+  const uint8_t* u =
+      u_src + ((mb_row << 3) + 4) * stride_u + (mb_col << 3) + 4;
+  const uint8_t* v =
+      v_src + ((mb_row << 3) + 4) * stride_v + (mb_col << 3) + 4;
+  // Use 2x2 average of center pixel to compute skin area.
+  uint8_t y_avg =
+      (*y + *(y + 1) + *(y + stride_y) + *(y + stride_y + 1)) >> 2;
+  uint8_t u_avg =
+      (*u + *(u + 1) + *(u + stride_u) + *(u + stride_u + 1)) >> 2;
+  uint8_t v_avg =
+      (*v + *(v + 1) + *(v + stride_v) + *(v + stride_v + 1)) >> 2;
+  // Ignore MB with too high or low brightness.
+  if (y_avg < y_low || y_avg > y_high)
+    return false;
+  else
+    return (EvaluateSkinColorDifference(u_avg, v_avg) < skin_threshold);
+}
+
+}  // namespace webrtc
diff --git a/webrtc/modules/video_processing/util/skin_detection.h b/webrtc/modules/video_processing/util/skin_detection.h
new file mode 100755
index 0000000000..561c03c425
--- /dev/null
+++ b/webrtc/modules/video_processing/util/skin_detection.h
@@ -0,0 +1,28 @@
+/*
+ *  Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_VIDEO_PROCESSING_UTIL_SKIN_DETECTION_H_
+#define WEBRTC_MODULES_VIDEO_PROCESSING_UTIL_SKIN_DETECTION_H_
+
+namespace webrtc {
+
+typedef unsigned char uint8_t;
+bool MbHasSkinColor(const uint8_t* y_src,
+                    const uint8_t* u_src,
+                    const uint8_t* v_src,
+                    const int stride_y,
+                    const int stride_u,
+                    const int stride_v,
+                    const int mb_row,
+                    const int mb_col);
+
+}  // namespace webrtc
+
+#endif  // WEBRTC_MODULES_VIDEO_PROCESSING_UTIL_SKIN_DETECTION_H_
diff --git a/webrtc/modules/video_processing/video_denoiser.cc b/webrtc/modules/video_processing/video_denoiser.cc
new file mode 100644
index 0000000000..83bcf7f1cc
--- /dev/null
+++ b/webrtc/modules/video_processing/video_denoiser.cc
@@ -0,0 +1,149 @@
+/*
+ *  Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "webrtc/common_video/libyuv/include/scaler.h"
+#include "webrtc/common_video/libyuv/include/webrtc_libyuv.h"
+#include "webrtc/modules/video_processing/video_denoiser.h"
+
+namespace webrtc {
+
+VideoDenoiser::VideoDenoiser()
+    : width_(0),
+      height_(0),
+      filter_(DenoiserFilter::Create()) {}
+
+void VideoDenoiser::TrailingReduction(int mb_rows,
+                                      int mb_cols,
+                                      const uint8_t* y_src,
+                                      int stride_y,
+                                      uint8_t* y_dst) {
+  for (int mb_row = 1; mb_row < mb_rows - 1; ++mb_row) {
+    for (int mb_col = 1; mb_col < mb_cols - 1; ++mb_col) {
+      int mb_index = mb_row * mb_cols + mb_col;
+      uint8_t* mb_dst = y_dst + (mb_row << 4) * stride_y + (mb_col << 4);
+      const uint8_t* mb_src = y_src + (mb_row << 4) * stride_y + (mb_col << 4);
+      // If the number of denoised neighbors is less than a threshold,
+      // do NOT denoise for the block. Set different threshold for skin MB.
+      // The change of denoising status will not propagate.
+      if (metrics_[mb_index].is_skin) {
+      // The threshold is high (more strict) for non-skin MB where the trailing
+      // usually happen.
+        if (metrics_[mb_index].denoise &&
+            metrics_[mb_index + 1].denoise +
+            metrics_[mb_index - 1].denoise +
+            metrics_[mb_index + mb_cols].denoise +
+            metrics_[mb_index - mb_cols].denoise <= 2) {
+          metrics_[mb_index].denoise = 0;
+          filter_->CopyMem16x16(mb_src, stride_y, mb_dst, stride_y);
+        }
+      } else if (metrics_[mb_index].denoise &&
+                 metrics_[mb_index + 1].denoise +
+                 metrics_[mb_index - 1].denoise +
+                 metrics_[mb_index + mb_cols + 1].denoise +
+                 metrics_[mb_index + mb_cols - 1].denoise +
+                 metrics_[mb_index - mb_cols + 1].denoise +
+                 metrics_[mb_index - mb_cols - 1].denoise +
+                 metrics_[mb_index + mb_cols].denoise +
+                 metrics_[mb_index - mb_cols].denoise <= 7) {
+        filter_->CopyMem16x16(mb_src, stride_y, mb_dst, stride_y);
+      }
+    }
+  }
+}
+
+void VideoDenoiser::DenoiseFrame(const VideoFrame& frame,
+                                 VideoFrame* denoised_frame) {
+  int stride_y = frame.stride(kYPlane);
+  int stride_u = frame.stride(kUPlane);
+  int stride_v = frame.stride(kVPlane);
+  // If previous width and height are different from current frame's, then no
+  // denoising for the current frame.
+  if (width_ != frame.width() || height_ != frame.height()) {
+    width_ = frame.width();
+    height_ = frame.height();
+    denoised_frame->CreateFrame(frame.buffer(kYPlane), frame.buffer(kUPlane),
+                                frame.buffer(kVPlane), width_, height_,
+                                stride_y, stride_u, stride_v);
+    // Setting time parameters to the output frame.
+    denoised_frame->set_timestamp(frame.timestamp());
+    denoised_frame->set_render_time_ms(frame.render_time_ms());
+    return;
+  }
+  // For 16x16 block.
+  int mb_cols = width_ >> 4;
+  int mb_rows = height_ >> 4;
+  if (metrics_.get() == nullptr)
+    metrics_.reset(new DenoiseMetrics[mb_cols * mb_rows]);
+  // Denoise on Y plane.
+  uint8_t* y_dst = denoised_frame->buffer(kYPlane);
+  uint8_t* u_dst = denoised_frame->buffer(kUPlane);
+  uint8_t* v_dst = denoised_frame->buffer(kVPlane);
+  const uint8_t* y_src = frame.buffer(kYPlane);
+  const uint8_t* u_src = frame.buffer(kUPlane);
+  const uint8_t* v_src = frame.buffer(kVPlane);
+  // Temporary buffer to store denoising result.
+  uint8_t y_tmp[16 * 16] = {0};
+  for (int mb_row = 0; mb_row < mb_rows; ++mb_row) {
+    for (int mb_col = 0; mb_col < mb_cols; ++mb_col) {
+      const uint8_t* mb_src =
+          y_src + (mb_row << 4) * stride_y + (mb_col << 4);
+      uint8_t* mb_dst = y_dst + (mb_row << 4) * stride_y + (mb_col << 4);
+      int mb_index = mb_row * mb_cols + mb_col;
+      // Denoise each MB at the very start and save the result to a temporary
+      // buffer.
+      if (filter_->MbDenoise(
+              mb_dst, stride_y, y_tmp, 16, mb_src, stride_y, 0, 1) ==
+          FILTER_BLOCK) {
+        uint32_t thr_var = 0;
+        // Save var and sad to the buffer.
+        metrics_[mb_index].var = filter_->Variance16x8(
+            mb_dst, stride_y, y_tmp, 16, &metrics_[mb_index].sad);
+        // Get skin map.
+        metrics_[mb_index].is_skin =
+            MbHasSkinColor(y_src, u_src, v_src, stride_y, stride_u, stride_v,
+                           mb_row, mb_col);
+        // Variance threshold for skin/non-skin MB is different.
+        // Skin MB use a small threshold to reduce blockiness.
+        thr_var = metrics_[mb_index].is_skin ? 128 : 12 * 128;
+        if (metrics_[mb_index].var > thr_var) {
+          metrics_[mb_index].denoise = 0;
+          // Use the source MB.
+          filter_->CopyMem16x16(mb_src, stride_y, mb_dst, stride_y);
+        } else {
+          metrics_[mb_index].denoise = 1;
+          // Use the denoised MB.
+          filter_->CopyMem16x16(y_tmp, 16, mb_dst, stride_y);
+        }
+      } else {
+        metrics_[mb_index].denoise = 0;
+        filter_->CopyMem16x16(mb_src, stride_y, mb_dst, stride_y);
+      }
+      // Copy source U/V plane.
+      const uint8_t* mb_src_u =
+          u_src + (mb_row << 3) * stride_u + (mb_col << 3);
+      const uint8_t* mb_src_v =
+          v_src + (mb_row << 3) * stride_v + (mb_col << 3);
+      uint8_t* mb_dst_u = u_dst + (mb_row << 3) * stride_u + (mb_col << 3);
+      uint8_t* mb_dst_v = v_dst + (mb_row << 3) * stride_v + (mb_col << 3);
+      filter_->CopyMem8x8(mb_src_u, stride_u, mb_dst_u, stride_u);
+      filter_->CopyMem8x8(mb_src_v, stride_v, mb_dst_v, stride_v);
+    }
+  }
+  // Second round.
+  // This is to reduce the trailing artifact and blockiness by referring
+  // neighbors' denoising status.
+  TrailingReduction(mb_rows, mb_cols, y_src, stride_y, y_dst);
+
+  // Setting time parameters to the output frame.
+  denoised_frame->set_timestamp(frame.timestamp());
+  denoised_frame->set_render_time_ms(frame.render_time_ms());
+  return;
+}
+
+}  // namespace webrtc
diff --git a/webrtc/modules/video_processing/video_denoiser.h b/webrtc/modules/video_processing/video_denoiser.h
new file mode 100644
index 0000000000..69edfaf956
--- /dev/null
+++ b/webrtc/modules/video_processing/video_denoiser.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_VIDEO_PROCESSING_VIDEO_DENOISER_H_
+#define WEBRTC_MODULES_VIDEO_PROCESSING_VIDEO_DENOISER_H_
+
+#include "webrtc/modules/video_processing/util/denoiser_filter.h"
+#include "webrtc/modules/video_processing/util/skin_detection.h"
+
+namespace webrtc {
+
+class VideoDenoiser {
+ public:
+  VideoDenoiser();
+  void DenoiseFrame(const VideoFrame& frame, VideoFrame* denoised_frame);
+
+ private:
+  void TrailingReduction(int mb_rows, int mb_cols, const uint8_t* y_src,
+                         int stride_y, uint8_t* y_dst);
+  int width_;
+  int height_;
+  rtc::scoped_ptr<DenoiseMetrics[]> metrics_;
+  rtc::scoped_ptr<DenoiserFilter> filter_;
+};
+
+}  // namespace webrtc
+
+#endif  // WEBRTC_MODULES_VIDEO_PROCESSING_VIDEO_DENOISER_H_
diff --git a/webrtc/modules/video_processing/video_processing.gypi b/webrtc/modules/video_processing/video_processing.gypi
index 25e2097007..e054f895be 100644
--- a/webrtc/modules/video_processing/video_processing.gypi
+++ b/webrtc/modules/video_processing/video_processing.gypi
@@ -36,11 +36,22 @@
         'video_decimator.h',
         'video_processing_impl.cc',
         'video_processing_impl.h',
+        'video_denoiser.cc',
+        'video_denoiser.h',
+        'util/denoiser_filter.cc',
+        'util/denoiser_filter.h',
+        'util/denoiser_filter_c.cc',
+        'util/denoiser_filter_c.h',
+        'util/skin_detection.cc',
+        'util/skin_detection.h',
       ],
       'conditions': [
         ['target_arch=="ia32" or target_arch=="x64"', {
           'dependencies': [ 'video_processing_sse2', ],
         }],
+        ['target_arch=="arm" or target_arch == "arm64"', {
+          'dependencies': [ 'video_processing_neon', ],
+        }],
       ],
     },
   ],
@@ -52,6 +63,8 @@
           'type': 'static_library',
           'sources': [
             'content_analysis_sse2.cc',
+            'util/denoiser_filter_sse2.cc',
+            'util/denoiser_filter_sse2.h',
           ],
           'conditions': [
             ['os_posix==1 and OS!="mac"', {
@@ -66,6 +79,19 @@
         },
       ],
     }],
+    ['target_arch=="arm" or target_arch == "arm64"', {
+      'targets': [
+        {
+          'target_name': 'video_processing_neon',
+          'type': 'static_library',
+          'includes': [ '../../build/arm_neon.gypi', ],
+          'sources': [
+            'util/denoiser_filter_neon.cc',
+            'util/denoiser_filter_neon.h',
+          ],
+        },
+      ],
+    }],
   ],
 }