diff --git a/android-webrtc.mk b/android-webrtc.mk
index dc92aeb8e2..0dae14ceca 100644
--- a/android-webrtc.mk
+++ b/android-webrtc.mk
@@ -42,8 +42,3 @@ ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
 MY_WEBRTC_COMMON_DEFS += \
     '-DWEBRTC_ARCH_ARM_V7A'
 endif
-
-else ifeq ($(TARGET_ARCH),x86)
-MY_WEBRTC_COMMON_DEFS += \
-    '-DWEBRTC_USE_SSE2'
-endif
diff --git a/src/modules/audio_processing/aec/Android.mk b/src/modules/audio_processing/aec/Android.mk
index 698755acdb..7d539587ba 100644
--- a/src/modules/audio_processing/aec/Android.mk
+++ b/src/modules/audio_processing/aec/Android.mk
@@ -20,9 +20,12 @@ LOCAL_SRC_FILES := \
     aec_resampler.c \
     aec_core.c \
     aec_rdft.c \
+
+ifeq ($(TARGET_ARCH),x86)
+LOCAL_SRC_FILES += \
     aec_core_sse2.c \
     aec_rdft_sse2.c
-
+endif
 
 # Flags passed to both C and C++ files.
 LOCAL_CFLAGS := \
diff --git a/src/modules/audio_processing/aec/aec.gypi b/src/modules/audio_processing/aec/aec.gypi
index 7e86a900f3..4b3a08d95c 100644
--- a/src/modules/audio_processing/aec/aec.gypi
+++ b/src/modules/audio_processing/aec/aec.gypi
@@ -16,8 +16,8 @@
         'aec_debug_dump%': 0,
       },
       'dependencies': [
+        'apm_util',
         '<(webrtc_root)/common_audio/common_audio.gyp:signal_processing',
-        'apm_util'
       ],
       'include_dirs': [
         'interface',
@@ -32,18 +32,37 @@
         'echo_cancellation.c',
         'aec_core.h',
         'aec_core.c',
-        'aec_core_sse2.c',
         'aec_rdft.h',
         'aec_rdft.c',
-        'aec_rdft_sse2.c',
         'aec_resampler.h',
         'aec_resampler.c',
       ],
       'conditions': [
+        ['target_arch=="ia32" or target_arch=="x64"', {
+          'dependencies': [ 'aec_sse2', ],
+        }],
         ['aec_debug_dump==1', {
           'defines': [ 'WEBRTC_AEC_DEBUG_DUMP', ],
         }],
       ],
     },
+    {
+      'target_name': 'aec_sse2',
+      'type': '<(library)',
+      'sources': [
+        'aec_core_sse2.c',
+        'aec_rdft_sse2.c',
+      ],
+      'conditions': [
+        ['os_posix==1 and OS!="mac"', {
+          'cflags': [ '-msse2', ],
+        }],
+        ['OS=="mac"', {
+          'xcode_settings': {
+            'OTHER_CFLAGS': [ '-msse2', ],
+          },
+        }],
+      ],
+    },
   ],
 }
diff --git a/src/modules/audio_processing/aec/aec_core.c b/src/modules/audio_processing/aec/aec_core.c
index 6718dec3fb..1637e6fdbc 100644
--- a/src/modules/audio_processing/aec/aec_core.c
+++ b/src/modules/audio_processing/aec/aec_core.c
@@ -21,6 +21,7 @@
 #include <string.h>
 
 #include "aec_rdft.h"
+#include "common_audio/signal_processing/include/signal_processing_library.h"
 #include "delay_estimator_wrapper.h"
 #include "ring_buffer.h"
 #include "system_wrappers/interface/cpu_features_wrapper.h"
@@ -516,11 +517,13 @@ int WebRtcAec_InitAec(aec_t *aec, int sampFreq)
     WebRtcAec_ScaleErrorSignal = ScaleErrorSignal;
     WebRtcAec_FilterAdaptation = FilterAdaptation;
     WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppress;
+
+#if defined(WEBRTC_ARCH_X86_FAMILY)
     if (WebRtc_GetCPUInfo(kSSE2)) {
-#if defined(WEBRTC_USE_SSE2)
       WebRtcAec_InitAec_SSE2();
-#endif
     }
+#endif
+
     aec_rdft_init();
 
     return 0;
diff --git a/src/modules/audio_processing/aec/aec_core.h b/src/modules/audio_processing/aec/aec_core.h
index 1b9828ab17..d326a6842d 100644
--- a/src/modules/audio_processing/aec/aec_core.h
+++ b/src/modules/audio_processing/aec/aec_core.h
@@ -15,9 +15,10 @@
 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
 #define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_
 
+#ifdef WEBRTC_AEC_DEBUG_DUMP
 #include <stdio.h>
+#endif
 
-#include "signal_processing_library.h"
 #include "typedefs.h"
 
 #define FRAME_LEN 80
diff --git a/src/modules/audio_processing/aec/aec_core_sse2.c b/src/modules/audio_processing/aec/aec_core_sse2.c
index 8894f28a17..74a1c48bdb 100644
--- a/src/modules/audio_processing/aec/aec_core_sse2.c
+++ b/src/modules/audio_processing/aec/aec_core_sse2.c
@@ -12,13 +12,12 @@
  * The core AEC algorithm, SSE2 version of speed-critical functions.
  */
 
-#include "typedefs.h"
+#include "aec_core.h"
 
-#if defined(WEBRTC_USE_SSE2)
 #include <emmintrin.h>
 #include <math.h>
+#include <string.h>  // memset
 
-#include "aec_core.h"
 #include "aec_rdft.h"
 
 __inline static float MulRe(float aRe, float aIm, float bRe, float bIm)
@@ -414,4 +413,3 @@ void WebRtcAec_InitAec_SSE2(void) {
   WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;
 }
 
-#endif   // WEBRTC_USE_SSE2
diff --git a/src/modules/audio_processing/aec/aec_rdft.c b/src/modules/audio_processing/aec/aec_rdft.c
index 92223343dc..19908d8541 100644
--- a/src/modules/audio_processing/aec/aec_rdft.c
+++ b/src/modules/audio_processing/aec/aec_rdft.c
@@ -576,11 +576,11 @@ void aec_rdft_init(void) {
   cftmdl_128 = cftmdl_128_C;
   rftfsub_128 = rftfsub_128_C;
   rftbsub_128 = rftbsub_128_C;
+#if defined(WEBRTC_ARCH_X86_FAMILY)
   if (WebRtc_GetCPUInfo(kSSE2)) {
-#if defined(WEBRTC_USE_SSE2)
     aec_rdft_init_sse2();
-#endif
   }
+#endif
   // init library constants.
   makewt_32();
   makect_32();
diff --git a/src/modules/audio_processing/aec/aec_rdft_sse2.c b/src/modules/audio_processing/aec/aec_rdft_sse2.c
index f936e2a7e2..eeb315245c 100644
--- a/src/modules/audio_processing/aec/aec_rdft_sse2.c
+++ b/src/modules/audio_processing/aec/aec_rdft_sse2.c
@@ -8,13 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "typedefs.h"
-
-#if defined(WEBRTC_USE_SSE2)
-#include <emmintrin.h>
-
 #include "aec_rdft.h"
 
+#include <emmintrin.h>
+
 static const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] =
   {-1.f, 1.f, -1.f, 1.f};
 
@@ -428,4 +425,3 @@ void aec_rdft_init_sse2(void) {
   rftbsub_128 = rftbsub_128_SSE2;
 }
 
-#endif  // WEBRTC_USE_SS2
diff --git a/src/modules/audio_processing/aec/echo_cancellation.c b/src/modules/audio_processing/aec/echo_cancellation.c
index 66c9b979f1..021df052ba 100644
--- a/src/modules/audio_processing/aec/echo_cancellation.c
+++ b/src/modules/audio_processing/aec/echo_cancellation.c
@@ -22,6 +22,7 @@
 
 #include "aec_core.h"
 #include "aec_resampler.h"
+#include "common_audio/signal_processing/include/signal_processing_library.h"
 #include "ring_buffer.h"
 #include "typedefs.h"
 
diff --git a/src/modules/video_processing/main/source/Android.mk b/src/modules/video_processing/main/source/Android.mk
index 03d2d743fe..74d15cb443 100644
--- a/src/modules/video_processing/main/source/Android.mk
+++ b/src/modules/video_processing/main/source/Android.mk
@@ -18,7 +18,6 @@ LOCAL_MODULE := libwebrtc_video_processing
 LOCAL_MODULE_TAGS := optional
 LOCAL_CPP_EXTENSION := .cc
 LOCAL_SRC_FILES := \
-    video_processing_impl.cc \
     brightness_detection.cc \
     color_enhancement.cc \
     content_analysis.cc \
@@ -27,6 +26,12 @@ LOCAL_SRC_FILES := \
     frame_preprocessor.cc \
     spatial_resampler.cc \
     video_decimator.cc
+    video_processing_impl.cc \
+
+ifeq ($(TARGET_ARCH),x86)
+LOCAL_SRC_FILES += \
+    content_analysis_sse2.cc
+endif
 
 # Flags passed to both C and C++ files.
 LOCAL_CFLAGS := \
diff --git a/src/modules/video_processing/main/source/content_analysis.cc b/src/modules/video_processing/main/source/content_analysis.cc
index 45935ebe09..32ee09a18f 100644
--- a/src/modules/video_processing/main/source/content_analysis.cc
+++ b/src/modules/video_processing/main/source/content_analysis.cc
@@ -13,12 +13,10 @@
 
 #include <math.h>
 #include <stdlib.h>
-#if defined(WEBRTC_USE_SSE2)
-#include <emmintrin.h>
-#endif
+
 namespace webrtc {
 
-VPMContentAnalysis::VPMContentAnalysis(bool RTCD):
+VPMContentAnalysis::VPMContentAnalysis(bool runtime_cpu_detection):
 _origFrame(NULL),
 _prevFrame(NULL),
 _width(0),
@@ -40,16 +38,16 @@ _cMetrics(NULL)
     ComputeSpatialMetrics = &VPMContentAnalysis::ComputeSpatialMetrics_C;
     TemporalDiffMetric = &VPMContentAnalysis::TemporalDiffMetric_C;
 
-    if (RTCD)
+    if (runtime_cpu_detection)
     {
-        if(WebRtc_GetCPUInfo(kSSE2))
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+        if (WebRtc_GetCPUInfo(kSSE2))
         {
-#if defined(WEBRTC_USE_SSE2)
             ComputeSpatialMetrics =
                           &VPMContentAnalysis::ComputeSpatialMetrics_SSE2;
             TemporalDiffMetric = &VPMContentAnalysis::TemporalDiffMetric_SSE2;
-#endif
         }
+#endif
     }
 
     Release();
@@ -249,110 +247,6 @@ VPMContentAnalysis::TemporalDiffMetric_C()
 
 }
 
-#if defined(WEBRTC_USE_SSE2)
-WebRtc_Word32
-VPMContentAnalysis::TemporalDiffMetric_SSE2()
-{
-    WebRtc_UWord32 numPixels = 0;       // counter for # of pixels
-
-    const WebRtc_UWord8* imgBufO = _origFrame + _border*_width + _border;
-    const WebRtc_UWord8* imgBufP = _prevFrame + _border*_width + _border;
-
-    const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border;
-
-    __m128i sad_64   = _mm_setzero_si128();
-    __m128i sum_64   = _mm_setzero_si128();
-    __m128i sqsum_64 = _mm_setzero_si128();
-    const __m128i z  = _mm_setzero_si128();
-
-    for(WebRtc_UWord16 i = 0; i < (_height - 2*_border); i += _skipNum)
-    {
-        __m128i sqsum_32  = _mm_setzero_si128();
-
-        const WebRtc_UWord8 *lineO = imgBufO;
-        const WebRtc_UWord8 *lineP = imgBufP;
-
-        // Work on 16 pixels at a time.  For HD content with a width of 1920
-        // this loop will run ~67 times (depending on border).  Maximum for
-        // abs(o-p) and sum(o) will be 255. _mm_sad_epu8 produces 2 64 bit
-        // results which are then accumulated.  There is no chance of
-        // rollover for these two accumulators.
-        // o*o will have a maximum of 255*255 = 65025.  This will roll over
-        // a 16 bit accumulator as 67*65025 > 65535, but will fit in a
-        // 32 bit accumulator.
-        for(WebRtc_UWord16 j = 0; j < width_end - _border; j += 16)
-        {
-            const __m128i o = _mm_loadu_si128((__m128i*)(lineO));
-            const __m128i p = _mm_loadu_si128((__m128i*)(lineP));
-
-            lineO += 16;
-            lineP += 16;
-
-            // abs pixel difference between frames
-            sad_64 = _mm_add_epi64 (sad_64, _mm_sad_epu8(o, p));
-
-            // sum of all pixels in frame
-            sum_64 = _mm_add_epi64 (sum_64, _mm_sad_epu8(o, z));
-
-            // squared sum of all pixels in frame
-            const __m128i olo = _mm_unpacklo_epi8(o,z);
-            const __m128i ohi = _mm_unpackhi_epi8(o,z);
-
-            const __m128i sqsum_32_lo = _mm_madd_epi16(olo, olo);
-            const __m128i sqsum_32_hi = _mm_madd_epi16(ohi, ohi);
-
-            sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_lo);
-            sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_hi);
-        }
-
-        // Add to 64 bit running sum as to not roll over.
-        sqsum_64 = _mm_add_epi64(sqsum_64,
-                                _mm_add_epi64(_mm_unpackhi_epi32(sqsum_32,z),
-                                              _mm_unpacklo_epi32(sqsum_32,z)));
-
-        imgBufO += _width * _skipNum;
-        imgBufP += _width * _skipNum;
-        numPixels += (width_end - _border);
-    }
-
-    WebRtc_Word64 sad_final_64[2];
-    WebRtc_Word64 sum_final_64[2];
-    WebRtc_Word64 sqsum_final_64[2];
-
-    // bring sums out of vector registers and into integer register
-    // domain, summing them along the way
-    _mm_store_si128 ((__m128i*)sad_final_64, sad_64);
-    _mm_store_si128 ((__m128i*)sum_final_64, sum_64);
-    _mm_store_si128 ((__m128i*)sqsum_final_64, sqsum_64);
-
-    const WebRtc_UWord32 pixelSum = sum_final_64[0] + sum_final_64[1];
-    const WebRtc_UWord64 pixelSqSum = sqsum_final_64[0] + sqsum_final_64[1];
-    const WebRtc_UWord32 tempDiffSum = sad_final_64[0] + sad_final_64[1];
-
-    // default
-    _motionMagnitudeNZ = 0.0f;
-
-    if (tempDiffSum == 0)
-    {
-        return VPM_OK;
-    }
-
-    // normalize over all pixels
-    const float tempDiffAvg = (float)tempDiffSum / (float)(numPixels);
-    const float pixelSumAvg = (float)pixelSum / (float)(numPixels);
-    const float pixelSqSumAvg = (float)pixelSqSum / (float)(numPixels);
-    float contrast = pixelSqSumAvg - (pixelSumAvg * pixelSumAvg);
-
-    if (contrast > 0.0)
-    {
-        contrast = sqrt(contrast);
-       _motionMagnitudeNZ = tempDiffAvg/contrast;
-    }
-
-    return VPM_OK;
-}
-#endif
-
 // Compute spatial metrics:
 // To reduce complexity, we compute the metric for a reduced set of points.
 // The spatial metrics are rough estimates of the prediction error cost for
@@ -427,172 +321,6 @@ VPMContentAnalysis::ComputeSpatialMetrics_C()
     return VPM_OK;
 }
 
-#if defined(WEBRTC_USE_SSE2)
-WebRtc_Word32
-VPMContentAnalysis::ComputeSpatialMetrics_SSE2()
-{
-    const WebRtc_UWord8* imgBuf = _origFrame + _border*_width;
-    const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border;
-
-    __m128i se_32  = _mm_setzero_si128();
-    __m128i sev_32 = _mm_setzero_si128();
-    __m128i seh_32 = _mm_setzero_si128();
-    __m128i msa_32 = _mm_setzero_si128();
-    const __m128i z = _mm_setzero_si128();
-
-    // Error is accumulated as a 32 bit value.  Looking at HD content with a
-    // height of 1080 lines, or about 67 macro blocks.  If the 16 bit row
-    // value is maxed out at 65529 for every row, 65529*1080 = 70777800, which
-    // will not roll over a 32 bit accumulator.
-    // _skipNum is also used to reduce the number of rows
-    for(WebRtc_Word32 i = 0; i < (_height - 2*_border); i += _skipNum)
-    {
-        __m128i se_16  = _mm_setzero_si128();
-        __m128i sev_16 = _mm_setzero_si128();
-        __m128i seh_16 = _mm_setzero_si128();
-        __m128i msa_16 = _mm_setzero_si128();
-
-        // Row error is accumulated as a 16 bit value.  There are 8
-        // accumulators.  Max value of a 16 bit number is 65529.  Looking
-        // at HD content, 1080p, has a width of 1920, 120 macro blocks.
-        // A mb at a time is processed at a time.  Absolute max error at
-        // a point would be abs(0-255+255+255+255) which equals 1020.
-        // 120*1020 = 122400.  The probability of hitting this is quite low
-        // on well behaved content.  A specially crafted image could roll over.
-        // _border could also be adjusted to concentrate on just the center of
-        // the images for an HD capture in order to reduce the possiblity of
-        // rollover.
-        const WebRtc_UWord8 *lineTop = imgBuf - _width + _border;
-        const WebRtc_UWord8 *lineCen = imgBuf + _border;
-        const WebRtc_UWord8 *lineBot = imgBuf + _width + _border;
-
-        for(WebRtc_Word32 j = 0; j < width_end - _border; j += 16)
-        {
-            const __m128i t = _mm_loadu_si128((__m128i*)(lineTop));
-            const __m128i l = _mm_loadu_si128((__m128i*)(lineCen - 1));
-            const __m128i c = _mm_loadu_si128((__m128i*)(lineCen));
-            const __m128i r = _mm_loadu_si128((__m128i*)(lineCen + 1));
-            const __m128i b = _mm_loadu_si128((__m128i*)(lineBot));
-
-            lineTop += 16;
-            lineCen += 16;
-            lineBot += 16;
-
-            // center pixel unpacked
-            __m128i clo = _mm_unpacklo_epi8(c,z);
-            __m128i chi = _mm_unpackhi_epi8(c,z);
-
-            // left right pixels unpacked and added together
-            const __m128i lrlo = _mm_add_epi16(_mm_unpacklo_epi8(l,z),
-                                               _mm_unpacklo_epi8(r,z));
-            const __m128i lrhi = _mm_add_epi16(_mm_unpackhi_epi8(l,z),
-                                               _mm_unpackhi_epi8(r,z));
-
-            // top & bottom pixels unpacked and added together
-            const __m128i tblo = _mm_add_epi16(_mm_unpacklo_epi8(t,z),
-                                               _mm_unpacklo_epi8(b,z));
-            const __m128i tbhi = _mm_add_epi16(_mm_unpackhi_epi8(t,z),
-                                               _mm_unpackhi_epi8(b,z));
-
-            // running sum of all pixels
-            msa_16 = _mm_add_epi16(msa_16, _mm_add_epi16(chi, clo));
-
-            clo = _mm_slli_epi16(clo, 1);
-            chi = _mm_slli_epi16(chi, 1);
-            const __m128i sevtlo = _mm_subs_epi16(clo, tblo);
-            const __m128i sevthi = _mm_subs_epi16(chi, tbhi);
-            const __m128i sehtlo = _mm_subs_epi16(clo, lrlo);
-            const __m128i sehthi = _mm_subs_epi16(chi, lrhi);
-
-            clo = _mm_slli_epi16(clo, 1);
-            chi = _mm_slli_epi16(chi, 1);
-            const __m128i setlo = _mm_subs_epi16(clo,
-                                                 _mm_add_epi16(lrlo, tblo));
-            const __m128i sethi = _mm_subs_epi16(chi,
-                                                 _mm_add_epi16(lrhi, tbhi));
-
-            // Add to 16 bit running sum
-            se_16  = _mm_add_epi16(se_16,
-                                   _mm_max_epi16(setlo,
-                                                 _mm_subs_epi16(z, setlo)));
-            se_16  = _mm_add_epi16(se_16,
-                                   _mm_max_epi16(sethi,
-                                                 _mm_subs_epi16(z, sethi)));
-            sev_16 = _mm_add_epi16(sev_16,
-                                   _mm_max_epi16(sevtlo,
-                                                 _mm_subs_epi16(z, sevtlo)));
-            sev_16 = _mm_add_epi16(sev_16,
-                                   _mm_max_epi16(sevthi,
-                                                 _mm_subs_epi16(z, sevthi)));
-            seh_16 = _mm_add_epi16(seh_16,
-                                   _mm_max_epi16(sehtlo,
-                                                 _mm_subs_epi16(z, sehtlo)));
-            seh_16 = _mm_add_epi16(seh_16,
-                                   _mm_max_epi16(sehthi,
-                                                 _mm_subs_epi16(z, sehthi)));
-        }
-
-        // Add to 32 bit running sum as to not roll over.
-        se_32  = _mm_add_epi32(se_32,
-                               _mm_add_epi32(_mm_unpackhi_epi16(se_16,z),
-                                             _mm_unpacklo_epi16(se_16,z)));
-        sev_32 = _mm_add_epi32(sev_32,
-                               _mm_add_epi32(_mm_unpackhi_epi16(sev_16,z),
-                                             _mm_unpacklo_epi16(sev_16,z)));
-        seh_32 = _mm_add_epi32(seh_32,
-                               _mm_add_epi32(_mm_unpackhi_epi16(seh_16,z),
-                                             _mm_unpacklo_epi16(seh_16,z)));
-        msa_32 = _mm_add_epi32(msa_32,
-                               _mm_add_epi32(_mm_unpackhi_epi16(msa_16,z),
-                                             _mm_unpacklo_epi16(msa_16,z)));
-
-        imgBuf += _width * _skipNum;
-    }
-
-    WebRtc_Word64 se_64[2];
-    WebRtc_Word64 sev_64[2];
-    WebRtc_Word64 seh_64[2];
-    WebRtc_Word64 msa_64[2];
-
-    // bring sums out of vector registers and into integer register
-    // domain, summing them along the way
-    _mm_store_si128 ((__m128i*)se_64,
-                     _mm_add_epi64(_mm_unpackhi_epi32(se_32,z),
-                                   _mm_unpacklo_epi32(se_32,z)));
-    _mm_store_si128 ((__m128i*)sev_64,
-                     _mm_add_epi64(_mm_unpackhi_epi32(sev_32,z),
-                                   _mm_unpacklo_epi32(sev_32,z)));
-    _mm_store_si128 ((__m128i*)seh_64,
-                     _mm_add_epi64(_mm_unpackhi_epi32(seh_32,z),
-                                   _mm_unpacklo_epi32(seh_32,z)));
-    _mm_store_si128 ((__m128i*)msa_64,
-                     _mm_add_epi64(_mm_unpackhi_epi32(msa_32,z),
-                                   _mm_unpacklo_epi32(msa_32,z)));
-
-    const WebRtc_UWord32 spatialErrSum  = se_64[0] + se_64[1];
-    const WebRtc_UWord32 spatialErrVSum = sev_64[0] + sev_64[1];
-    const WebRtc_UWord32 spatialErrHSum = seh_64[0] + seh_64[1];
-    const WebRtc_UWord32 pixelMSA = msa_64[0] + msa_64[1];
-
-    // normalize over all pixels
-    const float spatialErr  = (float)(spatialErrSum >> 2);
-    const float spatialErrH = (float)(spatialErrHSum >> 1);
-    const float spatialErrV = (float)(spatialErrVSum >> 1);
-    const float norm = (float)pixelMSA;
-
-    // 2X2:
-    _spatialPredErr = spatialErr / norm;
-
-    // 1X2:
-    _spatialPredErrH = spatialErrH / norm;
-
-    // 2X1:
-    _spatialPredErrV = spatialErrV / norm;
-
-    return VPM_OK;
-}
-#endif // #if defined(WEBRTC_USE_SSE2)
-
 VideoContentMetrics*
 VPMContentAnalysis::ContentMetrics()
 {
diff --git a/src/modules/video_processing/main/source/content_analysis.h b/src/modules/video_processing/main/source/content_analysis.h
index e0810d364a..5051650291 100644
--- a/src/modules/video_processing/main/source/content_analysis.h
+++ b/src/modules/video_processing/main/source/content_analysis.h
@@ -8,10 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-/*
- * content_analysis.h
- */
-
 #ifndef VPM_CONTENT_ANALYSIS_H
 #define VPM_CONTENT_ANALYSIS_H
 
@@ -24,7 +20,9 @@ namespace webrtc {
 class VPMContentAnalysis
 {
 public:
-    VPMContentAnalysis(bool RTCD = true);
+    // When |runtime_cpu_detection| is true, runtime selection of an optimized
+    // code path is allowed.
+    VPMContentAnalysis(bool runtime_cpu_detection);
     ~VPMContentAnalysis();
 
     // Initialize ContentAnalysis - should be called prior to
@@ -62,7 +60,7 @@ private:
     ComputeSpatialMetricsFunc ComputeSpatialMetrics;
     WebRtc_Word32 ComputeSpatialMetrics_C();
 
-#if defined(WEBRTC_USE_SSE2)
+#if defined(WEBRTC_ARCH_X86_FAMILY)
     WebRtc_Word32 ComputeSpatialMetrics_SSE2();
     WebRtc_Word32 TemporalDiffMetric_SSE2();
 #endif
diff --git a/src/modules/video_processing/main/source/content_analysis_sse2.cc b/src/modules/video_processing/main/source/content_analysis_sse2.cc
new file mode 100644
index 0000000000..347fa5b607
--- /dev/null
+++ b/src/modules/video_processing/main/source/content_analysis_sse2.cc
@@ -0,0 +1,284 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "content_analysis.h"
+
+#include <emmintrin.h>
+#include <math.h>
+
+namespace webrtc {
+
+WebRtc_Word32
+VPMContentAnalysis::TemporalDiffMetric_SSE2()
+{
+    WebRtc_UWord32 numPixels = 0;       // counter for # of pixels
+
+    const WebRtc_UWord8* imgBufO = _origFrame + _border*_width + _border;
+    const WebRtc_UWord8* imgBufP = _prevFrame + _border*_width + _border;
+
+    const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border;
+
+    __m128i sad_64   = _mm_setzero_si128();
+    __m128i sum_64   = _mm_setzero_si128();
+    __m128i sqsum_64 = _mm_setzero_si128();
+    const __m128i z  = _mm_setzero_si128();
+
+    for(WebRtc_UWord16 i = 0; i < (_height - 2*_border); i += _skipNum)
+    {
+        __m128i sqsum_32  = _mm_setzero_si128();
+
+        const WebRtc_UWord8 *lineO = imgBufO;
+        const WebRtc_UWord8 *lineP = imgBufP;
+
+        // Work on 16 pixels at a time.  For HD content with a width of 1920
+        // this loop will run ~67 times (depending on border).  Maximum for
+        // abs(o-p) and sum(o) will be 255. _mm_sad_epu8 produces 2 64 bit
+        // results which are then accumulated.  There is no chance of
+        // rollover for these two accumulators.
+        // o*o will have a maximum of 255*255 = 65025.  This will roll over
+        // a 16 bit accumulator as 67*65025 > 65535, but will fit in a
+        // 32 bit accumulator.
+        for(WebRtc_UWord16 j = 0; j < width_end - _border; j += 16)
+        {
+            const __m128i o = _mm_loadu_si128((__m128i*)(lineO));
+            const __m128i p = _mm_loadu_si128((__m128i*)(lineP));
+
+            lineO += 16;
+            lineP += 16;
+
+            // abs pixel difference between frames
+            sad_64 = _mm_add_epi64 (sad_64, _mm_sad_epu8(o, p));
+
+            // sum of all pixels in frame
+            sum_64 = _mm_add_epi64 (sum_64, _mm_sad_epu8(o, z));
+
+            // squared sum of all pixels in frame
+            const __m128i olo = _mm_unpacklo_epi8(o,z);
+            const __m128i ohi = _mm_unpackhi_epi8(o,z);
+
+            const __m128i sqsum_32_lo = _mm_madd_epi16(olo, olo);
+            const __m128i sqsum_32_hi = _mm_madd_epi16(ohi, ohi);
+
+            sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_lo);
+            sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_hi);
+        }
+
+        // Add to 64 bit running sum as to not roll over.
+        sqsum_64 = _mm_add_epi64(sqsum_64,
+                                _mm_add_epi64(_mm_unpackhi_epi32(sqsum_32,z),
+                                              _mm_unpacklo_epi32(sqsum_32,z)));
+
+        imgBufO += _width * _skipNum;
+        imgBufP += _width * _skipNum;
+        numPixels += (width_end - _border);
+    }
+
+    WebRtc_Word64 sad_final_64[2];
+    WebRtc_Word64 sum_final_64[2];
+    WebRtc_Word64 sqsum_final_64[2];
+
+    // bring sums out of vector registers and into integer register
+    // domain, summing them along the way
+    _mm_store_si128 ((__m128i*)sad_final_64, sad_64);
+    _mm_store_si128 ((__m128i*)sum_final_64, sum_64);
+    _mm_store_si128 ((__m128i*)sqsum_final_64, sqsum_64);
+
+    const WebRtc_UWord32 pixelSum = sum_final_64[0] + sum_final_64[1];
+    const WebRtc_UWord64 pixelSqSum = sqsum_final_64[0] + sqsum_final_64[1];
+    const WebRtc_UWord32 tempDiffSum = sad_final_64[0] + sad_final_64[1];
+
+    // default
+    _motionMagnitudeNZ = 0.0f;
+
+    if (tempDiffSum == 0)
+    {
+        return VPM_OK;
+    }
+
+    // normalize over all pixels
+    const float tempDiffAvg = (float)tempDiffSum / (float)(numPixels);
+    const float pixelSumAvg = (float)pixelSum / (float)(numPixels);
+    const float pixelSqSumAvg = (float)pixelSqSum / (float)(numPixels);
+    float contrast = pixelSqSumAvg - (pixelSumAvg * pixelSumAvg);
+
+    if (contrast > 0.0)
+    {
+        contrast = sqrt(contrast);
+       _motionMagnitudeNZ = tempDiffAvg/contrast;
+    }
+
+    return VPM_OK;
+}
+
+WebRtc_Word32
+VPMContentAnalysis::ComputeSpatialMetrics_SSE2()
+{
+    const WebRtc_UWord8* imgBuf = _origFrame + _border*_width;
+    const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border;
+
+    __m128i se_32  = _mm_setzero_si128();
+    __m128i sev_32 = _mm_setzero_si128();
+    __m128i seh_32 = _mm_setzero_si128();
+    __m128i msa_32 = _mm_setzero_si128();
+    const __m128i z = _mm_setzero_si128();
+
+    // Error is accumulated as a 32 bit value.  Looking at HD content with a
+    // height of 1080 lines, or about 67 macro blocks.  If the 16 bit row
+    // value is maxed out at 65529 for every row, 65529*1080 = 70777800, which
+    // will not roll over a 32 bit accumulator.
+    // _skipNum is also used to reduce the number of rows
+    for(WebRtc_Word32 i = 0; i < (_height - 2*_border); i += _skipNum)
+    {
+        __m128i se_16  = _mm_setzero_si128();
+        __m128i sev_16 = _mm_setzero_si128();
+        __m128i seh_16 = _mm_setzero_si128();
+        __m128i msa_16 = _mm_setzero_si128();
+
+        // Row error is accumulated as a 16 bit value.  There are 8
+        // accumulators.  Max value of a 16 bit number is 65529.  Looking
+        // at HD content, 1080p, has a width of 1920, 120 macro blocks.
+        // A mb at a time is processed at a time.  Absolute max error at
+        // a point would be abs(0-255+255+255+255) which equals 1020.
+        // 120*1020 = 122400.  The probability of hitting this is quite low
+        // on well behaved content.  A specially crafted image could roll over.
+        // _border could also be adjusted to concentrate on just the center of
+        // the images for an HD capture in order to reduce the possiblity of
+        // rollover.
+        const WebRtc_UWord8 *lineTop = imgBuf - _width + _border;
+        const WebRtc_UWord8 *lineCen = imgBuf + _border;
+        const WebRtc_UWord8 *lineBot = imgBuf + _width + _border;
+
+        for(WebRtc_Word32 j = 0; j < width_end - _border; j += 16)
+        {
+            const __m128i t = _mm_loadu_si128((__m128i*)(lineTop));
+            const __m128i l = _mm_loadu_si128((__m128i*)(lineCen - 1));
+            const __m128i c = _mm_loadu_si128((__m128i*)(lineCen));
+            const __m128i r = _mm_loadu_si128((__m128i*)(lineCen + 1));
+            const __m128i b = _mm_loadu_si128((__m128i*)(lineBot));
+
+            lineTop += 16;
+            lineCen += 16;
+            lineBot += 16;
+
+            // center pixel unpacked
+            __m128i clo = _mm_unpacklo_epi8(c,z);
+            __m128i chi = _mm_unpackhi_epi8(c,z);
+
+            // left right pixels unpacked and added together
+            const __m128i lrlo = _mm_add_epi16(_mm_unpacklo_epi8(l,z),
+                                               _mm_unpacklo_epi8(r,z));
+            const __m128i lrhi = _mm_add_epi16(_mm_unpackhi_epi8(l,z),
+                                               _mm_unpackhi_epi8(r,z));
+
+            // top & bottom pixels unpacked and added together
+            const __m128i tblo = _mm_add_epi16(_mm_unpacklo_epi8(t,z),
+                                               _mm_unpacklo_epi8(b,z));
+            const __m128i tbhi = _mm_add_epi16(_mm_unpackhi_epi8(t,z),
+                                               _mm_unpackhi_epi8(b,z));
+
+            // running sum of all pixels
+            msa_16 = _mm_add_epi16(msa_16, _mm_add_epi16(chi, clo));
+
+            clo = _mm_slli_epi16(clo, 1);
+            chi = _mm_slli_epi16(chi, 1);
+            const __m128i sevtlo = _mm_subs_epi16(clo, tblo);
+            const __m128i sevthi = _mm_subs_epi16(chi, tbhi);
+            const __m128i sehtlo = _mm_subs_epi16(clo, lrlo);
+            const __m128i sehthi = _mm_subs_epi16(chi, lrhi);
+
+            clo = _mm_slli_epi16(clo, 1);
+            chi = _mm_slli_epi16(chi, 1);
+            const __m128i setlo = _mm_subs_epi16(clo,
+                                                 _mm_add_epi16(lrlo, tblo));
+            const __m128i sethi = _mm_subs_epi16(chi,
+                                                 _mm_add_epi16(lrhi, tbhi));
+
+            // Add to 16 bit running sum
+            se_16  = _mm_add_epi16(se_16,
+                                   _mm_max_epi16(setlo,
+                                                 _mm_subs_epi16(z, setlo)));
+            se_16  = _mm_add_epi16(se_16,
+                                   _mm_max_epi16(sethi,
+                                                 _mm_subs_epi16(z, sethi)));
+            sev_16 = _mm_add_epi16(sev_16,
+                                   _mm_max_epi16(sevtlo,
+                                                 _mm_subs_epi16(z, sevtlo)));
+            sev_16 = _mm_add_epi16(sev_16,
+                                   _mm_max_epi16(sevthi,
+                                                 _mm_subs_epi16(z, sevthi)));
+            seh_16 = _mm_add_epi16(seh_16,
+                                   _mm_max_epi16(sehtlo,
+                                                 _mm_subs_epi16(z, sehtlo)));
+            seh_16 = _mm_add_epi16(seh_16,
+                                   _mm_max_epi16(sehthi,
+                                                 _mm_subs_epi16(z, sehthi)));
+        }
+
+        // Add to 32 bit running sum as to not roll over.
+        se_32  = _mm_add_epi32(se_32,
+                               _mm_add_epi32(_mm_unpackhi_epi16(se_16,z),
+                                             _mm_unpacklo_epi16(se_16,z)));
+        sev_32 = _mm_add_epi32(sev_32,
+                               _mm_add_epi32(_mm_unpackhi_epi16(sev_16,z),
+                                             _mm_unpacklo_epi16(sev_16,z)));
+        seh_32 = _mm_add_epi32(seh_32,
+                               _mm_add_epi32(_mm_unpackhi_epi16(seh_16,z),
+                                             _mm_unpacklo_epi16(seh_16,z)));
+        msa_32 = _mm_add_epi32(msa_32,
+                               _mm_add_epi32(_mm_unpackhi_epi16(msa_16,z),
+                                             _mm_unpacklo_epi16(msa_16,z)));
+
+        imgBuf += _width * _skipNum;
+    }
+
+    WebRtc_Word64 se_64[2];
+    WebRtc_Word64 sev_64[2];
+    WebRtc_Word64 seh_64[2];
+    WebRtc_Word64 msa_64[2];
+
+    // bring sums out of vector registers and into integer register
+    // domain, summing them along the way
+    _mm_store_si128 ((__m128i*)se_64,
+                     _mm_add_epi64(_mm_unpackhi_epi32(se_32,z),
+                                   _mm_unpacklo_epi32(se_32,z)));
+    _mm_store_si128 ((__m128i*)sev_64,
+                     _mm_add_epi64(_mm_unpackhi_epi32(sev_32,z),
+                                   _mm_unpacklo_epi32(sev_32,z)));
+    _mm_store_si128 ((__m128i*)seh_64,
+                     _mm_add_epi64(_mm_unpackhi_epi32(seh_32,z),
+                                   _mm_unpacklo_epi32(seh_32,z)));
+    _mm_store_si128 ((__m128i*)msa_64,
+                     _mm_add_epi64(_mm_unpackhi_epi32(msa_32,z),
+                                   _mm_unpacklo_epi32(msa_32,z)));
+
+    const WebRtc_UWord32 spatialErrSum  = se_64[0] + se_64[1];
+    const WebRtc_UWord32 spatialErrVSum = sev_64[0] + sev_64[1];
+    const WebRtc_UWord32 spatialErrHSum = seh_64[0] + seh_64[1];
+    const WebRtc_UWord32 pixelMSA = msa_64[0] + msa_64[1];
+
+    // normalize over all pixels
+    const float spatialErr  = (float)(spatialErrSum >> 2);
+    const float spatialErrH = (float)(spatialErrHSum >> 1);
+    const float spatialErrV = (float)(spatialErrVSum >> 1);
+    const float norm = (float)pixelMSA;
+
+    // 2X2:
+    _spatialPredErr = spatialErr / norm;
+
+    // 1X2:
+    _spatialPredErrH = spatialErrH / norm;
+
+    // 2X1:
+    _spatialPredErrV = spatialErrV / norm;
+
+    return VPM_OK;
+}
+
+}  // namespace webrtc
diff --git a/src/modules/video_processing/main/source/frame_preprocessor.cc b/src/modules/video_processing/main/source/frame_preprocessor.cc
index 57bc84daf9..14ced41476 100644
--- a/src/modules/video_processing/main/source/frame_preprocessor.cc
+++ b/src/modules/video_processing/main/source/frame_preprocessor.cc
@@ -22,7 +22,7 @@ _resampledFrame(),
 _enableCA(false)
 {
     _spatialResampler = new VPMSimpleSpatialResampler();
-    _ca = new VPMContentAnalysis();
+    _ca = new VPMContentAnalysis(true);
     _vd = new VPMVideoDecimator();
 }
 
diff --git a/src/modules/video_processing/main/source/video_processing.gypi b/src/modules/video_processing/main/source/video_processing.gypi
index 8ca831d96e..dda0a780a4 100644
--- a/src/modules/video_processing/main/source/video_processing.gypi
+++ b/src/modules/video_processing/main/source/video_processing.gypi
@@ -14,7 +14,7 @@
       'dependencies': [
         'webrtc_utility',
         '<(webrtc_root)/common_audio/common_audio.gyp:signal_processing',
-         '<(webrtc_root)/common_video/common_video.gyp:webrtc_libyuv',
+        '<(webrtc_root)/common_video/common_video.gyp:webrtc_libyuv',
         '<(webrtc_root)/system_wrappers/source/system_wrappers.gyp:system_wrappers',
       ],
       'include_dirs': [
@@ -26,41 +26,57 @@
         ],
       },
       'sources': [
-        # interfaces
         '../interface/video_processing.h',
         '../interface/video_processing_defines.h',
-
-        # headers
-        'video_processing_impl.h',
+        'brighten.cc',
+        'brighten.h',
+        'brightness_detection.cc',
         'brightness_detection.h',
-	'brighten.h',
+        'color_enhancement.cc',
         'color_enhancement.h',
         'color_enhancement_private.h',
-        'content_analysis.h',
-        'deflickering.h',
-        'denoising.h',
-        'frame_preprocessor.h',
-        'spatial_resampler.h',
-        'video_decimator.h',
-
-        # sources
-        'video_processing_impl.cc',
-        'brightness_detection.cc',
-	'brighten.cc',
-        'color_enhancement.cc',
         'content_analysis.cc',
+        'content_analysis.h',
         'deflickering.cc',
+        'deflickering.h',
         'denoising.cc',
+        'denoising.h',
         'frame_preprocessor.cc',
+        'frame_preprocessor.h',
         'spatial_resampler.cc',
+        'spatial_resampler.h',
         'video_decimator.cc',
-      ], # source
+        'video_decimator.h',
+        'video_processing_impl.cc',
+        'video_processing_impl.h',
+      ],
+      'conditions': [
+        ['target_arch=="ia32" or target_arch=="x64"', {
+          'dependencies': [ 'video_processing_sse2', ],
+        }],
+      ],
+    },
+    {
+      'target_name': 'video_processing_sse2',
+      'type': '<(library)',
+      'sources': [
+        'content_analysis_sse2.cc',
+      ],
+      'include_dirs': [
+        '../interface',
+        '../../../interface',
+      ],
+      'conditions': [
+        ['os_posix==1 and OS!="mac"', {
+          'cflags': [ '-msse2', ],
+        }],
+        ['OS=="mac"', {
+          'xcode_settings': {
+            'OTHER_CFLAGS': [ '-msse2', ],
+          },
+        }],
+      ],
     },
   ],
 }
 
-# Local Variables:
-# tab-width:2
-# indent-tabs-mode:nil
-# End:
-# vim: set expandtab tabstop=2 shiftwidth=2:
diff --git a/src/modules/video_processing/main/test/unit_test/content_metrics_test.cc b/src/modules/video_processing/main/test/unit_test/content_metrics_test.cc
index 20e803c782..b25c45fc6d 100644
--- a/src/modules/video_processing/main/test/unit_test/content_metrics_test.cc
+++ b/src/modules/video_processing/main/test/unit_test/content_metrics_test.cc
@@ -17,7 +17,7 @@ namespace webrtc {
 TEST_F(VideoProcessingModuleTest, ContentAnalysis)
 {
     VPMContentAnalysis    _ca_c(false);
-    VPMContentAnalysis    _ca_sse;
+    VPMContentAnalysis    _ca_sse(true);
     VideoContentMetrics  *_cM_c, *_cM_SSE;
 
     _ca_c.Initialize(_width,_height);