diff --git a/android-webrtc.mk b/android-webrtc.mk index dc92aeb8e2..0dae14ceca 100644 --- a/android-webrtc.mk +++ b/android-webrtc.mk @@ -42,8 +42,3 @@ ifeq ($(ARCH_ARM_HAVE_ARMV7A),true) MY_WEBRTC_COMMON_DEFS += \ '-DWEBRTC_ARCH_ARM_V7A' endif - -else ifeq ($(TARGET_ARCH),x86) -MY_WEBRTC_COMMON_DEFS += \ - '-DWEBRTC_USE_SSE2' -endif diff --git a/src/modules/audio_processing/aec/Android.mk b/src/modules/audio_processing/aec/Android.mk index 698755acdb..7d539587ba 100644 --- a/src/modules/audio_processing/aec/Android.mk +++ b/src/modules/audio_processing/aec/Android.mk @@ -20,9 +20,12 @@ LOCAL_SRC_FILES := \ aec_resampler.c \ aec_core.c \ aec_rdft.c \ + +ifeq ($(TARGET_ARCH),x86) +LOCAL_SRC_FILES += \ aec_core_sse2.c \ aec_rdft_sse2.c - +endif # Flags passed to both C and C++ files. LOCAL_CFLAGS := \ diff --git a/src/modules/audio_processing/aec/aec.gypi b/src/modules/audio_processing/aec/aec.gypi index 7e86a900f3..4b3a08d95c 100644 --- a/src/modules/audio_processing/aec/aec.gypi +++ b/src/modules/audio_processing/aec/aec.gypi @@ -16,8 +16,8 @@ 'aec_debug_dump%': 0, }, 'dependencies': [ + 'apm_util', '<(webrtc_root)/common_audio/common_audio.gyp:signal_processing', - 'apm_util' ], 'include_dirs': [ 'interface', @@ -32,18 +32,37 @@ 'echo_cancellation.c', 'aec_core.h', 'aec_core.c', - 'aec_core_sse2.c', 'aec_rdft.h', 'aec_rdft.c', - 'aec_rdft_sse2.c', 'aec_resampler.h', 'aec_resampler.c', ], 'conditions': [ + ['target_arch=="ia32" or target_arch=="x64"', { + 'dependencies': [ 'aec_sse2', ], + }], ['aec_debug_dump==1', { 'defines': [ 'WEBRTC_AEC_DEBUG_DUMP', ], }], ], }, + { + 'target_name': 'aec_sse2', + 'type': '<(library)', + 'sources': [ + 'aec_core_sse2.c', + 'aec_rdft_sse2.c', + ], + 'conditions': [ + ['os_posix==1 and OS!="mac"', { + 'cflags': [ '-msse2', ], + }], + ['OS=="mac"', { + 'xcode_settings': { + 'OTHER_CFLAGS': [ '-msse2', ], + }, + }], + ], + }, ], } diff --git a/src/modules/audio_processing/aec/aec_core.c b/src/modules/audio_processing/aec/aec_core.c index 6718dec3fb..1637e6fdbc 100644 --- a/src/modules/audio_processing/aec/aec_core.c +++ b/src/modules/audio_processing/aec/aec_core.c @@ -21,6 +21,7 @@ #include #include "aec_rdft.h" +#include "common_audio/signal_processing/include/signal_processing_library.h" #include "delay_estimator_wrapper.h" #include "ring_buffer.h" #include "system_wrappers/interface/cpu_features_wrapper.h" @@ -516,11 +517,13 @@ int WebRtcAec_InitAec(aec_t *aec, int sampFreq) WebRtcAec_ScaleErrorSignal = ScaleErrorSignal; WebRtcAec_FilterAdaptation = FilterAdaptation; WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppress; + +#if defined(WEBRTC_ARCH_X86_FAMILY) if (WebRtc_GetCPUInfo(kSSE2)) { -#if defined(WEBRTC_USE_SSE2) WebRtcAec_InitAec_SSE2(); -#endif } +#endif + aec_rdft_init(); return 0; diff --git a/src/modules/audio_processing/aec/aec_core.h b/src/modules/audio_processing/aec/aec_core.h index 1b9828ab17..d326a6842d 100644 --- a/src/modules/audio_processing/aec/aec_core.h +++ b/src/modules/audio_processing/aec/aec_core.h @@ -15,9 +15,10 @@ #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_ #define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_CORE_H_ +#ifdef WEBRTC_AEC_DEBUG_DUMP #include +#endif -#include "signal_processing_library.h" #include "typedefs.h" #define FRAME_LEN 80 diff --git a/src/modules/audio_processing/aec/aec_core_sse2.c b/src/modules/audio_processing/aec/aec_core_sse2.c index 8894f28a17..74a1c48bdb 100644 --- a/src/modules/audio_processing/aec/aec_core_sse2.c +++ b/src/modules/audio_processing/aec/aec_core_sse2.c @@ -12,13 +12,12 @@ * The core AEC algorithm, SSE2 version of speed-critical functions. */ -#include "typedefs.h" +#include "aec_core.h" -#if defined(WEBRTC_USE_SSE2) #include #include +#include // memset -#include "aec_core.h" #include "aec_rdft.h" __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) @@ -414,4 +413,3 @@ void WebRtcAec_InitAec_SSE2(void) { WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; } -#endif // WEBRTC_USE_SSE2 diff --git a/src/modules/audio_processing/aec/aec_rdft.c b/src/modules/audio_processing/aec/aec_rdft.c index 92223343dc..19908d8541 100644 --- a/src/modules/audio_processing/aec/aec_rdft.c +++ b/src/modules/audio_processing/aec/aec_rdft.c @@ -576,11 +576,11 @@ void aec_rdft_init(void) { cftmdl_128 = cftmdl_128_C; rftfsub_128 = rftfsub_128_C; rftbsub_128 = rftbsub_128_C; +#if defined(WEBRTC_ARCH_X86_FAMILY) if (WebRtc_GetCPUInfo(kSSE2)) { -#if defined(WEBRTC_USE_SSE2) aec_rdft_init_sse2(); -#endif } +#endif // init library constants. makewt_32(); makect_32(); diff --git a/src/modules/audio_processing/aec/aec_rdft_sse2.c b/src/modules/audio_processing/aec/aec_rdft_sse2.c index f936e2a7e2..eeb315245c 100644 --- a/src/modules/audio_processing/aec/aec_rdft_sse2.c +++ b/src/modules/audio_processing/aec/aec_rdft_sse2.c @@ -8,13 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "typedefs.h" - -#if defined(WEBRTC_USE_SSE2) -#include - #include "aec_rdft.h" +#include + static const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f}; @@ -428,4 +425,3 @@ void aec_rdft_init_sse2(void) { rftbsub_128 = rftbsub_128_SSE2; } -#endif // WEBRTC_USE_SS2 diff --git a/src/modules/audio_processing/aec/echo_cancellation.c b/src/modules/audio_processing/aec/echo_cancellation.c index 66c9b979f1..021df052ba 100644 --- a/src/modules/audio_processing/aec/echo_cancellation.c +++ b/src/modules/audio_processing/aec/echo_cancellation.c @@ -22,6 +22,7 @@ #include "aec_core.h" #include "aec_resampler.h" +#include "common_audio/signal_processing/include/signal_processing_library.h" #include "ring_buffer.h" #include "typedefs.h" diff --git a/src/modules/video_processing/main/source/Android.mk b/src/modules/video_processing/main/source/Android.mk index 03d2d743fe..74d15cb443 100644 --- a/src/modules/video_processing/main/source/Android.mk +++ b/src/modules/video_processing/main/source/Android.mk @@ -18,7 +18,6 @@ LOCAL_MODULE := libwebrtc_video_processing LOCAL_MODULE_TAGS := optional LOCAL_CPP_EXTENSION := .cc LOCAL_SRC_FILES := \ - video_processing_impl.cc \ brightness_detection.cc \ color_enhancement.cc \ content_analysis.cc \ @@ -27,6 +26,12 @@ LOCAL_SRC_FILES := \ frame_preprocessor.cc \ spatial_resampler.cc \ video_decimator.cc + video_processing_impl.cc \ + +ifeq ($(TARGET_ARCH),x86) +LOCAL_SRC_FILES += \ + content_analysis_sse2.cc +endif # Flags passed to both C and C++ files. LOCAL_CFLAGS := \ diff --git a/src/modules/video_processing/main/source/content_analysis.cc b/src/modules/video_processing/main/source/content_analysis.cc index 45935ebe09..32ee09a18f 100644 --- a/src/modules/video_processing/main/source/content_analysis.cc +++ b/src/modules/video_processing/main/source/content_analysis.cc @@ -13,12 +13,10 @@ #include #include -#if defined(WEBRTC_USE_SSE2) -#include -#endif + namespace webrtc { -VPMContentAnalysis::VPMContentAnalysis(bool RTCD): +VPMContentAnalysis::VPMContentAnalysis(bool runtime_cpu_detection): _origFrame(NULL), _prevFrame(NULL), _width(0), @@ -40,16 +38,16 @@ _cMetrics(NULL) ComputeSpatialMetrics = &VPMContentAnalysis::ComputeSpatialMetrics_C; TemporalDiffMetric = &VPMContentAnalysis::TemporalDiffMetric_C; - if (RTCD) + if (runtime_cpu_detection) { - if(WebRtc_GetCPUInfo(kSSE2)) +#if defined(WEBRTC_ARCH_X86_FAMILY) + if (WebRtc_GetCPUInfo(kSSE2)) { -#if defined(WEBRTC_USE_SSE2) ComputeSpatialMetrics = &VPMContentAnalysis::ComputeSpatialMetrics_SSE2; TemporalDiffMetric = &VPMContentAnalysis::TemporalDiffMetric_SSE2; -#endif } +#endif } Release(); @@ -249,110 +247,6 @@ VPMContentAnalysis::TemporalDiffMetric_C() } -#if defined(WEBRTC_USE_SSE2) -WebRtc_Word32 -VPMContentAnalysis::TemporalDiffMetric_SSE2() -{ - WebRtc_UWord32 numPixels = 0; // counter for # of pixels - - const WebRtc_UWord8* imgBufO = _origFrame + _border*_width + _border; - const WebRtc_UWord8* imgBufP = _prevFrame + _border*_width + _border; - - const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border; - - __m128i sad_64 = _mm_setzero_si128(); - __m128i sum_64 = _mm_setzero_si128(); - __m128i sqsum_64 = _mm_setzero_si128(); - const __m128i z = _mm_setzero_si128(); - - for(WebRtc_UWord16 i = 0; i < (_height - 2*_border); i += _skipNum) - { - __m128i sqsum_32 = _mm_setzero_si128(); - - const WebRtc_UWord8 *lineO = imgBufO; - const WebRtc_UWord8 *lineP = imgBufP; - - // Work on 16 pixels at a time. For HD content with a width of 1920 - // this loop will run ~67 times (depending on border). Maximum for - // abs(o-p) and sum(o) will be 255. _mm_sad_epu8 produces 2 64 bit - // results which are then accumulated. There is no chance of - // rollover for these two accumulators. - // o*o will have a maximum of 255*255 = 65025. This will roll over - // a 16 bit accumulator as 67*65025 > 65535, but will fit in a - // 32 bit accumulator. - for(WebRtc_UWord16 j = 0; j < width_end - _border; j += 16) - { - const __m128i o = _mm_loadu_si128((__m128i*)(lineO)); - const __m128i p = _mm_loadu_si128((__m128i*)(lineP)); - - lineO += 16; - lineP += 16; - - // abs pixel difference between frames - sad_64 = _mm_add_epi64 (sad_64, _mm_sad_epu8(o, p)); - - // sum of all pixels in frame - sum_64 = _mm_add_epi64 (sum_64, _mm_sad_epu8(o, z)); - - // squared sum of all pixels in frame - const __m128i olo = _mm_unpacklo_epi8(o,z); - const __m128i ohi = _mm_unpackhi_epi8(o,z); - - const __m128i sqsum_32_lo = _mm_madd_epi16(olo, olo); - const __m128i sqsum_32_hi = _mm_madd_epi16(ohi, ohi); - - sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_lo); - sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_hi); - } - - // Add to 64 bit running sum as to not roll over. - sqsum_64 = _mm_add_epi64(sqsum_64, - _mm_add_epi64(_mm_unpackhi_epi32(sqsum_32,z), - _mm_unpacklo_epi32(sqsum_32,z))); - - imgBufO += _width * _skipNum; - imgBufP += _width * _skipNum; - numPixels += (width_end - _border); - } - - WebRtc_Word64 sad_final_64[2]; - WebRtc_Word64 sum_final_64[2]; - WebRtc_Word64 sqsum_final_64[2]; - - // bring sums out of vector registers and into integer register - // domain, summing them along the way - _mm_store_si128 ((__m128i*)sad_final_64, sad_64); - _mm_store_si128 ((__m128i*)sum_final_64, sum_64); - _mm_store_si128 ((__m128i*)sqsum_final_64, sqsum_64); - - const WebRtc_UWord32 pixelSum = sum_final_64[0] + sum_final_64[1]; - const WebRtc_UWord64 pixelSqSum = sqsum_final_64[0] + sqsum_final_64[1]; - const WebRtc_UWord32 tempDiffSum = sad_final_64[0] + sad_final_64[1]; - - // default - _motionMagnitudeNZ = 0.0f; - - if (tempDiffSum == 0) - { - return VPM_OK; - } - - // normalize over all pixels - const float tempDiffAvg = (float)tempDiffSum / (float)(numPixels); - const float pixelSumAvg = (float)pixelSum / (float)(numPixels); - const float pixelSqSumAvg = (float)pixelSqSum / (float)(numPixels); - float contrast = pixelSqSumAvg - (pixelSumAvg * pixelSumAvg); - - if (contrast > 0.0) - { - contrast = sqrt(contrast); - _motionMagnitudeNZ = tempDiffAvg/contrast; - } - - return VPM_OK; -} -#endif - // Compute spatial metrics: // To reduce complexity, we compute the metric for a reduced set of points. // The spatial metrics are rough estimates of the prediction error cost for @@ -427,172 +321,6 @@ VPMContentAnalysis::ComputeSpatialMetrics_C() return VPM_OK; } -#if defined(WEBRTC_USE_SSE2) -WebRtc_Word32 -VPMContentAnalysis::ComputeSpatialMetrics_SSE2() -{ - const WebRtc_UWord8* imgBuf = _origFrame + _border*_width; - const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border; - - __m128i se_32 = _mm_setzero_si128(); - __m128i sev_32 = _mm_setzero_si128(); - __m128i seh_32 = _mm_setzero_si128(); - __m128i msa_32 = _mm_setzero_si128(); - const __m128i z = _mm_setzero_si128(); - - // Error is accumulated as a 32 bit value. Looking at HD content with a - // height of 1080 lines, or about 67 macro blocks. If the 16 bit row - // value is maxed out at 65529 for every row, 65529*1080 = 70777800, which - // will not roll over a 32 bit accumulator. - // _skipNum is also used to reduce the number of rows - for(WebRtc_Word32 i = 0; i < (_height - 2*_border); i += _skipNum) - { - __m128i se_16 = _mm_setzero_si128(); - __m128i sev_16 = _mm_setzero_si128(); - __m128i seh_16 = _mm_setzero_si128(); - __m128i msa_16 = _mm_setzero_si128(); - - // Row error is accumulated as a 16 bit value. There are 8 - // accumulators. Max value of a 16 bit number is 65529. Looking - // at HD content, 1080p, has a width of 1920, 120 macro blocks. - // A mb at a time is processed at a time. Absolute max error at - // a point would be abs(0-255+255+255+255) which equals 1020. - // 120*1020 = 122400. The probability of hitting this is quite low - // on well behaved content. A specially crafted image could roll over. - // _border could also be adjusted to concentrate on just the center of - // the images for an HD capture in order to reduce the possiblity of - // rollover. - const WebRtc_UWord8 *lineTop = imgBuf - _width + _border; - const WebRtc_UWord8 *lineCen = imgBuf + _border; - const WebRtc_UWord8 *lineBot = imgBuf + _width + _border; - - for(WebRtc_Word32 j = 0; j < width_end - _border; j += 16) - { - const __m128i t = _mm_loadu_si128((__m128i*)(lineTop)); - const __m128i l = _mm_loadu_si128((__m128i*)(lineCen - 1)); - const __m128i c = _mm_loadu_si128((__m128i*)(lineCen)); - const __m128i r = _mm_loadu_si128((__m128i*)(lineCen + 1)); - const __m128i b = _mm_loadu_si128((__m128i*)(lineBot)); - - lineTop += 16; - lineCen += 16; - lineBot += 16; - - // center pixel unpacked - __m128i clo = _mm_unpacklo_epi8(c,z); - __m128i chi = _mm_unpackhi_epi8(c,z); - - // left right pixels unpacked and added together - const __m128i lrlo = _mm_add_epi16(_mm_unpacklo_epi8(l,z), - _mm_unpacklo_epi8(r,z)); - const __m128i lrhi = _mm_add_epi16(_mm_unpackhi_epi8(l,z), - _mm_unpackhi_epi8(r,z)); - - // top & bottom pixels unpacked and added together - const __m128i tblo = _mm_add_epi16(_mm_unpacklo_epi8(t,z), - _mm_unpacklo_epi8(b,z)); - const __m128i tbhi = _mm_add_epi16(_mm_unpackhi_epi8(t,z), - _mm_unpackhi_epi8(b,z)); - - // running sum of all pixels - msa_16 = _mm_add_epi16(msa_16, _mm_add_epi16(chi, clo)); - - clo = _mm_slli_epi16(clo, 1); - chi = _mm_slli_epi16(chi, 1); - const __m128i sevtlo = _mm_subs_epi16(clo, tblo); - const __m128i sevthi = _mm_subs_epi16(chi, tbhi); - const __m128i sehtlo = _mm_subs_epi16(clo, lrlo); - const __m128i sehthi = _mm_subs_epi16(chi, lrhi); - - clo = _mm_slli_epi16(clo, 1); - chi = _mm_slli_epi16(chi, 1); - const __m128i setlo = _mm_subs_epi16(clo, - _mm_add_epi16(lrlo, tblo)); - const __m128i sethi = _mm_subs_epi16(chi, - _mm_add_epi16(lrhi, tbhi)); - - // Add to 16 bit running sum - se_16 = _mm_add_epi16(se_16, - _mm_max_epi16(setlo, - _mm_subs_epi16(z, setlo))); - se_16 = _mm_add_epi16(se_16, - _mm_max_epi16(sethi, - _mm_subs_epi16(z, sethi))); - sev_16 = _mm_add_epi16(sev_16, - _mm_max_epi16(sevtlo, - _mm_subs_epi16(z, sevtlo))); - sev_16 = _mm_add_epi16(sev_16, - _mm_max_epi16(sevthi, - _mm_subs_epi16(z, sevthi))); - seh_16 = _mm_add_epi16(seh_16, - _mm_max_epi16(sehtlo, - _mm_subs_epi16(z, sehtlo))); - seh_16 = _mm_add_epi16(seh_16, - _mm_max_epi16(sehthi, - _mm_subs_epi16(z, sehthi))); - } - - // Add to 32 bit running sum as to not roll over. - se_32 = _mm_add_epi32(se_32, - _mm_add_epi32(_mm_unpackhi_epi16(se_16,z), - _mm_unpacklo_epi16(se_16,z))); - sev_32 = _mm_add_epi32(sev_32, - _mm_add_epi32(_mm_unpackhi_epi16(sev_16,z), - _mm_unpacklo_epi16(sev_16,z))); - seh_32 = _mm_add_epi32(seh_32, - _mm_add_epi32(_mm_unpackhi_epi16(seh_16,z), - _mm_unpacklo_epi16(seh_16,z))); - msa_32 = _mm_add_epi32(msa_32, - _mm_add_epi32(_mm_unpackhi_epi16(msa_16,z), - _mm_unpacklo_epi16(msa_16,z))); - - imgBuf += _width * _skipNum; - } - - WebRtc_Word64 se_64[2]; - WebRtc_Word64 sev_64[2]; - WebRtc_Word64 seh_64[2]; - WebRtc_Word64 msa_64[2]; - - // bring sums out of vector registers and into integer register - // domain, summing them along the way - _mm_store_si128 ((__m128i*)se_64, - _mm_add_epi64(_mm_unpackhi_epi32(se_32,z), - _mm_unpacklo_epi32(se_32,z))); - _mm_store_si128 ((__m128i*)sev_64, - _mm_add_epi64(_mm_unpackhi_epi32(sev_32,z), - _mm_unpacklo_epi32(sev_32,z))); - _mm_store_si128 ((__m128i*)seh_64, - _mm_add_epi64(_mm_unpackhi_epi32(seh_32,z), - _mm_unpacklo_epi32(seh_32,z))); - _mm_store_si128 ((__m128i*)msa_64, - _mm_add_epi64(_mm_unpackhi_epi32(msa_32,z), - _mm_unpacklo_epi32(msa_32,z))); - - const WebRtc_UWord32 spatialErrSum = se_64[0] + se_64[1]; - const WebRtc_UWord32 spatialErrVSum = sev_64[0] + sev_64[1]; - const WebRtc_UWord32 spatialErrHSum = seh_64[0] + seh_64[1]; - const WebRtc_UWord32 pixelMSA = msa_64[0] + msa_64[1]; - - // normalize over all pixels - const float spatialErr = (float)(spatialErrSum >> 2); - const float spatialErrH = (float)(spatialErrHSum >> 1); - const float spatialErrV = (float)(spatialErrVSum >> 1); - const float norm = (float)pixelMSA; - - // 2X2: - _spatialPredErr = spatialErr / norm; - - // 1X2: - _spatialPredErrH = spatialErrH / norm; - - // 2X1: - _spatialPredErrV = spatialErrV / norm; - - return VPM_OK; -} -#endif // #if defined(WEBRTC_USE_SSE2) - VideoContentMetrics* VPMContentAnalysis::ContentMetrics() { diff --git a/src/modules/video_processing/main/source/content_analysis.h b/src/modules/video_processing/main/source/content_analysis.h index e0810d364a..5051650291 100644 --- a/src/modules/video_processing/main/source/content_analysis.h +++ b/src/modules/video_processing/main/source/content_analysis.h @@ -8,10 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ -/* - * content_analysis.h - */ - #ifndef VPM_CONTENT_ANALYSIS_H #define VPM_CONTENT_ANALYSIS_H @@ -24,7 +20,9 @@ namespace webrtc { class VPMContentAnalysis { public: - VPMContentAnalysis(bool RTCD = true); + // When |runtime_cpu_detection| is true, runtime selection of an optimized + // code path is allowed. + VPMContentAnalysis(bool runtime_cpu_detection); ~VPMContentAnalysis(); // Initialize ContentAnalysis - should be called prior to @@ -62,7 +60,7 @@ private: ComputeSpatialMetricsFunc ComputeSpatialMetrics; WebRtc_Word32 ComputeSpatialMetrics_C(); -#if defined(WEBRTC_USE_SSE2) +#if defined(WEBRTC_ARCH_X86_FAMILY) WebRtc_Word32 ComputeSpatialMetrics_SSE2(); WebRtc_Word32 TemporalDiffMetric_SSE2(); #endif diff --git a/src/modules/video_processing/main/source/content_analysis_sse2.cc b/src/modules/video_processing/main/source/content_analysis_sse2.cc new file mode 100644 index 0000000000..347fa5b607 --- /dev/null +++ b/src/modules/video_processing/main/source/content_analysis_sse2.cc @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "content_analysis.h" + +#include +#include + +namespace webrtc { + +WebRtc_Word32 +VPMContentAnalysis::TemporalDiffMetric_SSE2() +{ + WebRtc_UWord32 numPixels = 0; // counter for # of pixels + + const WebRtc_UWord8* imgBufO = _origFrame + _border*_width + _border; + const WebRtc_UWord8* imgBufP = _prevFrame + _border*_width + _border; + + const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border; + + __m128i sad_64 = _mm_setzero_si128(); + __m128i sum_64 = _mm_setzero_si128(); + __m128i sqsum_64 = _mm_setzero_si128(); + const __m128i z = _mm_setzero_si128(); + + for(WebRtc_UWord16 i = 0; i < (_height - 2*_border); i += _skipNum) + { + __m128i sqsum_32 = _mm_setzero_si128(); + + const WebRtc_UWord8 *lineO = imgBufO; + const WebRtc_UWord8 *lineP = imgBufP; + + // Work on 16 pixels at a time. For HD content with a width of 1920 + // this loop will run ~67 times (depending on border). Maximum for + // abs(o-p) and sum(o) will be 255. _mm_sad_epu8 produces 2 64 bit + // results which are then accumulated. There is no chance of + // rollover for these two accumulators. + // o*o will have a maximum of 255*255 = 65025. This will roll over + // a 16 bit accumulator as 67*65025 > 65535, but will fit in a + // 32 bit accumulator. + for(WebRtc_UWord16 j = 0; j < width_end - _border; j += 16) + { + const __m128i o = _mm_loadu_si128((__m128i*)(lineO)); + const __m128i p = _mm_loadu_si128((__m128i*)(lineP)); + + lineO += 16; + lineP += 16; + + // abs pixel difference between frames + sad_64 = _mm_add_epi64 (sad_64, _mm_sad_epu8(o, p)); + + // sum of all pixels in frame + sum_64 = _mm_add_epi64 (sum_64, _mm_sad_epu8(o, z)); + + // squared sum of all pixels in frame + const __m128i olo = _mm_unpacklo_epi8(o,z); + const __m128i ohi = _mm_unpackhi_epi8(o,z); + + const __m128i sqsum_32_lo = _mm_madd_epi16(olo, olo); + const __m128i sqsum_32_hi = _mm_madd_epi16(ohi, ohi); + + sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_lo); + sqsum_32 = _mm_add_epi32(sqsum_32, sqsum_32_hi); + } + + // Add to 64 bit running sum as to not roll over. + sqsum_64 = _mm_add_epi64(sqsum_64, + _mm_add_epi64(_mm_unpackhi_epi32(sqsum_32,z), + _mm_unpacklo_epi32(sqsum_32,z))); + + imgBufO += _width * _skipNum; + imgBufP += _width * _skipNum; + numPixels += (width_end - _border); + } + + WebRtc_Word64 sad_final_64[2]; + WebRtc_Word64 sum_final_64[2]; + WebRtc_Word64 sqsum_final_64[2]; + + // bring sums out of vector registers and into integer register + // domain, summing them along the way + _mm_store_si128 ((__m128i*)sad_final_64, sad_64); + _mm_store_si128 ((__m128i*)sum_final_64, sum_64); + _mm_store_si128 ((__m128i*)sqsum_final_64, sqsum_64); + + const WebRtc_UWord32 pixelSum = sum_final_64[0] + sum_final_64[1]; + const WebRtc_UWord64 pixelSqSum = sqsum_final_64[0] + sqsum_final_64[1]; + const WebRtc_UWord32 tempDiffSum = sad_final_64[0] + sad_final_64[1]; + + // default + _motionMagnitudeNZ = 0.0f; + + if (tempDiffSum == 0) + { + return VPM_OK; + } + + // normalize over all pixels + const float tempDiffAvg = (float)tempDiffSum / (float)(numPixels); + const float pixelSumAvg = (float)pixelSum / (float)(numPixels); + const float pixelSqSumAvg = (float)pixelSqSum / (float)(numPixels); + float contrast = pixelSqSumAvg - (pixelSumAvg * pixelSumAvg); + + if (contrast > 0.0) + { + contrast = sqrt(contrast); + _motionMagnitudeNZ = tempDiffAvg/contrast; + } + + return VPM_OK; +} + +WebRtc_Word32 +VPMContentAnalysis::ComputeSpatialMetrics_SSE2() +{ + const WebRtc_UWord8* imgBuf = _origFrame + _border*_width; + const WebRtc_Word32 width_end = ((_width - 2*_border) & -16) + _border; + + __m128i se_32 = _mm_setzero_si128(); + __m128i sev_32 = _mm_setzero_si128(); + __m128i seh_32 = _mm_setzero_si128(); + __m128i msa_32 = _mm_setzero_si128(); + const __m128i z = _mm_setzero_si128(); + + // Error is accumulated as a 32 bit value. Looking at HD content with a + // height of 1080 lines, or about 67 macro blocks. If the 16 bit row + // value is maxed out at 65529 for every row, 65529*1080 = 70777800, which + // will not roll over a 32 bit accumulator. + // _skipNum is also used to reduce the number of rows + for(WebRtc_Word32 i = 0; i < (_height - 2*_border); i += _skipNum) + { + __m128i se_16 = _mm_setzero_si128(); + __m128i sev_16 = _mm_setzero_si128(); + __m128i seh_16 = _mm_setzero_si128(); + __m128i msa_16 = _mm_setzero_si128(); + + // Row error is accumulated as a 16 bit value. There are 8 + // accumulators. Max value of a 16 bit number is 65529. Looking + // at HD content, 1080p, has a width of 1920, 120 macro blocks. + // A mb at a time is processed at a time. Absolute max error at + // a point would be abs(0-255+255+255+255) which equals 1020. + // 120*1020 = 122400. The probability of hitting this is quite low + // on well behaved content. A specially crafted image could roll over. + // _border could also be adjusted to concentrate on just the center of + // the images for an HD capture in order to reduce the possiblity of + // rollover. + const WebRtc_UWord8 *lineTop = imgBuf - _width + _border; + const WebRtc_UWord8 *lineCen = imgBuf + _border; + const WebRtc_UWord8 *lineBot = imgBuf + _width + _border; + + for(WebRtc_Word32 j = 0; j < width_end - _border; j += 16) + { + const __m128i t = _mm_loadu_si128((__m128i*)(lineTop)); + const __m128i l = _mm_loadu_si128((__m128i*)(lineCen - 1)); + const __m128i c = _mm_loadu_si128((__m128i*)(lineCen)); + const __m128i r = _mm_loadu_si128((__m128i*)(lineCen + 1)); + const __m128i b = _mm_loadu_si128((__m128i*)(lineBot)); + + lineTop += 16; + lineCen += 16; + lineBot += 16; + + // center pixel unpacked + __m128i clo = _mm_unpacklo_epi8(c,z); + __m128i chi = _mm_unpackhi_epi8(c,z); + + // left right pixels unpacked and added together + const __m128i lrlo = _mm_add_epi16(_mm_unpacklo_epi8(l,z), + _mm_unpacklo_epi8(r,z)); + const __m128i lrhi = _mm_add_epi16(_mm_unpackhi_epi8(l,z), + _mm_unpackhi_epi8(r,z)); + + // top & bottom pixels unpacked and added together + const __m128i tblo = _mm_add_epi16(_mm_unpacklo_epi8(t,z), + _mm_unpacklo_epi8(b,z)); + const __m128i tbhi = _mm_add_epi16(_mm_unpackhi_epi8(t,z), + _mm_unpackhi_epi8(b,z)); + + // running sum of all pixels + msa_16 = _mm_add_epi16(msa_16, _mm_add_epi16(chi, clo)); + + clo = _mm_slli_epi16(clo, 1); + chi = _mm_slli_epi16(chi, 1); + const __m128i sevtlo = _mm_subs_epi16(clo, tblo); + const __m128i sevthi = _mm_subs_epi16(chi, tbhi); + const __m128i sehtlo = _mm_subs_epi16(clo, lrlo); + const __m128i sehthi = _mm_subs_epi16(chi, lrhi); + + clo = _mm_slli_epi16(clo, 1); + chi = _mm_slli_epi16(chi, 1); + const __m128i setlo = _mm_subs_epi16(clo, + _mm_add_epi16(lrlo, tblo)); + const __m128i sethi = _mm_subs_epi16(chi, + _mm_add_epi16(lrhi, tbhi)); + + // Add to 16 bit running sum + se_16 = _mm_add_epi16(se_16, + _mm_max_epi16(setlo, + _mm_subs_epi16(z, setlo))); + se_16 = _mm_add_epi16(se_16, + _mm_max_epi16(sethi, + _mm_subs_epi16(z, sethi))); + sev_16 = _mm_add_epi16(sev_16, + _mm_max_epi16(sevtlo, + _mm_subs_epi16(z, sevtlo))); + sev_16 = _mm_add_epi16(sev_16, + _mm_max_epi16(sevthi, + _mm_subs_epi16(z, sevthi))); + seh_16 = _mm_add_epi16(seh_16, + _mm_max_epi16(sehtlo, + _mm_subs_epi16(z, sehtlo))); + seh_16 = _mm_add_epi16(seh_16, + _mm_max_epi16(sehthi, + _mm_subs_epi16(z, sehthi))); + } + + // Add to 32 bit running sum as to not roll over. + se_32 = _mm_add_epi32(se_32, + _mm_add_epi32(_mm_unpackhi_epi16(se_16,z), + _mm_unpacklo_epi16(se_16,z))); + sev_32 = _mm_add_epi32(sev_32, + _mm_add_epi32(_mm_unpackhi_epi16(sev_16,z), + _mm_unpacklo_epi16(sev_16,z))); + seh_32 = _mm_add_epi32(seh_32, + _mm_add_epi32(_mm_unpackhi_epi16(seh_16,z), + _mm_unpacklo_epi16(seh_16,z))); + msa_32 = _mm_add_epi32(msa_32, + _mm_add_epi32(_mm_unpackhi_epi16(msa_16,z), + _mm_unpacklo_epi16(msa_16,z))); + + imgBuf += _width * _skipNum; + } + + WebRtc_Word64 se_64[2]; + WebRtc_Word64 sev_64[2]; + WebRtc_Word64 seh_64[2]; + WebRtc_Word64 msa_64[2]; + + // bring sums out of vector registers and into integer register + // domain, summing them along the way + _mm_store_si128 ((__m128i*)se_64, + _mm_add_epi64(_mm_unpackhi_epi32(se_32,z), + _mm_unpacklo_epi32(se_32,z))); + _mm_store_si128 ((__m128i*)sev_64, + _mm_add_epi64(_mm_unpackhi_epi32(sev_32,z), + _mm_unpacklo_epi32(sev_32,z))); + _mm_store_si128 ((__m128i*)seh_64, + _mm_add_epi64(_mm_unpackhi_epi32(seh_32,z), + _mm_unpacklo_epi32(seh_32,z))); + _mm_store_si128 ((__m128i*)msa_64, + _mm_add_epi64(_mm_unpackhi_epi32(msa_32,z), + _mm_unpacklo_epi32(msa_32,z))); + + const WebRtc_UWord32 spatialErrSum = se_64[0] + se_64[1]; + const WebRtc_UWord32 spatialErrVSum = sev_64[0] + sev_64[1]; + const WebRtc_UWord32 spatialErrHSum = seh_64[0] + seh_64[1]; + const WebRtc_UWord32 pixelMSA = msa_64[0] + msa_64[1]; + + // normalize over all pixels + const float spatialErr = (float)(spatialErrSum >> 2); + const float spatialErrH = (float)(spatialErrHSum >> 1); + const float spatialErrV = (float)(spatialErrVSum >> 1); + const float norm = (float)pixelMSA; + + // 2X2: + _spatialPredErr = spatialErr / norm; + + // 1X2: + _spatialPredErrH = spatialErrH / norm; + + // 2X1: + _spatialPredErrV = spatialErrV / norm; + + return VPM_OK; +} + +} // namespace webrtc diff --git a/src/modules/video_processing/main/source/frame_preprocessor.cc b/src/modules/video_processing/main/source/frame_preprocessor.cc index 57bc84daf9..14ced41476 100644 --- a/src/modules/video_processing/main/source/frame_preprocessor.cc +++ b/src/modules/video_processing/main/source/frame_preprocessor.cc @@ -22,7 +22,7 @@ _resampledFrame(), _enableCA(false) { _spatialResampler = new VPMSimpleSpatialResampler(); - _ca = new VPMContentAnalysis(); + _ca = new VPMContentAnalysis(true); _vd = new VPMVideoDecimator(); } diff --git a/src/modules/video_processing/main/source/video_processing.gypi b/src/modules/video_processing/main/source/video_processing.gypi index 8ca831d96e..dda0a780a4 100644 --- a/src/modules/video_processing/main/source/video_processing.gypi +++ b/src/modules/video_processing/main/source/video_processing.gypi @@ -14,7 +14,7 @@ 'dependencies': [ 'webrtc_utility', '<(webrtc_root)/common_audio/common_audio.gyp:signal_processing', - '<(webrtc_root)/common_video/common_video.gyp:webrtc_libyuv', + '<(webrtc_root)/common_video/common_video.gyp:webrtc_libyuv', '<(webrtc_root)/system_wrappers/source/system_wrappers.gyp:system_wrappers', ], 'include_dirs': [ @@ -26,41 +26,57 @@ ], }, 'sources': [ - # interfaces '../interface/video_processing.h', '../interface/video_processing_defines.h', - - # headers - 'video_processing_impl.h', + 'brighten.cc', + 'brighten.h', + 'brightness_detection.cc', 'brightness_detection.h', - 'brighten.h', + 'color_enhancement.cc', 'color_enhancement.h', 'color_enhancement_private.h', - 'content_analysis.h', - 'deflickering.h', - 'denoising.h', - 'frame_preprocessor.h', - 'spatial_resampler.h', - 'video_decimator.h', - - # sources - 'video_processing_impl.cc', - 'brightness_detection.cc', - 'brighten.cc', - 'color_enhancement.cc', 'content_analysis.cc', + 'content_analysis.h', 'deflickering.cc', + 'deflickering.h', 'denoising.cc', + 'denoising.h', 'frame_preprocessor.cc', + 'frame_preprocessor.h', 'spatial_resampler.cc', + 'spatial_resampler.h', 'video_decimator.cc', - ], # source + 'video_decimator.h', + 'video_processing_impl.cc', + 'video_processing_impl.h', + ], + 'conditions': [ + ['target_arch=="ia32" or target_arch=="x64"', { + 'dependencies': [ 'video_processing_sse2', ], + }], + ], + }, + { + 'target_name': 'video_processing_sse2', + 'type': '<(library)', + 'sources': [ + 'content_analysis_sse2.cc', + ], + 'include_dirs': [ + '../interface', + '../../../interface', + ], + 'conditions': [ + ['os_posix==1 and OS!="mac"', { + 'cflags': [ '-msse2', ], + }], + ['OS=="mac"', { + 'xcode_settings': { + 'OTHER_CFLAGS': [ '-msse2', ], + }, + }], + ], }, ], } -# Local Variables: -# tab-width:2 -# indent-tabs-mode:nil -# End: -# vim: set expandtab tabstop=2 shiftwidth=2: diff --git a/src/modules/video_processing/main/test/unit_test/content_metrics_test.cc b/src/modules/video_processing/main/test/unit_test/content_metrics_test.cc index 20e803c782..b25c45fc6d 100644 --- a/src/modules/video_processing/main/test/unit_test/content_metrics_test.cc +++ b/src/modules/video_processing/main/test/unit_test/content_metrics_test.cc @@ -17,7 +17,7 @@ namespace webrtc { TEST_F(VideoProcessingModuleTest, ContentAnalysis) { VPMContentAnalysis _ca_c(false); - VPMContentAnalysis _ca_sse; + VPMContentAnalysis _ca_sse(true); VideoContentMetrics *_cM_c, *_cM_SSE; _ca_c.Initialize(_width,_height);