diff --git a/src/modules/audio_processing/aec/main/source/aec_core_sse2.c b/src/modules/audio_processing/aec/main/source/aec_core_sse2.c index 616abcead2..8894f28a17 100644 --- a/src/modules/audio_processing/aec/main/source/aec_core_sse2.c +++ b/src/modules/audio_processing/aec/main/source/aec_core_sse2.c @@ -235,10 +235,9 @@ static __m128 mm_pow_ps(__m128 a, __m128 b) {0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000}; static const int shift_exponent_into_top_mantissa = 8; const __m128 two_n = _mm_and_ps(a, *((__m128 *)float_exponent_mask)); - const __m128 n_1 = (__m128)_mm_srli_epi32((__m128i)two_n, - shift_exponent_into_top_mantissa); - const __m128 n_0 = _mm_or_ps( - (__m128)n_1, *((__m128 *)eight_biased_exponent)); + const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(two_n), + shift_exponent_into_top_mantissa)); + const __m128 n_0 = _mm_or_ps(n_1, *((__m128 *)eight_biased_exponent)); const __m128 n = _mm_sub_ps(n_0, *((__m128 *)implicit_leading_one)); // Compute y. @@ -317,8 +316,8 @@ static __m128 mm_pow_ps(__m128 a, __m128 b) static const int float_exponent_shift = 23; const __m128i two_n_exponent = _mm_add_epi32( x_minus_half_floor, *((__m128i *)float_exponent_bias)); - const __m128 two_n = (__m128)_mm_slli_epi32( - two_n_exponent, float_exponent_shift); + const __m128 two_n = _mm_castsi128_ps(_mm_slli_epi32( + two_n_exponent, float_exponent_shift)); // Compute y. const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor)); // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. diff --git a/src/modules/audio_processing/aec/main/source/aec_rdft_sse2.c b/src/modules/audio_processing/aec/main/source/aec_rdft_sse2.c index 89aea87422..f936e2a7e2 100644 --- a/src/modules/audio_processing/aec/main/source/aec_rdft_sse2.c +++ b/src/modules/audio_processing/aec/main/source/aec_rdft_sse2.c @@ -42,27 +42,33 @@ static void cft1st_128_SSE2(float *a) { const __m128 x1v = _mm_sub_ps(a01v, a23v); const __m128 x2v = _mm_add_ps(a45v, a67v); const __m128 x3v = _mm_sub_ps(a45v, a67v); + __m128 x0w; a01v = _mm_add_ps(x0v, x2v); x0v = _mm_sub_ps(x0v, x2v); - __m128 x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1)); - - const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v); - const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w); - a45v = _mm_add_ps(a45_0v, a45_1v); - - const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0 ,1)); - const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w); - x0v = _mm_add_ps(x1v, x3s); x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1)); - const __m128 a23_0v = _mm_mul_ps(wk1rv, x0v); - const __m128 a23_1v = _mm_mul_ps(wk1iv, x0w); - a23v = _mm_add_ps(a23_0v, a23_1v); + { + const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v); + const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w); + a45v = _mm_add_ps(a45_0v, a45_1v); + } + { + __m128 a23_0v, a23_1v; + const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0 ,1)); + const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w); + x0v = _mm_add_ps(x1v, x3s); + x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1)); + a23_0v = _mm_mul_ps(wk1rv, x0v); + a23_1v = _mm_mul_ps(wk1iv, x0w); + a23v = _mm_add_ps(a23_0v, a23_1v); - x0v = _mm_sub_ps(x1v, x3s); - x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1)); - const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v); - const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w); - a67v = _mm_add_ps(a67_0v, a67_1v); + x0v = _mm_sub_ps(x1v, x3s); + x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1)); + } + { + const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v); + const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w); + a67v = _mm_add_ps(a67_0v, a67_1v); + } a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1 ,0)); a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1 ,0)); @@ -78,7 +84,7 @@ static void cft1st_128_SSE2(float *a) { static void cftmdl_128_SSE2(float *a) { const int l = 8; const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); - int j0, k, k1, k2; + int j0; __m128 wk1rv = _mm_load_ps(cftmdl_wk1r); for (j0 = 0; j0 < l; j0 += 2) { @@ -86,9 +92,11 @@ static void cftmdl_128_SSE2(float *a) { const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); - const __m128 a_00_32 = _mm_shuffle_ps((__m128)a_00, (__m128)a_32, + const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), + _mm_castsi128_ps(a_32), _MM_SHUFFLE(1, 0, 1 ,0)); - const __m128 a_08_40 = _mm_shuffle_ps((__m128)a_08, (__m128)a_40, + const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), + _mm_castsi128_ps(a_40), _MM_SHUFFLE(1, 0, 1 ,0)); __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); @@ -97,30 +105,24 @@ static void cftmdl_128_SSE2(float *a) { const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); - const __m128 a_16_48 = _mm_shuffle_ps((__m128)a_16, (__m128)a_48, + const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), + _mm_castsi128_ps(a_48), _MM_SHUFFLE(1, 0, 1 ,0)); - const __m128 a_24_56 = _mm_shuffle_ps((__m128)a_24, (__m128)a_56, + const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), + _mm_castsi128_ps(a_56), _MM_SHUFFLE(1, 0, 1 ,0)); const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); - _mm_storel_epi64((__m128i*)&a[j0 + 0], (__m128i)xx0); - _mm_storel_epi64((__m128i*)&a[j0 + 32], - _mm_shuffle_epi32((__m128i)xx0, _MM_SHUFFLE(3, 2, 3, 2))); const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); - _mm_storel_epi64((__m128i*)&a[j0 + 16], (__m128i)xx1); - _mm_storel_epi64((__m128i*)&a[j0 + 48], - _mm_shuffle_epi32((__m128i)xx1, _MM_SHUFFLE(2, 3, 2, 3))); - a[j0 + 48] = -a[j0 + 48]; - const __m128 x3i0_3r0_3i1_x3r1 = (__m128) - _mm_shuffle_epi32((__m128i)x3r0_3i0_3r1_x3i1, _MM_SHUFFLE(2, 3, 0, 1)); + const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps( + _mm_shuffle_epi32(_mm_castps_si128(x3r0_3i0_3r1_x3i1), + _MM_SHUFFLE(2, 3, 0, 1))); const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); - _mm_storel_epi64((__m128i*)&a[j0 + 8], (__m128i)x1_x3_add); - _mm_storel_epi64((__m128i*)&a[j0 + 24], (__m128i)x1_x3_sub); const __m128 yy0 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2 ,2)); @@ -129,79 +131,111 @@ static void cftmdl_128_SSE2(float *a) { const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1); const __m128 yy3 = _mm_add_ps(yy0, yy2); const __m128 yy4 = _mm_mul_ps(wk1rv, yy3); - _mm_storel_epi64((__m128i*)&a[j0 + 40], (__m128i)yy4); + + _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0)); + _mm_storel_epi64((__m128i*)&a[j0 + 32], + _mm_shuffle_epi32(_mm_castps_si128(xx0), + _MM_SHUFFLE(3, 2, 3, 2))); + + _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1)); + _mm_storel_epi64((__m128i*)&a[j0 + 48], + _mm_shuffle_epi32(_mm_castps_si128(xx1), + _MM_SHUFFLE(2, 3, 2, 3))); + a[j0 + 48] = -a[j0 + 48]; + + _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add)); + _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub)); + + _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4)); _mm_storel_epi64((__m128i*)&a[j0 + 56], - _mm_shuffle_epi32((__m128i)yy4, _MM_SHUFFLE(2, 3, 2, 3))); + _mm_shuffle_epi32(_mm_castps_si128(yy4), + _MM_SHUFFLE(2, 3, 2, 3))); } - k1 = 0; - k = 64; - k1 += 2; - k2 = 2 * k1; - const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2+0]); - const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2+0]); - wk1rv = _mm_load_ps(&rdft_wk1r[k2+0]); - const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2+0]); - const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2+0]); - const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2+0]); - for (j0 = k; j0 < l + k; j0 += 2) { - const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); - const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); - const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); - const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); - const __m128 a_00_32 = _mm_shuffle_ps((__m128)a_00, (__m128)a_32, - _MM_SHUFFLE(1, 0, 1 ,0)); - const __m128 a_08_40 = _mm_shuffle_ps((__m128)a_08, (__m128)a_40, - _MM_SHUFFLE(1, 0, 1 ,0)); - __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); - const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); + { + int k = 64; + int k1 = 2; + int k2 = 2 * k1; + const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2+0]); + const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2+0]); + const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2+0]); + const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2+0]); + const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2+0]); + wk1rv = _mm_load_ps(&rdft_wk1r[k2+0]); + for (j0 = k; j0 < l + k; j0 += 2) { + const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); + const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); + const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); + const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); + const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), + _mm_castsi128_ps(a_32), + _MM_SHUFFLE(1, 0, 1 ,0)); + const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), + _mm_castsi128_ps(a_40), + _MM_SHUFFLE(1, 0, 1 ,0)); + __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); + const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); - const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); - const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); - const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); - const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); - const __m128 a_16_48 = _mm_shuffle_ps((__m128)a_16, (__m128)a_48, - _MM_SHUFFLE(1, 0, 1 ,0)); - const __m128 a_24_56 = _mm_shuffle_ps((__m128)a_24, (__m128)a_56, - _MM_SHUFFLE(1, 0, 1 ,0)); - const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); - const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); + const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); + const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); + const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); + const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); + const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), + _mm_castsi128_ps(a_48), + _MM_SHUFFLE(1, 0, 1 ,0)); + const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), + _mm_castsi128_ps(a_56), + _MM_SHUFFLE(1, 0, 1 ,0)); + const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); + const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); - const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); - _mm_storel_epi64((__m128i*)&a[j0 + 0], (__m128i)xx); - _mm_storel_epi64((__m128i*)&a[j0 + 32], - _mm_shuffle_epi32((__m128i)xx, _MM_SHUFFLE(3, 2, 3, 2))); + const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + const __m128 xx2 = _mm_mul_ps(xx1 , wk2rv); + const __m128 xx3 = _mm_mul_ps(wk2iv, + _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1), + _MM_SHUFFLE(2, 3, 0, 1)))); + const __m128 xx4 = _mm_add_ps(xx2, xx3); - const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); - const __m128 xx2 = _mm_mul_ps(xx1 , wk2rv); - const __m128 xx3 = _mm_mul_ps(wk2iv, - (__m128)_mm_shuffle_epi32((__m128i)xx1, _MM_SHUFFLE(2, 3, 0, 1))); - const __m128 xx4 = _mm_add_ps(xx2, xx3); - _mm_storel_epi64((__m128i*)&a[j0 + 16], (__m128i)xx4); - _mm_storel_epi64((__m128i*)&a[j0 + 48], - _mm_shuffle_epi32((__m128i)xx4, _MM_SHUFFLE(3, 2, 3, 2))); + const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps( + _mm_shuffle_epi32(_mm_castps_si128(x3r0_3i0_3r1_x3i1), + _MM_SHUFFLE(2, 3, 0, 1))); + const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); + const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); + const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); - const __m128 x3i0_3r0_3i1_x3r1 = (__m128) - _mm_shuffle_epi32((__m128i)x3r0_3i0_3r1_x3i1, _MM_SHUFFLE(2, 3, 0, 1)); - const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); - const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); - const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); + const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); + const __m128 xx11 = _mm_mul_ps(wk1iv, + _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add), + _MM_SHUFFLE(2, 3, 0, 1)))); + const __m128 xx12 = _mm_add_ps(xx10, xx11); - const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); - const __m128 xx11 = _mm_mul_ps(wk1iv, - (__m128)_mm_shuffle_epi32((__m128i)x1_x3_add, _MM_SHUFFLE(2, 3, 0, 1))); - const __m128 xx12 = _mm_add_ps(xx10, xx11); - _mm_storel_epi64((__m128i*)&a[j0 + 8], (__m128i)xx12); - _mm_storel_epi64((__m128i*)&a[j0 + 40], - _mm_shuffle_epi32((__m128i)xx12, _MM_SHUFFLE(3, 2, 3, 2))); + const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); + const __m128 xx21 = _mm_mul_ps(wk3iv, + _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub), + _MM_SHUFFLE(2, 3, 0, 1)))); + const __m128 xx22 = _mm_add_ps(xx20, xx21); - const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); - const __m128 xx21 = _mm_mul_ps(wk3iv, - (__m128)_mm_shuffle_epi32((__m128i)x1_x3_sub, _MM_SHUFFLE(2, 3, 0, 1))); - const __m128 xx22 = _mm_add_ps(xx20, xx21); - _mm_storel_epi64((__m128i*)&a[j0 + 24], (__m128i)xx22); - _mm_storel_epi64((__m128i*)&a[j0 + 56], - _mm_shuffle_epi32((__m128i)xx22, _MM_SHUFFLE(3, 2, 3, 2))); + _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx)); + _mm_storel_epi64((__m128i*)&a[j0 + 32], + _mm_shuffle_epi32(_mm_castps_si128(xx), + _MM_SHUFFLE(3, 2, 3, 2))); + + _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4)); + _mm_storel_epi64((__m128i*)&a[j0 + 48], + _mm_shuffle_epi32(_mm_castps_si128(xx4), + _MM_SHUFFLE(3, 2, 3, 2))); + + _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12)); + _mm_storel_epi64((__m128i*)&a[j0 + 40], + _mm_shuffle_epi32(_mm_castps_si128(xx12), + _MM_SHUFFLE(3, 2, 3, 2))); + + _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22)); + _mm_storel_epi64((__m128i*)&a[j0 + 56], + _mm_shuffle_epi32(_mm_castps_si128(xx22), + _MM_SHUFFLE(3, 2, 3, 2))); + } } } diff --git a/src/modules/video_processing/main/source/content_analysis.cc b/src/modules/video_processing/main/source/content_analysis.cc index dacdbb9ec1..45935ebe09 100644 --- a/src/modules/video_processing/main/source/content_analysis.cc +++ b/src/modules/video_processing/main/source/content_analysis.cc @@ -14,7 +14,7 @@ #include #include #if defined(WEBRTC_USE_SSE2) -#include +#include #endif namespace webrtc { diff --git a/src/typedefs.h b/src/typedefs.h index 61565149ba..ca3b509c00 100644 --- a/src/typedefs.h +++ b/src/typedefs.h @@ -80,10 +80,7 @@ #error Please add support for your architecture in typedefs.h #endif -// TODO(andrew): SSE2 is disabled on Windows for the moment, because AEC -// optimization is broken. Enable it as soon as AEC is fixed. -//#if defined(__SSE2__) || defined(_MSC_VER) -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(_MSC_VER) #define WEBRTC_USE_SSE2 #endif