From 0e07d82f479cc1bf77c816e2bcd7e787491e3182 Mon Sep 17 00:00:00 2001 From: "cduvivier@google.com" Date: Mon, 25 Jul 2011 23:54:20 +0000 Subject: [PATCH] Optimization of 'cft1st': * scalar optimization, vectorization. * 1.7% AEC overall speedup for the straight C path. * 12.2% AEC overall speedup for the SSE2 path. Review URL: http://webrtc-codereview.appspot.com/88006 git-svn-id: http://webrtc.googlecode.com/svn/trunk@253 4adac7df-926f-26a2-2b94-8c16560cd09d --- .../aec/main/source/aec_rdft.c | 84 ++++++++++++++++--- .../aec/main/source/aec_rdft.h | 19 +++++ .../aec/main/source/aec_rdft_sse2.c | 66 +++++++++++++-- 3 files changed, 150 insertions(+), 19 deletions(-) diff --git a/src/modules/audio_processing/aec/main/source/aec_rdft.c b/src/modules/audio_processing/aec/main/source/aec_rdft.c index 2e5d2162cf..a090506b68 100644 --- a/src/modules/audio_processing/aec/main/source/aec_rdft.c +++ b/src/modules/audio_processing/aec/main/source/aec_rdft.c @@ -24,7 +24,19 @@ #include "aec_rdft.h" #include "system_wrappers/interface/cpu_features_wrapper.h" +// constants shared by all paths (C, SSE2). float rdft_w[64]; +// constants used by the C path. +float rdft_wk3ri_first[32]; +float rdft_wk3ri_second[32]; +// constants used by SSE2 but initialized in C path. +ALIGN16_BEG float ALIGN16_END rdft_wk1r[32]; +ALIGN16_BEG float ALIGN16_END rdft_wk2r[32]; +ALIGN16_BEG float ALIGN16_END rdft_wk3r[32]; +ALIGN16_BEG float ALIGN16_END rdft_wk1i[32]; +ALIGN16_BEG float ALIGN16_END rdft_wk2i[32]; +ALIGN16_BEG float ALIGN16_END rdft_wk3i[32]; + static int ip[16]; static void bitrv2_32or128(int n, int *ip, float *a) { @@ -101,7 +113,7 @@ static void bitrv2_32or128(int n, int *ip, float *a) { } } -static void makewt_32() { +static void makewt_32(void) { const int nw = 32; int j, nwh; float delta, x, y; @@ -123,9 +135,55 @@ static void makewt_32() { rdft_w[nw - j + 1] = x; } bitrv2_32or128(nw, ip + 2, rdft_w); + + // pre-calculate constants used by cft1st_128 ... + { + int k1; + + for (k1 = 0, j = 0; j < 128; j += 16, k1 += 2) { + const int k2 = 2 * k1; + const float wk2r = rdft_w[k1 + 0]; + const float wk2i = rdft_w[k1 + 1]; + float wk1r, wk1i; + // ... scalar version. + wk1r = rdft_w[k2 + 0]; + wk1i = rdft_w[k2 + 1]; + rdft_wk3ri_first[k1 + 0] = wk1r - 2 * wk2i * wk1i; + rdft_wk3ri_first[k1 + 1] = 2 * wk2i * wk1r - wk1i; + wk1r = rdft_w[k2 + 2]; + wk1i = rdft_w[k2 + 3]; + rdft_wk3ri_second[k1 + 0] = wk1r - 2 * wk2r * wk1i; + rdft_wk3ri_second[k1 + 1] = 2 * wk2r * wk1r - wk1i; + // ... vector version. + rdft_wk1r[k2 + 0] = rdft_w[k2 + 0]; + rdft_wk1r[k2 + 1] = rdft_w[k2 + 0]; + rdft_wk1r[k2 + 2] = rdft_w[k2 + 2]; + rdft_wk1r[k2 + 3] = rdft_w[k2 + 2]; + rdft_wk2r[k2 + 0] = rdft_w[k1 + 0]; + rdft_wk2r[k2 + 1] = rdft_w[k1 + 0]; + rdft_wk2r[k2 + 2] = -rdft_w[k1 + 1]; + rdft_wk2r[k2 + 3] = -rdft_w[k1 + 1]; + rdft_wk3r[k2 + 0] = rdft_wk3ri_first[k1 + 0]; + rdft_wk3r[k2 + 1] = rdft_wk3ri_first[k1 + 0]; + rdft_wk3r[k2 + 2] = rdft_wk3ri_second[k1 + 0]; + rdft_wk3r[k2 + 3] = rdft_wk3ri_second[k1 + 0]; + rdft_wk1i[k2 + 0] = -rdft_w[k2 + 1]; + rdft_wk1i[k2 + 1] = rdft_w[k2 + 1]; + rdft_wk1i[k2 + 2] = -rdft_w[k2 + 3]; + rdft_wk1i[k2 + 3] = rdft_w[k2 + 3]; + rdft_wk2i[k2 + 0] = -rdft_w[k1 + 1]; + rdft_wk2i[k2 + 1] = rdft_w[k1 + 1]; + rdft_wk2i[k2 + 2] = -rdft_w[k1 + 0]; + rdft_wk2i[k2 + 3] = rdft_w[k1 + 0]; + rdft_wk3i[k2 + 0] = -rdft_wk3ri_first[k1 + 1]; + rdft_wk3i[k2 + 1] = rdft_wk3ri_first[k1 + 1]; + rdft_wk3i[k2 + 2] = -rdft_wk3ri_second[k1 + 1]; + rdft_wk3i[k2 + 3] = rdft_wk3ri_second[k1 + 1]; + } + } } -static void makect_32() { +static void makect_32(void) { float *c = rdft_w + 32; const int nc = 32; int j, nch; @@ -142,7 +200,7 @@ static void makect_32() { } } -static void cft1st_128(float *a) { +static void cft1st_128_C(float *a) { const int n = 128; int j, k1, k2; float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i; @@ -189,21 +247,21 @@ static void cft1st_128(float *a) { for (j = 16; j < n; j += 16) { k1 += 2; k2 = 2 * k1; - wk2r = rdft_w[k1]; + wk2r = rdft_w[k1 + 0]; wk2i = rdft_w[k1 + 1]; - wk1r = rdft_w[k2]; + wk1r = rdft_w[k2 + 0]; wk1i = rdft_w[k2 + 1]; - wk3r = wk1r - 2 * wk2i * wk1i; - wk3i = 2 * wk2i * wk1r - wk1i; - x0r = a[j] + a[j + 2]; + wk3r = rdft_wk3ri_first[k1 + 0]; + wk3i = rdft_wk3ri_first[k1 + 1]; + x0r = a[j + 0] + a[j + 2]; x0i = a[j + 1] + a[j + 3]; - x1r = a[j] - a[j + 2]; + x1r = a[j + 0] - a[j + 2]; x1i = a[j + 1] - a[j + 3]; x2r = a[j + 4] + a[j + 6]; x2i = a[j + 5] + a[j + 7]; x3r = a[j + 4] - a[j + 6]; x3i = a[j + 5] - a[j + 7]; - a[j] = x0r + x2r; + a[j + 0] = x0r + x2r; a[j + 1] = x0i + x2i; x0r -= x2r; x0i -= x2i; @@ -219,8 +277,8 @@ static void cft1st_128(float *a) { a[j + 7] = wk3r * x0i + wk3i * x0r; wk1r = rdft_w[k2 + 2]; wk1i = rdft_w[k2 + 3]; - wk3r = wk1r - 2 * wk2r * wk1i; - wk3i = 2 * wk2r * wk1r - wk1i; + wk3r = rdft_wk3ri_second[k1 + 0]; + wk3i = rdft_wk3ri_second[k1 + 1]; x0r = a[j + 8] + a[j + 10]; x0i = a[j + 9] + a[j + 11]; x1r = a[j + 8] - a[j + 10]; @@ -504,10 +562,12 @@ void aec_rdft_inverse_128(float *a) { } // code path selection +rft_sub_128_t cft1st_128; rft_sub_128_t rftfsub_128; rft_sub_128_t rftbsub_128; void aec_rdft_init(void) { + cft1st_128 = cft1st_128_C; rftfsub_128 = rftfsub_128_C; rftbsub_128 = rftbsub_128_C; if (WebRtc_GetCPUInfo(kSSE2)) { diff --git a/src/modules/audio_processing/aec/main/source/aec_rdft.h b/src/modules/audio_processing/aec/main/source/aec_rdft.h index cf908822a6..a7e67bb6b2 100644 --- a/src/modules/audio_processing/aec/main/source/aec_rdft.h +++ b/src/modules/audio_processing/aec/main/source/aec_rdft.h @@ -8,13 +8,32 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifdef _MSC_VER /* visual c++ */ +# define ALIGN16_BEG __declspec(align(16)) +# define ALIGN16_END +#else /* gcc or icc */ +# define ALIGN16_BEG +# define ALIGN16_END __attribute__((aligned(16))) +#endif + // constants shared by all paths (C, SSE2). extern float rdft_w[64]; +// constants used by the C path. +extern float rdft_wk3ri_first[32]; +extern float rdft_wk3ri_second[32]; +// constants used by SSE2 but initialized in C path. +extern float rdft_wk1r[32]; +extern float rdft_wk2r[32]; +extern float rdft_wk3r[32]; +extern float rdft_wk1i[32]; +extern float rdft_wk2i[32]; +extern float rdft_wk3i[32]; // code path selection function pointers typedef void (*rft_sub_128_t)(float *a); extern rft_sub_128_t rftfsub_128; extern rft_sub_128_t rftbsub_128; +extern rft_sub_128_t cft1st_128; // entry points void aec_rdft_init(void); diff --git a/src/modules/audio_processing/aec/main/source/aec_rdft_sse2.c b/src/modules/audio_processing/aec/main/source/aec_rdft_sse2.c index 1d7c4572b2..ad094f9863 100644 --- a/src/modules/audio_processing/aec/main/source/aec_rdft_sse2.c +++ b/src/modules/audio_processing/aec/main/source/aec_rdft_sse2.c @@ -13,13 +13,64 @@ #include "aec_rdft.h" -#ifdef _MSC_VER /* visual c++ */ -# define ALIGN16_BEG __declspec(align(16)) -# define ALIGN16_END -#else /* gcc or icc */ -# define ALIGN16_BEG -# define ALIGN16_END __attribute__((aligned(16))) -#endif +static void cft1st_128_SSE2(float *a) { + static const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] = + {-1.f, 1.f, -1.f, 1.f}; + const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); + int j, k2; + + for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { + __m128 a00v = _mm_loadu_ps(&a[j + 0]); + __m128 a04v = _mm_loadu_ps(&a[j + 4]); + __m128 a08v = _mm_loadu_ps(&a[j + 8]); + __m128 a12v = _mm_loadu_ps(&a[j + 12]); + __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1 ,0)); + __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3 ,2)); + __m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1 ,0)); + __m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3 ,2)); + + const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]); + const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]); + const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]); + const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]); + const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]); + const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]); + __m128 x0v = _mm_add_ps(a01v, a23v); + const __m128 x1v = _mm_sub_ps(a01v, a23v); + const __m128 x2v = _mm_add_ps(a45v, a67v); + const __m128 x3v = _mm_sub_ps(a45v, a67v); + a01v = _mm_add_ps(x0v, x2v); + x0v = _mm_sub_ps(x0v, x2v); + __m128 x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1)); + + const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v); + const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w); + a45v = _mm_add_ps(a45_0v, a45_1v); + + const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0 ,1)); + const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w); + x0v = _mm_add_ps(x1v, x3s); + x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1)); + const __m128 a23_0v = _mm_mul_ps(wk1rv, x0v); + const __m128 a23_1v = _mm_mul_ps(wk1iv, x0w); + a23v = _mm_add_ps(a23_0v, a23_1v); + + x0v = _mm_sub_ps(x1v, x3s); + x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1)); + const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v); + const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w); + a67v = _mm_add_ps(a67_0v, a67_1v); + + a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1 ,0)); + a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1 ,0)); + a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3 ,2)); + a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3 ,2)); + _mm_storeu_ps(&a[j + 0], a00v); + _mm_storeu_ps(&a[j + 4], a04v); + _mm_storeu_ps(&a[j + 8], a08v); + _mm_storeu_ps(&a[j + 12], a12v); + } +} static void rftfsub_128_SSE2(float *a) { const float *c = rdft_w + 32; @@ -205,6 +256,7 @@ static void rftbsub_128_SSE2(float *a) { } void aec_rdft_init_sse2(void) { + cft1st_128 = cft1st_128_SSE2; rftfsub_128 = rftfsub_128_SSE2; rftbsub_128 = rftbsub_128_SSE2; }