diff --git a/webrtc/common_audio/common_audio.gyp b/webrtc/common_audio/common_audio.gyp index 7a8e12c97d..59ebfacdf0 100644 --- a/webrtc/common_audio/common_audio.gyp +++ b/webrtc/common_audio/common_audio.gyp @@ -89,6 +89,9 @@ 'vad/vad_sp.h', ], 'conditions': [ + ['target_arch=="ia32" or target_arch=="x64"', { + 'dependencies': ['common_audio_sse2',], + }], ['target_arch=="arm"', { 'sources': [ 'signal_processing/complex_bit_reverse_arm.S', @@ -122,6 +125,21 @@ }, ], # targets 'conditions': [ + ['target_arch=="ia32" or target_arch=="x64"', { + 'targets': [ + { + 'target_name': 'common_audio_sse2', + 'type': 'static_library', + 'sources': [ + 'resampler/sinc_resampler_sse.cc', + ], + 'cflags': ['-msse2',], + 'xcode_settings': { + 'OTHER_CFLAGS': ['-msse2',], + }, + }, + ], # targets + }], ['target_arch=="arm" and armv7==1', { 'targets': [ { @@ -129,6 +147,7 @@ 'type': 'static_library', 'includes': ['../build/arm_neon.gypi',], 'sources': [ + 'resampler/sinc_resampler_neon.cc', 'signal_processing/cross_correlation_neon.S', 'signal_processing/downsample_fast_neon.S', 'signal_processing/min_max_operations_neon.S', diff --git a/webrtc/common_audio/resampler/sinc_resampler.cc b/webrtc/common_audio/resampler/sinc_resampler.cc index 6795beeac1..2e2ac453a6 100644 --- a/webrtc/common_audio/resampler/sinc_resampler.cc +++ b/webrtc/common_audio/resampler/sinc_resampler.cc @@ -50,42 +50,25 @@ #include #include - -#if defined(WEBRTC_USE_SSE2) -#include -#endif - -// TODO(ajm): See note below in Convolve_NEON. -//#if defined(WEBRTC_ARCH_ARM_NEON) || defined(WEBRTC_DETECT_ARM_NEON) -//#include -//#endif +#include namespace webrtc { -namespace { +static double SincScaleFactor(double io_ratio) { + // |sinc_scale_factor| is basically the normalized cutoff frequency of the + // low-pass filter. + double sinc_scale_factor = io_ratio > 1.0 ? 1.0 / io_ratio : 1.0; -enum { - // The kernel size can be adjusted for quality (higher is better) at the - // expense of performance. Must be a multiple of 32. - // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. - kKernelSize = 32, + // The sinc function is an idealized brick-wall filter, but since we're + // windowing it the transition from pass to stop does not happen right away. + // So we should adjust the low pass filter cutoff slightly downward to avoid + // some aliasing at the very high-end. + // TODO(crogers): this value is empirical and to be more exact should vary + // depending on kKernelSize. + sinc_scale_factor *= 0.9; - // The number of destination frames generated per processing pass. Affects - // how often and for how much SincResampler calls back for input. Must be - // greater than kKernelSize. - kDefaultBlockSize = 512, - - // The kernel offset count is used for interpolation and is the number of - // sub-sample kernel shifts. Can be adjusted for quality (higher is better) - // at the expense of allocating more memory. - kKernelOffsetCount = 32, - kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1), - - // The size (in samples) of the internal buffer used by the resampler. - kDefaultBufferSize = kDefaultBlockSize + kKernelSize -}; - -} // namespace + return sinc_scale_factor; +} SincResampler::SincResampler(double io_sample_rate_ratio, SincResamplerCallback* read_cb, @@ -99,8 +82,18 @@ SincResampler::SincResampler(double io_sample_rate_ratio, // Create input buffers with a 16-byte alignment for SSE optimizations. kernel_storage_(static_cast( AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))), + kernel_pre_sinc_storage_(static_cast( + AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))), + kernel_window_storage_(static_cast( + AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))), input_buffer_(static_cast( AlignedMalloc(sizeof(float) * buffer_size_, 16))), +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE__) + convolve_proc_(WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C), +#elif defined(WEBRTC_ARCH_ARM_V7) && !defined(WEBRTC_ARCH_ARM_NEON) + convolve_proc_(WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON ? + Convolve_NEON : Convolve_C), +#endif // Setup various region pointers in the buffer (see diagram above). r0_(input_buffer_.get() + kKernelSize / 2), r1_(input_buffer_.get()), @@ -123,8 +116,18 @@ SincResampler::SincResampler(double io_sample_rate_ratio, // Create input buffers with a 16-byte alignment for SSE optimizations. kernel_storage_(static_cast( AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))), + kernel_pre_sinc_storage_(static_cast( + AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))), + kernel_window_storage_(static_cast( + AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))), input_buffer_(static_cast( AlignedMalloc(sizeof(float) * buffer_size_, 16))), +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE__) + convolve_proc_(WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C), +#elif defined(WEBRTC_ARCH_ARM_V7) && !defined(WEBRTC_ARCH_ARM_NEON) + convolve_proc_(WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON ? + Convolve_NEON : Convolve_C), +#endif // Setup various region pointers in the buffer (see diagram above). r0_(input_buffer_.get() + kKernelSize / 2), r1_(input_buffer_.get()), @@ -160,6 +163,10 @@ void SincResampler::Initialize() { memset(kernel_storage_.get(), 0, sizeof(*kernel_storage_.get()) * kKernelStorageSize); + memset(kernel_pre_sinc_storage_.get(), 0, + sizeof(*kernel_pre_sinc_storage_.get()) * kKernelStorageSize); + memset(kernel_window_storage_.get(), 0, + sizeof(*kernel_window_storage_.get()) * kKernelStorageSize); memset(input_buffer_.get(), 0, sizeof(*input_buffer_.get()) * buffer_size_); } @@ -170,42 +177,84 @@ void SincResampler::InitializeKernel() { static const double kA1 = 0.5; static const double kA2 = 0.5 * kAlpha; - // |sinc_scale_factor| is basically the normalized cutoff frequency of the - // low-pass filter. - double sinc_scale_factor = - io_sample_rate_ratio_ > 1.0 ? 1.0 / io_sample_rate_ratio_ : 1.0; - - // The sinc function is an idealized brick-wall filter, but since we're - // windowing it the transition from pass to stop does not happen right away. - // So we should adjust the low pass filter cutoff slightly downward to avoid - // some aliasing at the very high-end. - // TODO(crogers): this value is empirical and to be more exact should vary - // depending on kKernelSize. - sinc_scale_factor *= 0.9; - // Generates a set of windowed sinc() kernels. // We generate a range of sub-sample offsets from 0.0 to 1.0. + const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_); for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) { - double subsample_offset = - static_cast(offset_idx) / kKernelOffsetCount; + const float subsample_offset = + static_cast(offset_idx) / kKernelOffsetCount; for (int i = 0; i < kKernelSize; ++i) { - // Compute the sinc with offset. - double s = - sinc_scale_factor * M_PI * (i - kKernelSize / 2 - subsample_offset); - double sinc = (!s ? 1.0 : sin(s) / s) * sinc_scale_factor; + const int idx = i + offset_idx * kKernelSize; + const float pre_sinc = M_PI * (i - kKernelSize / 2 - subsample_offset); + kernel_pre_sinc_storage_.get()[idx] = pre_sinc; // Compute Blackman window, matching the offset of the sinc(). - double x = (i - subsample_offset) / kKernelSize; - double window = kA0 - kA1 * cos(2.0 * M_PI * x) + kA2 + const float x = (i - subsample_offset) / kKernelSize; + const float window = kA0 - kA1 * cos(2.0 * M_PI * x) + kA2 * cos(4.0 * M_PI * x); + kernel_window_storage_.get()[idx] = window; - // Window the sinc() function and store at the correct offset. - kernel_storage_.get()[i + offset_idx * kKernelSize] = sinc * window; + // Compute the sinc with offset, then window the sinc() function and store + // at the correct offset. + if (pre_sinc == 0) { + kernel_storage_.get()[idx] = sinc_scale_factor * window; + } else { + kernel_storage_.get()[idx] = + window * sin(sinc_scale_factor * pre_sinc) / pre_sinc; + } } } } +void SincResampler::SetRatio(double io_sample_rate_ratio) { + if (fabs(io_sample_rate_ratio_ - io_sample_rate_ratio) < + std::numeric_limits::epsilon()) { + return; + } + + io_sample_rate_ratio_ = io_sample_rate_ratio; + + // Optimize reinitialization by reusing values which are independent of + // |sinc_scale_factor|. Provides a 3x speedup. + const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_); + for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) { + for (int i = 0; i < kKernelSize; ++i) { + const int idx = i + offset_idx * kKernelSize; + const float window = kernel_window_storage_.get()[idx]; + const float pre_sinc = kernel_pre_sinc_storage_.get()[idx]; + + if (pre_sinc == 0) { + kernel_storage_.get()[idx] = sinc_scale_factor * window; + } else { + kernel_storage_.get()[idx] = + window * sin(sinc_scale_factor * pre_sinc) / pre_sinc; + } + } + } +} + +// If we know the minimum architecture avoid function hopping for CPU detection. +#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(__SSE__) +#define CONVOLVE_FUNC Convolve_SSE +#else +// X86 CPU detection required. |convolve_proc_| will be set upon construction. +// TODO(dalecurtis): Once Chrome moves to a SSE baseline this can be removed. +#define CONVOLVE_FUNC convolve_proc_ +#endif +#elif defined(WEBRTC_ARCH_ARM_V7) +#if defined(WEBRTC_ARCH_ARM_NEON) +#define CONVOLVE_FUNC Convolve_NEON +#else +// NEON CPU detection required. |convolve_proc_| will be set upon construction. +#define CONVOLVE_FUNC convolve_proc_ +#endif +#else +// Unknown architecture. +#define CONVOLVE_FUNC Convolve_C +#endif + void SincResampler::Resample(float* destination, int frames) { int remaining_frames = frames; @@ -231,12 +280,17 @@ void SincResampler::Resample(float* destination, int frames) { float* k1 = kernel_storage_.get() + offset_idx * kKernelSize; float* k2 = k1 + kKernelSize; + // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be + // true so long as kKernelSize is a multiple of 16. + assert((reinterpret_cast(k1) & 0x0F) == 0u); + assert((reinterpret_cast(k2) & 0x0F) == 0u); + // Initialize input pointer based on quantized |virtual_source_idx_|. float* input_ptr = r1_ + source_idx; // Figure out how much to weight each kernel's "convolution". double kernel_interpolation_factor = virtual_offset_idx - offset_idx; - *destination++ = Convolve( + *destination++ = CONVOLVE_FUNC( input_ptr, k1, k2, kernel_interpolation_factor); // Advance the virtual index. @@ -260,6 +314,8 @@ void SincResampler::Resample(float* destination, int frames) { } } +#undef CONVOLVE_FUNC + int SincResampler::ChunkSize() { return block_size_ / io_sample_rate_ratio_; } @@ -274,30 +330,6 @@ void SincResampler::Flush() { memset(input_buffer_.get(), 0, sizeof(*input_buffer_.get()) * buffer_size_); } -float SincResampler::Convolve(const float* input_ptr, const float* k1, - const float* k2, - double kernel_interpolation_factor) { - // Rely on function level static initialization to keep ConvolveProc selection - // thread safe. - typedef float (*ConvolveProc)(const float* src, const float* k1, - const float* k2, - double kernel_interpolation_factor); -#if defined(WEBRTC_USE_SSE2) - static const ConvolveProc kConvolveProc = - WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C; -#elif defined(WEBRTC_ARCH_ARM_NEON) - static const ConvolveProc kConvolveProc = Convolve_NEON; -#elif defined(WEBRTC_DETECT_ARM_NEON) - static const ConvolveProc kConvolveProc = - WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON ? Convolve_NEON : - Convolve_C; -#else - static const ConvolveProc kConvolveProc = Convolve_C; -#endif - - return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor); -} - float SincResampler::Convolve_C(const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor) { @@ -317,80 +349,4 @@ float SincResampler::Convolve_C(const float* input_ptr, const float* k1, + kernel_interpolation_factor * sum2; } -#if defined(WEBRTC_USE_SSE2) -float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, - const float* k2, - double kernel_interpolation_factor) { - // Ensure |k1|, |k2| are 16-byte aligned for SSE usage. Should always be true - // so long as kKernelSize is a multiple of 16. - assert(0u == (reinterpret_cast(k1) & 0x0F)); - assert(0u == (reinterpret_cast(k2) & 0x0F)); - - __m128 m_input; - __m128 m_sums1 = _mm_setzero_ps(); - __m128 m_sums2 = _mm_setzero_ps(); - - // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling - // these loops hurt performance in local testing. - if (reinterpret_cast(input_ptr) & 0x0F) { - for (int i = 0; i < kKernelSize; i += 4) { - m_input = _mm_loadu_ps(input_ptr + i); - m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); - m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); - } - } else { - for (int i = 0; i < kKernelSize; i += 4) { - m_input = _mm_load_ps(input_ptr + i); - m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); - m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); - } - } - - // Linearly interpolate the two "convolutions". - m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor)); - m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor)); - m_sums1 = _mm_add_ps(m_sums1, m_sums2); - - // Sum components together. - float result; - m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); - _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( - m_sums2, m_sums2, 1))); - - return result; -} -#endif - -#if defined(WEBRTC_ARCH_ARM_NEON) || defined(WEBRTC_DETECT_ARM_NEON) -float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, - const float* k2, - double kernel_interpolation_factor) { - // TODO(ajm): The AndroidNDK bot is giving compile errors in this function. - // Fallback to the plain C version until it's resolved. - return Convolve_C(input_ptr, k1, k2, kernel_interpolation_factor); - //float32x4_t m_input; - //float32x4_t m_sums1 = vmovq_n_f32(0); - //float32x4_t m_sums2 = vmovq_n_f32(0); - - //const float* upper = input_ptr + kKernelSize; - //for (; input_ptr < upper; ) { - // m_input = vld1q_f32(input_ptr); - // input_ptr += 4; - // m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1)); - // k1 += 4; - // m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2)); - // k2 += 4; - //} - - // Linearly interpolate the two "convolutions". - //m_sums1 = vmlaq_f32( - // vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), - // m_sums2, vmovq_n_f32(kernel_interpolation_factor)); - - // Sum components together. - //float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); - //return vget_lane_f32(vpadd_f32(m_half, m_half), 0); -} -#endif - } // namespace webrtc diff --git a/webrtc/common_audio/resampler/sinc_resampler.h b/webrtc/common_audio/resampler/sinc_resampler.h index e207b8755a..5a1534354b 100644 --- a/webrtc/common_audio/resampler/sinc_resampler.h +++ b/webrtc/common_audio/resampler/sinc_resampler.h @@ -18,10 +18,13 @@ #include "webrtc/system_wrappers/interface/constructor_magic.h" #include "webrtc/system_wrappers/interface/scoped_ptr.h" #include "webrtc/test/testsupport/gtest_prod_util.h" +#include "webrtc/typedefs.h" namespace webrtc { -// Callback class to provide SincResampler with input. +// Callback class for providing more data into the resampler. Expects |frames| +// of data to be rendered into |destination|; zero padded if not enough frames +// are available to satisfy the request. class SincResamplerCallback { public: virtual ~SincResamplerCallback() {} @@ -31,6 +34,27 @@ class SincResamplerCallback { // SincResampler is a high-quality single-channel sample-rate converter. class SincResampler { public: + enum { + // The kernel size can be adjusted for quality (higher is better) at the + // expense of performance. Must be a multiple of 32. + // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. + kKernelSize = 32, + + // The number of destination frames generated per processing pass. Affects + // how often and for how much SincResampler calls back for input. Must be + // greater than kKernelSize. + kDefaultBlockSize = 512, + + // The kernel offset count is used for interpolation and is the number of + // sub-sample kernel shifts. Can be adjusted for quality (higher is better) + // at the expense of allocating more memory. + kKernelOffsetCount = 32, + kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1), + + // The size (in samples) of the internal buffer used by the resampler. + kDefaultBufferSize = kDefaultBlockSize + kKernelSize, + }; + // Constructs a SincResampler with the specified |read_cb|, which is used to // acquire audio data for resampling. |io_sample_rate_ratio| is the ratio of // input / output sample rates. If desired, the number of destination frames @@ -54,9 +78,20 @@ class SincResampler { // more to prime the buffer. int BlockSize(); - // Flush all buffered data and reset internal indices. + // Flush all buffered data and reset internal indices. Not thread safe, do + // not call while Resample() is in progress. void Flush(); + // Update |io_sample_rate_ratio_|. SetRatio() will cause a reconstruction of + // the kernels used for resampling. Not thread safe, do not call while + // Resample() is in progress. + // + // TODO(ajm): use this in PushSincResampler rather than reconstructing + // SincResampler. + void SetRatio(double io_sample_rate_ratio); + + float* get_kernel_for_testing() { return kernel_storage_.get(); } + private: FRIEND_TEST_ALL_PREFIXES(SincResamplerTest, Convolve); FRIEND_TEST_ALL_PREFIXES(SincResamplerTest, ConvolveBenchmark); @@ -68,16 +103,17 @@ class SincResampler { // linearly interpolated using |kernel_interpolation_factor|. On x86, the // underlying implementation is chosen at run time based on SSE support. On // ARM, NEON support is chosen at compile time based on compilation flags. - static float Convolve(const float* input_ptr, const float* k1, - const float* k2, double kernel_interpolation_factor); static float Convolve_C(const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor); +#if defined(WEBRTC_ARCH_X86_FAMILY) static float Convolve_SSE(const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor); +#elif defined(WEBRTC_ARCH_ARM_V7) static float Convolve_NEON(const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor); +#endif // The ratio of input / output sample rates. double io_sample_rate_ratio_; @@ -102,10 +138,20 @@ class SincResampler { // The kernel offsets are sub-sample shifts of a windowed sinc shifted from // 0.0 to 1.0 sample. scoped_ptr_malloc kernel_storage_; + scoped_ptr_malloc kernel_pre_sinc_storage_; + scoped_ptr_malloc kernel_window_storage_; // Data from the source is copied into this buffer for each processing pass. scoped_ptr_malloc input_buffer_; + // Stores the runtime selection of which Convolve function to use. +#if (defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE__)) || \ + (defined(WEBRTC_ARCH_ARM_V7) && !defined(WEBRTC_ARCH_ARM_NEON)) + typedef float (*ConvolveProc)(const float*, const float*, const float*, + double); + const ConvolveProc convolve_proc_; +#endif + // Pointers to the various regions inside |input_buffer_|. See the diagram at // the top of the .cc file for more information. float* const r0_; diff --git a/webrtc/common_audio/resampler/sinc_resampler_neon.cc b/webrtc/common_audio/resampler/sinc_resampler_neon.cc new file mode 100644 index 0000000000..e909a6c5de --- /dev/null +++ b/webrtc/common_audio/resampler/sinc_resampler_neon.cc @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Modified from the Chromium original: +// src/media/base/sinc_resampler.cc + +#include "webrtc/common_audio/resampler/sinc_resampler.h" + +#include + +namespace webrtc { + +float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, + const float* k2, + double kernel_interpolation_factor) { + float32x4_t m_input; + float32x4_t m_sums1 = vmovq_n_f32(0); + float32x4_t m_sums2 = vmovq_n_f32(0); + + const float* upper = input_ptr + kKernelSize; + for (; input_ptr < upper; ) { + m_input = vld1q_f32(input_ptr); + input_ptr += 4; + m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1)); + k1 += 4; + m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2)); + k2 += 4; + } + + // Linearly interpolate the two "convolutions". + m_sums1 = vmlaq_f32( + vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), + m_sums2, vmovq_n_f32(kernel_interpolation_factor)); + + // Sum components together. + float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); + return vget_lane_f32(vpadd_f32(m_half, m_half), 0); +} + +} // namespace webrtc diff --git a/webrtc/common_audio/resampler/sinc_resampler_sse.cc b/webrtc/common_audio/resampler/sinc_resampler_sse.cc new file mode 100644 index 0000000000..bdbe7b1aec --- /dev/null +++ b/webrtc/common_audio/resampler/sinc_resampler_sse.cc @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Modified from the Chromium original: +// src/media/base/simd/sinc_resampler_sse.cc + +#include "webrtc/common_audio/resampler/sinc_resampler.h" + +#include + +namespace webrtc { + +float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, + const float* k2, + double kernel_interpolation_factor) { + __m128 m_input; + __m128 m_sums1 = _mm_setzero_ps(); + __m128 m_sums2 = _mm_setzero_ps(); + + // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling + // these loops hurt performance in local testing. + if (reinterpret_cast(input_ptr) & 0x0F) { + for (int i = 0; i < kKernelSize; i += 4) { + m_input = _mm_loadu_ps(input_ptr + i); + m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); + m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); + } + } else { + for (int i = 0; i < kKernelSize; i += 4) { + m_input = _mm_load_ps(input_ptr + i); + m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); + m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); + } + } + + // Linearly interpolate the two "convolutions". + m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor)); + m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor)); + m_sums1 = _mm_add_ps(m_sums1, m_sums2); + + // Sum components together. + float result; + m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); + _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( + m_sums2, m_sums2, 1))); + + return result; +} + +} // namespace webrtc diff --git a/webrtc/common_audio/resampler/sinc_resampler_unittest.cc b/webrtc/common_audio/resampler/sinc_resampler_unittest.cc index 98b8d4ac26..3a4b97670f 100644 --- a/webrtc/common_audio/resampler/sinc_resampler_unittest.cc +++ b/webrtc/common_audio/resampler/sinc_resampler_unittest.cc @@ -11,12 +11,16 @@ // Modified from the Chromium original: // src/media/base/sinc_resampler_unittest.cc +// MSVC++ requires this to be set before any other includes to get M_PI. +#define _USE_MATH_DEFINES + #include #include "testing/gmock/include/gmock/gmock.h" #include "testing/gtest/include/gtest/gtest.h" #include "webrtc/common_audio/resampler/sinc_resampler.h" #include "webrtc/common_audio/resampler/sinusoidal_linear_chirp_source.h" +#include "webrtc/system_wrappers/interface/cpu_features_wrapper.h" #include "webrtc/system_wrappers/interface/scoped_ptr.h" #include "webrtc/system_wrappers/interface/stringize_macros.h" #include "webrtc/system_wrappers/interface/tick_util.h" @@ -94,9 +98,9 @@ TEST(SincResamplerTest, Flush) { } // Define platform independent function name for Convolve* tests. -#if defined(WEBRTC_USE_SSE2) && defined(__SSE__) +#if defined(WEBRTC_ARCH_X86_FAMILY) #define CONVOLVE_FUNC Convolve_SSE -#elif defined(WEBRTC_ARCH_ARM_NEON) || defined(WEBRTC_DETECT_ARM_NEON) +#elif defined(WEBRTC_ARCH_ARM_V7) #define CONVOLVE_FUNC Convolve_NEON #endif @@ -105,6 +109,12 @@ TEST(SincResamplerTest, Flush) { // will be tested by the parameterized SincResampler tests below. #if defined(CONVOLVE_FUNC) TEST(SincResamplerTest, Convolve) { +#if defined(WEBRTC_ARCH_X86_FAMILY) + ASSERT_TRUE(WebRtc_GetCPUInfo(kSSE2)); +#elif defined(WEBRTC_ARCH_ARM_V7) + ASSERT_TRUE(WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON); +#endif + // Initialize a dummy resampler. MockSource mock_source; SincResampler resampler(kSampleRateRatio, &mock_source); @@ -159,6 +169,12 @@ TEST(SincResamplerTest, ConvolveBenchmark) { printf("Convolve_C took %.2fms.\n", total_time_c_us / 1000); #if defined(CONVOLVE_FUNC) +#if defined(WEBRTC_ARCH_X86_FAMILY) + ASSERT_TRUE(WebRtc_GetCPUInfo(kSSE2)); +#elif defined(WEBRTC_ARCH_ARM_V7) + ASSERT_TRUE(WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON); +#endif + // Benchmark with unaligned input pointer. start = TickTime::Now(); for (int j = 0; j < kConvolveIterations; ++j) { @@ -226,10 +242,23 @@ TEST_P(SincResamplerTest, Resample) { SinusoidalLinearChirpSource resampler_source( input_rate_, input_samples, input_nyquist_freq, 0); + const double io_ratio = input_rate_ / static_cast(output_rate_); SincResampler resampler( - input_rate_ / static_cast(output_rate_), + io_ratio, &resampler_source); + // Force an update to the sample rate ratio to ensure dyanmic sample rate + // changes are working correctly. + scoped_array kernel(new float[SincResampler::kKernelStorageSize]); + memcpy(kernel.get(), resampler.get_kernel_for_testing(), + SincResampler::kKernelStorageSize); + resampler.SetRatio(M_PI); + ASSERT_NE(0, memcmp(kernel.get(), resampler.get_kernel_for_testing(), + SincResampler::kKernelStorageSize)); + resampler.SetRatio(io_ratio); + ASSERT_EQ(0, memcmp(kernel.get(), resampler.get_kernel_for_testing(), + SincResampler::kKernelStorageSize)); + // TODO(dalecurtis): If we switch to AVX/SSE optimization, we'll need to // allocate these on 32-byte boundaries and ensure they're sized % 32 bytes. scoped_array resampled_destination(new float[output_samples]);