diff --git a/BUILD.gn b/BUILD.gn index c844e3853f..b676d1e15e 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -129,10 +129,6 @@ config("common_inherited_config") { defines += [ "RTC_DISABLE_CHECK_MSG" ] } - if (rtc_enable_avx2) { - defines += [ "WEBRTC_ENABLE_AVX2" ] - } - # Some tests need to declare their own trace event handlers. If this define is # not set, the first time TRACE_EVENT_* is called it will store the return # value for the current handler in an static variable, so that subsequent diff --git a/common_audio/BUILD.gn b/common_audio/BUILD.gn index fc76351c10..4077486d87 100644 --- a/common_audio/BUILD.gn +++ b/common_audio/BUILD.gn @@ -67,7 +67,6 @@ rtc_library("common_audio") { if (current_cpu == "x86" || current_cpu == "x64") { deps += [ ":common_audio_sse2" ] - deps += [ ":common_audio_avx2" ] } } @@ -236,7 +235,6 @@ rtc_library("fir_filter_factory") { ] if (current_cpu == "x86" || current_cpu == "x64") { deps += [ ":common_audio_sse2" ] - deps += [ ":common_audio_avx2" ] } if (rtc_build_with_neon) { deps += [ ":common_audio_neon" ] @@ -263,27 +261,6 @@ if (current_cpu == "x86" || current_cpu == "x64") { "../rtc_base/memory:aligned_malloc", ] } - - rtc_library("common_audio_avx2") { - sources = [ "resampler/sinc_resampler_avx2.cc" ] - - if (is_win) { - cflags = [ "/arch:AVX2" ] - } else { - cflags = [ - "-mavx2", - "-mfma", - ] - } - - deps = [ - ":fir_filter", - ":sinc_resampler", - "../rtc_base:checks", - "../rtc_base:rtc_base_approved", - "../rtc_base/memory:aligned_malloc", - ] - } } if (rtc_build_with_neon) { diff --git a/common_audio/resampler/sinc_resampler.cc b/common_audio/resampler/sinc_resampler.cc index 831ce53d4a..21707e9e4e 100644 --- a/common_audio/resampler/sinc_resampler.cc +++ b/common_audio/resampler/sinc_resampler.cc @@ -122,22 +122,28 @@ double SincScaleFactor(double io_ratio) { const size_t SincResampler::kKernelSize; // If we know the minimum architecture at compile time, avoid CPU detection. -void SincResampler::InitializeCPUSpecificFeatures() { -#if defined(WEBRTC_HAS_NEON) - convolve_proc_ = Convolve_NEON; -#elif defined(WEBRTC_ARCH_X86_FAMILY) - // Using AVX2 instead of SSE2 when AVX2 supported. - if (WebRtc_GetCPUInfo(kAVX2)) - convolve_proc_ = Convolve_AVX2; - else if (WebRtc_GetCPUInfo(kSSE2)) - convolve_proc_ = Convolve_SSE; - else - convolve_proc_ = Convolve_C; +#if defined(WEBRTC_ARCH_X86_FAMILY) +#if defined(__SSE2__) +#define CONVOLVE_FUNC Convolve_SSE +void SincResampler::InitializeCPUSpecificFeatures() {} #else - // Unknown architecture. - convolve_proc_ = Convolve_C; -#endif +// x86 CPU detection required. Function will be set by +// InitializeCPUSpecificFeatures(). +// TODO(dalecurtis): Once Chrome moves to an SSE baseline this can be removed. +#define CONVOLVE_FUNC convolve_proc_ + +void SincResampler::InitializeCPUSpecificFeatures() { + convolve_proc_ = WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C; } +#endif +#elif defined(WEBRTC_HAS_NEON) +#define CONVOLVE_FUNC Convolve_NEON +void SincResampler::InitializeCPUSpecificFeatures() {} +#else +// Unknown architecture. +#define CONVOLVE_FUNC Convolve_C +void SincResampler::InitializeCPUSpecificFeatures() {} +#endif SincResampler::SincResampler(double io_sample_rate_ratio, size_t request_frames, @@ -146,20 +152,24 @@ SincResampler::SincResampler(double io_sample_rate_ratio, read_cb_(read_cb), request_frames_(request_frames), input_buffer_size_(request_frames_ + kKernelSize), - // Create input buffers with a 32-byte alignment for SIMD optimizations. + // Create input buffers with a 16-byte alignment for SSE optimizations. kernel_storage_(static_cast( - AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))), + AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))), kernel_pre_sinc_storage_(static_cast( - AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))), + AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))), kernel_window_storage_(static_cast( - AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))), + AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))), input_buffer_(static_cast( - AlignedMalloc(sizeof(float) * input_buffer_size_, 32))), + AlignedMalloc(sizeof(float) * input_buffer_size_, 16))), +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__) convolve_proc_(nullptr), +#endif r1_(input_buffer_.get()), r2_(input_buffer_.get() + kKernelSize / 2) { +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__) InitializeCPUSpecificFeatures(); RTC_DCHECK(convolve_proc_); +#endif RTC_DCHECK_GT(request_frames_, 0); Flush(); RTC_DCHECK_GT(block_size_, kKernelSize); @@ -292,10 +302,10 @@ void SincResampler::Resample(size_t frames, float* destination) { const float* const k1 = kernel_ptr + offset_idx * kKernelSize; const float* const k2 = k1 + kKernelSize; - // Ensure |k1|, |k2| are 32-byte aligned for SIMD usage. Should always be - // true so long as kKernelSize is a multiple of 32. - RTC_DCHECK_EQ(0, reinterpret_cast(k1) % 32); - RTC_DCHECK_EQ(0, reinterpret_cast(k2) % 32); + // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be + // true so long as kKernelSize is a multiple of 16. + RTC_DCHECK_EQ(0, reinterpret_cast(k1) % 16); + RTC_DCHECK_EQ(0, reinterpret_cast(k2) % 16); // Initialize input pointer based on quantized |virtual_source_idx_|. const float* const input_ptr = r1_ + source_idx; @@ -304,7 +314,7 @@ void SincResampler::Resample(size_t frames, float* destination) { const double kernel_interpolation_factor = virtual_offset_idx - offset_idx; *destination++ = - convolve_proc_(input_ptr, k1, k2, kernel_interpolation_factor); + CONVOLVE_FUNC(input_ptr, k1, k2, kernel_interpolation_factor); // Advance the virtual index. virtual_source_idx_ += current_io_ratio; diff --git a/common_audio/resampler/sinc_resampler.h b/common_audio/resampler/sinc_resampler.h index a72a0c62c4..5181c18dac 100644 --- a/common_audio/resampler/sinc_resampler.h +++ b/common_audio/resampler/sinc_resampler.h @@ -112,10 +112,6 @@ class SincResampler { const float* k1, const float* k2, double kernel_interpolation_factor); - static float Convolve_AVX2(const float* input_ptr, - const float* k1, - const float* k2, - double kernel_interpolation_factor); #elif defined(WEBRTC_HAS_NEON) static float Convolve_NEON(const float* input_ptr, const float* k1, @@ -159,11 +155,13 @@ class SincResampler { // TODO(ajm): Move to using a global static which must only be initialized // once by the user. We're not doing this initially, because we don't have // e.g. a LazyInstance helper in webrtc. +#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__) typedef float (*ConvolveProc)(const float*, const float*, const float*, double); ConvolveProc convolve_proc_; +#endif // Pointers to the various regions inside |input_buffer_|. See the diagram at // the top of the .cc file for more information. diff --git a/common_audio/resampler/sinc_resampler_avx2.cc b/common_audio/resampler/sinc_resampler_avx2.cc deleted file mode 100644 index 3eb5d4a1b1..0000000000 --- a/common_audio/resampler/sinc_resampler_avx2.cc +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2020 The WebRTC project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include -#include -#include - -#include "common_audio/resampler/sinc_resampler.h" - -namespace webrtc { - -float SincResampler::Convolve_AVX2(const float* input_ptr, - const float* k1, - const float* k2, - double kernel_interpolation_factor) { - __m256 m_input; - __m256 m_sums1 = _mm256_setzero_ps(); - __m256 m_sums2 = _mm256_setzero_ps(); - - // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling - // these loops has not been tested or benchmarked. - bool aligned_input = (reinterpret_cast(input_ptr) & 0x1F) == 0; - if (!aligned_input) { - for (size_t i = 0; i < kKernelSize; i += 8) { - m_input = _mm256_loadu_ps(input_ptr + i); - m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1); - m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2); - } - } else { - for (size_t i = 0; i < kKernelSize; i += 8) { - m_input = _mm256_load_ps(input_ptr + i); - m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1); - m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2); - } - } - - // Linearly interpolate the two "convolutions". - __m128 m128_sums1 = _mm_add_ps(_mm256_extractf128_ps(m_sums1, 0), - _mm256_extractf128_ps(m_sums1, 1)); - __m128 m128_sums2 = _mm_add_ps(_mm256_extractf128_ps(m_sums2, 0), - _mm256_extractf128_ps(m_sums2, 1)); - m128_sums1 = _mm_mul_ps( - m128_sums1, - _mm_set_ps1(static_cast(1.0 - kernel_interpolation_factor))); - m128_sums2 = _mm_mul_ps( - m128_sums2, _mm_set_ps1(static_cast(kernel_interpolation_factor))); - m128_sums1 = _mm_add_ps(m128_sums1, m128_sums2); - - // Sum components together. - float result; - m128_sums2 = _mm_add_ps(_mm_movehl_ps(m128_sums1, m128_sums1), m128_sums1); - _mm_store_ss(&result, _mm_add_ss(m128_sums2, - _mm_shuffle_ps(m128_sums2, m128_sums2, 1))); - - return result; -} - -} // namespace webrtc diff --git a/common_audio/resampler/sinc_resampler_unittest.cc b/common_audio/resampler/sinc_resampler_unittest.cc index ece6af0689..b067b23b88 100644 --- a/common_audio/resampler/sinc_resampler_unittest.cc +++ b/common_audio/resampler/sinc_resampler_unittest.cc @@ -116,9 +116,17 @@ TEST(SincResamplerTest, DISABLED_SetRatioBench) { printf("SetRatio() took %.2fms.\n", total_time_c_us / 1000); } +// Define platform independent function name for Convolve* tests. +#if defined(WEBRTC_ARCH_X86_FAMILY) +#define CONVOLVE_FUNC Convolve_SSE +#elif defined(WEBRTC_ARCH_ARM_V7) +#define CONVOLVE_FUNC Convolve_NEON +#endif + // Ensure various optimized Convolve() methods return the same value. Only run // this test if other optimized methods exist, otherwise the default Convolve() // will be tested by the parameterized SincResampler tests below. +#if defined(CONVOLVE_FUNC) TEST(SincResamplerTest, Convolve) { #if defined(WEBRTC_ARCH_X86_FAMILY) ASSERT_TRUE(WebRtc_GetCPUInfo(kSSE2)); @@ -140,7 +148,7 @@ TEST(SincResamplerTest, Convolve) { double result = resampler.Convolve_C( resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); - double result2 = resampler.convolve_proc_( + double result2 = resampler.CONVOLVE_FUNC( resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); EXPECT_NEAR(result2, result, kEpsilon); @@ -149,11 +157,12 @@ TEST(SincResamplerTest, Convolve) { result = resampler.Convolve_C( resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); - result2 = resampler.convolve_proc_( + result2 = resampler.CONVOLVE_FUNC( resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); EXPECT_NEAR(result2, result, kEpsilon); } +#endif // Benchmark for the various Convolve() methods. Make sure to build with // branding=Chrome so that RTC_DCHECKs are compiled out when benchmarking. @@ -181,6 +190,7 @@ TEST(SincResamplerTest, ConvolveBenchmark) { (rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec; printf("Convolve_C took %.2fms.\n", total_time_c_us / 1000); +#if defined(CONVOLVE_FUNC) #if defined(WEBRTC_ARCH_X86_FAMILY) ASSERT_TRUE(WebRtc_GetCPUInfo(kSSE2)); #elif defined(WEBRTC_ARCH_ARM_V7) @@ -190,33 +200,36 @@ TEST(SincResamplerTest, ConvolveBenchmark) { // Benchmark with unaligned input pointer. start = rtc::TimeNanos(); for (int j = 0; j < kConvolveIterations; ++j) { - resampler.convolve_proc_( + resampler.CONVOLVE_FUNC( resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); } double total_time_optimized_unaligned_us = (rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec; - printf(STRINGIZE(convolve_proc_) "(unaligned) took %.2fms; which is %.2fx " + printf(STRINGIZE(CONVOLVE_FUNC) "(unaligned) took %.2fms; which is %.2fx " "faster than Convolve_C.\n", total_time_optimized_unaligned_us / 1000, total_time_c_us / total_time_optimized_unaligned_us); // Benchmark with aligned input pointer. start = rtc::TimeNanos(); for (int j = 0; j < kConvolveIterations; ++j) { - resampler.convolve_proc_( + resampler.CONVOLVE_FUNC( resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); } double total_time_optimized_aligned_us = (rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec; - printf(STRINGIZE(convolve_proc_) " (aligned) took %.2fms; which is %.2fx " + printf(STRINGIZE(CONVOLVE_FUNC) " (aligned) took %.2fms; which is %.2fx " "faster than Convolve_C and %.2fx faster than " - STRINGIZE(convolve_proc_) " (unaligned).\n", + STRINGIZE(CONVOLVE_FUNC) " (unaligned).\n", total_time_optimized_aligned_us / 1000, total_time_c_us / total_time_optimized_aligned_us, total_time_optimized_unaligned_us / total_time_optimized_aligned_us); +#endif } +#undef CONVOLVE_FUNC + typedef std::tuple SincResamplerTestData; class SincResamplerTest : public ::testing::TestWithParam { @@ -339,7 +352,7 @@ INSTANTIATE_TEST_SUITE_P( std::make_tuple(16000, 44100, kResamplingRMSError, -62.54), std::make_tuple(22050, 44100, kResamplingRMSError, -73.53), std::make_tuple(32000, 44100, kResamplingRMSError, -63.32), - std::make_tuple(44100, 44100, kResamplingRMSError, -73.52), + std::make_tuple(44100, 44100, kResamplingRMSError, -73.53), std::make_tuple(48000, 44100, -15.01, -64.04), std::make_tuple(96000, 44100, -18.49, -25.51), std::make_tuple(192000, 44100, -20.50, -13.31), @@ -347,7 +360,7 @@ INSTANTIATE_TEST_SUITE_P( // To 48kHz std::make_tuple(8000, 48000, kResamplingRMSError, -63.43), std::make_tuple(11025, 48000, kResamplingRMSError, -62.61), - std::make_tuple(16000, 48000, kResamplingRMSError, -63.95), + std::make_tuple(16000, 48000, kResamplingRMSError, -63.96), std::make_tuple(22050, 48000, kResamplingRMSError, -62.42), std::make_tuple(32000, 48000, kResamplingRMSError, -64.04), std::make_tuple(44100, 48000, kResamplingRMSError, -62.63), diff --git a/system_wrappers/include/cpu_features_wrapper.h b/system_wrappers/include/cpu_features_wrapper.h index 02d54b4516..739161afca 100644 --- a/system_wrappers/include/cpu_features_wrapper.h +++ b/system_wrappers/include/cpu_features_wrapper.h @@ -18,7 +18,7 @@ extern "C" { #endif // List of features in x86. -typedef enum { kSSE2, kSSE3, kAVX2 } CPUFeature; +typedef enum { kSSE2, kSSE3 } CPUFeature; // List of features in ARM. enum { diff --git a/system_wrappers/source/cpu_features.cc b/system_wrappers/source/cpu_features.cc index 1667e46c10..ebcb48c15f 100644 --- a/system_wrappers/source/cpu_features.cc +++ b/system_wrappers/source/cpu_features.cc @@ -24,20 +24,6 @@ int GetCPUInfoNoASM(CPUFeature feature) { } #if defined(WEBRTC_ARCH_X86_FAMILY) - -// xgetbv returns the value of an Intel Extended Control Register (XCR). -// Currently only XCR0 is defined by Intel so |xcr| should always be zero. -uint64_t xgetbv(uint32_t xcr) { -#if defined(_MSC_VER) - return _xgetbv(xcr); -#else - uint32_t eax, edx; - - __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); - return (static_cast(edx) << 32) | eax; -#endif // _MSC_VER -} - #ifndef _MSC_VER // Intrinsic for "cpuid". #if defined(__pic__) && defined(__i386__) @@ -55,7 +41,7 @@ static inline void __cpuid(int cpu_info[4], int info_type) { __asm__ volatile("cpuid\n" : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type), "c"(0)); + : "a"(info_type)); } #endif #endif // _MSC_VER @@ -65,8 +51,6 @@ static inline void __cpuid(int cpu_info[4], int info_type) { // Actual feature detection for x86. static int GetCPUInfo(CPUFeature feature) { int cpu_info[4]; - __cpuid(cpu_info, 0); - int num_ids = cpu_info[0]; __cpuid(cpu_info, 1); if (feature == kSSE2) { return 0 != (cpu_info[3] & 0x04000000); @@ -74,23 +58,6 @@ static int GetCPUInfo(CPUFeature feature) { if (feature == kSSE3) { return 0 != (cpu_info[2] & 0x00000001); } - if (feature == kAVX2) { - // Interpret CPU feature information. - int cpu_info7[4] = {-1}; - if (num_ids >= 7) { - __cpuid(cpu_info7, 7); - } - -#if defined(WEBRTC_ENABLE_AVX2) - return (cpu_info[2] & 0x10000000) != 0 && - (cpu_info[2] & 0x04000000) != 0 /* XSAVE */ && - (cpu_info[2] & 0x08000000) != 0 /* OSXSAVE */ && - (xgetbv(0) & 0x00000006) == 6 /* XSAVE enabled by kernel */ && - (cpu_info7[1] & 0x00000020) != 0; -#else - return 0; -#endif // WEBRTC_ENABLE_AVX2 - } return 0; } #else diff --git a/webrtc.gni b/webrtc.gni index 17a66f974c..b3f9a7142f 100644 --- a/webrtc.gni +++ b/webrtc.gni @@ -242,10 +242,6 @@ declare_args() { # standalone WebRTC. rtc_include_internal_audio_device = !build_with_chromium - # Set this to true to enable the avx2 support in webrtc. - # TODO(bugs.webrtc.org/11663): Default this to true and eventually remove. - rtc_enable_avx2 = false - # Include tests in standalone checkout. rtc_include_tests = !build_with_chromium && !build_with_mozilla