diff --git a/webrtc/common_audio/BUILD.gn b/webrtc/common_audio/BUILD.gn index b3e3ff5116..c9c3c5b9b3 100644 --- a/webrtc/common_audio/BUILD.gn +++ b/webrtc/common_audio/BUILD.gn @@ -124,6 +124,10 @@ source_set("common_audio") { } } + if (current_cpu == "arm64") { + deps += [ ":common_audio_neon" ] + } + if (current_cpu == "mipsel") { sources += [ "signal_processing/include/spl_inl_mips.h", @@ -194,30 +198,23 @@ if (current_cpu == "x86" || current_cpu == "x64") { } } -if (rtc_build_armv7_neon) { +if (rtc_build_armv7_neon || current_cpu == "arm64") { source_set("common_audio_neon") { sources = [ "fir_filter_neon.cc", "resampler/sinc_resampler_neon.cc", - "signal_processing/cross_correlation_neon.S", - "signal_processing/downsample_fast_neon.S", - "signal_processing/min_max_operations_neon.S", + "signal_processing/cross_correlation_neon.c", + "signal_processing/downsample_fast_neon.c", + "signal_processing/min_max_operations_neon.c", ] configs += [ "..:common_config" ] public_configs = [ "..:common_inherited_config" ] - - # Enable compilation for the ARM v7 Neon instruction set. This is needed - # since //build/config/arm.gni only enables Neon for iOS, not Android. - # This provides the same functionality as webrtc/build/arm_neon.gypi. - # TODO(kjellander): Investigate if this can be moved into webrtc.gni or - # //build/config/arm.gni instead, to reduce code duplication. - # Remove the -mfpu=vfpv3-d16 cflag. - configs -= [ "//build/config/compiler:compiler_arm_fpu" ] - cflags = [ - "-mfpu=neon", - ] + if (!arm_use_neon) { + configs -= [ "//build/config/compiler:compiler_arm_fpu" ] + cflags = [ "-mfpu=neon" ] + } # Disable LTO in audio_processing_neon target due to compiler bug. if (rtc_use_lto) { diff --git a/webrtc/common_audio/common_audio.gyp b/webrtc/common_audio/common_audio.gyp index f74a5b1944..f7bc3a80e7 100644 --- a/webrtc/common_audio/common_audio.gyp +++ b/webrtc/common_audio/common_audio.gyp @@ -146,6 +146,9 @@ }], ], # conditions }], + ['target_arch=="arm64"', { + 'dependencies': ['common_audio_neon',], + }], ['target_arch=="mipsel" and mips_arch_variant!="r6" and android_webview_build==0', { 'sources': [ 'signal_processing/include/spl_inl_mips.h', @@ -194,7 +197,7 @@ }, ], # targets }], - ['target_arch=="arm" and arm_version>=7', { + ['target_arch=="arm" and arm_version>=7 or target_arch=="arm64"', { 'targets': [ { 'target_name': 'common_audio_neon', @@ -203,9 +206,9 @@ 'sources': [ 'fir_filter_neon.cc', 'resampler/sinc_resampler_neon.cc', - 'signal_processing/cross_correlation_neon.S', - 'signal_processing/downsample_fast_neon.S', - 'signal_processing/min_max_operations_neon.S', + 'signal_processing/cross_correlation_neon.c', + 'signal_processing/downsample_fast_neon.c', + 'signal_processing/min_max_operations_neon.c', ], 'conditions': [ # Disable LTO in common_audio_neon target due to compiler bug diff --git a/webrtc/common_audio/resampler/sinc_resampler.h b/webrtc/common_audio/resampler/sinc_resampler.h index 4428359e02..be84a99624 100644 --- a/webrtc/common_audio/resampler/sinc_resampler.h +++ b/webrtc/common_audio/resampler/sinc_resampler.h @@ -107,7 +107,7 @@ class SincResampler { static float Convolve_SSE(const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor); -#elif defined(WEBRTC_ARCH_ARM_V7) +#elif defined(WEBRTC_ARCH_ARM_V7) || defined(WEBRTC_ARCH_ARM64_NEON) static float Convolve_NEON(const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor); diff --git a/webrtc/common_audio/signal_processing/cross_correlation_neon.S b/webrtc/common_audio/signal_processing/cross_correlation_neon.S deleted file mode 100644 index 15b25b8f5b..0000000000 --- a/webrtc/common_audio/signal_processing/cross_correlation_neon.S +++ /dev/null @@ -1,159 +0,0 @@ -@ -@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. -@ -@ Use of this source code is governed by a BSD-style license -@ that can be found in the LICENSE file in the root of the source -@ tree. An additional intellectual property rights grant can be found -@ in the file PATENTS. All contributing project authors may -@ be found in the AUTHORS file in the root of the source tree. -@ - -@ cross_correlation_neon.s -@ This file contains the function WebRtcSpl_CrossCorrelationNeon(), -@ optimized for ARM Neon platform. -@ -@ Reference Ccode at end of this file. -@ Output is bit-exact with the reference C code, but not with the generic -@ C code in file cross_correlation.c, due to reduction of shift operations -@ from using Neon registers. - -@ Register usage: -@ -@ r0: *cross_correlation (function argument) -@ r1: *seq1 (function argument) -@ r2: *seq2 (function argument) -@ r3: dim_seq (function argument); then, total iteration of LOOP_DIM_SEQ -@ r4: counter for LOOP_DIM_CROSS_CORRELATION -@ r5: seq2_ptr -@ r6: seq1_ptr -@ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL -@ r8, r9, r10, r11, r12: scratch - -#include "webrtc/system_wrappers/interface/asm_defines.h" - -GLOBAL_FUNCTION WebRtcSpl_CrossCorrelationNeon -.align 2 -DEFINE_FUNCTION WebRtcSpl_CrossCorrelationNeon - push {r4-r11} - - @ Put the shift value (-right_shifts) into a Neon register. - ldrsh r10, [sp, #36] - rsb r10, r10, #0 - mov r8, r10, asr #31 - vmov d16, r10, r8 - - @ Initialize loop counters. - and r7, r3, #7 @ inner_loop_len2 = dim_seq % 8; - asr r3, r3, #3 @ inner_loop_len1 = dim_seq / 8; - ldrsh r4, [sp, #32] @ dim_cross_correlation - -LOOP_DIM_CROSS_CORRELATION: - vmov.i32 q9, #0 - vmov.i32 q14, #0 - movs r8, r3 @ inner_loop_len1 - mov r6, r1 @ seq1_ptr - mov r5, r2 @ seq2_ptr - ble POST_LOOP_DIM_SEQ - -LOOP_DIM_SEQ: - vld1.16 {d20, d21}, [r6]! @ seq1_ptr - vld1.16 {d22, d23}, [r5]! @ seq2_ptr - subs r8, r8, #1 - vmull.s16 q12, d20, d22 - vmull.s16 q13, d21, d23 - vpadal.s32 q9, q12 - vpadal.s32 q14, q13 - bgt LOOP_DIM_SEQ - -POST_LOOP_DIM_SEQ: - movs r10, r7 @ Loop counter - mov r12, #0 - mov r8, #0 - ble POST_LOOP_DIM_SEQ_RESIDUAL - -LOOP_DIM_SEQ_RESIDUAL: - ldrh r11, [r6], #2 - ldrh r9, [r5], #2 - smulbb r11, r11, r9 - adds r8, r8, r11 - adc r12, r12, r11, asr #31 - subs r10, #1 - bgt LOOP_DIM_SEQ_RESIDUAL - -POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift. - vadd.i64 d18, d19 - vadd.i64 d28, d29 - vadd.i64 d18, d28 - vmov.32 d17[0], r8 - vmov.32 d17[1], r12 - vadd.i64 d17, d18 - vshl.s64 d17, d16 - vst1.32 d17[0], [r0]! @ Store the output - - ldr r8, [sp, #40] @ step_seq2 - add r2, r8, lsl #1 @ prepare for seq2_ptr(r5) in the next loop. - - subs r4, #1 - bgt LOOP_DIM_CROSS_CORRELATION - - pop {r4-r11} - bx lr - -@ TODO(kma): Place this piece of reference code into a C code file. -@ void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation, -@ int16_t* seq1, -@ int16_t* seq2, -@ int16_t dim_seq, -@ int16_t dim_cross_correlation, -@ int16_t right_shifts, -@ int16_t step_seq2) { -@ int i = 0; -@ int j = 0; -@ int inner_loop_len1 = dim_seq >> 3; -@ int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3); -@ -@ assert(dim_cross_correlation > 0); -@ assert(dim_seq > 0); -@ -@ for (i = 0; i < dim_cross_correlation; i++) { -@ int16_t *seq1_ptr = seq1; -@ int16_t *seq2_ptr = seq2 + (step_seq2 * i); -@ int64_t sum = 0; -@ -@ for (j = inner_loop_len1; j > 0; j -= 1) { -@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); -@ seq1_ptr++; -@ seq2_ptr++; -@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); -@ seq1_ptr++; -@ seq2_ptr++; -@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); -@ seq1_ptr++; -@ seq2_ptr++; -@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); -@ seq1_ptr++; -@ seq2_ptr++; -@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); -@ seq1_ptr++; -@ seq2_ptr++; -@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); -@ seq1_ptr++; -@ seq2_ptr++; -@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); -@ seq1_ptr++; -@ seq2_ptr++; -@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); -@ seq1_ptr++; -@ seq2_ptr++; -@ } -@ -@ // Calculate the rest of the samples. -@ for (j = inner_loop_len2; j > 0; j -= 1) { -@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); -@ seq1_ptr++; -@ seq2_ptr++; -@ } -@ -@ *cross_correlation++ = (int32_t)(sum >> right_shifts); -@ } -@ } diff --git a/webrtc/common_audio/signal_processing/downsample_fast_neon.S b/webrtc/common_audio/signal_processing/downsample_fast_neon.S deleted file mode 100644 index 4e348ec646..0000000000 --- a/webrtc/common_audio/signal_processing/downsample_fast_neon.S +++ /dev/null @@ -1,215 +0,0 @@ -@ -@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. -@ -@ Use of this source code is governed by a BSD-style license -@ that can be found in the LICENSE file in the root of the source -@ tree. An additional intellectual property rights grant can be found -@ in the file PATENTS. All contributing project authors may -@ be found in the AUTHORS file in the root of the source tree. -@ - -@ This file contains the function WebRtcSpl_DownsampleFastNeon(), optimized for -@ ARM Neon platform. The description header can be found in -@ signal_processing_library.h -@ -@ The reference C code is in file downsample_fast.c. Bit-exact. - -#include "webrtc/system_wrappers/interface/asm_defines.h" - -GLOBAL_FUNCTION WebRtcSpl_DownsampleFastNeon -.align 2 -DEFINE_FUNCTION WebRtcSpl_DownsampleFastNeon - push {r4-r11} - - cmp r3, #0 @ data_out_length <= 0? - movle r0, #-1 - ble END - - ldrsh r12, [sp, #44] - ldr r5, [sp, #40] @ r5: factor - add r4, r12, #1 @ r4: delay + 1 - sub r3, r3, #1 @ r3: data_out_length - 1 - smulbb r3, r5, r3 - ldr r8, [sp, #32] @ &coefficients[0] - mov r9, r12 @ Iteration counter for outer loops. - add r3, r4 @ delay + factor * (out_length-1) +1 - - cmp r3, r1 @ data_in_length < endpos? - movgt r0, #-1 - bgt END - - @ Initializations. - sub r3, r5, asl #3 - add r11, r0, r12, asl #1 @ &data_in[delay] - ldr r0, [sp, #36] @ coefficients_length - add r3, r5 @ endpos - factor * 7 - - cmp r0, #0 @ coefficients_length <= 0 ? - movle r0, #-1 - ble END - - add r8, r0, asl #1 @ &coeffieient[coefficients_length] - cmp r9, r3 - bge POST_LOOP_ENDPOS @ branch when Iteration < 8 times. - -@ -@ First part, unroll the loop 8 times, with 3 subcases (factor == 2, 4, others) -@ - mov r4, #-2 - - @ Direct program flow to the right channel. - - @ r10 is an offset to &data_in[] in the loop. After an iteration, we need to - @ move the pointer back to original after advancing 16 bytes by a vld1, and - @ then move 2 bytes forward to increment one more sample. - cmp r5, #2 - moveq r10, #-14 - beq LOOP_ENDPOS_FACTOR2 @ Branch when factor == 2 - - @ Similar here, for r10, we need to move the pointer back to original after - @ advancing 32 bytes, then move 2 bytes forward to increment one sample. - cmp r5, #4 - moveq r10, #-30 - beq LOOP_ENDPOS_FACTOR4 @ Branch when factor == 4 - - @ For r10, we need to move the pointer back to original after advancing - @ (factor * 7 * 2) bytes, then move 2 bytes forward to increment one sample. - mov r10, r5, asl #4 - rsb r10, #2 - add r10, r5, asl #1 - lsl r5, #1 @ r5 = factor * sizeof(data_in) - -@ The general case (factor != 2 && factor != 4) -LOOP_ENDPOS_GENERAL: - @ Initializations. - vmov.i32 q2, #2048 - vmov.i32 q3, #2048 - sub r7, r8, #2 - sub r12, r0, #1 @ coefficients_length - 1 - sub r1, r11, r12, asl #1 @ &data_in[i - j] - -LOOP_COEFF_LENGTH_GENERAL: - vld1.16 {d2[], d3[]}, [r7], r4 @ coefficients[j] - vld1.16 d0[0], [r1], r5 @ data_in[i - j] - vld1.16 d0[1], [r1], r5 @ data_in[i + factor - j] - vld1.16 d0[2], [r1], r5 @ data_in[i + factor * 2 - j] - vld1.16 d0[3], [r1], r5 @ data_in[i + factor * 3 - j] - vld1.16 d1[0], [r1], r5 @ data_in[i + factor * 4 - j] - vld1.16 d1[1], [r1], r5 @ data_in[i + factor * 5 - j] - vld1.16 d1[2], [r1], r5 @ data_in[i + factor * 6 - j] - vld1.16 d1[3], [r1], r10 @ data_in[i + factor * 7 - j] - subs r12, #1 - vmlal.s16 q2, d0, d2 - vmlal.s16 q3, d1, d3 - bge LOOP_COEFF_LENGTH_GENERAL - - @ Shift, saturate, and store the result. - vqshrn.s32 d0, q2, #12 - vqshrn.s32 d1, q3, #12 - vst1.16 {d0, d1}, [r2]! - - add r11, r5, asl #3 @ r11 -> &data_in[i + factor * 8] - add r9, r5, asl #2 @ Counter i = delay + factor * 8. - cmp r9, r3 @ i < endpos - factor * 7 ? - blt LOOP_ENDPOS_GENERAL - asr r5, #1 @ Restore r5 to the value of factor. - b POST_LOOP_ENDPOS - -@ The case for factor == 2. -LOOP_ENDPOS_FACTOR2: - @ Initializations. - vmov.i32 q2, #2048 - vmov.i32 q3, #2048 - sub r7, r8, #2 - sub r12, r0, #1 @ coefficients_length - 1 - sub r1, r11, r12, asl #1 @ &data_in[i - j] - -LOOP_COEFF_LENGTH_FACTOR2: - vld1.16 {d16[], d17[]}, [r7], r4 @ coefficients[j] - vld2.16 {d0, d1}, [r1]! @ data_in[] - vld2.16 {d2, d3}, [r1], r10 @ data_in[] - subs r12, #1 - vmlal.s16 q2, d0, d16 - vmlal.s16 q3, d2, d17 - bge LOOP_COEFF_LENGTH_FACTOR2 - - @ Shift, saturate, and store the result. - vqshrn.s32 d0, q2, #12 - vqshrn.s32 d1, q3, #12 - vst1.16 {d0, d1}, [r2]! - - add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8] - add r9, r5, asl #3 @ Counter i = delay + factor * 8. - cmp r9, r3 @ i < endpos - factor * 7 ? - blt LOOP_ENDPOS_FACTOR2 - b POST_LOOP_ENDPOS - -@ The case for factor == 4. -LOOP_ENDPOS_FACTOR4: - @ Initializations. - vmov.i32 q2, #2048 - vmov.i32 q3, #2048 - sub r7, r8, #2 - sub r12, r0, #1 @ coefficients_length - 1 - sub r1, r11, r12, asl #1 @ &data_in[i - j] - -LOOP_COEFF_LENGTH_FACTOR4: - vld1.16 {d16[], d17[]}, [r7], r4 @ coefficients[j] - vld4.16 {d0, d1, d2, d3}, [r1]! @ data_in[] - vld4.16 {d18, d19, d20, d21}, [r1], r10 @ data_in[] - subs r12, #1 - vmlal.s16 q2, d0, d16 - vmlal.s16 q3, d18, d17 - bge LOOP_COEFF_LENGTH_FACTOR4 - - add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8] - add r9, r5, asl #3 @ Counter i = delay + factor * 8. - - @ Shift, saturate, and store the result. - vqshrn.s32 d0, q2, #12 - vqshrn.s32 d1, q3, #12 - cmp r9, r3 @ i < endpos - factor * 7 ? - vst1.16 {d0, d1}, [r2]! - - blt LOOP_ENDPOS_FACTOR4 - -@ -@ Second part, do the rest iterations (if any). -@ - -POST_LOOP_ENDPOS: - add r3, r5, asl #3 - sub r3, r5 @ Restore r3 to endpos. - cmp r9, r3 - movge r0, #0 - bge END - -LOOP2_ENDPOS: - @ Initializations. - mov r7, r8 - sub r12, r0, #1 @ coefficients_length - 1 - sub r6, r11, r12, asl #1 @ &data_in[i - j] - - mov r1, #2048 - -LOOP2_COEFF_LENGTH: - ldrsh r4, [r7, #-2]! @ coefficients[j] - ldrsh r10, [r6], #2 @ data_in[i - j] - smlabb r1, r4, r10, r1 - subs r12, #1 - bge LOOP2_COEFF_LENGTH - - @ Shift, saturate, and store the result. - ssat r1, #16, r1, asr #12 - strh r1, [r2], #2 - - add r11, r5, asl #1 @ r11 -> &data_in[i + factor] - add r9, r5 @ Counter i = delay + factor. - cmp r9, r3 @ i < endpos? - blt LOOP2_ENDPOS - - mov r0, #0 - -END: - pop {r4-r11} - bx lr diff --git a/webrtc/common_audio/signal_processing/include/signal_processing_library.h b/webrtc/common_audio/signal_processing/include/signal_processing_library.h index d987e9ad85..a1dc6aa06f 100644 --- a/webrtc/common_audio/signal_processing/include/signal_processing_library.h +++ b/webrtc/common_audio/signal_processing/include/signal_processing_library.h @@ -154,7 +154,8 @@ void WebRtcSpl_ZerosArrayW32(int32_t* vector, typedef int16_t (*MaxAbsValueW16)(const int16_t* vector, int length); extern MaxAbsValueW16 WebRtcSpl_MaxAbsValueW16; int16_t WebRtcSpl_MaxAbsValueW16C(const int16_t* vector, int length); -#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) +#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \ + (defined WEBRTC_ARCH_ARM64_NEON) int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length); #endif #if defined(MIPS32_LE) @@ -172,7 +173,8 @@ int16_t WebRtcSpl_MaxAbsValueW16_mips(const int16_t* vector, int length); typedef int32_t (*MaxAbsValueW32)(const int32_t* vector, int length); extern MaxAbsValueW32 WebRtcSpl_MaxAbsValueW32; int32_t WebRtcSpl_MaxAbsValueW32C(const int32_t* vector, int length); -#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) +#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \ + (defined WEBRTC_ARCH_ARM64_NEON) int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length); #endif #if defined(MIPS_DSP_R1_LE) @@ -192,7 +194,8 @@ int32_t WebRtcSpl_MaxAbsValueW32_mips(const int32_t* vector, int length); typedef int16_t (*MaxValueW16)(const int16_t* vector, int length); extern MaxValueW16 WebRtcSpl_MaxValueW16; int16_t WebRtcSpl_MaxValueW16C(const int16_t* vector, int length); -#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) +#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \ + (defined WEBRTC_ARCH_ARM64_NEON) int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length); #endif #if defined(MIPS32_LE) @@ -212,7 +215,8 @@ int16_t WebRtcSpl_MaxValueW16_mips(const int16_t* vector, int length); typedef int32_t (*MaxValueW32)(const int32_t* vector, int length); extern MaxValueW32 WebRtcSpl_MaxValueW32; int32_t WebRtcSpl_MaxValueW32C(const int32_t* vector, int length); -#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) +#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \ + (defined WEBRTC_ARCH_ARM64_NEON) int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length); #endif #if defined(MIPS32_LE) @@ -232,7 +236,8 @@ int32_t WebRtcSpl_MaxValueW32_mips(const int32_t* vector, int length); typedef int16_t (*MinValueW16)(const int16_t* vector, int length); extern MinValueW16 WebRtcSpl_MinValueW16; int16_t WebRtcSpl_MinValueW16C(const int16_t* vector, int length); -#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) +#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \ + (defined WEBRTC_ARCH_ARM64_NEON) int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length); #endif #if defined(MIPS32_LE) @@ -252,7 +257,8 @@ int16_t WebRtcSpl_MinValueW16_mips(const int16_t* vector, int length); typedef int32_t (*MinValueW32)(const int32_t* vector, int length); extern MinValueW32 WebRtcSpl_MinValueW32; int32_t WebRtcSpl_MinValueW32C(const int32_t* vector, int length); -#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) +#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \ + (defined WEBRTC_ARCH_ARM64_NEON) int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length); #endif #if defined(MIPS32_LE) @@ -552,7 +558,8 @@ void WebRtcSpl_CrossCorrelationC(int32_t* cross_correlation, int16_t dim_cross_correlation, int16_t right_shifts, int16_t step_seq2); -#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) +#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \ + (defined WEBRTC_ARCH_ARM64_NEON) void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation, const int16_t* seq1, const int16_t* seq2, @@ -717,7 +724,8 @@ int WebRtcSpl_DownsampleFastC(const int16_t* data_in, int coefficients_length, int factor, int delay); -#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) +#if (defined WEBRTC_DETECT_ARM_NEON) || (defined WEBRTC_ARCH_ARM_NEON) || \ + (defined WEBRTC_ARCH_ARM64_NEON) int WebRtcSpl_DownsampleFastNeon(const int16_t* data_in, int data_in_length, int16_t* data_out, diff --git a/webrtc/common_audio/signal_processing/min_max_operations_neon.S b/webrtc/common_audio/signal_processing/min_max_operations_neon.S deleted file mode 100644 index f427e68164..0000000000 --- a/webrtc/common_audio/signal_processing/min_max_operations_neon.S +++ /dev/null @@ -1,283 +0,0 @@ -@ -@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. -@ -@ Use of this source code is governed by a BSD-style license -@ that can be found in the LICENSE file in the root of the source -@ tree. An additional intellectual property rights grant can be found -@ in the file PATENTS. All contributing project authors may -@ be found in the AUTHORS file in the root of the source tree. -@ - -@ This file contains some minimum and maximum functions, optimized for -@ ARM Neon platform. The description header can be found in -@ signal_processing_library.h -@ -@ The reference C code is in file min_max_operations.c. Code here is basically -@ a loop unrolling by 8 with Neon instructions. Bit-exact. - -#include "webrtc/system_wrappers/interface/asm_defines.h" - -GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW16Neon -GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW32Neon -GLOBAL_FUNCTION WebRtcSpl_MaxValueW16Neon -GLOBAL_FUNCTION WebRtcSpl_MaxValueW32Neon -GLOBAL_FUNCTION WebRtcSpl_MinValueW16Neon -GLOBAL_FUNCTION WebRtcSpl_MinValueW32Neon - -.align 2 -@ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length); -DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW16Neon - mov r2, #-1 @ Initialize the return value. - cmp r0, #0 - beq END_MAX_ABS_VALUE_W16 - cmp r1, #0 - ble END_MAX_ABS_VALUE_W16 - - cmp r1, #8 - blt LOOP_MAX_ABS_VALUE_W16 - - vmov.i16 q12, #0 - sub r1, #8 @ Counter for loops - -LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16: - vld1.16 {q13}, [r0]! - subs r1, #8 - vabs.s16 q13, q13 @ Note vabs doesn't change the value of -32768. - vmax.u16 q12, q13 @ Use u16 so we don't lose the value -32768. - bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16 - - @ Find the maximum value in the Neon registers and move it to r2. - vmax.u16 d24, d25 - vpmax.u16 d24, d24, d24 - vpmax.u16 d24, d24, d24 - adds r1, #8 - vmov.u16 r2, d24[0] - beq END_MAX_ABS_VALUE_W16 - -LOOP_MAX_ABS_VALUE_W16: - ldrsh r3, [r0], #2 - eor r12, r3, r3, asr #31 @ eor and then sub, to get absolute value. - sub r12, r12, r3, asr #31 - cmp r2, r12 - movlt r2, r12 - subs r1, #1 - bne LOOP_MAX_ABS_VALUE_W16 - -END_MAX_ABS_VALUE_W16: - cmp r2, #0x8000 @ Guard against the case for -32768. - subeq r2, #1 - mov r0, r2 - bx lr - - - -@ int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length); -DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW32Neon - cmp r0, #0 - moveq r0, #-1 - beq EXIT @ Return -1 for a NULL pointer. - cmp r1, #0 @ length - movle r0, #-1 - ble EXIT @ Return -1 if length <= 0. - - vmov.i32 q11, #0 - vmov.i32 q12, #0 - cmp r1, #8 - blt LOOP_MAX_ABS_VALUE_W32 - - sub r1, #8 @ Counter for loops - -LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32: - vld1.32 {q13, q14}, [r0]! - subs r1, #8 @ Counter for loops - vabs.s32 q13, q13 @ vabs doesn't change the value of 0x80000000. - vabs.s32 q14, q14 - vmax.u32 q11, q13 @ Use u32 so we don't lose the value 0x80000000. - vmax.u32 q12, q14 - bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32 - - @ Find the maximum value in the Neon registers and move it to r2. - vmax.u32 q12, q11 - vmax.u32 d24, d25 - vpmax.u32 d24, d24, d24 - adds r1, #8 - vmov.u32 r2, d24[0] - beq END_MAX_ABS_VALUE_W32 - -LOOP_MAX_ABS_VALUE_W32: - ldr r3, [r0], #4 - eor r12, r3, r3, asr #31 @ eor and then sub, to get absolute value. - sub r12, r12, r3, asr #31 - cmp r2, r12 - movcc r2, r12 - subs r1, #1 - bne LOOP_MAX_ABS_VALUE_W32 - -END_MAX_ABS_VALUE_W32: - mvn r0, #0x80000000 @ Guard against the case for 0x80000000. - cmp r2, r0 - movcc r0, r2 - -EXIT: - bx lr - -@ int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length); -DEFINE_FUNCTION WebRtcSpl_MaxValueW16Neon - mov r2, #0x8000 @ Initialize the return value. - cmp r0, #0 - beq END_MAX_VALUE_W16 - cmp r1, #0 - ble END_MAX_VALUE_W16 - - vmov.i16 q12, #0x8000 - cmp r1, #8 - blt LOOP_MAX_VALUE_W16 - - sub r1, #8 @ Counter for loops - -LOOP_UNROLLED_BY_8_MAX_VALUE_W16: - vld1.16 {q13}, [r0]! - subs r1, #8 - vmax.s16 q12, q13 - bge LOOP_UNROLLED_BY_8_MAX_VALUE_W16 - - @ Find the maximum value in the Neon registers and move it to r2. - vmax.s16 d24, d25 - vpmax.s16 d24, d24, d24 - vpmax.s16 d24, d24, d24 - adds r1, #8 - vmov.u16 r2, d24[0] - beq END_MAX_VALUE_W16 - -LOOP_MAX_VALUE_W16: - ldrsh r3, [r0], #2 - cmp r2, r3 - movlt r2, r3 - subs r1, #1 - bne LOOP_MAX_VALUE_W16 - -END_MAX_VALUE_W16: - mov r0, r2 - bx lr - -@ int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length); -DEFINE_FUNCTION WebRtcSpl_MaxValueW32Neon - mov r2, #0x80000000 @ Initialize the return value. - cmp r0, #0 - beq END_MAX_VALUE_W32 - cmp r1, #0 - ble END_MAX_VALUE_W32 - - vmov.i32 q11, #0x80000000 - vmov.i32 q12, #0x80000000 - cmp r1, #8 - blt LOOP_MAX_VALUE_W32 - - sub r1, #8 @ Counter for loops - -LOOP_UNROLLED_BY_8_MAX_VALUE_W32: - vld1.32 {q13, q14}, [r0]! - subs r1, #8 - vmax.s32 q11, q13 - vmax.s32 q12, q14 - bge LOOP_UNROLLED_BY_8_MAX_VALUE_W32 - - @ Find the maximum value in the Neon registers and move it to r2. - vmax.s32 q12, q11 - vpmax.s32 d24, d24, d25 - vpmax.s32 d24, d24, d24 - adds r1, #8 - vmov.s32 r2, d24[0] - beq END_MAX_VALUE_W32 - -LOOP_MAX_VALUE_W32: - ldr r3, [r0], #4 - cmp r2, r3 - movlt r2, r3 - subs r1, #1 - bne LOOP_MAX_VALUE_W32 - -END_MAX_VALUE_W32: - mov r0, r2 - bx lr - -@ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length); -DEFINE_FUNCTION WebRtcSpl_MinValueW16Neon - movw r2, #0x7FFF @ Initialize the return value. - cmp r0, #0 - beq END_MIN_VALUE_W16 - cmp r1, #0 - ble END_MIN_VALUE_W16 - - vdup.16 q12, r2 - cmp r1, #8 - blt LOOP_MIN_VALUE_W16 - - sub r1, #8 @ Counter for loops - -LOOP_UNROLLED_BY_8_MIN_VALUE_W16: - vld1.16 {q13}, [r0]! - subs r1, #8 - vmin.s16 q12, q13 - bge LOOP_UNROLLED_BY_8_MIN_VALUE_W16 - - @ Find the maximum value in the Neon registers and move it to r2. - vmin.s16 d24, d25 - vpmin.s16 d24, d24, d24 - vpmin.s16 d24, d24, d24 - adds r1, #8 - vmov.s16 r2, d24[0] - sxth r2, r2 - beq END_MIN_VALUE_W16 - -LOOP_MIN_VALUE_W16: - ldrsh r3, [r0], #2 - cmp r2, r3 - movge r2, r3 - subs r1, #1 - bne LOOP_MIN_VALUE_W16 - -END_MIN_VALUE_W16: - mov r0, r2 - bx lr - -@ int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length); -DEFINE_FUNCTION WebRtcSpl_MinValueW32Neon - mov r2, #0x7FFFFFFF @ Initialize the return value. - cmp r0, #0 - beq END_MIN_VALUE_W32 - cmp r1, #0 - ble END_MIN_VALUE_W32 - - vdup.32 q11, r2 - vdup.32 q12, r2 - cmp r1, #8 - blt LOOP_MIN_VALUE_W32 - - sub r1, #8 @ Counter for loops - -LOOP_UNROLLED_BY_8_MIN_VALUE_W32: - vld1.32 {q13, q14}, [r0]! - subs r1, #8 - vmin.s32 q11, q13 - vmin.s32 q12, q14 - bge LOOP_UNROLLED_BY_8_MIN_VALUE_W32 - - @ Find the maximum value in the Neon registers and move it to r2. - vmin.s32 q12, q11 - vpmin.s32 d24, d24, d25 - vpmin.s32 d24, d24, d24 - adds r1, #8 - vmov.s32 r2, d24[0] - beq END_MIN_VALUE_W32 - -LOOP_MIN_VALUE_W32: - ldr r3, [r0], #4 - cmp r2, r3 - movge r2, r3 - subs r1, #1 - bne LOOP_MIN_VALUE_W32 - -END_MIN_VALUE_W32: - mov r0, r2 - bx lr diff --git a/webrtc/common_audio/signal_processing/spl_init.c b/webrtc/common_audio/signal_processing/spl_init.c index c9a1673b0c..0a493796cb 100644 --- a/webrtc/common_audio/signal_processing/spl_init.c +++ b/webrtc/common_audio/signal_processing/spl_init.c @@ -29,7 +29,7 @@ DownsampleFast WebRtcSpl_DownsampleFast; ScaleAndAddVectorsWithRound WebRtcSpl_ScaleAndAddVectorsWithRound; #if (defined(WEBRTC_DETECT_ARM_NEON) || !defined(WEBRTC_ARCH_ARM_NEON)) && \ - !defined(MIPS32_LE) + !defined(MIPS32_LE) && !defined(WEBRTC_ARCH_ARM64_NEON) /* Initialize function pointers to the generic C version. */ static void InitPointersToC() { WebRtcSpl_MaxAbsValueW16 = WebRtcSpl_MaxAbsValueW16C; @@ -45,7 +45,8 @@ static void InitPointersToC() { } #endif -#if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON) +#if defined(WEBRTC_DETECT_ARM_NEON) || defined(WEBRTC_ARCH_ARM_NEON) || \ + (defined WEBRTC_ARCH_ARM64_NEON) /* Initialize function pointers to the Neon version. */ static void InitPointersToNeon() { WebRtcSpl_MaxAbsValueW16 = WebRtcSpl_MaxAbsValueW16Neon; @@ -92,7 +93,7 @@ static void InitFunctionPointers(void) { } else { InitPointersToC(); } -#elif defined(WEBRTC_ARCH_ARM_NEON) +#elif defined(WEBRTC_ARCH_ARM_NEON) || defined(WEBRTC_ARCH_ARM64_NEON) InitPointersToNeon(); #elif defined(MIPS32_LE) InitPointersToMIPS();