From b0abbd353dd5cd0cfb2458728a7c6a87931d887e Mon Sep 17 00:00:00 2001 From: "kma@webrtc.org" Date: Fri, 6 Jan 2012 19:50:20 +0000 Subject: [PATCH] Optimized spl function WebRtcSpl_CrossCorrelation for ARM Neon platforms. When used in Neteq, Neteq performance improved from 13 to 33% with different test configurations. Output is not bit-exact with generic C code in file cross_correlation.c, due to reduction of shift operations from using Neon registers, although in theory now the result is more accurate than before. Review URL: http://webrtc-codereview.appspot.com/333013 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1350 4adac7df-926f-26a2-2b94-8c16560cd09d --- src/common_audio/signal_processing/Android.mk | 7 +- .../signal_processing/cross_correlation.c | 4 + .../cross_correlation_neon.s | 168 ++++++++++++++++++ 3 files changed, 177 insertions(+), 2 deletions(-) create mode 100644 src/common_audio/signal_processing/cross_correlation_neon.s diff --git a/src/common_audio/signal_processing/Android.mk b/src/common_audio/signal_processing/Android.mk index 787e5c1400..4ef1ef0a7d 100644 --- a/src/common_audio/signal_processing/Android.mk +++ b/src/common_audio/signal_processing/Android.mk @@ -22,7 +22,6 @@ LOCAL_SRC_FILES := \ complex_fft.c \ complex_bit_reverse.c \ copy_set_operations.c \ - cross_correlation.c \ division_operations.c \ dot_product_with_scale.c \ downsample_fast.c \ @@ -60,9 +59,13 @@ LOCAL_C_INCLUDES := \ ifeq ($(ARCH_ARM_HAVE_NEON),true) LOCAL_SRC_FILES += \ - min_max_operations_neon.c + min_max_operations_neon.c \ + cross_correlation_neon.s LOCAL_CFLAGS += \ $(MY_ARM_CFLAGS_NEON) +else +LOCAL_SRC_FILES += \ + cross_correlation.c endif LOCAL_SHARED_LIBRARIES := libstlport diff --git a/src/common_audio/signal_processing/cross_correlation.c b/src/common_audio/signal_processing/cross_correlation.c index 1133d0933d..726a74967b 100644 --- a/src/common_audio/signal_processing/cross_correlation.c +++ b/src/common_audio/signal_processing/cross_correlation.c @@ -15,6 +15,10 @@ * */ +/* TODO(kma): Clean up the code in this file, and break it up for + * various platforms (Xscale, ARM/Neon etc.). + */ + #include "signal_processing_library.h" void WebRtcSpl_CrossCorrelation(WebRtc_Word32* cross_correlation, WebRtc_Word16* seq1, diff --git a/src/common_audio/signal_processing/cross_correlation_neon.s b/src/common_audio/signal_processing/cross_correlation_neon.s new file mode 100644 index 0000000000..e9b1c69bc9 --- /dev/null +++ b/src/common_audio/signal_processing/cross_correlation_neon.s @@ -0,0 +1,168 @@ +@ +@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. +@ + +@ cross_correlation_neon.s +@ This file contains the function WebRtcSpl_CrossCorrelation(), +@ optimized for ARM Neon platform. +@ +@ Reference Ccode at end of this file. +@ Output is bit-exact with the reference C code, but not with the generic +@ C code in file cross_correlation.c, due to reduction of shift operations +@ from using Neon registers. + +@ Register usage: +@ +@ r0: *cross_correlation (function argument) +@ r1: *seq1 (function argument) +@ r2: *seq2 (function argument) +@ r3: dim_seq (function argument); then, total iteration of LOOP_DIM_SEQ +@ r4: counter for LOOP_DIM_CROSS_CORRELATION +@ r5: seq2_ptr +@ r6: seq1_ptr +@ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL +@ r8, r9, r10, r11, r12: scratch + +.arch armv7-a +.fpu neon + +.align 2 +.global WebRtcSpl_CrossCorrelation + +WebRtcSpl_CrossCorrelation: + +.fnstart + +.save {r4-r11} + push {r4-r11} + + @ Put the shift value (-right_shifts) into a Neon register. + ldrsh r10, [sp, #36] + rsb r10, r10, #0 + mov r8, r10, asr #31 + vmov.32 d16, r10, r8 + + @ Initialize loop counters. + and r7, r3, #7 @ inner_loop_len2 = dim_seq % 8; + asr r3, r3, #3 @ inner_loop_len1 = dim_seq / 8; + ldrsh r4, [sp, #32] @ dim_cross_correlation + +LOOP_DIM_CROSS_CORRELATION: + vmov.i32 q9, #0 + vmov.i32 q14, #0 + movs r8, r3 @ inner_loop_len1 + mov r6, r1 @ seq1_ptr + mov r5, r2 @ seq2_ptr + ble POST_LOOP_DIM_SEQ + +LOOP_DIM_SEQ: + vld1.16 {d20, d21}, [r6]! @ seq1_ptr + vld1.16 {d22, d23}, [r5]! @ seq2_ptr + subs r8, r8, #1 + vmull.s16 q12, d20, d22 + vmull.s16 q13, d21, d23 + vpadal.s32 q9, q12 + vpadal.s32 q14, q13 + bgt LOOP_DIM_SEQ + +POST_LOOP_DIM_SEQ: + movs r10, r7 @ Loop counter + mov r12, #0 + mov r8, #0 + ble POST_LOOP_DIM_SEQ_RESIDUAL + +LOOP_DIM_SEQ_RESIDUAL: + ldrh r11, [r6], #2 + ldrh r9, [r5], #2 + smulbb r11, r11, r9 + adds r8, r8, r11 + adc r12, r12, r11, asr #31 + subs r10, #1 + bgt LOOP_DIM_SEQ_RESIDUAL + +POST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift. + vadd.i64 d18, d19 + vadd.i64 d28, d29 + vadd.i64 d18, d28 + vmov.32 d17[0], r8 + vmov.32 d17[1], r12 + vadd.i64 d17, d18 + vshl.s64 d17, d16 + vst1.32 d17[0], [r0]! @ Store the output + + ldr r8, [sp, #40] @ step_seq2 + add r2, r8, lsl #1 @ prepare for seq2_ptr(r5) in the next loop. + + subs r4, #1 + bgt LOOP_DIM_CROSS_CORRELATION + + pop {r4-r11} + bx lr + +.fnend + + +@ TODO(kma): Place this piece of reference code into a C code file. +@ void WebRtcSpl_CrossCorrelation(WebRtc_Word32* cross_correlation, +@ WebRtc_Word16* seq1, +@ WebRtc_Word16* seq2, +@ WebRtc_Word16 dim_seq, +@ WebRtc_Word16 dim_cross_correlation, +@ WebRtc_Word16 right_shifts, +@ WebRtc_Word16 step_seq2) { +@ int i = 0; +@ int j = 0; +@ int inner_loop_len1 = dim_seq >> 3; +@ int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3); +@ +@ assert(dim_cross_correlation > 0); +@ assert(dim_seq > 0); +@ +@ for (i = 0; i < dim_cross_correlation; i++) { +@ int16_t *seq1_ptr = seq1; +@ int16_t *seq2_ptr = seq2 + (step_seq2 * i); +@ int64_t sum = 0; +@ +@ for (j = inner_loop_len1; j > 0; j -= 1) { +@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); +@ seq1_ptr++; +@ seq2_ptr++; +@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); +@ seq1_ptr++; +@ seq2_ptr++; +@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); +@ seq1_ptr++; +@ seq2_ptr++; +@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); +@ seq1_ptr++; +@ seq2_ptr++; +@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); +@ seq1_ptr++; +@ seq2_ptr++; +@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); +@ seq1_ptr++; +@ seq2_ptr++; +@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); +@ seq1_ptr++; +@ seq2_ptr++; +@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); +@ seq1_ptr++; +@ seq2_ptr++; +@ } +@ +@ // Calculate the rest of the samples. +@ for (j = inner_loop_len2; j > 0; j -= 1) { +@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); +@ seq1_ptr++; +@ seq2_ptr++; +@ } +@ +@ *cross_correlation++ = (int32_t)(sum >> right_shifts); +@ } +@ }