diff --git a/src/common_audio/signal_processing/Android.mk b/src/common_audio/signal_processing/Android.mk index b7151cf1ec..3a2ddc71f5 100644 --- a/src/common_audio/signal_processing/Android.mk +++ b/src/common_audio/signal_processing/Android.mk @@ -24,7 +24,6 @@ LOCAL_SRC_FILES := \ copy_set_operations.c \ division_operations.c \ dot_product_with_scale.c \ - downsample_fast.c \ energy.c \ filter_ar.c \ filter_ma_fast_q12.c \ @@ -58,12 +57,14 @@ LOCAL_C_INCLUDES := \ ifeq ($(ARCH_ARM_HAVE_NEON),true) LOCAL_SRC_FILES += \ min_max_operations_neon.c \ - cross_correlation_neon.s + cross_correlation_neon.s \ + downsample_fast_neon.s LOCAL_CFLAGS += \ $(MY_ARM_CFLAGS_NEON) else LOCAL_SRC_FILES += \ - cross_correlation.c + cross_correlation.c \ + downsample_fast.c endif ifeq ($(ARCH_ARM_HAVE_ARMV7A),true) diff --git a/src/common_audio/signal_processing/downsample_fast.c b/src/common_audio/signal_processing/downsample_fast.c index cce463c5d3..526cdca844 100644 --- a/src/common_audio/signal_processing/downsample_fast.c +++ b/src/common_audio/signal_processing/downsample_fast.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,52 +8,40 @@ * be found in the AUTHORS file in the root of the source tree. */ - -/* - * This file contains the function WebRtcSpl_DownsampleFast(). - * The description header can be found in signal_processing_library.h - * - */ - #include "signal_processing_library.h" -int WebRtcSpl_DownsampleFast(WebRtc_Word16 *in_ptr, WebRtc_Word16 in_length, - WebRtc_Word16 *out_ptr, WebRtc_Word16 out_length, - WebRtc_Word16 *B, WebRtc_Word16 B_length, WebRtc_Word16 factor, - WebRtc_Word16 delay) -{ - WebRtc_Word32 o; - int i, j; +// TODO(Bjornv): Change the function parameter order to WebRTC code style. +int WebRtcSpl_DownsampleFast(const int16_t* data_in, + int data_in_length, + int16_t* data_out, + int data_out_length, + const int16_t* __restrict coefficients, + int coefficients_length, + int factor, + int delay) { + int i = 0; + int j = 0; + int32_t out_s32 = 0; + int endpos = delay + factor * (data_out_length - 1) + 1; - WebRtc_Word16 *downsampled_ptr = out_ptr; - WebRtc_Word16 *b_ptr; - WebRtc_Word16 *x_ptr; - WebRtc_Word16 endpos = delay - + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(factor, (out_length - 1)) + 1; + // Return error if any of the running conditions doesn't meet. + if (data_out_length <= 0 || coefficients_length <= 0 + || data_in_length < endpos) { + return -1; + } - if (in_length < endpos) - { - return -1; + for (i = delay; i < endpos; i += factor) { + out_s32 = 2048; // Round value, 0.5 in Q12. + + for (j = 0; j < coefficients_length; j++) { + out_s32 += coefficients[j] * data_in[i - j]; // Q12. } - for (i = delay; i < endpos; i += factor) - { - b_ptr = &B[0]; - x_ptr = &in_ptr[i]; + out_s32 >>= 12; // Q0. - o = (WebRtc_Word32)2048; // Round val + // Saturate and store the output. + *data_out++ = WebRtcSpl_SatW32ToW16(out_s32); + } - for (j = 0; j < B_length; j++) - { - o += WEBRTC_SPL_MUL_16_16(*b_ptr++, *x_ptr--); - } - - o = WEBRTC_SPL_RSHIFT_W32(o, 12); - - // If output is higher than 32768, saturate it. Same with negative side - - *downsampled_ptr++ = WebRtcSpl_SatW32ToW16(o); - } - - return 0; + return 0; } diff --git a/src/common_audio/signal_processing/downsample_fast_neon.s b/src/common_audio/signal_processing/downsample_fast_neon.s new file mode 100644 index 0000000000..906b0a10be --- /dev/null +++ b/src/common_audio/signal_processing/downsample_fast_neon.s @@ -0,0 +1,222 @@ +@ +@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. +@ + +@ This file contains the function WebRtcSpl_DownsampleFast(), optimized for +@ ARM Neon platform. The description header can be found in +@ signal_processing_library.h +@ +@ The reference C code is in file downsample_fast.c. Bit-exact. + +.arch armv7-a +.fpu neon + +.align 2 +.global WebRtcSpl_DownsampleFast + +WebRtcSpl_DownsampleFast: + +.fnstart + +.save {r4-r11} + push {r4-r11} + + cmp r3, #0 @ data_out_length <= 0? + movle r0, #-1 + ble END + + ldrsh r12, [sp, #44] + ldr r5, [sp, #40] @ r5: factor + add r4, r12, #1 @ r4: delay + 1 + sub r3, r3, #1 @ r3: data_out_length - 1 + smulbb r3, r5, r3 + ldr r8, [sp, #32] @ &coefficients[0] + mov r9, r12 @ Iteration counter for outer loops. + add r3, r4 @ delay + factor * (out_length-1) +1 + + cmp r3, r1 @ data_in_length < endpos? + movgt r0, #-1 + bgt END + + @ Initializations. + sub r3, r5, asl #3 + add r11, r0, r12, asl #1 @ &data_in[delay] + ldr r0, [sp, #36] @ coefficients_length + add r3, r5 @ endpos - factor * 7 + + cmp r0, #0 @ coefficients_length <= 0 ? + movle r0, #-1 + ble END + + add r8, r0, asl #1 @ &coeffieient[coefficients_length] + cmp r9, r3 + bge POST_LOOP_ENDPOS @ branch when Iteration < 8 times. + +@ +@ First part, unroll the loop 8 times, with 3 subcases (factor == 2, 4, others) +@ + mov r4, #-2 + + @ Direct program flow to the right channel. + + @ r10 is an offset to &data_in[] in the loop. After an iteration, we need to + @ move the pointer back to original after advancing 16 bytes by a vld1, and + @ then move 2 bytes forward to increment one more sample. + cmp r5, #2 + moveq r10, #-14 + beq LOOP_ENDPOS_FACTOR2 @ Branch when factor == 2 + + @ Similar here, for r10, we need to move the pointer back to original after + @ advancing 32 bytes, then move 2 bytes forward to increment one sample. + cmp r5, #4 + moveq r10, #-30 + beq LOOP_ENDPOS_FACTOR4 @ Branch when factor == 4 + + @ For r10, we need to move the pointer back to original after advancing + @ (factor * 7 * 2) bytes, then move 2 bytes forward to increment one sample. + mov r10, r5, asl #4 + rsb r10, #2 + add r10, r5, asl #1 + lsl r5, #1 @ r5 = factor * sizeof(data_in) + +@ The general case (factor != 2 && factor != 4) +LOOP_ENDPOS_GENERAL: + @ Initializations. + vmov.i32 q2, #2048 + vmov.i32 q3, #2048 + sub r7, r8, #2 + sub r12, r0, #1 @ coefficients_length - 1 + sub r1, r11, r12, asl #1 @ &data_in[i - j] + +LOOP_COEFF_LENGTH_GENERAL: + vld1.16 {d2[], d3[]}, [r7], r4 @ coefficients[j] + vld1.16 d0[0], [r1], r5 @ data_in[i - j] + vld1.16 d0[1], [r1], r5 @ data_in[i + factor - j] + vld1.16 d0[2], [r1], r5 @ data_in[i + factor * 2 - j] + vld1.16 d0[3], [r1], r5 @ data_in[i + factor * 3 - j] + vld1.16 d1[0], [r1], r5 @ data_in[i + factor * 4 - j] + vld1.16 d1[1], [r1], r5 @ data_in[i + factor * 5 - j] + vld1.16 d1[2], [r1], r5 @ data_in[i + factor * 6 - j] + vld1.16 d1[3], [r1], r10 @ data_in[i + factor * 7 - j] + subs r12, #1 + vmlal.s16 q2, d0, d2 + vmlal.s16 q3, d1, d3 + bge LOOP_COEFF_LENGTH_GENERAL + + @ Shift, saturate, and store the result. + vqshrn.s32 d0, q2, #12 + vqshrn.s32 d1, q3, #12 + vst1.16 {d0, d1}, [r2]! + + add r11, r5, asl #3 @ r11 -> &data_in[i + factor * 8] + add r9, r5, asl #2 @ Counter i = delay + factor * 8. + cmp r9, r3 @ i < endpos - factor * 7 ? + blt LOOP_ENDPOS_GENERAL + asr r5, #1 @ Restore r5 to the value of factor. + b POST_LOOP_ENDPOS + +@ The case for factor == 2. +LOOP_ENDPOS_FACTOR2: + @ Initializations. + vmov.i32 q2, #2048 + vmov.i32 q3, #2048 + sub r7, r8, #2 + sub r12, r0, #1 @ coefficients_length - 1 + sub r1, r11, r12, asl #1 @ &data_in[i - j] + +LOOP_COEFF_LENGTH_FACTOR2: + vld1.16 {d16[], d17[]}, [r7], r4 @ coefficients[j] + vld2.16 {d0, d1}, [r1]! @ data_in[] + vld2.16 {d2, d3}, [r1], r10 @ data_in[] + subs r12, #1 + vmlal.s16 q2, d0, d16 + vmlal.s16 q3, d2, d17 + bge LOOP_COEFF_LENGTH_FACTOR2 + + @ Shift, saturate, and store the result. + vqshrn.s32 d0, q2, #12 + vqshrn.s32 d1, q3, #12 + vst1.16 {d0, d1}, [r2]! + + add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8] + add r9, r5, asl #3 @ Counter i = delay + factor * 8. + cmp r9, r3 @ i < endpos - factor * 7 ? + blt LOOP_ENDPOS_FACTOR2 + b POST_LOOP_ENDPOS + +@ The case for factor == 4. +LOOP_ENDPOS_FACTOR4: + @ Initializations. + vmov.i32 q2, #2048 + vmov.i32 q3, #2048 + sub r7, r8, #2 + sub r12, r0, #1 @ coefficients_length - 1 + sub r1, r11, r12, asl #1 @ &data_in[i - j] + +LOOP_COEFF_LENGTH_FACTOR4: + vld1.16 {d16[], d17[]}, [r7], r4 @ coefficients[j] + vld4.16 {d0, d1, d2, d3}, [r1]! @ data_in[] + vld4.16 {d18, d19, d20, d21}, [r1], r10 @ data_in[] + subs r12, #1 + vmlal.s16 q2, d0, d16 + vmlal.s16 q3, d18, d17 + bge LOOP_COEFF_LENGTH_FACTOR4 + + @ Shift, saturate, and store the result. + vqshrn.s32 d0, q2, #12 + vqshrn.s32 d1, q3, #12 + vst1.16 {d0, d1}, [r2]! + + add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8] + add r9, r5, asl #3 @ Counter i = delay + factor * 8. + cmp r9, r3 @ i < endpos - factor * 7 ? + blt LOOP_ENDPOS_FACTOR4 + +@ +@ Second part, do the rest iterations (if any). +@ + +POST_LOOP_ENDPOS: + add r3, r5, asl #3 + sub r3, r5 @ Restore r3 to endpos. + cmp r9, r3 + movge r0, #0 + bge END + +LOOP2_ENDPOS: + @ Initializations. + mov r7, r8 + sub r12, r0, #1 @ coefficients_length - 1 + sub r6, r11, r12, asl #1 @ &data_in[i - j] + + mov r1, #2048 + +LOOP2_COEFF_LENGTH: + ldrsh r4, [r7, #-2]! @ coefficients[j] + ldrsh r10, [r6], #2 @ data_in[i - j] + smlabb r1, r4, r10, r1 + subs r12, #1 + bge LOOP2_COEFF_LENGTH + + @ Shift, saturate, and store the result. + ssat r1, #16, r1, asr #12 + strh r1, [r2], #2 + + add r11, r5, asl #1 @ r11 -> &data_in[i + factor] + add r9, r5 @ Counter i = delay + factor. + cmp r9, r3 @ i < endpos? + blt LOOP2_ENDPOS + + mov r0, #0 + +END: + pop {r4-r11} + bx lr + +.fnend diff --git a/src/common_audio/signal_processing/filter_ar_fast_q12.c b/src/common_audio/signal_processing/filter_ar_fast_q12.c index cadfaf1480..0402302732 100644 --- a/src/common_audio/signal_processing/filter_ar_fast_q12.c +++ b/src/common_audio/signal_processing/filter_ar_fast_q12.c @@ -13,9 +13,9 @@ // TODO(bjornv): Change the return type to report errors. -void WebRtcSpl_FilterARFastQ12(int16_t* data_in, +void WebRtcSpl_FilterARFastQ12(const int16_t* data_in, int16_t* data_out, - int16_t* __restrict coefficients, + const int16_t* __restrict coefficients, int coefficients_length, int data_length) { int i = 0; diff --git a/src/common_audio/signal_processing/include/signal_processing_library.h b/src/common_audio/signal_processing/include/signal_processing_library.h index 27d7060dd2..d9008c1460 100644 --- a/src/common_audio/signal_processing/include/signal_processing_library.h +++ b/src/common_audio/signal_processing/include/signal_processing_library.h @@ -386,35 +386,46 @@ void WebRtcSpl_FilterMAFastQ12(WebRtc_Word16* in_vector, WebRtc_Word16 ma_coef_length, WebRtc_Word16 vector_length); -// WebRtcSpl_FilterARFastQ12(...) -// // Performs a AR filtering on a vector in Q12 -// // Input: -// - data_in : Input samples -// - data_out : State information in positions -// data_out[-order] .. data_out[-1] -// - coefficients : Filter coefficients (in Q12) -// - coefficients_length : Number of coefficients (order+1) -// - data_length : Number of samples to be filtered -// +// - data_in : Input samples +// - data_out : State information in positions +// data_out[-order] .. data_out[-1] +// - coefficients : Filter coefficients (in Q12) +// - coefficients_length: Number of coefficients (order+1) +// - data_length : Number of samples to be filtered // Output: -// - data_out : Filtered samples - -void WebRtcSpl_FilterARFastQ12(int16_t* data_in, +// - data_out : Filtered samples +void WebRtcSpl_FilterARFastQ12(const int16_t* data_in, int16_t* data_out, - int16_t* __restrict coefficients, + const int16_t* __restrict coefficients, int coefficients_length, int data_length); -int WebRtcSpl_DownsampleFast(WebRtc_Word16* in_vector, - WebRtc_Word16 in_vector_length, - WebRtc_Word16* out_vector, - WebRtc_Word16 out_vector_length, - WebRtc_Word16* ma_coef, - WebRtc_Word16 ma_coef_length, - WebRtc_Word16 factor, - WebRtc_Word16 delay); +// Performs a MA down sampling filter on a vector +// Input: +// - data_in : Input samples (state in positions +// data_in[-order] .. data_in[-1]) +// - data_in_length : Number of samples in |data_in| to be filtered. +// This must be at least +// |delay| + |factor|*(|out_vector_length|-1) + 1) +// - data_out_length : Number of down sampled samples desired +// - coefficients : Filter coefficients (in Q12) +// - coefficients_length: Number of coefficients (order+1) +// - factor : Decimation factor +// - delay : Delay of filter (compensated for in out_vector) +// Output: +// - data_out : Filtered samples +// Return value : 0 if OK, -1 if |in_vector| is too short +int WebRtcSpl_DownsampleFast(const int16_t* data_in, + int data_in_length, + int16_t* data_out, + int data_out_length, + const int16_t* __restrict coefficients, + int coefficients_length, + int factor, + int delay); + // End: Filter operations. // FFT operations @@ -1454,28 +1465,6 @@ void WebRtcSpl_SynthesisQMF(const WebRtc_Word16* low_band, // - out_vector : Filtered samples // -// -// WebRtcSpl_DownsampleFast(...) -// -// Performs a MA down sampling filter on a vector -// -// Input: -// - in_vector : Input samples (state in positions -// in_vector[-order] .. in_vector[-1]) -// - in_vector_length : Number of samples in |in_vector| to be filtered. -// This must be at least -// |delay| + |factor|*(|out_vector_length|-1) + 1) -// - out_vector_length : Number of down sampled samples desired -// - ma_coef : Filter coefficients (in Q12) -// - ma_coef_length : Number of B coefficients (order+1) -// - factor : Decimation factor -// - delay : Delay of filter (compensated for in out_vector) -// -// Output: -// - out_vector : Filtered samples -// -// Return value : 0 if OK, -1 if |in_vector| is too short -// // // WebRtcSpl_DotProductWithScale(...)