diff --git a/common_audio/signal_processing_library/main/source/Android.mk b/common_audio/signal_processing_library/main/source/Android.mk index 8b08676f85..b46cf11afc 100644 --- a/common_audio/signal_processing_library/main/source/Android.mk +++ b/common_audio/signal_processing_library/main/source/Android.mk @@ -53,7 +53,6 @@ LOCAL_SRC_FILES := add_sat_w16.c \ resample_fractional.c \ sin_table.c \ sin_table_1024.c \ - spl_sqrt.c \ spl_version.c \ splitting_filter.c \ sqrt_of_one_minus_x_squared.c \ @@ -61,6 +60,14 @@ LOCAL_SRC_FILES := add_sat_w16.c \ sub_sat_w32.c \ vector_scaling_operations.c +ifeq ($(TARGET_ARCH), arm) +LOCAL_SRC_FILES += \ + spl_sqrt.s +else +LOCAL_SRC_FILES += \ + spl_sqrt.c +endif + # Flags passed to both C and C++ files. MY_CFLAGS := MY_CFLAGS_C := diff --git a/common_audio/signal_processing_library/main/source/spl_sqrt.c b/common_audio/signal_processing_library/main/source/spl_sqrt.c index cfe2cd3f34..dd34fd7b29 100644 --- a/common_audio/signal_processing_library/main/source/spl_sqrt.c +++ b/common_audio/signal_processing_library/main/source/spl_sqrt.c @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - /* * This file contains the function WebRtcSpl_Sqrt(). * The description header can be found in signal_processing_library.h @@ -17,168 +16,23 @@ #include "signal_processing_library.h" -WebRtc_Word32 WebRtcSpl_SqrtLocal(WebRtc_Word32 in); +#define iter1(N) \ + try1 = root + (1 << (N)); \ + if (value >= try1 << (N)) \ + { \ + value -= try1 << (N); \ + root |= 2 << (N); \ + } -WebRtc_Word32 WebRtcSpl_SqrtLocal(WebRtc_Word32 in) -{ +// (out) Square root of input parameter +WebRtc_Word32 WebRtcSpl_Sqrt(WebRtc_Word32 value) { + // new routine for performance, 4 cycles/bit in ARM + // output precision is 16 bits - WebRtc_Word16 x_half, t16; - WebRtc_Word32 A, B, x2; - - /* The following block performs: - y=in/2 - x=y-2^30 - x_half=x/2^31 - t = 1 + (x_half) - 0.5*((x_half)^2) + 0.5*((x_half)^3) - 0.625*((x_half)^4) - + 0.875*((x_half)^5) - */ - - B = in; - - B = WEBRTC_SPL_RSHIFT_W32(B, 1); // B = in/2 - B = B - ((WebRtc_Word32)0x40000000); // B = in/2 - 1/2 - x_half = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(B, 16);// x_half = x/2 = (in-1)/2 - B = B + ((WebRtc_Word32)0x40000000); // B = 1 + x/2 - B = B + ((WebRtc_Word32)0x40000000); // Add 0.5 twice (since 1.0 does not exist in Q31) - - x2 = ((WebRtc_Word32)x_half) * ((WebRtc_Word32)x_half) * 2; // A = (x/2)^2 - A = -x2; // A = -(x/2)^2 - B = B + (A >> 1); // B = 1 + x/2 - 0.5*(x/2)^2 - - A = WEBRTC_SPL_RSHIFT_W32(A, 16); - A = A * A * 2; // A = (x/2)^4 - t16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(A, 16); - B = B + WEBRTC_SPL_MUL_16_16(-20480, t16) * 2; // B = B - 0.625*A - // After this, B = 1 + x/2 - 0.5*(x/2)^2 - 0.625*(x/2)^4 - - t16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(A, 16); - A = WEBRTC_SPL_MUL_16_16(x_half, t16) * 2; // A = (x/2)^5 - t16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(A, 16); - B = B + WEBRTC_SPL_MUL_16_16(28672, t16) * 2; // B = B + 0.875*A - // After this, B = 1 + x/2 - 0.5*(x/2)^2 - 0.625*(x/2)^4 + 0.875*(x/2)^5 - - t16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(x2, 16); - A = WEBRTC_SPL_MUL_16_16(x_half, t16) * 2; // A = x/2^3 - - B = B + (A >> 1); // B = B + 0.5*A - // After this, B = 1 + x/2 - 0.5*(x/2)^2 + 0.5*(x/2)^3 - 0.625*(x/2)^4 + 0.875*(x/2)^5 - - B = B + ((WebRtc_Word32)32768); // Round off bit - - return B; -} - -WebRtc_Word32 WebRtcSpl_Sqrt(WebRtc_Word32 value) -{ - /* - Algorithm: - - Six term Taylor Series is used here to compute the square root of a number - y^0.5 = (1+x)^0.5 where x = y-1 - = 1+(x/2)-0.5*((x/2)^2+0.5*((x/2)^3-0.625*((x/2)^4+0.875*((x/2)^5) - 0.5 <= x < 1 - - Example of how the algorithm works, with ut=sqrt(in), and - with in=73632 and ut=271 (even shift value case): - - in=73632 - y= in/131072 - x=y-1 - t = 1 + (x/2) - 0.5*((x/2)^2) + 0.5*((x/2)^3) - 0.625*((x/2)^4) + 0.875*((x/2)^5) - ut=t*(1/sqrt(2))*512 - - or: - - in=73632 - in2=73632*2^14 - y= in2/2^31 - x=y-1 - t = 1 + (x/2) - 0.5*((x/2)^2) + 0.5*((x/2)^3) - 0.625*((x/2)^4) + 0.875*((x/2)^5) - ut=t*(1/sqrt(2)) - ut2=ut*2^9 - - which gives: - - in = 73632 - in2 = 1206386688 - y = 0.56176757812500 - x = -0.43823242187500 - t = 0.74973506527313 - ut = 0.53014274874797 - ut2 = 2.714330873589594e+002 - - or: - - in=73632 - in2=73632*2^14 - y=in2/2 - x=y-2^30 - x_half=x/2^31 - t = 1 + (x_half) - 0.5*((x_half)^2) + 0.5*((x_half)^3) - 0.625*((x_half)^4) - + 0.875*((x_half)^5) - ut=t*(1/sqrt(2)) - ut2=ut*2^9 - - which gives: - - in = 73632 - in2 = 1206386688 - y = 603193344 - x = -470548480 - x_half = -0.21911621093750 - t = 0.74973506527313 - ut = 0.53014274874797 - ut2 = 2.714330873589594e+002 - - */ - - WebRtc_Word16 x_norm, nshift, t16, sh; - WebRtc_Word32 A; - - WebRtc_Word16 k_sqrt_2 = 23170; // 1/sqrt2 (==5a82) - - A = value; - - if (A == 0) - return (WebRtc_Word32)0; // sqrt(0) = 0 - - sh = WebRtcSpl_NormW32(A); // # shifts to normalize A - A = WEBRTC_SPL_LSHIFT_W32(A, sh); // Normalize A - if (A < (WEBRTC_SPL_WORD32_MAX - 32767)) - { - A = A + ((WebRtc_Word32)32768); // Round off bit - } else - { - A = WEBRTC_SPL_WORD32_MAX; - } - - x_norm = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(A, 16); // x_norm = AH - - nshift = WEBRTC_SPL_RSHIFT_W16(sh, 1); // nshift = sh>>1 - nshift = -nshift; // Negate the power for later de-normalization - - A = (WebRtc_Word32)WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)x_norm, 16); - A = WEBRTC_SPL_ABS_W32(A); // A = abs(x_norm<<16) - A = WebRtcSpl_SqrtLocal(A); // A = sqrt(A) - - if ((-2 * nshift) == sh) - { // Even shift value case - - t16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(A, 16); // t16 = AH - - A = WEBRTC_SPL_MUL_16_16(k_sqrt_2, t16) * 2; // A = 1/sqrt(2)*t16 - A = A + ((WebRtc_Word32)32768); // Round off - A = A & ((WebRtc_Word32)0x7fff0000); // Round off - - A = WEBRTC_SPL_RSHIFT_W32(A, 15); // A = A>>16 - - } else - { - A = WEBRTC_SPL_RSHIFT_W32(A, 16); // A = A>>16 - } - - A = A & ((WebRtc_Word32)0x0000ffff); - A = (WebRtc_Word32)WEBRTC_SPL_SHIFT_W32(A, nshift); // De-normalize the result - - return A; + WebRtc_Word32 root = 0, try1; + iter1 (15); iter1 (14); iter1 (13); iter1 (12); + iter1 (11); iter1 (10); iter1 ( 9); iter1 ( 8); + iter1 ( 7); iter1 ( 6); iter1 ( 5); iter1 ( 4); + iter1 ( 3); iter1 ( 2); iter1 ( 1); iter1 ( 0); + return root >> 1; } diff --git a/common_audio/signal_processing_library/main/source/spl_sqrt.s b/common_audio/signal_processing_library/main/source/spl_sqrt.s new file mode 100644 index 0000000000..f546fce4aa --- /dev/null +++ b/common_audio/signal_processing_library/main/source/spl_sqrt.s @@ -0,0 +1,93 @@ +@ +@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. + +@ sqrt() routine. 3 cycles/bit, total 51 cycles. +@ IN : r0 32 bit unsigned integer +@ OUT: r0 = INT (SQRT (r0)), precision is 16 bits +@ TMP: r1, r2 + +.global WebRtcSpl_Sqrt + +.align 2 +.section .text.WebRtcSpl_Sqrt: +WebRtcSpl_Sqrt: +.fnstart + + MOV r1, #3 << 30 + MOV r2, #1 << 30 + + @ unroll for i = 0 .. 15 + + CMP r0, r2, ROR #2 * 0 + SUBHS r0, r0, r2, ROR #2 * 0 + ADC r2, r1, r2, LSL #1 + + CMP r0, r2, ROR #2 * 1 + SUBHS r0, r0, r2, ROR #2 * 1 + ADC r2, r1, r2, LSL #1 + + CMP r0, r2, ROR #2 * 2 + SUBHS r0, r0, r2, ROR #2 * 2 + ADC r2, r1, r2, LSL #1 + + CMP r0, r2, ROR #2 * 3 + SUBHS r0, r0, r2, ROR #2 * 3 + ADC r2, r1, r2, LSL #1 + + CMP r0, r2, ROR #2 * 4 + SUBHS r0, r0, r2, ROR #2 * 4 + ADC r2, r1, r2, LSL #1 + + CMP r0, r2, ROR #2 * 5 + SUBHS r0, r0, r2, ROR #2 * 5 + ADC r2, r1, r2, LSL #1 + + CMP r0, r2, ROR #2 * 6 + SUBHS r0, r0, r2, ROR #2 * 6 + ADC r2, r1, r2, LSL #1 + + CMP r0, r2, ROR #2 * 7 + SUBHS r0, r0, r2, ROR #2 * 7 + ADC r2, r1, r2, LSL #1 + + CMP r0, r2, ROR #2 * 8 + SUBHS r0, r0, r2, ROR #2 * 8 + ADC r2, r1, r2, LSL #1 + + CMP r0, r2, ROR #2 * 9 + SUBHS r0, r0, r2, ROR #2 * 9 + ADC r2, r1, r2, LSL #1 + + CMP r0, r2, ROR #2 * 10 + SUBHS r0, r0, r2, ROR #2 * 10 + ADC r2, r1, r2, LSL #1 + + CMP r0, r2, ROR #2 * 11 + SUBHS r0, r0, r2, ROR #2 * 11 + ADC r2, r1, r2, LSL #1 + + CMP r0, r2, ROR #2 * 12 + SUBHS r0, r0, r2, ROR #2 * 12 + ADC r2, r1, r2, LSL #1 + + CMP r0, r2, ROR #2 * 13 + SUBHS r0, r0, r2, ROR #2 * 13 + ADC r2, r1, r2, LSL #1 + + CMP r0, r2, ROR #2 * 14 + SUBHS r0, r0, r2, ROR #2 * 14 + ADC r2, r1, r2, LSL #1 + + CMP r0, r2, ROR #2 * 15 + SUBHS r0, r0, r2, ROR #2 * 15 + ADC r2, r1, r2, LSL #1 + + BIC r0, r2, #3 << 30 @ for rounding add: CMP r0, r2 ADC r2, #1 + +.fnend