optimized sqrt in general and for android
Review URL: http://webrtc-codereview.appspot.com/42001
git-svn-id: http://webrtc.googlecode.com/svn/trunk@95 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/common_audio/signal_processing_library/main/source/Android.mk b/common_audio/signal_processing_library/main/source/Android.mk
index 8b08676..b46cf11 100644
--- a/common_audio/signal_processing_library/main/source/Android.mk
+++ b/common_audio/signal_processing_library/main/source/Android.mk
@@ -53,7 +53,6 @@
resample_fractional.c \
sin_table.c \
sin_table_1024.c \
- spl_sqrt.c \
spl_version.c \
splitting_filter.c \
sqrt_of_one_minus_x_squared.c \
@@ -61,6 +60,14 @@
sub_sat_w32.c \
vector_scaling_operations.c
+ifeq ($(TARGET_ARCH), arm)
+LOCAL_SRC_FILES += \
+ spl_sqrt.s
+else
+LOCAL_SRC_FILES += \
+ spl_sqrt.c
+endif
+
# Flags passed to both C and C++ files.
MY_CFLAGS :=
MY_CFLAGS_C :=
diff --git a/common_audio/signal_processing_library/main/source/spl_sqrt.c b/common_audio/signal_processing_library/main/source/spl_sqrt.c
index cfe2cd3..dd34fd7 100644
--- a/common_audio/signal_processing_library/main/source/spl_sqrt.c
+++ b/common_audio/signal_processing_library/main/source/spl_sqrt.c
@@ -8,7 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
/*
* This file contains the function WebRtcSpl_Sqrt().
* The description header can be found in signal_processing_library.h
@@ -17,168 +16,23 @@
#include "signal_processing_library.h"
-WebRtc_Word32 WebRtcSpl_SqrtLocal(WebRtc_Word32 in);
+#define iter1(N) \
+ try1 = root + (1 << (N)); \
+ if (value >= try1 << (N)) \
+ { \
+ value -= try1 << (N); \
+ root |= 2 << (N); \
+ }
-WebRtc_Word32 WebRtcSpl_SqrtLocal(WebRtc_Word32 in)
-{
+// (out) Square root of input parameter
+WebRtc_Word32 WebRtcSpl_Sqrt(WebRtc_Word32 value) {
+ // new routine for performance, 4 cycles/bit in ARM
+ // output precision is 16 bits
- WebRtc_Word16 x_half, t16;
- WebRtc_Word32 A, B, x2;
-
- /* The following block performs:
- y=in/2
- x=y-2^30
- x_half=x/2^31
- t = 1 + (x_half) - 0.5*((x_half)^2) + 0.5*((x_half)^3) - 0.625*((x_half)^4)
- + 0.875*((x_half)^5)
- */
-
- B = in;
-
- B = WEBRTC_SPL_RSHIFT_W32(B, 1); // B = in/2
- B = B - ((WebRtc_Word32)0x40000000); // B = in/2 - 1/2
- x_half = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(B, 16);// x_half = x/2 = (in-1)/2
- B = B + ((WebRtc_Word32)0x40000000); // B = 1 + x/2
- B = B + ((WebRtc_Word32)0x40000000); // Add 0.5 twice (since 1.0 does not exist in Q31)
-
- x2 = ((WebRtc_Word32)x_half) * ((WebRtc_Word32)x_half) * 2; // A = (x/2)^2
- A = -x2; // A = -(x/2)^2
- B = B + (A >> 1); // B = 1 + x/2 - 0.5*(x/2)^2
-
- A = WEBRTC_SPL_RSHIFT_W32(A, 16);
- A = A * A * 2; // A = (x/2)^4
- t16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(A, 16);
- B = B + WEBRTC_SPL_MUL_16_16(-20480, t16) * 2; // B = B - 0.625*A
- // After this, B = 1 + x/2 - 0.5*(x/2)^2 - 0.625*(x/2)^4
-
- t16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(A, 16);
- A = WEBRTC_SPL_MUL_16_16(x_half, t16) * 2; // A = (x/2)^5
- t16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(A, 16);
- B = B + WEBRTC_SPL_MUL_16_16(28672, t16) * 2; // B = B + 0.875*A
- // After this, B = 1 + x/2 - 0.5*(x/2)^2 - 0.625*(x/2)^4 + 0.875*(x/2)^5
-
- t16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(x2, 16);
- A = WEBRTC_SPL_MUL_16_16(x_half, t16) * 2; // A = x/2^3
-
- B = B + (A >> 1); // B = B + 0.5*A
- // After this, B = 1 + x/2 - 0.5*(x/2)^2 + 0.5*(x/2)^3 - 0.625*(x/2)^4 + 0.875*(x/2)^5
-
- B = B + ((WebRtc_Word32)32768); // Round off bit
-
- return B;
-}
-
-WebRtc_Word32 WebRtcSpl_Sqrt(WebRtc_Word32 value)
-{
- /*
- Algorithm:
-
- Six term Taylor Series is used here to compute the square root of a number
- y^0.5 = (1+x)^0.5 where x = y-1
- = 1+(x/2)-0.5*((x/2)^2+0.5*((x/2)^3-0.625*((x/2)^4+0.875*((x/2)^5)
- 0.5 <= x < 1
-
- Example of how the algorithm works, with ut=sqrt(in), and
- with in=73632 and ut=271 (even shift value case):
-
- in=73632
- y= in/131072
- x=y-1
- t = 1 + (x/2) - 0.5*((x/2)^2) + 0.5*((x/2)^3) - 0.625*((x/2)^4) + 0.875*((x/2)^5)
- ut=t*(1/sqrt(2))*512
-
- or:
-
- in=73632
- in2=73632*2^14
- y= in2/2^31
- x=y-1
- t = 1 + (x/2) - 0.5*((x/2)^2) + 0.5*((x/2)^3) - 0.625*((x/2)^4) + 0.875*((x/2)^5)
- ut=t*(1/sqrt(2))
- ut2=ut*2^9
-
- which gives:
-
- in = 73632
- in2 = 1206386688
- y = 0.56176757812500
- x = -0.43823242187500
- t = 0.74973506527313
- ut = 0.53014274874797
- ut2 = 2.714330873589594e+002
-
- or:
-
- in=73632
- in2=73632*2^14
- y=in2/2
- x=y-2^30
- x_half=x/2^31
- t = 1 + (x_half) - 0.5*((x_half)^2) + 0.5*((x_half)^3) - 0.625*((x_half)^4)
- + 0.875*((x_half)^5)
- ut=t*(1/sqrt(2))
- ut2=ut*2^9
-
- which gives:
-
- in = 73632
- in2 = 1206386688
- y = 603193344
- x = -470548480
- x_half = -0.21911621093750
- t = 0.74973506527313
- ut = 0.53014274874797
- ut2 = 2.714330873589594e+002
-
- */
-
- WebRtc_Word16 x_norm, nshift, t16, sh;
- WebRtc_Word32 A;
-
- WebRtc_Word16 k_sqrt_2 = 23170; // 1/sqrt2 (==5a82)
-
- A = value;
-
- if (A == 0)
- return (WebRtc_Word32)0; // sqrt(0) = 0
-
- sh = WebRtcSpl_NormW32(A); // # shifts to normalize A
- A = WEBRTC_SPL_LSHIFT_W32(A, sh); // Normalize A
- if (A < (WEBRTC_SPL_WORD32_MAX - 32767))
- {
- A = A + ((WebRtc_Word32)32768); // Round off bit
- } else
- {
- A = WEBRTC_SPL_WORD32_MAX;
- }
-
- x_norm = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(A, 16); // x_norm = AH
-
- nshift = WEBRTC_SPL_RSHIFT_W16(sh, 1); // nshift = sh>>1
- nshift = -nshift; // Negate the power for later de-normalization
-
- A = (WebRtc_Word32)WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)x_norm, 16);
- A = WEBRTC_SPL_ABS_W32(A); // A = abs(x_norm<<16)
- A = WebRtcSpl_SqrtLocal(A); // A = sqrt(A)
-
- if ((-2 * nshift) == sh)
- { // Even shift value case
-
- t16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(A, 16); // t16 = AH
-
- A = WEBRTC_SPL_MUL_16_16(k_sqrt_2, t16) * 2; // A = 1/sqrt(2)*t16
- A = A + ((WebRtc_Word32)32768); // Round off
- A = A & ((WebRtc_Word32)0x7fff0000); // Round off
-
- A = WEBRTC_SPL_RSHIFT_W32(A, 15); // A = A>>16
-
- } else
- {
- A = WEBRTC_SPL_RSHIFT_W32(A, 16); // A = A>>16
- }
-
- A = A & ((WebRtc_Word32)0x0000ffff);
- A = (WebRtc_Word32)WEBRTC_SPL_SHIFT_W32(A, nshift); // De-normalize the result
-
- return A;
+ WebRtc_Word32 root = 0, try1;
+ iter1 (15); iter1 (14); iter1 (13); iter1 (12);
+ iter1 (11); iter1 (10); iter1 ( 9); iter1 ( 8);
+ iter1 ( 7); iter1 ( 6); iter1 ( 5); iter1 ( 4);
+ iter1 ( 3); iter1 ( 2); iter1 ( 1); iter1 ( 0);
+ return root >> 1;
}
diff --git a/common_audio/signal_processing_library/main/source/spl_sqrt.s b/common_audio/signal_processing_library/main/source/spl_sqrt.s
new file mode 100644
index 0000000..f546fce
--- /dev/null
+++ b/common_audio/signal_processing_library/main/source/spl_sqrt.s
@@ -0,0 +1,93 @@
+@
+@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS. All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+
+@ sqrt() routine. 3 cycles/bit, total 51 cycles.
+@ IN : r0 32 bit unsigned integer
+@ OUT: r0 = INT (SQRT (r0)), precision is 16 bits
+@ TMP: r1, r2
+
+.global WebRtcSpl_Sqrt
+
+.align 2
+.section .text.WebRtcSpl_Sqrt:
+WebRtcSpl_Sqrt:
+.fnstart
+
+ MOV r1, #3 << 30
+ MOV r2, #1 << 30
+
+ @ unroll for i = 0 .. 15
+
+ CMP r0, r2, ROR #2 * 0
+ SUBHS r0, r0, r2, ROR #2 * 0
+ ADC r2, r1, r2, LSL #1
+
+ CMP r0, r2, ROR #2 * 1
+ SUBHS r0, r0, r2, ROR #2 * 1
+ ADC r2, r1, r2, LSL #1
+
+ CMP r0, r2, ROR #2 * 2
+ SUBHS r0, r0, r2, ROR #2 * 2
+ ADC r2, r1, r2, LSL #1
+
+ CMP r0, r2, ROR #2 * 3
+ SUBHS r0, r0, r2, ROR #2 * 3
+ ADC r2, r1, r2, LSL #1
+
+ CMP r0, r2, ROR #2 * 4
+ SUBHS r0, r0, r2, ROR #2 * 4
+ ADC r2, r1, r2, LSL #1
+
+ CMP r0, r2, ROR #2 * 5
+ SUBHS r0, r0, r2, ROR #2 * 5
+ ADC r2, r1, r2, LSL #1
+
+ CMP r0, r2, ROR #2 * 6
+ SUBHS r0, r0, r2, ROR #2 * 6
+ ADC r2, r1, r2, LSL #1
+
+ CMP r0, r2, ROR #2 * 7
+ SUBHS r0, r0, r2, ROR #2 * 7
+ ADC r2, r1, r2, LSL #1
+
+ CMP r0, r2, ROR #2 * 8
+ SUBHS r0, r0, r2, ROR #2 * 8
+ ADC r2, r1, r2, LSL #1
+
+ CMP r0, r2, ROR #2 * 9
+ SUBHS r0, r0, r2, ROR #2 * 9
+ ADC r2, r1, r2, LSL #1
+
+ CMP r0, r2, ROR #2 * 10
+ SUBHS r0, r0, r2, ROR #2 * 10
+ ADC r2, r1, r2, LSL #1
+
+ CMP r0, r2, ROR #2 * 11
+ SUBHS r0, r0, r2, ROR #2 * 11
+ ADC r2, r1, r2, LSL #1
+
+ CMP r0, r2, ROR #2 * 12
+ SUBHS r0, r0, r2, ROR #2 * 12
+ ADC r2, r1, r2, LSL #1
+
+ CMP r0, r2, ROR #2 * 13
+ SUBHS r0, r0, r2, ROR #2 * 13
+ ADC r2, r1, r2, LSL #1
+
+ CMP r0, r2, ROR #2 * 14
+ SUBHS r0, r0, r2, ROR #2 * 14
+ ADC r2, r1, r2, LSL #1
+
+ CMP r0, r2, ROR #2 * 15
+ SUBHS r0, r0, r2, ROR #2 * 15
+ ADC r2, r1, r2, LSL #1
+
+ BIC r0, r2, #3 << 30 @ for rounding add: CMP r0, r2 ADC r2, #1
+
+.fnend