arm64: clang assembler compatability
Fixes to compile with clang (3.7)
- Fix fmul syntax:
fmul v0.2s,v1.2s,v2.2s[0] -> v0.2s,v1.2s,v2.s[0]
- lowercase RSB macro call to -> rsb
- add w modifier and use U32 in and out for fastlog2()
With these changes openmax_dl compiles as part of chromium on arm64
with both gcc 5.2 and clang 3.7. Unfortunately there was no
instructions how to run the included tests. The generated .o
files looks same before and after (minor changes in debug sections)
BUG=5090
R=andrew@webrtc.org, rtoy@google.com
Review URL: https://codereview.webrtc.org/1420973006 .
Patch from Riku Voipio <riku.voipio@linaro.org>.
diff --git a/dl/sp/api/armSP.h b/dl/sp/api/armSP.h
index 2592679..8f804ac 100644
--- a/dl/sp/api/armSP.h
+++ b/dl/sp/api/armSP.h
@@ -93,8 +93,8 @@
/*
* Compute log2(x), where x must be a power of 2.
*/
-static inline OMX_U32 fastlog2(long x) {
- OMX_U32 out;
+static inline long fastlog2(long x) {
+ long out;
asm("clz %0,%1\n\t"
"sub %0, %0, #63\n\t"
"neg %0, %0\n\t"
diff --git a/dl/sp/src/arm/arm64/ComplexToRealFixup.S b/dl/sp/src/arm/arm64/ComplexToRealFixup.S
index 9b30093..2580e7e 100644
--- a/dl/sp/src/arm/arm64/ComplexToRealFixup.S
+++ b/dl/sp/src/arm/arm64/ComplexToRealFixup.S
@@ -94,6 +94,7 @@
#define qT3 v20.2s
#define half v0.2s
+#define halfs v0.s
#define dZip v21.2s
#define dZip8b v21.8b
@@ -106,7 +107,7 @@
clz order, subFFTNum // N = 2^order
- RSB order,order,#63
+ rsb order,order,#63
MOV subFFTSize,subFFTNum // subFFTSize = N/2
//MOV subFFTNum,N
mov argDst, pDst
@@ -127,7 +128,7 @@
MOV zero,#0
mov dX0rs[1],zero
lsl step,subFFTSize, #3 // step = N/2 * 8 bytes
- mov dX0i[1],zero
+ mov dX0is[1],zero
// twStep = 3N/8 * 8 bytes pointing to W^1
SUB twStep,step,subFFTSize,LSL #1
@@ -185,8 +186,8 @@
fadd dT0,dX0r,dX1r // a+c
fsub dT1,dX0i,dX1i // b-d
fadd dT3,dX0i,dX1i // b+d
- fmul dT0,dT0,half[0]
- fmul dT1,dT1,half[0]
+ fmul dT0,dT0,halfs[0]
+ fmul dT1,dT1,halfs[0]
// VZIP dW1r,dW1i
// VZIP dW0r,dW0i
zip1 dZip, dW1r, dW1i
@@ -208,8 +209,8 @@
fmla qT3,dW0i,dT2
- fmul dX1r,qT0,half[0]
- fmul dX1i,qT1,half[0]
+ fmul dX1r,qT0,halfs[0]
+ fmul dX1i,qT1,halfs[0]
fsub dY1r,dT0,dX1i // F(N/2 -1)
fadd dY1i,dT1,dX1r
@@ -219,8 +220,8 @@
rev64 dY1i,dY1i
- fmul dX0r,qT2,half[0]
- fmul dX0i,qT3,half[0]
+ fmul dX0r,qT2,halfs[0]
+ fmul dX0i,qT3,halfs[0]
fsub dY0r,dT0,dX0i // F(1)
fadd dY0i,dT1,dX0r
diff --git a/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S b/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
index 02ff1c2..f93aa97 100644
--- a/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
+++ b/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
@@ -112,6 +112,7 @@
#define dZip v19.2s
#define dZip8b v19.8b
#define half v13.2s
+#define halfs v13.s
.macro FFTSTAGE scaled, inverse, name
@@ -140,8 +141,8 @@
fadd dY0,dX0,dX1 // [b+d | a+c]
fsub dY1,dX0,dX1 // [b-d | a-c]
- fmul dY0, dY0, half[0]
- fmul dY1, dY1, half[0]
+ fmul dY0, dY0, halfs[0]
+ fmul dY1, dY1, halfs[0]
// dY0= [a-c | a+c] ;dY1= [b-d | b+d]
// VZIP dY0,dY1
@@ -201,11 +202,11 @@
fsub dT1,dX0i,dX1i // b-d
SUB step1,step1,#8
- fmul dT2, dT2, half[0]
- fmul dT3, dT3, half[0]
+ fmul dT2, dT2, halfs[0]
+ fmul dT3, dT3, halfs[0]
- fmul dT0, dT0, half[0]
- fmul dT1, dT1, half[0]
+ fmul dT0, dT0, halfs[0]
+ fmul dT1, dT1, halfs[0]
// VZIP dW1r,dW1i
// VZIP dW0r,dW0i
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S
index 6e732a8..3bc92d3 100644
--- a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S
@@ -70,6 +70,7 @@
// Neon Registers
#define dW v0.2s
+#define dWs v0.s
#define dX0 v2.2s
#define dX1 v3.2s
#define dX2 v4.2s
@@ -135,17 +136,17 @@
SUBS setCount,setCount,#2
.ifeqs "\inverse", "TRUE"
- fmul qT0,dX2,dW[0]
- fmla qT0,dX3,dW[1] // real part
- fmul qT1,dX3,dW[0]
- fmls qT1,dX2,dW[1] // imag part
+ fmul qT0,dX2,dWs[0]
+ fmla qT0,dX3,dWs[1] // real part
+ fmul qT1,dX3,dWs[0]
+ fmls qT1,dX2,dWs[1] // imag part
.else
- fmul qT0,dX2,dW[0]
- fmls qT0,dX3,dW[1] // real part
- fmul qT1,dX3,dW[0]
- fmla qT1,dX2,dW[1] // imag part
+ fmul qT0,dX2,dWs[0]
+ fmls qT0,dX3,dWs[1] // real part
+ fmul qT1,dX3,dWs[0]
+ fmla qT1,dX2,dWs[1] // imag part
.endif
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S
index 7442e0d..047597d 100644
--- a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S
@@ -80,8 +80,11 @@
// Neon Registers
#define dW1 v0.2s
+#define dW1s v0.s
#define dW2 v1.2s
+#define dW2s v1.s
#define dW3 v2.2s
+#define dW3s v2.s
#define dXr0 v4.2s
#define dXi0 v5.2s
@@ -182,49 +185,49 @@
.ifeqs "\inverse", "TRUE"
- fmul dZr1,dXr1,dW1[0]
- fmul dZi1,dXi1,dW1[0]
- fmul dZr2,dXr2,dW2[0]
- fmul dZi2,dXi2,dW2[0]
- fmul dZr3,dXr3,dW3[0]
- fmul dZi3,dXi3,dW3[0]
+ fmul dZr1,dXr1,dW1s[0]
+ fmul dZi1,dXi1,dW1s[0]
+ fmul dZr2,dXr2,dW2s[0]
+ fmul dZi2,dXi2,dW2s[0]
+ fmul dZr3,dXr3,dW3s[0]
+ fmul dZi3,dXi3,dW3s[0]
- fmla dZr1,dXi1,dW1[1] // real part
- fmls dZi1,dXr1,dW1[1] // imag part
+ fmla dZr1,dXi1,dW1s[1] // real part
+ fmls dZi1,dXr1,dW1s[1] // imag part
// data[1] for next iteration
ld2 {dXr1,dXi1},[pSrc],pointStep
- fmla dZr2,dXi2,dW2[1] // real part
- fmls dZi2,dXr2,dW2[1] // imag part
+ fmla dZr2,dXi2,dW2s[1] // real part
+ fmls dZi2,dXr2,dW2s[1] // imag part
// data[2] for next iteration
ld2 {dXr2,dXi2},[pSrc],pointStep
- fmla dZr3,dXi3,dW3[1] // real part
- fmls dZi3,dXr3,dW3[1] // imag part
+ fmla dZr3,dXi3,dW3s[1] // real part
+ fmls dZi3,dXr3,dW3s[1] // imag part
.else
- fmul dZr1,dXr1,dW1[0]
- fmul dZi1,dXi1,dW1[0]
- fmul dZr2,dXr2,dW2[0]
- fmul dZi2,dXi2,dW2[0]
- fmul dZr3,dXr3,dW3[0]
- fmul dZi3,dXi3,dW3[0]
+ fmul dZr1,dXr1,dW1s[0]
+ fmul dZi1,dXi1,dW1s[0]
+ fmul dZr2,dXr2,dW2s[0]
+ fmul dZi2,dXi2,dW2s[0]
+ fmul dZr3,dXr3,dW3s[0]
+ fmul dZi3,dXi3,dW3s[0]
- fmls dZr1,dXi1,dW1[1] // real part
- fmla dZi1,dXr1,dW1[1] // imag part
+ fmls dZr1,dXi1,dW1s[1] // real part
+ fmla dZi1,dXr1,dW1s[1] // imag part
// data[1] for next iteration
ld2 {dXr1,dXi1},[pSrc],pointStep
- fmls dZr2,dXi2,dW2[1] // real part
- fmla dZi2,dXr2,dW2[1] // imag part
+ fmls dZr2,dXi2,dW2s[1] // real part
+ fmla dZi2,dXr2,dW2s[1] // imag part
// data[2] for next iteration
ld2 {dXr2,dXi2},[pSrc],pointStep
- fmls dZr3,dXi3,dW3[1] // real part
- fmla dZi3,dXr3,dW3[1] // imag part
+ fmls dZr3,dXi3,dW3s[1] // real part
+ fmla dZi3,dXr3,dW3s[1] // imag part
.endif
// data[3] & update pSrc to data[0]
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
index 03969be..5fe4925 100644
--- a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
@@ -352,10 +352,10 @@
.ifeqs "\inverse", "TRUE"
// calculate a*v5
- fmul dT1,dVr5,dT0[0] // use dVi0 for dT1
+ fmul dT1,dVr5,dT0s[0] // use dVi0 for dT1
ld2 {dXr4,dXi4},[pSrc],pointStep // data[4]
- fmul dVi5,dVi5,dT0[0]
+ fmul dVi5,dVi5,dT0s[0]
ld2 {dXr5,dXi5},[pSrc],pointStep // data[5]
fsub dVr5,dT1,dVi5 // a * V5
@@ -364,8 +364,8 @@
ld2 {dXr6,dXi6},[pSrc],pointStep // data[6]
// calculate b*v7
- fmul dT1,dVr7,dT0[0]
- fmul dVi7,dVi7,dT0[0]
+ fmul dT1,dVr7,dT0s[0]
+ fmul dVi7,dVi7,dT0s[0]
// fadd qY1,qV1,qV5
// fsub qY5,qV1,qV5
@@ -399,9 +399,9 @@
.else
// calculate b*v7
- fmul dT1,dVr7,dT0[0]
+ fmul dT1,dVr7,dT0s[0]
ld2 {dXr4,dXi4},[pSrc],pointStep // data[4]
- fmul dVi7,dVi7,dT0[0]
+ fmul dVi7,dVi7,dT0s[0]
ld2 {dXr5,dXi5},[pSrc],pointStep // data[5]
fadd dVr7,dT1,dVi7 // b * V7
@@ -410,8 +410,8 @@
ld2 {dXr6,dXi6},[pSrc],pointStep // data[6]
// calculate a*v5
- fmul dT1,dVr5,dT0[0] // use dVi0 for dT1
- fmul dVi5,dVi5,dT0[0]
+ fmul dT1,dVr5,dT0s[0] // use dVi0 for dT1
+ fmul dVi5,dVi5,dT0s[0]
fadd dYr7,dVr3,dVr7
fadd dYi7,dVi3,dVi7