arm64: clang assembler compatability

Fixes to compile with clang (3.7)

- Fix fmul syntax:
 fmul  v0.2s,v1.2s,v2.2s[0] ->  v0.2s,v1.2s,v2.s[0]
- lowercase RSB macro call to -> rsb
- add w modifier and use U32 in and out for fastlog2()

With these changes openmax_dl compiles as part of chromium on arm64
 with both gcc 5.2 and clang 3.7. Unfortunately there was no
instructions how to run the included tests. The generated .o
files looks same before and after (minor changes in debug sections)

BUG=5090
R=andrew@webrtc.org, rtoy@google.com

Review URL: https://codereview.webrtc.org/1420973006 .

Patch from Riku Voipio <riku.voipio@linaro.org>.
diff --git a/dl/sp/api/armSP.h b/dl/sp/api/armSP.h
index 2592679..8f804ac 100644
--- a/dl/sp/api/armSP.h
+++ b/dl/sp/api/armSP.h
@@ -93,8 +93,8 @@
 /*
  * Compute log2(x), where x must be a power of 2.
  */
-static inline OMX_U32 fastlog2(long x) {
-  OMX_U32 out;
+static inline long fastlog2(long x) {
+  long out;
   asm("clz %0,%1\n\t"
       "sub %0, %0, #63\n\t"
       "neg %0, %0\n\t"
diff --git a/dl/sp/src/arm/arm64/ComplexToRealFixup.S b/dl/sp/src/arm/arm64/ComplexToRealFixup.S
index 9b30093..2580e7e 100644
--- a/dl/sp/src/arm/arm64/ComplexToRealFixup.S
+++ b/dl/sp/src/arm/arm64/ComplexToRealFixup.S
@@ -94,6 +94,7 @@
 #define qT3       v20.2s
 
 #define half      v0.2s
+#define halfs     v0.s
 #define dZip      v21.2s
 #define dZip8b    v21.8b
         
@@ -106,7 +107,7 @@
 
         clz     order, subFFTNum                    // N = 2^order
 
-        RSB     order,order,#63
+        rsb     order,order,#63
         MOV     subFFTSize,subFFTNum            // subFFTSize = N/2
         //MOV     subFFTNum,N
         mov     argDst, pDst
@@ -127,7 +128,7 @@
         MOV     zero,#0
         mov    dX0rs[1],zero
         lsl     step,subFFTSize, #3               // step = N/2 * 8 bytes
-        mov    dX0i[1],zero
+        mov    dX0is[1],zero
         // twStep = 3N/8 * 8 bytes pointing to W^1
         SUB     twStep,step,subFFTSize,LSL #1
 
@@ -185,8 +186,8 @@
         fadd    dT0,dX0r,dX1r                     // a+c
         fsub    dT1,dX0i,dX1i                     // b-d
         fadd    dT3,dX0i,dX1i                     // b+d
-        fmul   dT0,dT0,half[0]
-        fmul   dT1,dT1,half[0]
+        fmul   dT0,dT0,halfs[0]
+        fmul   dT1,dT1,halfs[0]
         // VZIP    dW1r,dW1i
         // VZIP    dW0r,dW0i
         zip1    dZip, dW1r, dW1i
@@ -208,8 +209,8 @@
         fmla   qT3,dW0i,dT2
 
 
-        fmul  dX1r,qT0,half[0]
-        fmul  dX1i,qT1,half[0]
+        fmul  dX1r,qT0,halfs[0]
+        fmul  dX1i,qT1,halfs[0]
 
         fsub    dY1r,dT0,dX1i                     // F(N/2 -1)
         fadd    dY1i,dT1,dX1r
@@ -219,8 +220,8 @@
         rev64   dY1i,dY1i
 
 
-        fmul  dX0r,qT2,half[0]
-        fmul  dX0i,qT3,half[0]
+        fmul  dX0r,qT2,halfs[0]
+        fmul  dX0i,qT3,halfs[0]
 
         fsub    dY0r,dT0,dX0i                     // F(1)
         fadd    dY0i,dT1,dX0r
diff --git a/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S b/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
index 02ff1c2..f93aa97 100644
--- a/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
+++ b/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
@@ -112,6 +112,7 @@
 #define dZip    v19.2s
 #define dZip8b  v19.8b
 #define half    v13.2s
+#define halfs   v13.s
 
         .macro FFTSTAGE scaled, inverse, name
 
@@ -140,8 +141,8 @@
 
         fadd    dY0,dX0,dX1                   // [b+d | a+c]
         fsub    dY1,dX0,dX1                   // [b-d | a-c]
-        fmul    dY0, dY0, half[0]
-        fmul    dY1, dY1, half[0]
+        fmul    dY0, dY0, halfs[0]
+        fmul    dY1, dY1, halfs[0]
 
         // dY0= [a-c | a+c] ;dY1= [b-d | b+d]
         // VZIP    dY0,dY1
@@ -201,11 +202,11 @@
         fsub    dT1,dX0i,dX1i                 // b-d
         SUB     step1,step1,#8
 
-        fmul    dT2, dT2, half[0]
-        fmul    dT3, dT3, half[0]
+        fmul    dT2, dT2, halfs[0]
+        fmul    dT3, dT3, halfs[0]
 
-        fmul    dT0, dT0, half[0]
-        fmul    dT1, dT1, half[0]
+        fmul    dT0, dT0, halfs[0]
+        fmul    dT1, dT1, halfs[0]
 
         // VZIP    dW1r,dW1i
         // VZIP    dW0r,dW0i
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S
index 6e732a8..3bc92d3 100644
--- a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S
@@ -70,6 +70,7 @@
 // Neon Registers
 
 #define dW      v0.2s
+#define dWs     v0.s
 #define dX0     v2.2s
 #define dX1     v3.2s
 #define dX2     v4.2s
@@ -135,17 +136,17 @@
         SUBS    setCount,setCount,#2
 
         .ifeqs  "\inverse", "TRUE"
-            fmul   qT0,dX2,dW[0]
-            fmla   qT0,dX3,dW[1]                       // real part
-            fmul   qT1,dX3,dW[0]
-            fmls   qT1,dX2,dW[1]                       // imag part
+            fmul   qT0,dX2,dWs[0]
+            fmla   qT0,dX3,dWs[1]                       // real part
+            fmul   qT1,dX3,dWs[0]
+            fmls   qT1,dX2,dWs[1]                       // imag part
 
         .else
 
-            fmul   qT0,dX2,dW[0]
-            fmls   qT0,dX3,dW[1]                       // real part
-            fmul   qT1,dX3,dW[0]
-            fmla   qT1,dX2,dW[1]                       // imag part
+            fmul   qT0,dX2,dWs[0]
+            fmls   qT0,dX3,dWs[1]                       // real part
+            fmul   qT1,dX3,dWs[0]
+            fmla   qT1,dX2,dWs[1]                       // imag part
 
         .endif
 
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S
index 7442e0d..047597d 100644
--- a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S
@@ -80,8 +80,11 @@
 // Neon Registers
 
 #define dW1     v0.2s
+#define dW1s    v0.s
 #define dW2     v1.2s
+#define dW2s    v1.s
 #define dW3     v2.2s
+#define dW3s    v2.s
 
 #define dXr0    v4.2s
 #define dXi0    v5.2s
@@ -182,49 +185,49 @@
 
 
         .ifeqs  "\inverse", "TRUE"
-            fmul   dZr1,dXr1,dW1[0]
-            fmul   dZi1,dXi1,dW1[0]
-            fmul   dZr2,dXr2,dW2[0]
-            fmul   dZi2,dXi2,dW2[0]
-            fmul   dZr3,dXr3,dW3[0]
-            fmul   dZi3,dXi3,dW3[0]
+            fmul   dZr1,dXr1,dW1s[0]
+            fmul   dZi1,dXi1,dW1s[0]
+            fmul   dZr2,dXr2,dW2s[0]
+            fmul   dZi2,dXi2,dW2s[0]
+            fmul   dZr3,dXr3,dW3s[0]
+            fmul   dZi3,dXi3,dW3s[0]
 
-            fmla   dZr1,dXi1,dW1[1]                // real part
-            fmls   dZi1,dXr1,dW1[1]                // imag part
+            fmla   dZr1,dXi1,dW1s[1]                // real part
+            fmls   dZi1,dXr1,dW1s[1]                // imag part
 
             //  data[1] for next iteration
             ld2     {dXr1,dXi1},[pSrc],pointStep
 
-            fmla   dZr2,dXi2,dW2[1]                // real part
-            fmls   dZi2,dXr2,dW2[1]                // imag part
+            fmla   dZr2,dXi2,dW2s[1]                // real part
+            fmls   dZi2,dXr2,dW2s[1]                // imag part
 
             //  data[2] for next iteration
             ld2     {dXr2,dXi2},[pSrc],pointStep
 
-            fmla   dZr3,dXi3,dW3[1]                // real part
-            fmls   dZi3,dXr3,dW3[1]                // imag part
+            fmla   dZr3,dXi3,dW3s[1]                // real part
+            fmls   dZi3,dXr3,dW3s[1]                // imag part
         .else
-            fmul   dZr1,dXr1,dW1[0]
-            fmul   dZi1,dXi1,dW1[0]
-            fmul   dZr2,dXr2,dW2[0]
-            fmul   dZi2,dXi2,dW2[0]
-            fmul   dZr3,dXr3,dW3[0]
-            fmul   dZi3,dXi3,dW3[0]
+            fmul   dZr1,dXr1,dW1s[0]
+            fmul   dZi1,dXi1,dW1s[0]
+            fmul   dZr2,dXr2,dW2s[0]
+            fmul   dZi2,dXi2,dW2s[0]
+            fmul   dZr3,dXr3,dW3s[0]
+            fmul   dZi3,dXi3,dW3s[0]
 
-            fmls   dZr1,dXi1,dW1[1]                // real part
-            fmla   dZi1,dXr1,dW1[1]                // imag part
+            fmls   dZr1,dXi1,dW1s[1]                // real part
+            fmla   dZi1,dXr1,dW1s[1]                // imag part
 
             //  data[1] for next iteration
             ld2     {dXr1,dXi1},[pSrc],pointStep
 
-            fmls   dZr2,dXi2,dW2[1]                // real part
-            fmla   dZi2,dXr2,dW2[1]                // imag part
+            fmls   dZr2,dXi2,dW2s[1]                // real part
+            fmla   dZi2,dXr2,dW2s[1]                // imag part
 
             //  data[2] for next iteration
             ld2     {dXr2,dXi2},[pSrc],pointStep
 
-            fmls   dZr3,dXi3,dW3[1]                // real part
-            fmla   dZi3,dXr3,dW3[1]                // imag part
+            fmls   dZr3,dXi3,dW3s[1]                // real part
+            fmla   dZi3,dXr3,dW3s[1]                // imag part
         .endif
 
         //  data[3] & update pSrc to data[0]
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
index 03969be..5fe4925 100644
--- a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
@@ -352,10 +352,10 @@
         .ifeqs  "\inverse", "TRUE"
 
             // calculate a*v5
-            fmul    dT1,dVr5,dT0[0]              // use dVi0 for dT1
+            fmul    dT1,dVr5,dT0s[0]              // use dVi0 for dT1
 
             ld2     {dXr4,dXi4},[pSrc],pointStep //  data[4]
-            fmul    dVi5,dVi5,dT0[0]
+            fmul    dVi5,dVi5,dT0s[0]
 
             ld2     {dXr5,dXi5},[pSrc],pointStep //  data[5]
             fsub    dVr5,dT1,dVi5                // a * V5
@@ -364,8 +364,8 @@
             ld2     {dXr6,dXi6},[pSrc],pointStep //  data[6]
 
             // calculate  b*v7
-            fmul    dT1,dVr7,dT0[0]
-            fmul    dVi7,dVi7,dT0[0]
+            fmul    dT1,dVr7,dT0s[0]
+            fmul    dVi7,dVi7,dT0s[0]
 
             // fadd    qY1,qV1,qV5
             // fsub    qY5,qV1,qV5
@@ -399,9 +399,9 @@
         .else
 
             // calculate  b*v7
-            fmul    dT1,dVr7,dT0[0]
+            fmul    dT1,dVr7,dT0s[0]
             ld2     {dXr4,dXi4},[pSrc],pointStep //  data[4]
-            fmul    dVi7,dVi7,dT0[0]
+            fmul    dVi7,dVi7,dT0s[0]
 
             ld2     {dXr5,dXi5},[pSrc],pointStep //  data[5]
             fadd    dVr7,dT1,dVi7                     // b * V7
@@ -410,8 +410,8 @@
             ld2     {dXr6,dXi6},[pSrc],pointStep //  data[6]
 
             // calculate a*v5
-            fmul    dT1,dVr5,dT0[0]              // use dVi0 for dT1
-            fmul    dVi5,dVi5,dT0[0]
+            fmul    dT1,dVr5,dT0s[0]              // use dVi0 for dT1
+            fmul    dVi5,dVi5,dT0s[0]
 
             fadd    dYr7,dVr3,dVr7
             fadd    dYi7,dVi3,dVi7