Add non-NEON float FFT implementation.
BUG=
R=aedla@chromium.org, andrew@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/2028004

git-svn-id: http://webrtc.googlecode.com/svn/deps/third_party/openmax@5051 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/dl/api/arm/armCOMM_s.h b/dl/api/arm/armCOMM_s.h
index 6b0d2be..6ce1e2f 100644
--- a/dl/api/arm/armCOMM_s.h
+++ b/dl/api/arm/armCOMM_s.h
@@ -371,6 +371,17 @@
 	
 	.endm
 
+        @// Allocate 8-byte aligned area of name
+        @// |name| and size |size| bytes.
+	.macro	M_ALLOC8 name, size
+	.if	(_SBytes & 7) != 0
+	.set	_SBytes, _SBytes + (8 - (_SBytes & 7))
+	.endif
+	.set	\name\()_F, _SBytes
+	.set	_SBytes, _SBytes + \size
+	
+	.endm
+
         @ Load word from stack
 	.macro M_LDR r, a0, a1, a2, a3
 	_M_DATA "ldr", 4, \r, \a0, \a1, \a2, \a3
@@ -381,6 +392,16 @@
 	_M_DATA "str", 4, \r, \a0, \a1, \a2, \a3
 	.endm
 
+        @ Load double word from stack
+	.macro M_LDRD r0, r1, a0, a1, a2, a3
+	_M_DATA2 "ldrd", 8, \r0, \r1, \a0, \a1, \a2, \a3
+	.endm
+
+        @ Store double word to stack
+	.macro M_STRD r0, r1, a0, a1, a2, a3
+	_M_DATA2 "strd", 8, \r0, \r1, \a0, \a1, \a2, \a3
+	.endm
+
         @ Macro to perform a data access operation
         @ Such as LDR or STR
         @ The addressing mode is modified such that
@@ -407,3 +428,31 @@
 	.set	_Offset, _Workspace + \a0\()_F
 	\i\a1	\r, [sp, #_Offset]	
 	.endm
+
+        @ Macro to perform a data access operation
+        @ Such as LDR or STR
+        @ The addressing mode is modified such that
+        @ 1. If no address is given then the name is taken
+        @    as a stack offset
+        @ 2. If the addressing mode is not available for the
+        @    state being assembled for (eg Thumb) then a suitable
+        @    addressing mode is substituted.
+        @
+        @ On Entry:
+        @ $i = Instruction to perform (eg "LDRB")
+        @ $a = Required byte alignment
+        @ $r = Register(s) to transfer (eg "r1")
+        @ $a0,$a1,$a2. Addressing mode and condition. One of:
+        @     label {,cc}
+        @     [base]                    {,,,cc}
+        @     [base, offset]{!}         {,,cc}
+        @     [base, offset, shift]{!}  {,cc}
+        @     [base], offset            {,,cc}
+        @     [base], offset, shift     {,cc}
+	@
+	@ WARNING: Most of the above are not supported, except the first case.
+	.macro _M_DATA2 i, a, r0, r1, a0, a1, a2, a3
+	.set	_Offset, _Workspace + \a0\()_F
+	\i\a1	\r0, \r1, [sp, #_Offset]	
+	.endm
+	
\ No newline at end of file
diff --git a/dl/sp/src/arm/armv7/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S b/dl/sp/src/arm/armv7/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S
new file mode 100644
index 0000000..75d6711
--- /dev/null
+++ b/dl/sp/src/arm/armv7/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S
@@ -0,0 +1,260 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of 
+@//  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S to support float
+@//  instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
+@// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
+@// It implements the "scaled"(by 1/2) version of the above formula.
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@//        M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@/    IF  ARM1136JS
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+
+
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define pTwiddle        r4
+#define pOut            r5
+#define subFFTSize      r7
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+#define count           r8
+#define diffMinusOne    r2
+#define round           r3
+
+#define pOut1           r2
+#define size            r7
+#define step            r3
+#define step1           r6
+#define twStep          r12
+#define pTwiddleTmp     r14
+#define t0              r12
+
+#define x0r     s0
+#define x0i     s1
+#define x1r     s2
+#define x1i     s3
+#define w0r     s4
+#define w0i     s5
+#define y0r     s6
+#define y0i     s7
+#define w1r     s6
+#define w1i     s7
+#define y1r     s6              /*@// w1r,w1i*/
+#define y1i     s7
+#define st0     s8
+#define st1     s9
+#define st2     s10
+#define st3     s11
+#define st4     s12
+#define st5     s13
+//@ half = 0.5
+#define half    s15
+
+
+
+
+
+        .MACRO FFTSTAGE scaled, inverse,name
+
+        @// Initialize half now.
+        movw    N, #0x0000
+        movt    N, #0x3f00
+        vmov.f32 half, N                @// half = 0.5
+
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+
+        MOV     size,N,ASR #1           @// preserve the contents of N
+
+        MOV     step,size,LSL #3        @// step = N/2 * 8 bytes
+        ADD     pTwiddleTmp,pTwiddle,#8 @// W^2
+
+        ADD     pOut1,pOut,step         @// pOut1 = pOut+ N/2*8 bytes
+        @// twStep = 3N/8 * 8 bytes pointing to W^1
+        SUB     twStep,step,size,LSL #1
+        MOV     step1,size,LSL #2       @// step1 = N/4 * 8 = N/2*4 bytes
+        SUB     step1,step1,#8          @// (N/4-1)*8 bytes
+        ADD     argTwiddle,pTwiddle,twStep      @// W^1
+
+        @// Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
+        @// Note: W^(k) is stored as negated value and also need to
+        @// conjugate the values from the table
+
+        @// Z(0) : no need of twiddle multiply
+        @// Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
+
+
+        add      pSrc, step             @// step = N/2*8 bytes
+        vldm.f32 pSrc, {x1r, x1i}       @// {x1r, x1i} = [pSrc, step]
+        sub      pSrc, step
+        vldm.f32 pSrc!, {x0r, x0i}
+
+        SUBS    size,size,#2
+
+        vadd.f32 st0, x0r, x1r          @// a+c
+        vsub.f32 st1, x0r, x1r          @// a-c
+        vmov.f32 x0r, st0
+        vmov.f32 x1r, st1
+        vsub.f32 st0, x0i, x1i          @// b-d
+        vadd.f32 x1i, x0i, x1i          @// b+d
+        vmov.f32 x0i, st0
+
+
+        vsub.f32     x0r,x0r,x1i        @// Z(0).r
+        vadd.f32     x0i,x0i,x1r        @// Z(0).i
+
+        vmul.f32 x0r, half
+        vmul.f32 x0i, half
+        vstm.f32 pOut1!, {x0r, x0i}     @// pOut1 = pOut+ N/2*8 bytes
+
+        BLT     end\name
+        BEQ     lastElement\name
+
+        ASR     size,size,#1
+evenOddButterflyLoop\name:
+
+        SUB     step,step,#16           @// (N/2-2)*8 bytes
+
+        add      pSrc, step             @// (N/2-1)*8 bytes
+        vldm.f32 pSrc, {x1r, x1i}       @// {x1r, x1i} = [pSrc, step]
+        sub      pSrc, step
+        vldm.f32 pSrc!, {x0r, x0i}
+        add      argTwiddle, step1
+        vldm.f32 argTwiddle, {w1r, w1i} @// {w1r, w1i} = [argTwiddle, step]
+        sub      argTwiddle, step1
+        vldm.f32 argTwiddle!, {w0r, w0i}
+
+        SUB     step1,step1,#8
+        SUBS    size,size,#1
+
+
+        vsub.f32     st2,x0r,x1r        @// a-c
+        vadd.f32     st3,x0i,x1i        @// b+d
+        vadd.f32     st0,x0r,x1r        @// a+c
+        vsub.f32     st1,x0i,x1i        @// b-d
+
+        vmul.f32  x1r,w1r,st2
+        vmul.f32  x1i,w1r,st3
+        vmls.f32  x1r,w1i,st3
+        vmla.f32  x1i,w1i,st2
+
+        vadd.f32     y1r,st0,x1i        @// F(N/2 -1)
+        vsub.f32     y1i,x1r,st1        @// y1r,y1i same as w1r, w1i
+
+
+        vmul.f32  x0r,w0r,st2
+        vmul.f32  x0i,w0r,st3
+        vmla.f32  x0r,w0i,st3
+        vmls.f32  x0i,w0i,st2
+
+
+        vadd.f32     st4,st0,x0i        @// F(1)
+        vsub.f32     st5,st1,x0r
+
+
+        vmul.f32 y1r, half
+        vmul.f32 y1i, half
+        vmul.f32 st4, half
+        vmul.f32 st5, half
+        add      pOut1, step            @// (N/2-1)*8 bytes
+        vstm.f32 pOut1, {y1r, y1i}      @// {y1r,y1i} = [pOut1, step]
+        sub      pOut1, step
+        vstm.f32 pOut1!, {st4, st5}
+
+        MOV     t0,argTwiddle           @// swap ptr for even and odd twiddles
+        MOV     argTwiddle,pTwiddleTmp
+        MOV     pTwiddleTmp,t0
+
+        BGT     evenOddButterflyLoop\name
+
+
+        @// Last element can be expanded as follows
+        @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)]
+        @// (since W^k is stored as -ve)
+        @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
+        @// 1/2[2a+j0] + j (c-jd) [0+j2b]
+        @// (a+bc, -bd)
+        @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement\name:
+        vldm.f32 pSrc, {x0r, x0i}
+
+        vneg.f32 x0i, x0i
+        vstm.f32 pOut1, {x0r, x0i}
+end\name:
+
+
+        .endm
+
+
+@ Structure offsets for FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+
+        M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_vfp,r4
+             FFTSTAGE "FALSE","TRUE",Inv
+        M_END
+
+@//    ENDIF                                           @//ARM1136JS
+
+
+      @// Guarding implementation by the processor name
+
+
+
+    .end
diff --git a/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S b/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S
new file mode 100644
index 0000000..c2feb0b
--- /dev/null
+++ b/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S
@@ -0,0 +1,145 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@//        M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@//    IF  ARM1136JS
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define pPingPongBuf    r5
+#define subFFTNum       r6
+#define subFFTSize      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define pDstBuf         r3                   /*@// Temporarily hold pingpong buffer ptr*/
+#define grpSize         r14
+#define outPointStep    r12
+#define setCount        r14
+#define pointStep       r12
+
+@// Real and Imaginary parts
+#define x0r s0
+#define x0i s1
+#define x1r s2
+#define x1i s3
+#define y1r s4
+#define y1i s5
+#define y0r s6
+#define y0i s7
+
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount 
+	@// and pGrpSize regs
+
+        mov     subFFTSize, #2
+        lsr     grpSize, subFFTNum, #1
+        mov     subFFTNum, grpSize
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+        @// Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+        MOV     pointStep,grpSize,LSL #3
+
+
+
+        @// Loop on the sets for grp zero
+
+grpZeroSetLoop\name:
+
+        add      pSrc, pSrc, pointStep
+        @// {x1r,x1i} = [pSrc, pointStep]
+        vldm.f32 pSrc, {x1r, x1i}
+        sub      pSrc, pSrc, pointStep
+        vldm.f32 pSrc!, {x0r, x0i}
+
+        SUBS    setCount,setCount,#1            @// decrement the loop counter
+
+
+
+        vsub.f32     y1r,x0r,x1r
+        vsub.f32     y1i,x0i,x1i
+
+        vadd.f32     y0r,x0r,x1r
+        vadd.f32     y0i,x0i,x1i
+
+        add     pDst, pDst, outPointStep
+        @// {y1r,y1i} -> [pDst, outPointStep]
+        vstm    pDst, {y1r, y1i}
+        sub     pDst, pDst, outPointStep
+        vstm    pDst!, {y0r, y0i}
+
+        BGT     grpZeroSetLoop\name
+
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep             @// pDst -= 2*grpSize
+        mov     pDst, pPingPongBuf
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+        M_START armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+@/    ENDIF                                                           @//ARM1136JS
+
+
+@// Guarding implementation by the processor name
+
+
+
+    .end
diff --git a/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S b/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S
new file mode 100644
index 0000000..3bd4725
--- /dev/null
+++ b/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S
@@ -0,0 +1,213 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a first stage Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@//        M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@//    IF  ARM1136JS
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define pPingPongBuf    r5
+#define subFFTNum       r6
+#define subFFTSize      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize         r14
+#define outPointStep    r12
+#define setStep         r3
+#define setCount        r14                  /*@// Reuse grpSize as setCount*/
+#define pointStep       r12
+
+@// Real and Imaginary parts
+#define x0r s0
+#define x0i s1
+#define x1r s2
+#define x1i s3
+#define x2r s4
+#define x2i s5
+#define x3r s6
+#define x3i s7
+#define t3r s0                 /*@// Temporarily hold x3r and x3i*/
+#define t3i s1
+#define sr  s8
+#define si  s9
+
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+
+        @// Update grpCount and grpSize rightaway inorder to reuse
+        @// pSubFFTSize and pSubFFTNum regs
+        mov     subFFTSize, #4
+        lsr     grpSize, subFFTNum, #2
+        mov     subFFTNum, grpSize
+
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+        @// Note: setCount = grpSize/4 (reuse the updated grpSize for setCount)
+        MOV     pointStep,grpSize,LSL #3
+
+
+        @// Calculate the step of input data for the next set
+        @//MOV     setStep,pointStep,LSL #1
+        MOV     setStep,grpSize,LSL #4
+        @// setStep = 3*pointStep
+        ADD     setStep,setStep,pointStep
+        @// setStep = - 3*pointStep+8
+        RSB     setStep,setStep,#8
+
+        @// grp = 0 a special case since all the twiddle factors are 1
+        @// Loop on the sets
+
+grpZeroSetLoop\name:
+
+        vldm.f32 pSrc, {x0r, x0i}
+        add     pSrc, pSrc, pointStep
+        vldm.f32 pSrc, {x1r, x1i}
+        add     pSrc, pSrc, pointStep
+        vldm.f32 pSrc, {x2r, x2i}
+        add     pSrc, pSrc, pointStep
+        vldm.f32 pSrc, {x3r, x3i}
+        add     pSrc, pSrc, setStep
+
+
+        @// Decrement setcount
+        SUBS    setCount,setCount,#1
+
+
+
+        @// finish first stage of 4 point FFT
+
+        vadd.f32     x0r,x0r,x2r                @// x0 = x0 + x2
+        vadd.f32     x0i,x0i,x2i
+
+        vadd.f32     sr, x2r, x2r
+        vadd.f32     si, x2i, x2i
+        vsub.f32     x2r,x0r,sr                 @// x2 = x0 - x2
+        vsub.f32     x2i,x0i,si
+
+        vadd.f32     x1r,x1r,x3r                @// x1 = x1 + x3
+        vadd.f32     x1i,x1i,x3i
+
+        vadd.f32     sr, x3r, x3r
+        vadd.f32     si, x3i, x3i
+        vsub.f32     x3r,x1r,sr                 @// x3 = x1 - x3
+        vsub.f32     x3i,x1i,si
+
+
+        @// finish second stage of 4 point FFT
+
+
+        vadd.f32     x0r,x0r,x1r                @// x0 = x0 + x1
+        vadd.f32     x0i,x0i,x1i
+
+        vadd.f32     sr, x1r, x1r
+        vadd.f32     si, x1i, x1i
+        vsub.f32     x1r,x0r,sr                 @// x1 = x0 - x1
+        vsub.f32     x1i,x0i,si
+
+        vstm.f32 pDst, {x0r, x0i}
+        add      pDst, pDst, outPointStep
+
+        vadd.f32     x2r,x2r,x3i
+        vsub.f32     x2i,x2i,x3r
+
+        vadd.f32     sr, x3r, x3r
+        vadd.f32     si, x3i, x3i
+        vsub.f32     t3r, x2r, si
+        vadd.f32     t3i, x2i, sr
+
+        .ifeqs  "\inverse", "TRUE"
+            vstm.f32 pDst, {t3r, t3i}
+            add      pDst, pDst, outPointStep
+            vstm.f32 pDst, {x1r, x1i}
+            add      pDst, pDst, outPointStep
+            vstm.f32 pDst, {x2r, x2i}
+            add      pDst, pDst, setStep
+        .else
+            vstm.f32 pDst, {x2r, x2i}
+            add      pDst, pDst, outPointStep
+            vstm.f32 pDst, {x1r, x1i}
+            add      pDst, pDst, outPointStep
+            vstm.f32 pDst, {t3r, t3i}
+            add      pDst, pDst, setStep
+        .endif
+
+
+        BGT     grpZeroSetLoop\name
+
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep             @// pDst -= 2*grpSize
+        mov     pDst, pPingPongBuf
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+@//    ENDIF                                                           @//ARM1136JS
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+    .end
diff --git a/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S b/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S
new file mode 100644
index 0000000..00e48d1
--- /dev/null
+++ b/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S
@@ -0,0 +1,310 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.S
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@//        M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@//    IF  ARM1136JS
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define subFFTNum       r6
+#define subFFTSize      r7
+
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpCount        r12
+#define step            r12                  /*@// Reuse grpCount*/
+#define outPointStep    r3
+#define setCount        r8
+#define diff            r9
+#define pointStep       r14
+
+#define t1              r3                 /*@// Reuse outPointStep*/
+
+@// Real and Imaginary parts used in the inner grp loop
+#define x0r s0
+#define x0i s1
+#define x1r s2
+#define x1i s3
+#define x2r s4
+#define x2i s5
+#define x3r s6
+#define x3i s7
+
+@// Temporary reg to hold the twiddle multiplies
+
+#define t0r s8
+#define t0i s9
+#define t2r s10
+#define t2i s11
+#define sr  s12
+#define si  s13
+
+
+
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        @// Define stack arguments
+
+
+        @// Update grpCount and grpSize rightaway inorder to reuse
+        @// pGrpCount and pGrpSize regs
+
+        LSL     grpCount,subFFTSize,#2
+        lsr     subFFTNum, subFFTNum, #2
+        mov     subFFTSize, grpCount
+
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+        mov     pointStep, subFFTNum, lsl #1
+
+
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 8*outPointStep bytes = 2*size
+        @// bytes
+
+        @// Use setCount as dummy.  It's set correctly below.
+        smull   outPointStep, setCount, grpCount, pointStep
+
+        LSL     pointStep,pointStep,#2                      @// 2*grpSize
+
+
+        MOV     setCount,pointStep,LSR #3
+
+        @// Interchange grpLoop and setLoop
+
+setLoop\name:
+
+        MOV     step,#0
+        @// Set pSrc and pDst for the grpLoop
+
+        SUB      diff,outPointStep,pointStep
+
+        @// Save setCount on stack to reuse the reg
+
+        ADD      pSrc,pSrc,diff,LSL #2  @// pSrc += (grpCount-1)*grpStep
+        ADD      pDst,pDst,diff         @// pDst += (grpCount-1)*setCount
+        ADD      step,step,diff         @// step += (grpCount-1)*setCount
+
+
+
+        @// Loop on the grps
+
+grpLoop\name:
+
+
+
+        @// butterfly loop
+        add         pSrc, pointStep
+        vldm.f32    pSrc, {x3r, x3i}                    @// data[1]
+        add         pTwiddle, step
+        vldm.f32    pTwiddle, {x1r, x1i}                @// coef[1]
+        add         pTwiddle, step
+        vldm.f32    pTwiddle, {x2r, x2i}                @// coef[2]
+        add         pSrc, pointStep
+        vldm.f32    pSrc, {x0r, x0i}                    @// data[2]
+
+        @// do first complex multiply
+        vmul.f32 t0r, x3r, x1r
+        vmul.f32 t0i, x3i, x1r
+
+        .ifeqs  "\inverse", "TRUE"
+            vmla.f32 t0r, x3i, x1i
+            vmls.f32 t0i, x3r, x1i
+            vmov.f32 x1r, t0r
+            vmov.f32 x1i, t0i
+        .else
+            vmls.f32 t0r, x3i, x1i
+            vmla.f32 t0i, x3r, x1i
+            vmov.f32 x1r, t0r
+            vmov.f32 x1i, t0i
+        .endif
+
+        add     pTwiddle, pTwiddle, step
+        vldm    pTwiddle, {x3r, x3i}                    @// coef[3]
+        sub     pTwiddle, pTwiddle, step
+
+        @// do second complex multiply
+        vmul.f32 t0r, x0r, x2r
+        vmul.f32 t0i, x0i, x2r
+
+        .ifeqs  "\inverse", "TRUE"
+            vmla.f32 t0r, x0i, x2i
+            vmls.f32 t0i, x0r, x2i
+            vmov.f32 x2r, t0r
+            vmov.f32 x2i, t0i
+        .else
+            vmls.f32 t0r, x0i, x2i
+            vmla.f32 t0i, x0r, x2i
+            vmov.f32 x2r, t0r
+            vmov.f32 x2i, t0i
+        .endif
+
+        add     pSrc, pointStep
+        vldm    pSrc, {x0r, x0i}                @// data[3]
+        sub     pSrc, pointStep
+
+        SUB     pTwiddle,pTwiddle,step,LSL #1   @// reset pTwiddle
+        SUBS    step,step,pointStep             @// decrement loop counter
+
+        @// do third complex multiply
+        SUB     pSrc,pSrc,pointStep,LSL #1      @// reset pSrc to data[0]
+        vmul.f32 t0r, x0r, x3r
+        vmul.f32 t0i, x0i, x3r
+
+        .ifeqs  "\inverse", "TRUE"
+            vmla.f32 t0r, x0i, x3i
+            vmls.f32 t0i, x0r, x3i
+            vmov.f32 x3r, t0r
+            vmov.f32 x3i, t0i
+        .else
+            vmls.f32 t0r, x0i, x3i
+            vmla.f32 t0i, x0r, x3i
+            vmov.f32 x3r, t0r
+            vmov.f32 x3i, t0i
+        .endif
+
+        vldm    pSrc, {x0r, x0i}                @// data[0]
+
+        @// finish first stage of 4 point FFT
+        vadd.f32     x0r,x0r,x2r                @// x0 = x0 + x2 (u0)
+        vadd.f32     x0i,x0i,x2i
+
+        vadd.f32     sr, x2r, x2r
+        vadd.f32     si, x2i, x2i
+        vsub.f32     x2r,x0r,sr                 @// x2 = x0 - x2 (u1)
+        vsub.f32     x2i,x0i,si
+
+        vadd.f32     x1r,x1r,x3r                @// x1 = x1/2 + x3/2 (u2/2)
+        vadd.f32     x1i,x1i,x3i
+
+        vadd.f32     sr, x3r, x3r
+        vadd.f32     si, x3i, x3i
+        vsub.f32     x3r,x1r,sr                 @// x3 = x1/2 - x3/2 (u3/2)
+        vsub.f32     x3i,x1i,si
+
+
+        @// finish second stage of 4 point FFT
+
+        @// y0 = u1-u2 since twiddle's are stored as -ve values
+        vsub.f32     x2r,x2r,x1r
+        vsub.f32     x2i,x2i,x1i
+
+        vadd.f32     sr, x1r, x1r
+        vadd.f32     si, x1i, x1i
+        vadd.f32     x1r,x2r,sr                 @// y2 = u1+u2
+        vadd.f32     x1i,x2i,si
+        vstm    pDst, {x2r, x2i}                @// store y0
+
+        vsub.f32     x0r,x0r,x3i                @// y3 = u0+ju3
+        vadd.f32     x0i,x0i,x3r
+
+        vadd.f32     sr, x3r, x3r
+        vadd.f32     si, x3i, x3i
+        vadd.f32     t2r,x0r,si                 @// y1 = u0-ju3
+        vsub.f32     t2i,x0i,sr                 @// t2 will be same as x2r reg
+
+        .ifeqs  "\inverse", "TRUE"
+            add     pDst, outPointStep
+            vstm    pDst, {t2r, t2i}            @// store y1
+            add     pDst, outPointStep
+            vstm    pDst, {x1r, x1i}            @// store y2
+            add     pDst, outPointStep
+            vstm    pDst, {x0r, x0i}            @// store y3
+            sub     pDst, outPointStep
+        .else
+            add     pDst, outPointStep
+            vstm    pDst, {x0r, x0i}            @// store y1
+            add     pDst, outPointStep
+            vstm    pDst, {x1r, x1i}            @// store y2
+            add     pDst, outPointStep
+            vstm    pDst, {t2r, t2i}            @// store y3
+            sub     pDst, outPointStep
+        .endif
+
+        SUB     pDst,pDst,outPointStep, LSL #1  @// reset pDst
+        @// update the pDst for the next grp
+        SUBGE   pDst,pDst,pointStep
+        @// update the pSrc for the next grp
+        SUBGE   pSrc,pSrc,pointStep,LSL #2
+
+
+        BGE     grpLoop\name
+
+        ADD     pSrc,pSrc,#8                    @// pSrc += 1; for the next set
+        ADD     pDst,pDst,#8                    @// pDst += 1; for the next set
+
+        SUBS    setCount,setCount,#1            @// decrement loop counter
+
+
+        BGT     setLoop\name
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     t1,pDst
+        SUB     pDst,pSrc,subFFTNum,LSL #3
+        SUB     pSrc,t1,subFFTNum,LSL #3
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+@//    ENDIF                                                           @//ARM1136JS
+
+
+
+@// Guarding implementation by the processor name
+
+    .end
diff --git a/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S b/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S
new file mode 100644
index 0000000..4ac2da4
--- /dev/null
+++ b/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S
@@ -0,0 +1,386 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a first stage Radix 8 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@//        M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@//    IF  ARM1136JS
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define subFFTNum       r6
+#define subFFTSize      r7
+#define pPingPongBuf    r5
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize         r14
+#define step1           r3
+#define step2           r8
+#define setCount        r14             /*@// Reuse grpSize as setCount*/
+#define pointStep       r12
+
+#define t0              r4
+@// Real and Imaginary parts
+
+#define x0r             s0
+#define x0i             s1
+#define x1r             s2
+#define x1i             s3
+#define x2r             s4
+#define x2i             s5
+#define x3r             s6
+#define x3i             s7
+#define t3r             s8              /*@// Temporarily hold x3r and x3i*/
+#define t3i             s9
+#define t1r             s4
+#define t1i             s5
+#define sr              s10
+#define si              s11
+#define roothalf        s12
+
+@// Define macros to load/store two float regs from/to the stack.
+        .macro M_VSTM r0, r1, p
+        .set    _Offset, _Workspace + \p\()_F
+        add     t0, sp, #_Offset
+        vstm.f32 t0, {\r0, \r1}
+        .endm
+
+        .macro M_VLDM r0, r1, p
+        .set    _Offset, _Workspace + \p\()_F
+        add     t0, sp, #_Offset
+        vldm.f32 t0, {\r0, \r1}
+        .endm
+
+@// Define constants
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        @// Define stack arguments
+
+
+        @// Update grpCount and grpSize rightaway inorder to reuse
+        @// pSubFFTSize and pSubFFTNum regs
+
+        mov     subFFTSize, #8
+        lsr     grpSize, subFFTNum, #3
+        mov     subFFTNum, grpSize
+
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
+        @// Note: setCount = grpSize/8 (reuse the updated grpSize for
+        @// setCount)
+        MOV     pointStep,grpSize,LSL #3
+
+
+        @// Calculate the step of input data for the next set
+        MOV     step1,grpSize,LSL #4
+        MOV     step2,pointStep,LSL #3
+        SUB     step2,step2,pointStep           @// step2 = 7*pointStep
+
+
+        @// grp = 0 a special case since all the twiddle factors are 1
+        @// Loop on the sets
+
+        movw    t0,#0x04f3
+        movt    t0,#0x3f35
+        vmov.f32 roothalf, t0                   @// roothalf = sqrt(1/2)
+
+grpZeroSetLoop\name:
+
+        vldm.f32 pSrc, {x0r, x0i}               @// x0
+        add      pSrc, step1
+        vldm.f32 pSrc, {x1r, x1i}               @// x2
+        add      pSrc, step1
+        vldm.f32 pSrc, {x2r, x2i}               @// x4
+        add      pSrc, step1
+        vldm.f32 pSrc, {x3r, x3i}               @// x6
+        add      pSrc, step1
+
+        SUB     pSrc, pSrc, step2
+
+        @// finish first stage of 8 point FFT and save on stack
+
+        vadd.f32     x0r,x0r,x2r                @// u0
+        vadd.f32     x0i,x0i,x2i
+
+        vadd.f32     sr, x2r, x2r
+        vadd.f32     si, x2i, x2i
+        vsub.f32     x2r,x0r,sr                 @// u1
+        vsub.f32     x2i,x0i,si
+
+        M_VSTM   x0r,x0i, pU0
+        M_VSTM   x2r,x2i, pU1
+
+        vadd.f32     x1r,x1r,x3r                @// u4
+        vadd.f32     x1i,x1i,x3i
+
+        vadd.f32     sr, x3r, x3r
+        vadd.f32     si, x3i, x3i
+        vsub.f32     x3r,x1r,sr                 @// u5
+        vsub.f32     x3i,x1i,si
+
+        M_VSTM   x1r,x1i, pU4
+        M_VSTM   x3r,x3i, pU5
+
+
+        vldm    pSrc, {x0r, x0i}                @// x1
+        add     pSrc, step1
+        vldm    pSrc, {x1r, x1i}                @// x3
+        add     pSrc, step1
+        vldm    pSrc, {x2r, x2i}                @// x5
+        add     pSrc, step1
+        vldm    pSrc, {x3r, x3i}                @// x7
+        add     pSrc, #8
+
+        SUB     pSrc, pSrc, step2
+
+        vadd.f32     x0r,x0r,x2r                @// u2
+        vadd.f32     x0i,x0i,x2i
+
+        vadd.f32         sr, x2r, x2r
+        vadd.f32         si, x2i, x2i
+        vsub.f32     x2r,x0r,sr                 @// u3
+        vsub.f32     x2i,x0i,si
+
+        M_VSTM   x2r,x2i, pU3
+
+        vadd.f32     x1r,x1r,x3r                @// u6
+        vadd.f32     x1i,x1i,x3i
+
+        vadd.f32         sr, x3r, x3r
+        vadd.f32         si, x3i, x3i
+        vsub.f32     x3r,x1r,sr                 @// u7
+        vsub.f32     x3i,x1i,si
+
+        @// finish second and third stage of 8 point FFT
+
+        M_VSTM  x3r,x3i, pU7
+        M_VLDM  x2r,x2i, pU0
+
+        @// Decrement setcount
+        SUBS    setCount,setCount,#1
+        M_VLDM  x3r,x3i, pU4
+
+        vadd.f32     x0r,x0r,x1r                @// v4
+        vadd.f32     x0i,x0i,x1i
+
+        vadd.f32     sr, x1r, x1r
+        vadd.f32     si, x1i, x1i
+        vsub.f32     x1r,x0r,sr                 @// v6
+        vsub.f32     x1i,x0i,si
+
+        vadd.f32     x2r,x2r,x3r                @// v0
+        vadd.f32     x2i,x2i,x3i
+
+        vadd.f32     sr, x3r, x3r
+        vadd.f32     si, x3i, x3i
+        vsub.f32     x3r,x2r,sr                 @// v2
+        vsub.f32     x3i,x2i,si
+
+
+
+        vadd.f32     x2r,x2r,x0r                @// y0
+        vadd.f32     x2i,x2i,x0i
+
+        vadd.f32     sr, x0r, x0r
+        vadd.f32     si, x0i, x0i
+        vsub.f32     x0r,x2r,sr                 @// y4
+        vsub.f32     x0i,x2i,si
+
+        vstm    pDst, {x2r, x2i}                @// store y0
+        add     pDst, step1
+
+        vadd.f32     x3r,x3r,x1i                @// y6
+        vsub.f32     x3i,x3i,x1r
+
+        vadd.f32     sr, x1r, x1r
+        vadd.f32     si, x1i, x1i
+        vsub.f32     t1r,x3r,si                 @// t1r=x2r reg;t1i=x2i reg
+        vadd.f32     t1i,x3i,sr                 @// y2
+
+        .ifeqs  "\inverse", "TRUE"
+            vstm        pDst, {t1r, t1i}        @// store y2
+            add pDst, step1
+            vstm        pDst, {x0r, x0i}        @// store y4
+            add pDst, step1
+            vstm        pDst, {x3r, x3i}        @// store y6
+            add pDst, step1
+        .else
+            vstm        pDst, {x3r, x3i}        @// store y2
+            add pDst, step1
+            vstm        pDst, {x0r, x0i}        @// store y4
+            add pDst, step1
+            vstm        pDst, {t1r, t1i}        @// store y6
+            add pDst, step1
+        .endif
+
+        SUB     pDst, pDst, step2               @// set pDst to y1
+
+
+        M_VLDM  x0r,x0i,pU1                     @// Load u1,u3,u5,u7
+        M_VLDM  x1r,x1i,pU5
+        M_VLDM  x3r,x3i,pU7
+
+        vsub.f32     x0r,x0r,x1i                @// v1
+        vadd.f32     x0i,x0i,x1r
+        vadd.f32     sr, x1r, x1r
+        vadd.f32     si, x1i, x1i
+        vadd.f32     t1r,x0r,si                 @// t1r=x2r reg;t1i=x2i reg
+        vsub.f32     t1i,x0i,sr                 @// v3
+
+        M_VLDM  x1r,x1i,pU3
+
+        vsub.f32     x1r,x1r,x3i                @// v5
+        vadd.f32     x1i,x1i,x3r
+
+        vadd.f32     sr, x3r, x3r
+        vadd.f32     si, x3i, x3i
+        vadd.f32     t3r,x1r,si                 @// t3i = x3i
+        vsub.f32     t3i,x1i,sr                 @// v7
+
+        @// store v5  as (v5.r - v5.i,v5.r + v5.i)
+        @// store v7  as (v7.i + v7.r,v7.i - v7.r)
+
+        vadd.f32     x3r,t3i,t3r                @// v7
+        vsub.f32     x3i,t3i,t3r
+
+        vsub.f32     x1r,x1r,x1i                @// v5
+        vadd.f32     x1i, x1i
+        vadd.f32     x1i,x1r,x1i
+
+        vmul.f32  x3r, x3r, roothalf            @// (v7.i + v7.r)*(1/sqrt(2))
+        vmul.f32  x3i, x3i, roothalf            @// (v7.i - v7.r)*(1/sqrt(2))
+        vmul.f32  x1r, x1r, roothalf            @// (v5.r - v5.i)*(1/sqrt(2))
+        vmul.f32  x1i, x1i, roothalf            @// (v5.r + v5.i)*(1/sqrt(2))
+
+        vadd.f32     x2r,x2r,x3r                @// y7
+        vadd.f32     x2i,x2i,x3i
+
+        vadd.f32     sr, x3r, x3r
+        vadd.f32     si, x3i, x3i
+        vsub.f32     x3r,x2r,sr                 @// y3
+        vsub.f32     x3i,x2i,si
+
+
+        vsub.f32     x0r,x0r,x1r                @// y5
+        vsub.f32     x0i,x0i,x1i
+
+        vadd.f32     sr, x1r, x1r
+        vadd.f32     si, x1i, x1i
+        vadd.f32     x1r,x0r,sr                 @// y1
+        vadd.f32     x1i,x0i,si
+
+        .ifeqs  "\inverse", "TRUE"
+            vstm    pDst, {x1r, x1i}            @// store y1
+            add pDst, step1
+            vstm    pDst, {x3r, x3i}            @// store y3
+            add pDst, step1
+            vstm    pDst, {x0r, x0i}            @// store y5
+            add pDst, step1
+            vstm    pDst, {x2r, x2i}            @// store y7
+            add pDst, #8
+        .else
+            vstm    pDst, {x2r, x2i}            @// store y1
+            add pDst, step1
+            vstm    pDst, {x0r, x0i}            @// store y3
+            add pDst, step1
+            vstm    pDst, {x3r, x3i}            @// store y5
+            add pDst, step1
+            vstm    pDst, {x1r, x1i}            @// store y7
+            add pDst, #8
+        .endif
+
+        SUB     pDst, pDst, step2               @// update pDst for the next set
+
+
+        BGT     grpZeroSetLoop\name
+
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep             @// pDst -= 2*grpSize
+        mov     pDst, pPingPongBuf
+
+
+        .ENDM
+
+
+
+
+
+        @// Allocate stack memory required by the function
+
+        @// Ensure 8 byte alignment to use M_VLDM
+        M_ALLOC8    pU0, 8
+        M_ALLOC8    pU1, 8
+        M_ALLOC8    pU3, 8
+        M_ALLOC8    pU4, 8
+        M_ALLOC8    pU5, 8
+        M_ALLOC8    pU7, 8
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp,r4
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+        @// Allocate stack memory required by the function
+
+        @// Ensure 8 byte alignment to use M_VLDM
+        M_ALLOC8    pU0, 8
+        M_ALLOC8    pU1, 8
+        M_ALLOC8    pU3, 8
+        M_ALLOC8    pU4, 8
+        M_ALLOC8    pU5, 8
+        M_ALLOC8    pU7, 8
+
+        M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp,r4
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+@//    ENDIF        @//ARM1136JS
+
+
+
+@// Guarding implementation by the processor name
+
+
+    .END
diff --git a/dl/sp/src/arm/armv7/omxSP_FFTFwd_CToC_FC32_Sfs_s.S b/dl/sp/src/arm/armv7/omxSP_FFTFwd_CToC_FC32_Sfs_s.S
new file mode 100644
index 0000000..25b4976
--- /dev/null
+++ b/dl/sp/src/arm/armv7/omxSP_FFTFwd_CToC_FC32_Sfs_s.S
@@ -0,0 +1,161 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of omxSP_FFTFwd_CToC_SC32_Sfs_s.S
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@// 
+@// 
+
+        
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+        
+@//        M_VARIANTS ARM1136JS
+        
+@// Import symbols required from other files
+@// (For example tables)
+        
+        .extern  armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+        .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+        .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+        .extern  armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+    
+@/    IF  ARM1136JS 
+    
+@//Input Registers
+
+#define pSrc		r0
+#define pDst		r1
+#define pFFTSpec	r2
+
+
+@// Output registers
+#define result		r0
+
+@//Local Scratch Registers
+
+#define argTwiddle	r1
+#define argDst		r2
+#define argScale	r4
+#define pTwiddle	r4
+#define pOut		r5
+#define subFFTSize	r7     
+#define subFFTNum	r6
+#define N		r6
+#define order		r14
+#define diff		r9
+#define count		r8
+#define diffMinusOne	r2
+#define round		r3
+
+#define x0r s0    
+#define x0i s1
+
+
+
+
+    @// Allocate stack memory required by the function
+
+    @// Write function header
+        M_START     omxSP_FFTFwd_CToC_FC32_Sfs_vfp,r11
+        
+@ Structure offsets for FFTSpec	
+	.set	ARMsFFTSpec_N, 0
+	.set	ARMsFFTSpec_pBitRev, 4
+	.set	ARMsFFTSpec_pTwiddle, 8
+	.set	ARMsFFTSpec_pBuf, 12
+        
+        @// Define stack arguments
+        
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+        
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+                
+        CLZ     order,N			@// N = 2^order 
+        RSB     order,order,#31     
+        MOV     subFFTSize,#1
+        @//MOV     subFFTNum,N
+        
+        
+        CMP     order,#1
+        BGT     orderGreaterthan1	@// order > 1
+	@// order = 0, 1
+	vldmlt.f32 pSrc, {x0r, x0i}
+	vstmlt.f32 pDst, {x0r, x0i}
+	
+        MOVLT   pSrc,pDst
+        BLT     End
+
+	@// Handle order = 1
+        MOV     argDst,pDst             @// Set input args to fft stages
+        MOV     argTwiddle,pTwiddle
+        BL    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+        B     End  
+        
+
+
+orderGreaterthan1:       
+        
+        TST     order, #2               @// Set input args to fft stages
+        MOVNE   argDst,pDst        
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst               @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle  
+
+        @//check for even or odd order
+	
+        @// NOTE: The following combination of BL's would work fine
+	@// eventhough the first BL would corrupt the flags. This is
+	@// because the end of the "grpZeroSetLoop" loop inside
+        @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+	@// sets the Z flag to EQ
+        
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp 
+        BLNE    armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp 
+
+unscaledRadix4Loop:
+        CMP        subFFTNum,#1
+         BEQ        End
+         BL        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+         B        unscaledRadix4Loop
+
+       
+End:                        
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr       
+
+        @// Write function tail
+        M_END
+        
+@//    ENDIF                                           @//ARM1136JS    
+    
+    
+    @// Guarding implementation by the processor name
+    
+    
+    
+    .end
diff --git a/dl/sp/src/arm/armv7/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S b/dl/sp/src/arm/armv7/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S
new file mode 100644
index 0000000..dd1690a
--- /dev/null
+++ b/dl/sp/src/arm/armv7/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S
@@ -0,0 +1,328 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute FFT for a real signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@//        M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+        .extern  armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+        .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+        .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+        .extern  armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@//    IF  ARM1136JS
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+
+@// N=1 case
+#define scaleMinusOne   r2
+#define rnd             r2
+#define zero            r8
+#define Zero            r9
+
+
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define pTwiddle        r4
+#define pOut            r5
+#define subFFTSize      r7
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+#define count           r8
+#define diffMinusOne    r10
+#define round           r3
+
+#define step            r3
+#define step1           r6
+#define twStep          r12
+#define pTwiddleTmp     r14
+#define t0              r12
+#define t1              r14              /*@// pTwiddleTmp*/
+#define t2              r0
+#define t3              r1               /*@// pSrc,argTwiddle*/
+#define t4              r6
+#define t5              r7               /*@// step1,subFFTSize*/
+
+#define x0r     s0
+#define x0i     s1
+#define y0r     s2
+#define y0i     s3
+#define x1r     s4
+#define x1i     s5
+#define w1r     s2
+#define w1i     s3
+#define w0r     s6
+#define w0i     s7
+#define y1r     s2              /*@// w1r,w1i*/
+#define y1i     s3
+#define st0     s8
+#define st1     s9
+#define st2     s10
+#define st3     s11
+#define st4     s12
+#define st5     s13
+#define half    s15
+
+
+
+
+    @// Allocate stack memory required by the function
+
+
+
+    @// Write function header
+        M_START     omxSP_FFTFwd_RToCCS_F32_Sfs_vfp,r11
+
+@ Structure offsets for FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @// Define stack arguments
+
+        @// Setup half value
+        movw    N, #0                   @// Use N as a temp.
+        movt    N, #0x3f00
+        vmov.f32 half, N
+
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+        @//  N=1 Treat seperately
+        CMP     N,#1
+        BGT     sizeGreaterThanOne
+        // N<=1 is not supported
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr
+        B       FunctionEnd
+
+sizeGreaterThanOne:
+        @// Do a N/2 point complex FFT including the scaling
+
+        MOV     N,N,ASR #1              @// N/2 point complex FFT
+        CLZ     order,N                 @// N = 2^order
+        RSB     order,order,#31
+        MOV     subFFTSize,#1
+        @//MOV     subFFTNum,N
+
+
+        CMP     order,#1
+        BGT     orderGreaterthan1       @// order > 1
+        vldmlt.f32 pSrc, {x0r, x0i}
+        vstmlt.f32 pOut, {x0r, x0i}
+        MOVLT   pSrc,pOut
+        MOVLT   argDst,pDst
+        BLT     FFTEnd
+
+        MOV     argDst,pOut             @// Set input args to fft stages
+        MOV     pOut,pDst               @// Set input args to fft stages
+        MOV     argTwiddle,pTwiddle
+
+        BL    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+        B     finalComplexToRealFixup
+
+orderGreaterthan1:
+
+        TST     order, #2               @// Set input args to fft stages
+        MOVEQ   argDst,pDst
+        MOVNE   argDst,pOut
+        MOVNE   pOut,pDst               @// Pass the first stage dest in RN5
+        MOV     argTwiddle,pTwiddle
+
+        @//check for even or odd order
+
+        @// NOTE: The following combination of BL's would work fine
+        @// eventhough the first BL would corrupt the flags. This is
+        @// because the end of the "grpZeroSetLoop" loop inside
+        @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets
+        @// the Z flag to EQ
+
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+        BLNE    armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+
+unscaledRadix4Loop:
+        CMP        subFFTNum,#1
+         BEQ        FFTEnd
+         BL        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+         B        unscaledRadix4Loop
+
+FFTEnd:
+finalComplexToRealFixup:
+
+        @// step = N/2 * 8 bytes
+        MOV     step,subFFTSize,LSL #3
+        @// twStep = 3N/8 * 8 bytes pointing to W^1
+        SUB     twStep,step,subFFTSize,LSL #1
+        @// step1 = N/4 * 8 = N/2*4 bytes
+        MOV     step1,subFFTSize,LSL #2
+        @// (N/4-1)*8 bytes
+        SUB     step1,step1,#8
+
+        @// F(0) = 1/2 [Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
+        @// 1/2 [(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
+        @// 1/2 [2a+j0] - j [0+j2b]
+        @// (a+b, 0)
+
+        @// F(N/2) =1/2 [Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
+        @// 1/2 [(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
+        @// 1/2 [2a+j0] + j [0+j2b]
+        @// (a-b, 0)
+
+        @// F(0) and F(N/2)
+        vldm.f32 pSrc!, {x0r, x0i}
+        vadd.f32 y0r,x0r,x0i            @// F(0) = (2(Z0.r+Z0.i) , 0)
+        vsub.f32 x0r,x0r,x0i            @// F(N/2) = (2(Z0.r-Z0.i) , 0)
+        vsub.f32 y0i, y0i               @ y0i and x0i set to 0.0
+        vsub.f32 x0i, x0i
+
+        add      argDst, step
+        vstm.f32 argDst, {x0r, x0i}     @// {x0r,x0i}->[argDst, step]
+        sub      argDst, step
+        vstm.f32 argDst!, {y0r, y0i}
+
+        SUBS    subFFTSize,subFFTSize,#2
+
+        ADD     pTwiddleTmp,argTwiddle,#8       @// W^2
+        ADD     argTwiddle,argTwiddle,twStep    @// W^1
+        BLT     End
+        BEQ     lastElement
+
+
+        @// F(k) = 1/2 [Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
+        @// Process 2 elements at a time. E.g: F(1) and F(N/2-1) since
+        @// both of them require Z(1) and Z(N/2-1)
+
+        ASR     subFFTSize,subFFTSize,#1
+evenOddButterflyLoop:
+
+        SUB     step,step,#16           @// (N/2-2)*8 bytes
+
+        add      pSrc, step
+        vldm.f32 pSrc, {x1r, x1i}       @// {x1r, x1i} = [pSrc, step]
+        sub      pSrc, step
+        vldm.f32 pSrc!, {x0r, x0i}
+        add      argTwiddle, step1
+        vldm.f32 argTwiddle, {w1r, w1i}  @// {w1r, w1i} = [argTwiddle, step1]
+        sub      argTwiddle, step1
+        vldm.f32 argTwiddle!, {w0r, w0i} @// {w0r, w0i} = [argTwiddle], #8
+
+        SUB     step1,step1,#8
+        SUBS    subFFTSize,subFFTSize,#1
+
+        vsub.f32 st2,x0r,x1r            @// a-c
+        vadd.f32 st3,x0i,x1i            @// b+d
+        vadd.f32 st0,x0r,x1r            @// a+c
+        vsub.f32 st1,x0i,x1i            @// b-d
+
+        vmul.f32 x1r,w1r,st2
+        vmul.f32 x1i,w1r,st3
+        vmla.f32 x1r,w1i,st3            @// x1r = w1r*st2 + w1i*st3
+        @//RSB     x1r,x1r,#0
+        vmls.f32 x1i,w1i,st2            @// x1i = w1r*st3 - wli*st2
+
+        vsub.f32 y1r, st0, x1i
+        vadd.f32 y1i, x1r, st1
+        vneg.f32 y1i, y1i
+
+        vmul.f32  x0r,w0r,st2
+        vmul.f32  x0i,w0r,st3
+        vmls.f32  x0r,w0i,st3           @// x0r = w0r*st2 - w0i*st3
+        vmla.f32  x0i,w0i,st2           @// x0i = w0r*st3 + x0i*st1
+
+        vsub.f32   st4,st0,x0i          @// F(1)
+        vadd.f32   st5,x0r,st1
+
+
+        vmul.f32 y1r, half
+        vmul.f32 y1i, half
+        vmul.f32 st4, half
+        vmul.f32 st5, half
+
+        add      argDst, step
+        vstm.f32 argDst, {y1r, y1i}     @// {y1r,y1i} -> [argDst,step]
+        sub      argDst, step
+        vstm.f32 argDst!, {st4, st5}
+
+
+        MOV     t0,argTwiddle           @// swap ptr for even and odd twiddles
+        MOV     argTwiddle,pTwiddleTmp
+        MOV     pTwiddleTmp,t0
+
+        BGT     evenOddButterflyLoop
+
+        @// Last element can be expanded as follows
+        @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
+        @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
+        @// 1/2[2a+j0] + j (c+jd) [0+j2b]
+        @// (a-bc, -bd)
+
+lastElement:
+        vldm.f32 pSrc, {x0r, x0i}
+        vneg.f32 x0i, x0i
+        vstm.f32 argDst, {x0r, x0i}
+
+End:
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr
+
+FunctionEnd:
+        @// Write function tail
+        M_END
+
+@//    ENDIF                                           @//ARM1136JS
+
+
+    @// Guarding implementation by the processor name
+
+
+
+    .end
diff --git a/dl/sp/src/arm/armv7/omxSP_FFTInv_CCSToR_F32_Sfs_s.S b/dl/sp/src/arm/armv7/omxSP_FFTInv_CCSToR_F32_Sfs_s.S
new file mode 100644
index 0000000..d6a4765
--- /dev/null
+++ b/dl/sp/src/arm/armv7/omxSP_FFTInv_CCSToR_F32_Sfs_s.S
@@ -0,0 +1,227 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of omxSP_FFTInv_CCSToR_S32_Sfs_s.s
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@//        M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+        .extern  armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+        .extern  armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+        .extern  armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+        .extern  armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+        .extern  armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_vfp
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@//    IF  ARM1136JS
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+
+
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define pTwiddle        r4
+#define pOut            r5
+#define subFFTSize      r7
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+@// Total num of radix stages required to comple the FFT*/
+#define count           r8
+
+#define round           r3
+
+#define x0r     s0
+#define x0i     s1
+#define y0r     s2
+#define y0i     s3
+#define x1r     s4
+#define x1i     s5
+#define w1r     s2
+#define w1i     s3
+#define w0r     s6
+#define w0i     s7
+#define y1r     s2              /*@// w1r,w1i*/
+#define y1i     s3
+#define st0     s8
+#define st1     s9
+#define st2     s10
+#define st3     s11
+#define st4     s12
+#define st5     s13
+#define fscale  s2
+#define fone    s3
+
+
+
+    @// Allocate stack memory required by the function
+        M_ALLOC4        pDstOnStack, 4
+        M_ALLOC4        pFFTSpecOnStack, 4
+
+    @// Write function header
+        M_START     omxSP_FFTInv_CCSToR_F32_Sfs_vfp,r11
+
+@ Structure offsets for FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @// Define stack arguments
+
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+
+
+
+        @//  N=1 Treat seperately
+        CMP     N,#1
+        BGT     sizeGreaterThanOne
+        vldr.f32 x0r, [pSrc]
+        vstr.f32 x0r, [pDst]
+
+        B       End
+
+sizeGreaterThanOne:
+        M_STR   pDst,pDstOnStack                    @// store all the pointers
+        M_STR   pFFTSpec,pFFTSpecOnStack
+
+
+        @// Call the preTwiddle Radix2 stage before doing the compledIFFT
+
+        BL    armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_vfp
+
+
+complexIFFT:
+
+        M_LDR   pFFTSpec,pFFTSpecOnStack
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+        ASR     N,N,#1                  @// N/2 point complex IFFT
+        ADD     pSrc,pOut,N,LSL #3      @// set pSrc as pOut1
+        M_LDR   pDst,pDstOnStack
+
+        CLZ     order,N                 @// N = 2^order
+        RSB     order,order,#31
+        MOV     subFFTSize,#1
+
+        CMP     order,#1
+        BGT     orderGreaterthan1       @// order > 1
+        vldmlt.f32 pSrc, {x0r, x0i}
+        vstmlt.f32 pDst, {x0r, x0i}
+
+        MOVLT   pSrc,pDst
+        BLT     FFTEnd
+
+        MOV     argDst,pDst             @// Set input args to fft stages
+        MOV     argTwiddle,pTwiddle
+
+        BL      armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+        B       FFTEnd
+
+
+orderGreaterthan1:
+
+        TST     order, #2               @// Set input args to fft stages
+        MOVNE   argDst,pDst
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst               @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+
+
+        @//check for even or odd order
+
+        @// NOTE: The following combination of BL's would work fine
+        @// eventhough the first BL would corrupt the flags. This is
+        @// because the end of the "grpZeroSetLoop" loop inside
+        @// armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets
+        @// the Z flag to EQ
+
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+        BLNE    armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+
+unscaledRadix4Loop:
+        CMP        subFFTNum,#1
+         BEQ        FFTEnd
+         BL        armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+         B        unscaledRadix4Loop
+
+FFTEnd:
+
+        vldm.f32 pSrc, {x0r, x0i}
+
+        vmov.f32     fscale, subFFTSize
+        vcvt.f32.s32 fscale, fscale             @// fscale = N as a float
+        mov          round, #1
+        vmov.f32     fone, round
+        vcvt.f32.s32 fone, fone
+        vdiv.f32     fscale, fone, fscale       @// fscale = 1/N
+
+scaleFFTData:                                   @// N = subFFTSize
+        SUBS    subFFTSize,subFFTSize,#1
+        vmul.f32 x0r, x0r, fscale
+        vmul.f32 x0i, x0i, fscale
+        vstm.f32 pSrc!, {x0r, x0i}
+        vldmgt.f32 pSrc, {x0r, x0i}
+
+        BGT     scaleFFTData
+
+
+End:
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr
+
+        @// Write function tail
+        M_END
+
+@//    ENDIF                                           @//ARM1136JS
+
+
+      @// Guarding implementation by the processor name
+
+
+
+    .end
diff --git a/dl/sp/src/arm/armv7/omxSP_FFTInv_CToC_FC32_Sfs_s.S b/dl/sp/src/arm/armv7/omxSP_FFTInv_CToC_FC32_Sfs_s.S
new file mode 100644
index 0000000..64aa5da
--- /dev/null
+++ b/dl/sp/src/arm/armv7/omxSP_FFTInv_CToC_FC32_Sfs_s.S
@@ -0,0 +1,180 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of omxSP_FFTInv_CToC_SC32_Sfs_s.s
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@//        M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+        .extern  armSP_FFTInv_CToC_FC32_Sfs_Radix2_fs_OutOfPlace_unsafe_vfp
+        .extern  armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+        .extern  armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+        .extern  armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+        .extern  armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@//    IF  ARM1136JS
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define pTwiddle        r4
+#define pOut            r5
+#define subFFTSize      r7
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+#define count           r8
+#define diffMinusOne    r2
+#define round           r3
+
+#define x0r     s0
+#define x0i     s1
+#define fone    s2
+#define fscale  s3
+
+
+    @// Allocate stack memory required by the function
+
+    @// Write function header
+        M_START     omxSP_FFTInv_CToC_FC32_Sfs_vfp,r11
+
+@ Structure offsets for FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @// Define stack arguments
+
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+        CLZ     order,N                 @// N = 2^order
+        RSB     order,order,#31
+        MOV     subFFTSize,#1
+        @//MOV     subFFTNum,N
+
+        CMP     order,#1
+        BGT     orderGreaterthan1       @// order > 1
+        @// Order = 0 or 1
+        vldmlt.f32 pSrc, {x0r, x0i}
+        vstmlt.f32 pDst, {x0r, x0i}
+
+        MOVLT   pSrc,pDst
+        BLT     FFTEnd
+
+        @// Handle order = 1
+        MOV     argDst,pDst
+        MOV     argTwiddle,pTwiddle
+
+        BL      armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+        B       FFTEnd
+
+orderGreaterthan1:
+
+        TST     order, #2               @// Set input args to fft stages
+        MOVNE   argDst,pDst
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst               @// Pass the first stage dest in RN5
+        MOV     argTwiddle,pTwiddle
+
+
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine
+        @// eventhough the first BL would corrupt the flags. This is
+        @// because the end of the "grpZeroSetLoop" loop inside
+        @// armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets
+        @// the Z flag to EQ
+
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+        BLNE    armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+
+unscaledRadix4Loop:
+        CMP        subFFTNum,#1
+         BEQ        FFTEnd
+         BL        armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+         B        unscaledRadix4Loop
+
+
+FFTEnd:
+
+        vldm.f32 pSrc, {x0r, x0i}
+
+        vmov.f32     fscale, subFFTSize
+        vcvt.f32.s32 fscale, fscale             @// fscale = N as a float
+        movw         round, #0
+        movt         round, #0x3f80             @// round = 1.0
+        vmov.f32     fone, round
+        vdiv.f32     fscale, fone, fscale       @// fscale = 1/N
+scaleFFTData:                                   @// N = subFFTSize
+        SUBS    subFFTSize,subFFTSize,#1
+        vmul.f32 x0r, x0r, fscale
+        vmul.f32 x0i, x0i, fscale
+        vstm.f32 pSrc, {x0r, x0i}
+        add      pSrc, #8
+        vldmgt.f32 pSrc, {x0r, x0i}
+
+        bgt     scaleFFTData
+
+
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr
+
+        @// Write function tail
+        M_END
+
+@//    ENDIF                                           @//ARM1136JS
+
+
+      @// Guarding implementation by the processor name
+
+
+
+    .end