Add non-NEON float FFT implementation.
BUG=
R=aedla@chromium.org, andrew@webrtc.org
Review URL: https://webrtc-codereview.appspot.com/2028004
git-svn-id: http://webrtc.googlecode.com/svn/deps/third_party/openmax@5051 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/dl/api/arm/armCOMM_s.h b/dl/api/arm/armCOMM_s.h
index 6b0d2be..6ce1e2f 100644
--- a/dl/api/arm/armCOMM_s.h
+++ b/dl/api/arm/armCOMM_s.h
@@ -371,6 +371,17 @@
.endm
+ @// Allocate 8-byte aligned area of name
+ @// |name| and size |size| bytes.
+ .macro M_ALLOC8 name, size
+ .if (_SBytes & 7) != 0
+ .set _SBytes, _SBytes + (8 - (_SBytes & 7))
+ .endif
+ .set \name\()_F, _SBytes
+ .set _SBytes, _SBytes + \size
+
+ .endm
+
@ Load word from stack
.macro M_LDR r, a0, a1, a2, a3
_M_DATA "ldr", 4, \r, \a0, \a1, \a2, \a3
@@ -381,6 +392,16 @@
_M_DATA "str", 4, \r, \a0, \a1, \a2, \a3
.endm
+ @ Load double word from stack
+ .macro M_LDRD r0, r1, a0, a1, a2, a3
+ _M_DATA2 "ldrd", 8, \r0, \r1, \a0, \a1, \a2, \a3
+ .endm
+
+ @ Store double word to stack
+ .macro M_STRD r0, r1, a0, a1, a2, a3
+ _M_DATA2 "strd", 8, \r0, \r1, \a0, \a1, \a2, \a3
+ .endm
+
@ Macro to perform a data access operation
@ Such as LDR or STR
@ The addressing mode is modified such that
@@ -407,3 +428,31 @@
.set _Offset, _Workspace + \a0\()_F
\i\a1 \r, [sp, #_Offset]
.endm
+
+ @ Macro to perform a data access operation
+ @ Such as LDR or STR
+ @ The addressing mode is modified such that
+ @ 1. If no address is given then the name is taken
+ @ as a stack offset
+ @ 2. If the addressing mode is not available for the
+ @ state being assembled for (eg Thumb) then a suitable
+ @ addressing mode is substituted.
+ @
+ @ On Entry:
+ @ $i = Instruction to perform (eg "LDRB")
+ @ $a = Required byte alignment
+ @ $r = Register(s) to transfer (eg "r1")
+ @ $a0,$a1,$a2. Addressing mode and condition. One of:
+ @ label {,cc}
+ @ [base] {,,,cc}
+ @ [base, offset]{!} {,,cc}
+ @ [base, offset, shift]{!} {,cc}
+ @ [base], offset {,,cc}
+ @ [base], offset, shift {,cc}
+ @
+ @ WARNING: Most of the above are not supported, except the first case.
+ .macro _M_DATA2 i, a, r0, r1, a0, a1, a2, a3
+ .set _Offset, _Workspace + \a0\()_F
+ \i\a1 \r0, \r1, [sp, #_Offset]
+ .endm
+
\ No newline at end of file
diff --git a/dl/sp/src/arm/armv7/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S b/dl/sp/src/arm/armv7/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S
new file mode 100644
index 0000000..75d6711
--- /dev/null
+++ b/dl/sp/src/arm/armv7/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S
@@ -0,0 +1,260 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of
+@// armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S to support float
+@// instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
+@// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
+@// It implements the "scaled"(by 1/2) version of the above formula.
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@/ IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+
+
+@// Output registers
+#define result r0
+
+@//Local Scratch Registers
+
+
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define pTwiddle r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+#define count r8
+#define diffMinusOne r2
+#define round r3
+
+#define pOut1 r2
+#define size r7
+#define step r3
+#define step1 r6
+#define twStep r12
+#define pTwiddleTmp r14
+#define t0 r12
+
+#define x0r s0
+#define x0i s1
+#define x1r s2
+#define x1i s3
+#define w0r s4
+#define w0i s5
+#define y0r s6
+#define y0i s7
+#define w1r s6
+#define w1i s7
+#define y1r s6 /*@// w1r,w1i*/
+#define y1i s7
+#define st0 s8
+#define st1 s9
+#define st2 s10
+#define st3 s11
+#define st4 s12
+#define st5 s13
+//@ half = 0.5
+#define half s15
+
+
+
+
+
+ .MACRO FFTSTAGE scaled, inverse,name
+
+ @// Initialize half now.
+ movw N, #0x0000
+ movt N, #0x3f00
+ vmov.f32 half, N @// half = 0.5
+
+ @// Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+ @// Read other structure parameters
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+
+ MOV size,N,ASR #1 @// preserve the contents of N
+
+ MOV step,size,LSL #3 @// step = N/2 * 8 bytes
+ ADD pTwiddleTmp,pTwiddle,#8 @// W^2
+
+ ADD pOut1,pOut,step @// pOut1 = pOut+ N/2*8 bytes
+ @// twStep = 3N/8 * 8 bytes pointing to W^1
+ SUB twStep,step,size,LSL #1
+ MOV step1,size,LSL #2 @// step1 = N/4 * 8 = N/2*4 bytes
+ SUB step1,step1,#8 @// (N/4-1)*8 bytes
+ ADD argTwiddle,pTwiddle,twStep @// W^1
+
+ @// Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]}
+ @// Note: W^(k) is stored as negated value and also need to
+ @// conjugate the values from the table
+
+ @// Z(0) : no need of twiddle multiply
+ @// Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }
+
+
+ add pSrc, step @// step = N/2*8 bytes
+ vldm.f32 pSrc, {x1r, x1i} @// {x1r, x1i} = [pSrc, step]
+ sub pSrc, step
+ vldm.f32 pSrc!, {x0r, x0i}
+
+ SUBS size,size,#2
+
+ vadd.f32 st0, x0r, x1r @// a+c
+ vsub.f32 st1, x0r, x1r @// a-c
+ vmov.f32 x0r, st0
+ vmov.f32 x1r, st1
+ vsub.f32 st0, x0i, x1i @// b-d
+ vadd.f32 x1i, x0i, x1i @// b+d
+ vmov.f32 x0i, st0
+
+
+ vsub.f32 x0r,x0r,x1i @// Z(0).r
+ vadd.f32 x0i,x0i,x1r @// Z(0).i
+
+ vmul.f32 x0r, half
+ vmul.f32 x0i, half
+ vstm.f32 pOut1!, {x0r, x0i} @// pOut1 = pOut+ N/2*8 bytes
+
+ BLT end\name
+ BEQ lastElement\name
+
+ ASR size,size,#1
+evenOddButterflyLoop\name:
+
+ SUB step,step,#16 @// (N/2-2)*8 bytes
+
+ add pSrc, step @// (N/2-1)*8 bytes
+ vldm.f32 pSrc, {x1r, x1i} @// {x1r, x1i} = [pSrc, step]
+ sub pSrc, step
+ vldm.f32 pSrc!, {x0r, x0i}
+ add argTwiddle, step1
+ vldm.f32 argTwiddle, {w1r, w1i} @// {w1r, w1i} = [argTwiddle, step]
+ sub argTwiddle, step1
+ vldm.f32 argTwiddle!, {w0r, w0i}
+
+ SUB step1,step1,#8
+ SUBS size,size,#1
+
+
+ vsub.f32 st2,x0r,x1r @// a-c
+ vadd.f32 st3,x0i,x1i @// b+d
+ vadd.f32 st0,x0r,x1r @// a+c
+ vsub.f32 st1,x0i,x1i @// b-d
+
+ vmul.f32 x1r,w1r,st2
+ vmul.f32 x1i,w1r,st3
+ vmls.f32 x1r,w1i,st3
+ vmla.f32 x1i,w1i,st2
+
+ vadd.f32 y1r,st0,x1i @// F(N/2 -1)
+ vsub.f32 y1i,x1r,st1 @// y1r,y1i same as w1r, w1i
+
+
+ vmul.f32 x0r,w0r,st2
+ vmul.f32 x0i,w0r,st3
+ vmla.f32 x0r,w0i,st3
+ vmls.f32 x0i,w0i,st2
+
+
+ vadd.f32 st4,st0,x0i @// F(1)
+ vsub.f32 st5,st1,x0r
+
+
+ vmul.f32 y1r, half
+ vmul.f32 y1i, half
+ vmul.f32 st4, half
+ vmul.f32 st5, half
+ add pOut1, step @// (N/2-1)*8 bytes
+ vstm.f32 pOut1, {y1r, y1i} @// {y1r,y1i} = [pOut1, step]
+ sub pOut1, step
+ vstm.f32 pOut1!, {st4, st5}
+
+ MOV t0,argTwiddle @// swap ptr for even and odd twiddles
+ MOV argTwiddle,pTwiddleTmp
+ MOV pTwiddleTmp,t0
+
+ BGT evenOddButterflyLoop\name
+
+
+ @// Last element can be expanded as follows
+ @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)]
+ @// (since W^k is stored as -ve)
+ @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
+ @// 1/2[2a+j0] + j (c-jd) [0+j2b]
+ @// (a+bc, -bd)
+ @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement\name:
+ vldm.f32 pSrc, {x0r, x0i}
+
+ vneg.f32 x0i, x0i
+ vstm.f32 pOut1, {x0r, x0i}
+end\name:
+
+
+ .endm
+
+
+@ Structure offsets for FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+
+ M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_vfp,r4
+ FFTSTAGE "FALSE","TRUE",Inv
+ M_END
+
+@// ENDIF @//ARM1136JS
+
+
+ @// Guarding implementation by the processor name
+
+
+
+ .end
diff --git a/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S b/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S
new file mode 100644
index 0000000..c2feb0b
--- /dev/null
+++ b/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S
@@ -0,0 +1,145 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S
+@// to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@// IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r2
+#define pTwiddle r1
+#define pPingPongBuf r5
+#define subFFTNum r6
+#define subFFTSize r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define pDstBuf r3 /*@// Temporarily hold pingpong buffer ptr*/
+#define grpSize r14
+#define outPointStep r12
+#define setCount r14
+#define pointStep r12
+
+@// Real and Imaginary parts
+#define x0r s0
+#define x0i s1
+#define x1r s2
+#define x1i s3
+#define y1r s4
+#define y1i s5
+#define y0r s6
+#define y0i s7
+
+
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount
+ @// and pGrpSize regs
+
+ mov subFFTSize, #2
+ lsr grpSize, subFFTNum, #1
+ mov subFFTNum, grpSize
+
+ @// pT0+1 increments pT0 by 8 bytes
+ @// pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+ @// Note: outPointStep = pointStep for firststage
+ @// Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+ MOV pointStep,grpSize,LSL #3
+
+
+
+ @// Loop on the sets for grp zero
+
+grpZeroSetLoop\name:
+
+ add pSrc, pSrc, pointStep
+ @// {x1r,x1i} = [pSrc, pointStep]
+ vldm.f32 pSrc, {x1r, x1i}
+ sub pSrc, pSrc, pointStep
+ vldm.f32 pSrc!, {x0r, x0i}
+
+ SUBS setCount,setCount,#1 @// decrement the loop counter
+
+
+
+ vsub.f32 y1r,x0r,x1r
+ vsub.f32 y1i,x0i,x1i
+
+ vadd.f32 y0r,x0r,x1r
+ vadd.f32 y0i,x0i,x1i
+
+ add pDst, pDst, outPointStep
+ @// {y1r,y1i} -> [pDst, outPointStep]
+ vstm pDst, {y1r, y1i}
+ sub pDst, pDst, outPointStep
+ vstm pDst!, {y0r, y0i}
+
+ BGT grpZeroSetLoop\name
+
+
+ @// reset pSrc to pDst for the next stage
+ SUB pSrc,pDst,pointStep @// pDst -= 2*grpSize
+ mov pDst, pPingPongBuf
+
+ .endm
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp,r4
+ FFTSTAGE "FALSE","FALSE",FWD
+ M_END
+
+ M_START armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp,r4
+ FFTSTAGE "FALSE","TRUE",INV
+ M_END
+
+
+@/ ENDIF @//ARM1136JS
+
+
+@// Guarding implementation by the processor name
+
+
+
+ .end
diff --git a/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S b/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S
new file mode 100644
index 0000000..3bd4725
--- /dev/null
+++ b/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S
@@ -0,0 +1,213 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S
+@// to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a first stage Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@// IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r2
+#define pTwiddle r1
+#define pPingPongBuf r5
+#define subFFTNum r6
+#define subFFTSize r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize r14
+#define outPointStep r12
+#define setStep r3
+#define setCount r14 /*@// Reuse grpSize as setCount*/
+#define pointStep r12
+
+@// Real and Imaginary parts
+#define x0r s0
+#define x0i s1
+#define x1r s2
+#define x1i s3
+#define x2r s4
+#define x2i s5
+#define x3r s6
+#define x3i s7
+#define t3r s0 /*@// Temporarily hold x3r and x3i*/
+#define t3i s1
+#define sr s8
+#define si s9
+
+
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ @// Define stack arguments
+
+
+ @// Update grpCount and grpSize rightaway inorder to reuse
+ @// pSubFFTSize and pSubFFTNum regs
+ mov subFFTSize, #4
+ lsr grpSize, subFFTNum, #2
+ mov subFFTNum, grpSize
+
+
+ @// pT0+1 increments pT0 by 8 bytes
+ @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+ @// Note: outPointStep = pointStep for firststage
+ @// Note: setCount = grpSize/4 (reuse the updated grpSize for setCount)
+ MOV pointStep,grpSize,LSL #3
+
+
+ @// Calculate the step of input data for the next set
+ @//MOV setStep,pointStep,LSL #1
+ MOV setStep,grpSize,LSL #4
+ @// setStep = 3*pointStep
+ ADD setStep,setStep,pointStep
+ @// setStep = - 3*pointStep+8
+ RSB setStep,setStep,#8
+
+ @// grp = 0 a special case since all the twiddle factors are 1
+ @// Loop on the sets
+
+grpZeroSetLoop\name:
+
+ vldm.f32 pSrc, {x0r, x0i}
+ add pSrc, pSrc, pointStep
+ vldm.f32 pSrc, {x1r, x1i}
+ add pSrc, pSrc, pointStep
+ vldm.f32 pSrc, {x2r, x2i}
+ add pSrc, pSrc, pointStep
+ vldm.f32 pSrc, {x3r, x3i}
+ add pSrc, pSrc, setStep
+
+
+ @// Decrement setcount
+ SUBS setCount,setCount,#1
+
+
+
+ @// finish first stage of 4 point FFT
+
+ vadd.f32 x0r,x0r,x2r @// x0 = x0 + x2
+ vadd.f32 x0i,x0i,x2i
+
+ vadd.f32 sr, x2r, x2r
+ vadd.f32 si, x2i, x2i
+ vsub.f32 x2r,x0r,sr @// x2 = x0 - x2
+ vsub.f32 x2i,x0i,si
+
+ vadd.f32 x1r,x1r,x3r @// x1 = x1 + x3
+ vadd.f32 x1i,x1i,x3i
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vsub.f32 x3r,x1r,sr @// x3 = x1 - x3
+ vsub.f32 x3i,x1i,si
+
+
+ @// finish second stage of 4 point FFT
+
+
+ vadd.f32 x0r,x0r,x1r @// x0 = x0 + x1
+ vadd.f32 x0i,x0i,x1i
+
+ vadd.f32 sr, x1r, x1r
+ vadd.f32 si, x1i, x1i
+ vsub.f32 x1r,x0r,sr @// x1 = x0 - x1
+ vsub.f32 x1i,x0i,si
+
+ vstm.f32 pDst, {x0r, x0i}
+ add pDst, pDst, outPointStep
+
+ vadd.f32 x2r,x2r,x3i
+ vsub.f32 x2i,x2i,x3r
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vsub.f32 t3r, x2r, si
+ vadd.f32 t3i, x2i, sr
+
+ .ifeqs "\inverse", "TRUE"
+ vstm.f32 pDst, {t3r, t3i}
+ add pDst, pDst, outPointStep
+ vstm.f32 pDst, {x1r, x1i}
+ add pDst, pDst, outPointStep
+ vstm.f32 pDst, {x2r, x2i}
+ add pDst, pDst, setStep
+ .else
+ vstm.f32 pDst, {x2r, x2i}
+ add pDst, pDst, outPointStep
+ vstm.f32 pDst, {x1r, x1i}
+ add pDst, pDst, outPointStep
+ vstm.f32 pDst, {t3r, t3i}
+ add pDst, pDst, setStep
+ .endif
+
+
+ BGT grpZeroSetLoop\name
+
+
+ @// reset pSrc to pDst for the next stage
+ SUB pSrc,pDst,pointStep @// pDst -= 2*grpSize
+ mov pDst, pPingPongBuf
+
+ .endm
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp,r4
+ FFTSTAGE "FALSE","FALSE",FWD
+ M_END
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp,r4
+ FFTSTAGE "FALSE","TRUE",INV
+ M_END
+
+
+@// ENDIF @//ARM1136JS
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+ .end
diff --git a/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S b/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S
new file mode 100644
index 0000000..00e48d1
--- /dev/null
+++ b/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S
@@ -0,0 +1,310 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.S
+@// to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@// IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r2
+#define pTwiddle r1
+#define subFFTNum r6
+#define subFFTSize r7
+
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpCount r12
+#define step r12 /*@// Reuse grpCount*/
+#define outPointStep r3
+#define setCount r8
+#define diff r9
+#define pointStep r14
+
+#define t1 r3 /*@// Reuse outPointStep*/
+
+@// Real and Imaginary parts used in the inner grp loop
+#define x0r s0
+#define x0i s1
+#define x1r s2
+#define x1i s3
+#define x2r s4
+#define x2i s5
+#define x3r s6
+#define x3i s7
+
+@// Temporary reg to hold the twiddle multiplies
+
+#define t0r s8
+#define t0i s9
+#define t2r s10
+#define t2i s11
+#define sr s12
+#define si s13
+
+
+
+
+ .MACRO FFTSTAGE scaled, inverse , name
+
+ @// Define stack arguments
+
+
+ @// Update grpCount and grpSize rightaway inorder to reuse
+ @// pGrpCount and pGrpSize regs
+
+ LSL grpCount,subFFTSize,#2
+ lsr subFFTNum, subFFTNum, #2
+ mov subFFTSize, grpCount
+
+
+ @// pT0+1 increments pT0 by 8 bytes
+ @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+ mov pointStep, subFFTNum, lsl #1
+
+
+ @// pOut0+1 increments pOut0 by 8 bytes
+ @// pOut0+outPointStep == increment of 8*outPointStep bytes = 2*size
+ @// bytes
+
+ @// Use setCount as dummy. It's set correctly below.
+ smull outPointStep, setCount, grpCount, pointStep
+
+ LSL pointStep,pointStep,#2 @// 2*grpSize
+
+
+ MOV setCount,pointStep,LSR #3
+
+ @// Interchange grpLoop and setLoop
+
+setLoop\name:
+
+ MOV step,#0
+ @// Set pSrc and pDst for the grpLoop
+
+ SUB diff,outPointStep,pointStep
+
+ @// Save setCount on stack to reuse the reg
+
+ ADD pSrc,pSrc,diff,LSL #2 @// pSrc += (grpCount-1)*grpStep
+ ADD pDst,pDst,diff @// pDst += (grpCount-1)*setCount
+ ADD step,step,diff @// step += (grpCount-1)*setCount
+
+
+
+ @// Loop on the grps
+
+grpLoop\name:
+
+
+
+ @// butterfly loop
+ add pSrc, pointStep
+ vldm.f32 pSrc, {x3r, x3i} @// data[1]
+ add pTwiddle, step
+ vldm.f32 pTwiddle, {x1r, x1i} @// coef[1]
+ add pTwiddle, step
+ vldm.f32 pTwiddle, {x2r, x2i} @// coef[2]
+ add pSrc, pointStep
+ vldm.f32 pSrc, {x0r, x0i} @// data[2]
+
+ @// do first complex multiply
+ vmul.f32 t0r, x3r, x1r
+ vmul.f32 t0i, x3i, x1r
+
+ .ifeqs "\inverse", "TRUE"
+ vmla.f32 t0r, x3i, x1i
+ vmls.f32 t0i, x3r, x1i
+ vmov.f32 x1r, t0r
+ vmov.f32 x1i, t0i
+ .else
+ vmls.f32 t0r, x3i, x1i
+ vmla.f32 t0i, x3r, x1i
+ vmov.f32 x1r, t0r
+ vmov.f32 x1i, t0i
+ .endif
+
+ add pTwiddle, pTwiddle, step
+ vldm pTwiddle, {x3r, x3i} @// coef[3]
+ sub pTwiddle, pTwiddle, step
+
+ @// do second complex multiply
+ vmul.f32 t0r, x0r, x2r
+ vmul.f32 t0i, x0i, x2r
+
+ .ifeqs "\inverse", "TRUE"
+ vmla.f32 t0r, x0i, x2i
+ vmls.f32 t0i, x0r, x2i
+ vmov.f32 x2r, t0r
+ vmov.f32 x2i, t0i
+ .else
+ vmls.f32 t0r, x0i, x2i
+ vmla.f32 t0i, x0r, x2i
+ vmov.f32 x2r, t0r
+ vmov.f32 x2i, t0i
+ .endif
+
+ add pSrc, pointStep
+ vldm pSrc, {x0r, x0i} @// data[3]
+ sub pSrc, pointStep
+
+ SUB pTwiddle,pTwiddle,step,LSL #1 @// reset pTwiddle
+ SUBS step,step,pointStep @// decrement loop counter
+
+ @// do third complex multiply
+ SUB pSrc,pSrc,pointStep,LSL #1 @// reset pSrc to data[0]
+ vmul.f32 t0r, x0r, x3r
+ vmul.f32 t0i, x0i, x3r
+
+ .ifeqs "\inverse", "TRUE"
+ vmla.f32 t0r, x0i, x3i
+ vmls.f32 t0i, x0r, x3i
+ vmov.f32 x3r, t0r
+ vmov.f32 x3i, t0i
+ .else
+ vmls.f32 t0r, x0i, x3i
+ vmla.f32 t0i, x0r, x3i
+ vmov.f32 x3r, t0r
+ vmov.f32 x3i, t0i
+ .endif
+
+ vldm pSrc, {x0r, x0i} @// data[0]
+
+ @// finish first stage of 4 point FFT
+ vadd.f32 x0r,x0r,x2r @// x0 = x0 + x2 (u0)
+ vadd.f32 x0i,x0i,x2i
+
+ vadd.f32 sr, x2r, x2r
+ vadd.f32 si, x2i, x2i
+ vsub.f32 x2r,x0r,sr @// x2 = x0 - x2 (u1)
+ vsub.f32 x2i,x0i,si
+
+ vadd.f32 x1r,x1r,x3r @// x1 = x1/2 + x3/2 (u2/2)
+ vadd.f32 x1i,x1i,x3i
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vsub.f32 x3r,x1r,sr @// x3 = x1/2 - x3/2 (u3/2)
+ vsub.f32 x3i,x1i,si
+
+
+ @// finish second stage of 4 point FFT
+
+ @// y0 = u1-u2 since twiddle's are stored as -ve values
+ vsub.f32 x2r,x2r,x1r
+ vsub.f32 x2i,x2i,x1i
+
+ vadd.f32 sr, x1r, x1r
+ vadd.f32 si, x1i, x1i
+ vadd.f32 x1r,x2r,sr @// y2 = u1+u2
+ vadd.f32 x1i,x2i,si
+ vstm pDst, {x2r, x2i} @// store y0
+
+ vsub.f32 x0r,x0r,x3i @// y3 = u0+ju3
+ vadd.f32 x0i,x0i,x3r
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vadd.f32 t2r,x0r,si @// y1 = u0-ju3
+ vsub.f32 t2i,x0i,sr @// t2 will be same as x2r reg
+
+ .ifeqs "\inverse", "TRUE"
+ add pDst, outPointStep
+ vstm pDst, {t2r, t2i} @// store y1
+ add pDst, outPointStep
+ vstm pDst, {x1r, x1i} @// store y2
+ add pDst, outPointStep
+ vstm pDst, {x0r, x0i} @// store y3
+ sub pDst, outPointStep
+ .else
+ add pDst, outPointStep
+ vstm pDst, {x0r, x0i} @// store y1
+ add pDst, outPointStep
+ vstm pDst, {x1r, x1i} @// store y2
+ add pDst, outPointStep
+ vstm pDst, {t2r, t2i} @// store y3
+ sub pDst, outPointStep
+ .endif
+
+ SUB pDst,pDst,outPointStep, LSL #1 @// reset pDst
+ @// update the pDst for the next grp
+ SUBGE pDst,pDst,pointStep
+ @// update the pSrc for the next grp
+ SUBGE pSrc,pSrc,pointStep,LSL #2
+
+
+ BGE grpLoop\name
+
+ ADD pSrc,pSrc,#8 @// pSrc += 1; for the next set
+ ADD pDst,pDst,#8 @// pDst += 1; for the next set
+
+ SUBS setCount,setCount,#1 @// decrement loop counter
+
+
+ BGT setLoop\name
+
+ @// Reset and Swap pSrc and pDst for the next stage
+ MOV t1,pDst
+ SUB pDst,pSrc,subFFTNum,LSL #3
+ SUB pSrc,t1,subFFTNum,LSL #3
+
+ .endm
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp,r4
+ FFTSTAGE "FALSE","FALSE",FWD
+ M_END
+
+ M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp,r4
+ FFTSTAGE "FALSE","TRUE",INV
+ M_END
+
+
+@// ENDIF @//ARM1136JS
+
+
+
+@// Guarding implementation by the processor name
+
+ .end
diff --git a/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S b/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S
new file mode 100644
index 0000000..4ac2da4
--- /dev/null
+++ b/dl/sp/src/arm/armv7/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S
@@ -0,0 +1,386 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S
+@// to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a first stage Radix 8 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@// IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r2
+#define pTwiddle r1
+#define subFFTNum r6
+#define subFFTSize r7
+#define pPingPongBuf r5
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize r14
+#define step1 r3
+#define step2 r8
+#define setCount r14 /*@// Reuse grpSize as setCount*/
+#define pointStep r12
+
+#define t0 r4
+@// Real and Imaginary parts
+
+#define x0r s0
+#define x0i s1
+#define x1r s2
+#define x1i s3
+#define x2r s4
+#define x2i s5
+#define x3r s6
+#define x3i s7
+#define t3r s8 /*@// Temporarily hold x3r and x3i*/
+#define t3i s9
+#define t1r s4
+#define t1i s5
+#define sr s10
+#define si s11
+#define roothalf s12
+
+@// Define macros to load/store two float regs from/to the stack.
+ .macro M_VSTM r0, r1, p
+ .set _Offset, _Workspace + \p\()_F
+ add t0, sp, #_Offset
+ vstm.f32 t0, {\r0, \r1}
+ .endm
+
+ .macro M_VLDM r0, r1, p
+ .set _Offset, _Workspace + \p\()_F
+ add t0, sp, #_Offset
+ vldm.f32 t0, {\r0, \r1}
+ .endm
+
+@// Define constants
+
+ .MACRO FFTSTAGE scaled, inverse , name
+
+ @// Define stack arguments
+
+
+ @// Update grpCount and grpSize rightaway inorder to reuse
+ @// pSubFFTSize and pSubFFTNum regs
+
+ mov subFFTSize, #8
+ lsr grpSize, subFFTNum, #3
+ mov subFFTNum, grpSize
+
+
+ @// pT0+1 increments pT0 by 8 bytes
+ @// pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
+ @// Note: setCount = grpSize/8 (reuse the updated grpSize for
+ @// setCount)
+ MOV pointStep,grpSize,LSL #3
+
+
+ @// Calculate the step of input data for the next set
+ MOV step1,grpSize,LSL #4
+ MOV step2,pointStep,LSL #3
+ SUB step2,step2,pointStep @// step2 = 7*pointStep
+
+
+ @// grp = 0 a special case since all the twiddle factors are 1
+ @// Loop on the sets
+
+ movw t0,#0x04f3
+ movt t0,#0x3f35
+ vmov.f32 roothalf, t0 @// roothalf = sqrt(1/2)
+
+grpZeroSetLoop\name:
+
+ vldm.f32 pSrc, {x0r, x0i} @// x0
+ add pSrc, step1
+ vldm.f32 pSrc, {x1r, x1i} @// x2
+ add pSrc, step1
+ vldm.f32 pSrc, {x2r, x2i} @// x4
+ add pSrc, step1
+ vldm.f32 pSrc, {x3r, x3i} @// x6
+ add pSrc, step1
+
+ SUB pSrc, pSrc, step2
+
+ @// finish first stage of 8 point FFT and save on stack
+
+ vadd.f32 x0r,x0r,x2r @// u0
+ vadd.f32 x0i,x0i,x2i
+
+ vadd.f32 sr, x2r, x2r
+ vadd.f32 si, x2i, x2i
+ vsub.f32 x2r,x0r,sr @// u1
+ vsub.f32 x2i,x0i,si
+
+ M_VSTM x0r,x0i, pU0
+ M_VSTM x2r,x2i, pU1
+
+ vadd.f32 x1r,x1r,x3r @// u4
+ vadd.f32 x1i,x1i,x3i
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vsub.f32 x3r,x1r,sr @// u5
+ vsub.f32 x3i,x1i,si
+
+ M_VSTM x1r,x1i, pU4
+ M_VSTM x3r,x3i, pU5
+
+
+ vldm pSrc, {x0r, x0i} @// x1
+ add pSrc, step1
+ vldm pSrc, {x1r, x1i} @// x3
+ add pSrc, step1
+ vldm pSrc, {x2r, x2i} @// x5
+ add pSrc, step1
+ vldm pSrc, {x3r, x3i} @// x7
+ add pSrc, #8
+
+ SUB pSrc, pSrc, step2
+
+ vadd.f32 x0r,x0r,x2r @// u2
+ vadd.f32 x0i,x0i,x2i
+
+ vadd.f32 sr, x2r, x2r
+ vadd.f32 si, x2i, x2i
+ vsub.f32 x2r,x0r,sr @// u3
+ vsub.f32 x2i,x0i,si
+
+ M_VSTM x2r,x2i, pU3
+
+ vadd.f32 x1r,x1r,x3r @// u6
+ vadd.f32 x1i,x1i,x3i
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vsub.f32 x3r,x1r,sr @// u7
+ vsub.f32 x3i,x1i,si
+
+ @// finish second and third stage of 8 point FFT
+
+ M_VSTM x3r,x3i, pU7
+ M_VLDM x2r,x2i, pU0
+
+ @// Decrement setcount
+ SUBS setCount,setCount,#1
+ M_VLDM x3r,x3i, pU4
+
+ vadd.f32 x0r,x0r,x1r @// v4
+ vadd.f32 x0i,x0i,x1i
+
+ vadd.f32 sr, x1r, x1r
+ vadd.f32 si, x1i, x1i
+ vsub.f32 x1r,x0r,sr @// v6
+ vsub.f32 x1i,x0i,si
+
+ vadd.f32 x2r,x2r,x3r @// v0
+ vadd.f32 x2i,x2i,x3i
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vsub.f32 x3r,x2r,sr @// v2
+ vsub.f32 x3i,x2i,si
+
+
+
+ vadd.f32 x2r,x2r,x0r @// y0
+ vadd.f32 x2i,x2i,x0i
+
+ vadd.f32 sr, x0r, x0r
+ vadd.f32 si, x0i, x0i
+ vsub.f32 x0r,x2r,sr @// y4
+ vsub.f32 x0i,x2i,si
+
+ vstm pDst, {x2r, x2i} @// store y0
+ add pDst, step1
+
+ vadd.f32 x3r,x3r,x1i @// y6
+ vsub.f32 x3i,x3i,x1r
+
+ vadd.f32 sr, x1r, x1r
+ vadd.f32 si, x1i, x1i
+ vsub.f32 t1r,x3r,si @// t1r=x2r reg;t1i=x2i reg
+ vadd.f32 t1i,x3i,sr @// y2
+
+ .ifeqs "\inverse", "TRUE"
+ vstm pDst, {t1r, t1i} @// store y2
+ add pDst, step1
+ vstm pDst, {x0r, x0i} @// store y4
+ add pDst, step1
+ vstm pDst, {x3r, x3i} @// store y6
+ add pDst, step1
+ .else
+ vstm pDst, {x3r, x3i} @// store y2
+ add pDst, step1
+ vstm pDst, {x0r, x0i} @// store y4
+ add pDst, step1
+ vstm pDst, {t1r, t1i} @// store y6
+ add pDst, step1
+ .endif
+
+ SUB pDst, pDst, step2 @// set pDst to y1
+
+
+ M_VLDM x0r,x0i,pU1 @// Load u1,u3,u5,u7
+ M_VLDM x1r,x1i,pU5
+ M_VLDM x3r,x3i,pU7
+
+ vsub.f32 x0r,x0r,x1i @// v1
+ vadd.f32 x0i,x0i,x1r
+ vadd.f32 sr, x1r, x1r
+ vadd.f32 si, x1i, x1i
+ vadd.f32 t1r,x0r,si @// t1r=x2r reg;t1i=x2i reg
+ vsub.f32 t1i,x0i,sr @// v3
+
+ M_VLDM x1r,x1i,pU3
+
+ vsub.f32 x1r,x1r,x3i @// v5
+ vadd.f32 x1i,x1i,x3r
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vadd.f32 t3r,x1r,si @// t3i = x3i
+ vsub.f32 t3i,x1i,sr @// v7
+
+ @// store v5 as (v5.r - v5.i,v5.r + v5.i)
+ @// store v7 as (v7.i + v7.r,v7.i - v7.r)
+
+ vadd.f32 x3r,t3i,t3r @// v7
+ vsub.f32 x3i,t3i,t3r
+
+ vsub.f32 x1r,x1r,x1i @// v5
+ vadd.f32 x1i, x1i
+ vadd.f32 x1i,x1r,x1i
+
+ vmul.f32 x3r, x3r, roothalf @// (v7.i + v7.r)*(1/sqrt(2))
+ vmul.f32 x3i, x3i, roothalf @// (v7.i - v7.r)*(1/sqrt(2))
+ vmul.f32 x1r, x1r, roothalf @// (v5.r - v5.i)*(1/sqrt(2))
+ vmul.f32 x1i, x1i, roothalf @// (v5.r + v5.i)*(1/sqrt(2))
+
+ vadd.f32 x2r,x2r,x3r @// y7
+ vadd.f32 x2i,x2i,x3i
+
+ vadd.f32 sr, x3r, x3r
+ vadd.f32 si, x3i, x3i
+ vsub.f32 x3r,x2r,sr @// y3
+ vsub.f32 x3i,x2i,si
+
+
+ vsub.f32 x0r,x0r,x1r @// y5
+ vsub.f32 x0i,x0i,x1i
+
+ vadd.f32 sr, x1r, x1r
+ vadd.f32 si, x1i, x1i
+ vadd.f32 x1r,x0r,sr @// y1
+ vadd.f32 x1i,x0i,si
+
+ .ifeqs "\inverse", "TRUE"
+ vstm pDst, {x1r, x1i} @// store y1
+ add pDst, step1
+ vstm pDst, {x3r, x3i} @// store y3
+ add pDst, step1
+ vstm pDst, {x0r, x0i} @// store y5
+ add pDst, step1
+ vstm pDst, {x2r, x2i} @// store y7
+ add pDst, #8
+ .else
+ vstm pDst, {x2r, x2i} @// store y1
+ add pDst, step1
+ vstm pDst, {x0r, x0i} @// store y3
+ add pDst, step1
+ vstm pDst, {x3r, x3i} @// store y5
+ add pDst, step1
+ vstm pDst, {x1r, x1i} @// store y7
+ add pDst, #8
+ .endif
+
+ SUB pDst, pDst, step2 @// update pDst for the next set
+
+
+ BGT grpZeroSetLoop\name
+
+
+ @// reset pSrc to pDst for the next stage
+ SUB pSrc,pDst,pointStep @// pDst -= 2*grpSize
+ mov pDst, pPingPongBuf
+
+
+ .ENDM
+
+
+
+
+
+ @// Allocate stack memory required by the function
+
+ @// Ensure 8 byte alignment to use M_VLDM
+ M_ALLOC8 pU0, 8
+ M_ALLOC8 pU1, 8
+ M_ALLOC8 pU3, 8
+ M_ALLOC8 pU4, 8
+ M_ALLOC8 pU5, 8
+ M_ALLOC8 pU7, 8
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp,r4
+ FFTSTAGE "FALSE","FALSE",FWD
+ M_END
+
+ @// Allocate stack memory required by the function
+
+ @// Ensure 8 byte alignment to use M_VLDM
+ M_ALLOC8 pU0, 8
+ M_ALLOC8 pU1, 8
+ M_ALLOC8 pU3, 8
+ M_ALLOC8 pU4, 8
+ M_ALLOC8 pU5, 8
+ M_ALLOC8 pU7, 8
+
+ M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp,r4
+ FFTSTAGE "FALSE","TRUE",INV
+ M_END
+
+@// ENDIF @//ARM1136JS
+
+
+
+@// Guarding implementation by the processor name
+
+
+ .END
diff --git a/dl/sp/src/arm/armv7/omxSP_FFTFwd_CToC_FC32_Sfs_s.S b/dl/sp/src/arm/armv7/omxSP_FFTFwd_CToC_FC32_Sfs_s.S
new file mode 100644
index 0000000..25b4976
--- /dev/null
+++ b/dl/sp/src/arm/armv7/omxSP_FFTFwd_CToC_FC32_Sfs_s.S
@@ -0,0 +1,161 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of omxSP_FFTFwd_CToC_SC32_Sfs_s.S
+@// to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+ .extern armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@/ IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+
+
+@// Output registers
+#define result r0
+
+@//Local Scratch Registers
+
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define pTwiddle r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+#define count r8
+#define diffMinusOne r2
+#define round r3
+
+#define x0r s0
+#define x0i s1
+
+
+
+
+ @// Allocate stack memory required by the function
+
+ @// Write function header
+ M_START omxSP_FFTFwd_CToC_FC32_Sfs_vfp,r11
+
+@ Structure offsets for FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+ @// Define stack arguments
+
+ @// Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+ @// Read other structure parameters
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+ CLZ order,N @// N = 2^order
+ RSB order,order,#31
+ MOV subFFTSize,#1
+ @//MOV subFFTNum,N
+
+
+ CMP order,#1
+ BGT orderGreaterthan1 @// order > 1
+ @// order = 0, 1
+ vldmlt.f32 pSrc, {x0r, x0i}
+ vstmlt.f32 pDst, {x0r, x0i}
+
+ MOVLT pSrc,pDst
+ BLT End
+
+ @// Handle order = 1
+ MOV argDst,pDst @// Set input args to fft stages
+ MOV argTwiddle,pTwiddle
+ BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+ B End
+
+
+
+orderGreaterthan1:
+
+ TST order, #2 @// Set input args to fft stages
+ MOVNE argDst,pDst
+ MOVEQ argDst,pOut
+ MOVEQ pOut,pDst @// Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
+
+ @//check for even or odd order
+
+ @// NOTE: The following combination of BL's would work fine
+ @// eventhough the first BL would corrupt the flags. This is
+ @// because the end of the "grpZeroSetLoop" loop inside
+ @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ @// sets the Z flag to EQ
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ BLNE armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+
+unscaledRadix4Loop:
+ CMP subFFTNum,#1
+ BEQ End
+ BL armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+ B unscaledRadix4Loop
+
+
+End:
+ @// Set return value
+ MOV result, #OMX_Sts_NoErr
+
+ @// Write function tail
+ M_END
+
+@// ENDIF @//ARM1136JS
+
+
+ @// Guarding implementation by the processor name
+
+
+
+ .end
diff --git a/dl/sp/src/arm/armv7/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S b/dl/sp/src/arm/armv7/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S
new file mode 100644
index 0000000..dd1690a
--- /dev/null
+++ b/dl/sp/src/arm/armv7/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S
@@ -0,0 +1,328 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
+@// to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute FFT for a real signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+ .extern armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@// IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+
+
+@// Output registers
+#define result r0
+
+@//Local Scratch Registers
+
+@// N=1 case
+#define scaleMinusOne r2
+#define rnd r2
+#define zero r8
+#define Zero r9
+
+
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define pTwiddle r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+#define count r8
+#define diffMinusOne r10
+#define round r3
+
+#define step r3
+#define step1 r6
+#define twStep r12
+#define pTwiddleTmp r14
+#define t0 r12
+#define t1 r14 /*@// pTwiddleTmp*/
+#define t2 r0
+#define t3 r1 /*@// pSrc,argTwiddle*/
+#define t4 r6
+#define t5 r7 /*@// step1,subFFTSize*/
+
+#define x0r s0
+#define x0i s1
+#define y0r s2
+#define y0i s3
+#define x1r s4
+#define x1i s5
+#define w1r s2
+#define w1i s3
+#define w0r s6
+#define w0i s7
+#define y1r s2 /*@// w1r,w1i*/
+#define y1i s3
+#define st0 s8
+#define st1 s9
+#define st2 s10
+#define st3 s11
+#define st4 s12
+#define st5 s13
+#define half s15
+
+
+
+
+ @// Allocate stack memory required by the function
+
+
+
+ @// Write function header
+ M_START omxSP_FFTFwd_RToCCS_F32_Sfs_vfp,r11
+
+@ Structure offsets for FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+ @// Define stack arguments
+
+ @// Setup half value
+ movw N, #0 @// Use N as a temp.
+ movt N, #0x3f00
+ vmov.f32 half, N
+
+ @// Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+ @// Read other structure parameters
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+ @// N=1 Treat seperately
+ CMP N,#1
+ BGT sizeGreaterThanOne
+ // N<=1 is not supported
+ @// Set return value
+ MOV result, #OMX_Sts_NoErr
+ B FunctionEnd
+
+sizeGreaterThanOne:
+ @// Do a N/2 point complex FFT including the scaling
+
+ MOV N,N,ASR #1 @// N/2 point complex FFT
+ CLZ order,N @// N = 2^order
+ RSB order,order,#31
+ MOV subFFTSize,#1
+ @//MOV subFFTNum,N
+
+
+ CMP order,#1
+ BGT orderGreaterthan1 @// order > 1
+ vldmlt.f32 pSrc, {x0r, x0i}
+ vstmlt.f32 pOut, {x0r, x0i}
+ MOVLT pSrc,pOut
+ MOVLT argDst,pDst
+ BLT FFTEnd
+
+ MOV argDst,pOut @// Set input args to fft stages
+ MOV pOut,pDst @// Set input args to fft stages
+ MOV argTwiddle,pTwiddle
+
+ BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+ B finalComplexToRealFixup
+
+orderGreaterthan1:
+
+ TST order, #2 @// Set input args to fft stages
+ MOVEQ argDst,pDst
+ MOVNE argDst,pOut
+ MOVNE pOut,pDst @// Pass the first stage dest in RN5
+ MOV argTwiddle,pTwiddle
+
+ @//check for even or odd order
+
+ @// NOTE: The following combination of BL's would work fine
+ @// eventhough the first BL would corrupt the flags. This is
+ @// because the end of the "grpZeroSetLoop" loop inside
+ @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets
+ @// the Z flag to EQ
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ BLNE armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+
+unscaledRadix4Loop:
+ CMP subFFTNum,#1
+ BEQ FFTEnd
+ BL armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+ B unscaledRadix4Loop
+
+FFTEnd:
+finalComplexToRealFixup:
+
+ @// step = N/2 * 8 bytes
+ MOV step,subFFTSize,LSL #3
+ @// twStep = 3N/8 * 8 bytes pointing to W^1
+ SUB twStep,step,subFFTSize,LSL #1
+ @// step1 = N/4 * 8 = N/2*4 bytes
+ MOV step1,subFFTSize,LSL #2
+ @// (N/4-1)*8 bytes
+ SUB step1,step1,#8
+
+ @// F(0) = 1/2 [Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
+ @// 1/2 [(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)]
+ @// 1/2 [2a+j0] - j [0+j2b]
+ @// (a+b, 0)
+
+ @// F(N/2) =1/2 [Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
+ @// 1/2 [(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)]
+ @// 1/2 [2a+j0] + j [0+j2b]
+ @// (a-b, 0)
+
+ @// F(0) and F(N/2)
+ vldm.f32 pSrc!, {x0r, x0i}
+ vadd.f32 y0r,x0r,x0i @// F(0) = (2(Z0.r+Z0.i) , 0)
+ vsub.f32 x0r,x0r,x0i @// F(N/2) = (2(Z0.r-Z0.i) , 0)
+ vsub.f32 y0i, y0i @ y0i and x0i set to 0.0
+ vsub.f32 x0i, x0i
+
+ add argDst, step
+ vstm.f32 argDst, {x0r, x0i} @// {x0r,x0i}->[argDst, step]
+ sub argDst, step
+ vstm.f32 argDst!, {y0r, y0i}
+
+ SUBS subFFTSize,subFFTSize,#2
+
+ ADD pTwiddleTmp,argTwiddle,#8 @// W^2
+ ADD argTwiddle,argTwiddle,twStep @// W^1
+ BLT End
+ BEQ lastElement
+
+
+ @// F(k) = 1/2 [Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)]
+ @// Process 2 elements at a time. E.g: F(1) and F(N/2-1) since
+ @// both of them require Z(1) and Z(N/2-1)
+
+ ASR subFFTSize,subFFTSize,#1
+evenOddButterflyLoop:
+
+ SUB step,step,#16 @// (N/2-2)*8 bytes
+
+ add pSrc, step
+ vldm.f32 pSrc, {x1r, x1i} @// {x1r, x1i} = [pSrc, step]
+ sub pSrc, step
+ vldm.f32 pSrc!, {x0r, x0i}
+ add argTwiddle, step1
+ vldm.f32 argTwiddle, {w1r, w1i} @// {w1r, w1i} = [argTwiddle, step1]
+ sub argTwiddle, step1
+ vldm.f32 argTwiddle!, {w0r, w0i} @// {w0r, w0i} = [argTwiddle], #8
+
+ SUB step1,step1,#8
+ SUBS subFFTSize,subFFTSize,#1
+
+ vsub.f32 st2,x0r,x1r @// a-c
+ vadd.f32 st3,x0i,x1i @// b+d
+ vadd.f32 st0,x0r,x1r @// a+c
+ vsub.f32 st1,x0i,x1i @// b-d
+
+ vmul.f32 x1r,w1r,st2
+ vmul.f32 x1i,w1r,st3
+ vmla.f32 x1r,w1i,st3 @// x1r = w1r*st2 + w1i*st3
+ @//RSB x1r,x1r,#0
+ vmls.f32 x1i,w1i,st2 @// x1i = w1r*st3 - wli*st2
+
+ vsub.f32 y1r, st0, x1i
+ vadd.f32 y1i, x1r, st1
+ vneg.f32 y1i, y1i
+
+ vmul.f32 x0r,w0r,st2
+ vmul.f32 x0i,w0r,st3
+ vmls.f32 x0r,w0i,st3 @// x0r = w0r*st2 - w0i*st3
+ vmla.f32 x0i,w0i,st2 @// x0i = w0r*st3 + x0i*st1
+
+ vsub.f32 st4,st0,x0i @// F(1)
+ vadd.f32 st5,x0r,st1
+
+
+ vmul.f32 y1r, half
+ vmul.f32 y1i, half
+ vmul.f32 st4, half
+ vmul.f32 st5, half
+
+ add argDst, step
+ vstm.f32 argDst, {y1r, y1i} @// {y1r,y1i} -> [argDst,step]
+ sub argDst, step
+ vstm.f32 argDst!, {st4, st5}
+
+
+ MOV t0,argTwiddle @// swap ptr for even and odd twiddles
+ MOV argTwiddle,pTwiddleTmp
+ MOV pTwiddleTmp,t0
+
+ BGT evenOddButterflyLoop
+
+ @// Last element can be expanded as follows
+ @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
+ @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
+ @// 1/2[2a+j0] + j (c+jd) [0+j2b]
+ @// (a-bc, -bd)
+
+lastElement:
+ vldm.f32 pSrc, {x0r, x0i}
+ vneg.f32 x0i, x0i
+ vstm.f32 argDst, {x0r, x0i}
+
+End:
+ @// Set return value
+ MOV result, #OMX_Sts_NoErr
+
+FunctionEnd:
+ @// Write function tail
+ M_END
+
+@// ENDIF @//ARM1136JS
+
+
+ @// Guarding implementation by the processor name
+
+
+
+ .end
diff --git a/dl/sp/src/arm/armv7/omxSP_FFTInv_CCSToR_F32_Sfs_s.S b/dl/sp/src/arm/armv7/omxSP_FFTInv_CCSToR_F32_Sfs_s.S
new file mode 100644
index 0000000..d6a4765
--- /dev/null
+++ b/dl/sp/src/arm/armv7/omxSP_FFTInv_CCSToR_F32_Sfs_s.S
@@ -0,0 +1,227 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of omxSP_FFTInv_CCSToR_S32_Sfs_s.s
+@// to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+ .extern armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_vfp
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@// IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+
+
+@// Output registers
+#define result r0
+
+@//Local Scratch Registers
+
+
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define pTwiddle r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+@// Total num of radix stages required to comple the FFT*/
+#define count r8
+
+#define round r3
+
+#define x0r s0
+#define x0i s1
+#define y0r s2
+#define y0i s3
+#define x1r s4
+#define x1i s5
+#define w1r s2
+#define w1i s3
+#define w0r s6
+#define w0i s7
+#define y1r s2 /*@// w1r,w1i*/
+#define y1i s3
+#define st0 s8
+#define st1 s9
+#define st2 s10
+#define st3 s11
+#define st4 s12
+#define st5 s13
+#define fscale s2
+#define fone s3
+
+
+
+ @// Allocate stack memory required by the function
+ M_ALLOC4 pDstOnStack, 4
+ M_ALLOC4 pFFTSpecOnStack, 4
+
+ @// Write function header
+ M_START omxSP_FFTInv_CCSToR_F32_Sfs_vfp,r11
+
+@ Structure offsets for FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+ @// Define stack arguments
+
+ @// Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+
+
+ @// N=1 Treat seperately
+ CMP N,#1
+ BGT sizeGreaterThanOne
+ vldr.f32 x0r, [pSrc]
+ vstr.f32 x0r, [pDst]
+
+ B End
+
+sizeGreaterThanOne:
+ M_STR pDst,pDstOnStack @// store all the pointers
+ M_STR pFFTSpec,pFFTSpecOnStack
+
+
+ @// Call the preTwiddle Radix2 stage before doing the compledIFFT
+
+ BL armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_vfp
+
+
+complexIFFT:
+
+ M_LDR pFFTSpec,pFFTSpecOnStack
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+ ASR N,N,#1 @// N/2 point complex IFFT
+ ADD pSrc,pOut,N,LSL #3 @// set pSrc as pOut1
+ M_LDR pDst,pDstOnStack
+
+ CLZ order,N @// N = 2^order
+ RSB order,order,#31
+ MOV subFFTSize,#1
+
+ CMP order,#1
+ BGT orderGreaterthan1 @// order > 1
+ vldmlt.f32 pSrc, {x0r, x0i}
+ vstmlt.f32 pDst, {x0r, x0i}
+
+ MOVLT pSrc,pDst
+ BLT FFTEnd
+
+ MOV argDst,pDst @// Set input args to fft stages
+ MOV argTwiddle,pTwiddle
+
+ BL armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+ B FFTEnd
+
+
+orderGreaterthan1:
+
+ TST order, #2 @// Set input args to fft stages
+ MOVNE argDst,pDst
+ MOVEQ argDst,pOut
+ MOVEQ pOut,pDst @// Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
+
+
+ @//check for even or odd order
+
+ @// NOTE: The following combination of BL's would work fine
+ @// eventhough the first BL would corrupt the flags. This is
+ @// because the end of the "grpZeroSetLoop" loop inside
+ @// armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets
+ @// the Z flag to EQ
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ BLNE armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+
+unscaledRadix4Loop:
+ CMP subFFTNum,#1
+ BEQ FFTEnd
+ BL armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+ B unscaledRadix4Loop
+
+FFTEnd:
+
+ vldm.f32 pSrc, {x0r, x0i}
+
+ vmov.f32 fscale, subFFTSize
+ vcvt.f32.s32 fscale, fscale @// fscale = N as a float
+ mov round, #1
+ vmov.f32 fone, round
+ vcvt.f32.s32 fone, fone
+ vdiv.f32 fscale, fone, fscale @// fscale = 1/N
+
+scaleFFTData: @// N = subFFTSize
+ SUBS subFFTSize,subFFTSize,#1
+ vmul.f32 x0r, x0r, fscale
+ vmul.f32 x0i, x0i, fscale
+ vstm.f32 pSrc!, {x0r, x0i}
+ vldmgt.f32 pSrc, {x0r, x0i}
+
+ BGT scaleFFTData
+
+
+End:
+ @// Set return value
+ MOV result, #OMX_Sts_NoErr
+
+ @// Write function tail
+ M_END
+
+@// ENDIF @//ARM1136JS
+
+
+ @// Guarding implementation by the processor name
+
+
+
+ .end
diff --git a/dl/sp/src/arm/armv7/omxSP_FFTInv_CToC_FC32_Sfs_s.S b/dl/sp/src/arm/armv7/omxSP_FFTInv_CToC_FC32_Sfs_s.S
new file mode 100644
index 0000000..64aa5da
--- /dev/null
+++ b/dl/sp/src/arm/armv7/omxSP_FFTInv_CToC_FC32_Sfs_s.S
@@ -0,0 +1,180 @@
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This is a modification of omxSP_FFTInv_CToC_SC32_Sfs_s.s
+@// to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/arm/armCOMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+@// M_VARIANTS ARM1136JS
+
+@// Import symbols required from other files
+@// (For example tables)
+
+ .extern armSP_FFTInv_CToC_FC32_Sfs_Radix2_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+ .extern armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+@// IF ARM1136JS
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+
+
+@// Output registers
+#define result r0
+
+@//Local Scratch Registers
+
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define pTwiddle r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+#define count r8
+#define diffMinusOne r2
+#define round r3
+
+#define x0r s0
+#define x0i s1
+#define fone s2
+#define fscale s3
+
+
+ @// Allocate stack memory required by the function
+
+ @// Write function header
+ M_START omxSP_FFTInv_CToC_FC32_Sfs_vfp,r11
+
+@ Structure offsets for FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+ @// Define stack arguments
+
+ @// Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+ @// Read other structure parameters
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+ CLZ order,N @// N = 2^order
+ RSB order,order,#31
+ MOV subFFTSize,#1
+ @//MOV subFFTNum,N
+
+ CMP order,#1
+ BGT orderGreaterthan1 @// order > 1
+ @// Order = 0 or 1
+ vldmlt.f32 pSrc, {x0r, x0i}
+ vstmlt.f32 pDst, {x0r, x0i}
+
+ MOVLT pSrc,pDst
+ BLT FFTEnd
+
+ @// Handle order = 1
+ MOV argDst,pDst
+ MOV argTwiddle,pTwiddle
+
+ BL armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
+ B FFTEnd
+
+orderGreaterthan1:
+
+ TST order, #2 @// Set input args to fft stages
+ MOVNE argDst,pDst
+ MOVEQ argDst,pOut
+ MOVEQ pOut,pDst @// Pass the first stage dest in RN5
+ MOV argTwiddle,pTwiddle
+
+
+ @//check for even or odd order
+ @// NOTE: The following combination of BL's would work fine
+ @// eventhough the first BL would corrupt the flags. This is
+ @// because the end of the "grpZeroSetLoop" loop inside
+ @// armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets
+ @// the Z flag to EQ
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
+ BLNE armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
+
+unscaledRadix4Loop:
+ CMP subFFTNum,#1
+ BEQ FFTEnd
+ BL armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
+ B unscaledRadix4Loop
+
+
+FFTEnd:
+
+ vldm.f32 pSrc, {x0r, x0i}
+
+ vmov.f32 fscale, subFFTSize
+ vcvt.f32.s32 fscale, fscale @// fscale = N as a float
+ movw round, #0
+ movt round, #0x3f80 @// round = 1.0
+ vmov.f32 fone, round
+ vdiv.f32 fscale, fone, fscale @// fscale = 1/N
+scaleFFTData: @// N = subFFTSize
+ SUBS subFFTSize,subFFTSize,#1
+ vmul.f32 x0r, x0r, fscale
+ vmul.f32 x0i, x0i, fscale
+ vstm.f32 pSrc, {x0r, x0i}
+ add pSrc, #8
+ vldmgt.f32 pSrc, {x0r, x0i}
+
+ bgt scaleFFTData
+
+
+ @// Set return value
+ MOV result, #OMX_Sts_NoErr
+
+ @// Write function tail
+ M_END
+
+@// ENDIF @//ARM1136JS
+
+
+ @// Guarding implementation by the processor name
+
+
+
+ .end