Add 16-bit FFT routines as is from OpenMAX DL.
These are the assembly routines for the 16-bit complex FFT unchanged
from OpenMAX DL.
Review URL: https://webrtc-codereview.appspot.com/1101004
git-svn-id: http://webrtc.googlecode.com/svn/deps/third_party/openmax@3481 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S
new file mode 100644
index 0000000..f321502
--- /dev/null
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S
@@ -0,0 +1,162 @@
+;//
+;//
+;// File Name: armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision: 6693
+;// Last Modified Date: Tue, 10 Jul 2007
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// Description:
+;// Compute a Radix 2 FFT stage for a N point complex signal
+;//
+;//
+
+
+;// Include standard headers
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+;// Import symbols required from other files
+;// (For example tables)
+
+
+
+
+;// Set debugging level
+;//DEBUG_ON SETL {TRUE}
+
+
+
+;// Guarding implementation by the processor name
+
+
+
+;// Guarding implementation by the processor name
+
+ IF CortexA8
+
+;//Input Registers
+
+pSrc RN 0
+pDst RN 2
+pTwiddle RN 1
+pPingPongBuf RN 5
+subFFTNum RN 6
+subFFTSize RN 7
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+
+pointStep RN 3
+outPointStep RN 3
+grpSize RN 4
+setCount RN 4
+step RN 8
+dstStep RN 8
+
+;// Neon Registers
+
+dX0 DN D0.S16
+dX1 DN D1.S16
+dY0 DN D2.S16
+dY1 DN D3.S16
+dX0S32 DN D0.S32
+dX1S32 DN D1.S32
+dY0S32 DN D2.S32
+dY1S32 DN D3.S32
+
+
+ MACRO
+ FFTSTAGE $scaled, $inverse, $name
+
+ ;// Define stack arguments
+
+
+ ;// update subFFTSize and subFFTNum into RN6 and RN7 for the next stage
+
+
+ MOV subFFTSize,#2
+ LSR grpSize,subFFTNum,#1
+ MOV subFFTNum,grpSize
+
+
+ ;// pT0+1 increments pT0 by 8 bytes
+ ;// pT0+pointStep = increment of 4*pointStep bytes = 2*grpSize bytes
+ ;// Note: outPointStep = pointStep for firststage
+ ;// Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+
+ MOV pointStep,grpSize,LSL #2
+ RSB step,pointStep,#4
+
+
+ ;// Loop on the sets for grp zero: 1 set at a time
+
+grpZeroSetLoop$name
+
+ VLD1 {dX0S32[0]},[pSrc],pointStep
+ VLD1 {dX1S32[0]},[pSrc],step ;// step = -pointStep + 4
+ SUBS setCount,setCount,#1 ;// decrement the loop counter
+
+ IF $scaled
+
+ VHADD dY0,dX0,dX1
+ VHSUB dY1,dX0,dX1
+
+ ELSE
+
+ VADD dY0,dX0,dX1
+ VSUB dY1,dX0,dX1
+
+
+ ENDIF
+
+ VST1 {dY0S32[0]},[pDst],outPointStep
+ VST1 {dY1S32[0]},[pDst],dstStep ;// dstStep = step = -pointStep + 4
+
+ BGT grpZeroSetLoop$name
+
+
+ ;// reset pSrc to pDst for the next stage
+ SUB pSrc,pDst,pointStep ;// pDst -= 2*grpSize
+ MOV pDst,pPingPongBuf
+
+ MEND
+
+
+
+ M_START armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe,r4
+ FFTSTAGE {FALSE},{FALSE},FWD
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe,r4
+ FFTSTAGE {FALSE},{TRUE},INV
+ M_END
+
+
+
+ M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe,r4
+ FFTSTAGE {TRUE},{FALSE},FWDSFS
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe,r4
+ FFTSTAGE {TRUE},{TRUE},INVSFS
+ M_END
+
+
+ ENDIF ;//CORTEXA8
+
+
+ END
\ No newline at end of file
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S
new file mode 100644
index 0000000..0932099
--- /dev/null
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S
@@ -0,0 +1,202 @@
+;//
+;//
+;// File Name: armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision: 6741
+;// Last Modified Date: Wed, 18 Jul 2007
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// Description:
+;// Compute a Radix 2 FFT stage for a N point complex signal
+;//
+;//
+
+
+;// Include standard headers
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+;// Import symbols required from other files
+;// (For example tables)
+
+
+
+
+;// Set debugging level
+;//DEBUG_ON SETL {TRUE}
+
+
+;// Guarding implementation by the processor name
+
+
+
+
+
+
+
+;// Guarding implementation by the processor name
+
+ IF CortexA8
+
+;//Input Registers
+
+pSrc RN 0
+pDst RN 2
+pTwiddle RN 1
+subFFTNum RN 6
+subFFTSize RN 7
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+
+
+outPointStep RN 3
+grpCount RN 4
+dstStep RN 5
+pTmp RN 4
+step RN 8
+
+;// Neon Registers
+
+dWr DN D0.S16
+dWi DN D1.S16
+dXr0 DN D2.S16
+dXi0 DN D3.S16
+dXr1 DN D4.S16
+dXi1 DN D5.S16
+dYr0 DN D6.S16
+dYi0 DN D7.S16
+dYr1 DN D8.S16
+dYi1 DN D9.S16
+qT0 QN Q5.S32
+qT1 QN Q6.S32
+
+
+ MACRO
+ FFTSTAGE $scaled, $inverse, $name
+
+
+ MOV outPointStep,subFFTSize,LSL #2
+ ;// Update grpCount and grpSize rightaway
+
+ MOV subFFTNum,#1 ;//after the last stage
+ LSL grpCount,subFFTSize,#1
+
+ ;// update subFFTSize for the next stage
+ MOV subFFTSize,grpCount
+
+ SUB step,outPointStep,#4 ;// step = -4+outPointStep
+ RSB dstStep,step,#0 ;// dstStep = -4-outPointStep+8 = -step
+ ;//RSB dstStep,outPointStep,#16
+
+
+ ;// Loop on 2 grps at a time for the last stage
+
+grpLoop$name
+ VLD2 {dWr[0],dWi[0]},[pTwiddle]! ;// grp 0
+ VLD2 {dWr[1],dWi[1]},[pTwiddle]! ;// grp 1
+
+ ;//VLD2 {dWr,dWi},[pTwiddle],#16
+
+ VLD4 {dXr0[0],dXi0[0],dXr1[0],dXi1[0]},[pSrc]! ;// grp 0
+ VLD4 {dXr0[1],dXi0[1],dXr1[1],dXi1[1]},[pSrc]! ;// grp 1
+
+
+ ;//VLD4 {dXr0,dXi0,dXr1,dXi1},[pSrc],#32
+ SUBS grpCount,grpCount,#4 ;// grpCount is multiplied by 2
+
+ IF $inverse
+ VMULL qT0,dXr1,dWr
+ VMLAL qT0,dXi1,dWi ;// real part
+ VMULL qT1,dXi1,dWr
+ VMLSL qT1,dXr1,dWi ;// imag part
+
+ ELSE
+ VMULL qT0,dXr1,dWr
+ VMLSL qT0,dXi1,dWi ;// real part
+ VMULL qT1,dXi1,dWr
+ VMLAL qT1,dXr1,dWi ;// imag part
+
+ ENDIF
+
+ VRSHRN dXr1,qT0,#15
+ VRSHRN dXi1,qT1,#15
+
+
+ IF $scaled
+
+ VHSUB dYr0,dXr0,dXr1
+ VHSUB dYi0,dXi0,dXi1
+ VHADD dYr1,dXr0,dXr1
+ VHADD dYi1,dXi0,dXi1
+
+ ELSE
+
+ VSUB dYr0,dXr0,dXr1
+ VSUB dYi0,dXi0,dXi1
+ VADD dYr1,dXr0,dXr1
+ VADD dYi1,dXi0,dXi1
+
+
+ ENDIF
+
+ VST2 {dYr0[0],dYi0[0]},[pDst]!
+ VST2 {dYr0[1],dYi0[1]},[pDst],step ;// step = -4+outPointStep
+
+ VST2 {dYr1[0],dYi1[0]},[pDst]!
+ VST2 {dYr1[1],dYi1[1]},[pDst],dstStep ;// dstStep = -4-outPointStep+8 = -step
+
+ ;//VST2 {dYr0,dYi0},[pDst],outPointStep
+ ;//VST2 {dYr1,dYi1},[pDst],dstStep ;// dstStep = step = -outPointStep + 16
+
+ BGT grpLoop$name
+
+
+ ;// Reset and Swap pSrc and pDst for the next stage
+ MOV pTmp,pDst
+ SUB pDst,pSrc,outPointStep,LSL #1 ;// pDst -= 2*size; pSrc -= 4*size bytes
+ SUB pSrc,pTmp,outPointStep
+
+ ;// Reset pTwiddle for the next stage
+ SUB pTwiddle,pTwiddle,outPointStep ;// pTwiddle -= 2*size bytes
+
+ MEND
+
+
+
+ M_START armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe,r4
+ FFTSTAGE {FALSE},{FALSE},FWD
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe,r4
+ FFTSTAGE {FALSE},{TRUE},INV
+ M_END
+
+
+
+ M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe,r4
+ FFTSTAGE {TRUE},{FALSE},FWDSFS
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe,r4
+ FFTSTAGE {TRUE},{TRUE},INVSFS
+ M_END
+
+
+ ENDIF ;//CORTEXA8
+
+
+ END
\ No newline at end of file
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S
new file mode 100644
index 0000000..49bf607
--- /dev/null
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S
@@ -0,0 +1,209 @@
+;//
+;//
+;// File Name: armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision: 6740
+;// Last Modified Date: Wed, 18 Jul 2007
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// Description:
+;// Compute a Radix 2 FFT stage for a N point complex signal
+;//
+;//
+
+
+;// Include standard headers
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+;// Import symbols required from other files
+;// (For example tables)
+
+
+
+
+;// Set debugging level
+;//DEBUG_ON SETL {TRUE}
+
+
+
+
+;// Guarding implementation by the processor name
+
+ IF CortexA8
+
+;//Input Registers
+
+pSrc RN 0
+pDst RN 2
+pTwiddle RN 1
+subFFTNum RN 6
+subFFTSize RN 7
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+
+outPointStep RN 3
+grpCount RN 4
+dstStep RN 5
+twStep RN 8
+pTmp RN 4
+
+;// Neon Registers
+
+dW1S32 DN D0.S32
+dW2S32 DN D1.S32
+dW1 DN D0.S16
+dW2 DN D1.S16
+
+dX0 DN D2.S16
+dX1 DN D3.S16
+dX2 DN D4.S16
+dX3 DN D5.S16
+dY0 DN D6.S16
+dY1 DN D7.S16
+dY2 DN D8.S16
+dY3 DN D9.S16
+qT0 QN Q5.S32
+qT1 QN Q6.S32
+
+
+ MACRO
+ FFTSTAGE $scaled, $inverse, $name
+
+ ;// Define stack arguments
+
+
+ ;// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+
+
+ LSL grpCount,subFFTSize,#1
+
+
+ ;// update subFFTSize for the next stage
+ MOV subFFTSize,grpCount
+
+ ;// pOut0+1 increments pOut0 by 8 bytes
+ ;// pOut0+outPointStep == increment of 4*outPointStep bytes = 2*size bytes
+ SMULBB outPointStep,grpCount,subFFTNum
+ MOV twStep,subFFTNum,LSL #1
+ LSR subFFTNum,subFFTNum,#1 ;//grpSize
+
+
+ RSB dstStep,outPointStep,#8
+
+
+ ;// Note: pointStep is 8 in this case: so need of extra reg
+ ;// Loop on the groups: 2 groups at a time
+
+grpLoop$name
+
+ VLD1 dW1S32[],[pTwiddle],twStep ;//[wi | wr]
+ VLD1 dW2S32[],[pTwiddle],twStep
+
+ ;// Process the sets for each grp: 2 sets at a time (no set looping required)
+
+ VLD1 dX0,[pSrc]! ;// point0: of set0,set1 of grp0
+ VLD1 dX1,[pSrc]! ;// point1: of set0,set1 of grp0
+ VLD1 dX2,[pSrc]! ;// point0: of set0,set1 of grp1
+ VLD1 dX3,[pSrc]! ;// point1: of set0,set1 of grp1
+
+ SUBS grpCount,grpCount,#4 ;// decrement the loop counter
+ VUZP dW1,dW2
+ VUZP dX1,dX3
+
+ IF $inverse
+ VMULL qT0,dX1,dW1
+ VMLAL qT0,dX3,dW2 ;// real part
+ VMULL qT1,dX3,dW1
+ VMLSL qT1,dX1,dW2 ;// imag part
+
+ ELSE
+ VMULL qT0,dX1,dW1
+ VMLSL qT0,dX3,dW2 ;// real part
+ VMULL qT1,dX3,dW1
+ VMLAL qT1,dX1,dW2 ;// imag part
+
+ ENDIF
+
+ VRSHRN dX1,qT0,#15
+ VRSHRN dX3,qT1,#15
+
+ VZIP dX1,dX3
+
+
+ IF $scaled
+
+ VHSUB dY0,dX0,dX1
+ VHADD dY1,dX0,dX1
+ VHSUB dY2,dX2,dX3
+ VHADD dY3,dX2,dX3
+
+ ELSE
+
+ VSUB dY0,dX0,dX1
+ VADD dY1,dX0,dX1
+ VSUB dY2,dX2,dX3
+ VADD dY3,dX2,dX3
+
+
+
+ ENDIF
+
+ VST1 dY0,[pDst],outPointStep ;// point0: of set0,set1 of grp0
+ VST1 dY1,[pDst],dstStep ;// dstStep = -outPointStep + 8
+ VST1 dY2,[pDst],outPointStep ;// point0: of set0,set1 of grp1
+ VST1 dY3,[pDst],dstStep ;// point1: of set0,set1 of grp1
+
+
+ BGT grpLoop$name
+
+
+ ;// Reset and Swap pSrc and pDst for the next stage
+ MOV pTmp,pDst
+ SUB pDst,pSrc,outPointStep,LSL #1 ;// pDst -= 2*size; pSrc -= 4*size bytes
+ SUB pSrc,pTmp,outPointStep
+
+ ;// Reset pTwiddle for the next stage
+ SUB pTwiddle,pTwiddle,outPointStep ;// pTwiddle -= 2*size bytes
+
+ MEND
+
+
+
+ M_START armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe,r4
+ FFTSTAGE {FALSE},{FALSE},FWD
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe,r4
+ FFTSTAGE {FALSE},{TRUE},INV
+ M_END
+
+
+
+ M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe,r4
+ FFTSTAGE {TRUE},{FALSE},FWDSFS
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe,r4
+ FFTSTAGE {TRUE},{TRUE},INVSFS
+ M_END
+
+
+ ENDIF ;//CORTEXA8
+
+
+ END
\ No newline at end of file
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S
new file mode 100644
index 0000000..133b137
--- /dev/null
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S
@@ -0,0 +1,214 @@
+;//
+;//
+;// File Name: armSP_FFT_CToC_SC16_Radix2_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision: 5892
+;// Last Modified Date: Thu, 07 Jun 2007
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// Description:
+;// Compute a Radix 2 FFT stage for a N point complex signal
+;//
+;//
+
+
+;// Include standard headers
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+;// Import symbols required from other files
+;// (For example tables)
+
+
+
+;// Set debugging level
+;//DEBUG_ON SETL {TRUE}
+
+
+
+;// Guarding implementation by the processor name
+
+
+
+
+ ;// Guarding implementation by the processor name
+
+ IF CortexA8
+
+
+;//Input Registers
+
+pSrc RN 0
+pDst RN 2
+pTwiddle RN 1
+subFFTNum RN 6
+subFFTSize RN 7
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+
+outPointStep RN 3
+pointStep RN 4
+grpCount RN 5
+setCount RN 8
+step RN 10
+dstStep RN 11
+pTmp RN 9
+
+;// Neon Registers
+
+dW DN D0.S16
+dX0 DN D2.S16
+dX1 DN D3.S16
+dX2 DN D4.S16
+dX3 DN D5.S16
+dY0 DN D6.S16
+dY1 DN D7.S16
+dY2 DN D8.S16
+dY3 DN D9.S16
+qT0 QN Q3.S32
+qT1 QN Q4.S32
+
+
+
+ MACRO
+ FFTSTAGE $scaled, $inverse, $name
+
+ ;// Define stack arguments
+
+
+ ;// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+
+ LSR subFFTNum,subFFTNum,#1 ;//grpSize
+ LSL grpCount,subFFTSize,#1
+
+
+ ;// pT0+1 increments pT0 by 8 bytes
+ ;// pT0+pointStep = increment of 4*pointStep bytes = 2*grpSize bytes
+ MOV pointStep,subFFTNum,LSL #1
+
+ ;// update subFFTSize for the next stage
+ MOV subFFTSize,grpCount
+
+ ;// pOut0+1 increments pOut0 by 8 bytes
+ ;// pOut0+outPointStep == increment of 4*outPointStep bytes = 2*size bytes
+ SMULBB outPointStep,grpCount,pointStep
+ LSL pointStep,pointStep,#1
+
+
+ RSB step,pointStep,#16
+ RSB dstStep,outPointStep,#16
+
+ ;// Loop on the groups
+
+grpLoop$name
+
+ VLD1 dW,[pTwiddle],pointStep ;//[wi | wr]
+ MOV setCount,pointStep,LSR #2
+
+
+ ;// Loop on the sets: 4 at a time
+
+
+setLoop$name
+
+
+ VLD2 {dX0,dX1},[pSrc],pointStep ;// point0: dX0-real part dX1-img part
+ VLD2 {dX2,dX3},[pSrc],step ;// point1: dX2-real part dX3-img part
+
+ SUBS setCount,setCount,#4
+
+ IF $inverse
+ VMULL qT0,dX2,dW[0]
+ VMLAL qT0,dX3,dW[1] ;// real part
+ VMULL qT1,dX3,dW[0]
+ VMLSL qT1,dX2,dW[1] ;// imag part
+
+ ELSE
+
+ VMULL qT0,dX2,dW[0]
+ VMLSL qT0,dX3,dW[1] ;// real part
+ VMULL qT1,dX3,dW[0]
+ VMLAL qT1,dX2,dW[1] ;// imag part
+
+ ENDIF
+
+ VRSHRN dX2,qT0,#15
+ VRSHRN dX3,qT1,#15
+
+ IF $scaled
+ VHSUB dY0,dX0,dX2
+ VHSUB dY1,dX1,dX3
+ VHADD dY2,dX0,dX2
+ VHADD dY3,dX1,dX3
+
+ ELSE
+ VSUB dY0,dX0,dX2
+ VSUB dY1,dX1,dX3
+ VADD dY2,dX0,dX2
+ VADD dY3,dX1,dX3
+
+ ENDIF
+
+ VST2 {dY0,dY1},[pDst],outPointStep
+ VST2 {dY2,dY3},[pDst],dstStep ;// dstStep = -outPointStep + 16
+
+ BGT setLoop$name
+
+ SUBS grpCount,grpCount,#2
+ ADD pSrc,pSrc,pointStep
+ BGT grpLoop$name
+
+
+ ;// Reset and Swap pSrc and pDst for the next stage
+ MOV pTmp,pDst
+ SUB pDst,pSrc,outPointStep,LSL #1 ;// pDst -= 2*size; pSrc -= 4*size bytes
+ SUB pSrc,pTmp,outPointStep
+
+ ;// Reset pTwiddle for the next stage
+ SUB pTwiddle,pTwiddle,outPointStep ;// pTwiddle -= 2*size bytes
+
+
+ MEND
+
+
+
+ M_START armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe,r4
+ FFTSTAGE {FALSE},{FALSE},FWD
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe,r4
+ FFTSTAGE {FALSE},{TRUE},INV
+ M_END
+
+
+
+ M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe,r4
+ FFTSTAGE {TRUE},{FALSE},FWDSFS
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe,r4
+ FFTSTAGE {TRUE},{TRUE},INVSFS
+ M_END
+
+
+
+ ENDIF ;//CORTEXA8
+
+
+
+ END
+
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S
new file mode 100644
index 0000000..82662e6
--- /dev/null
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S
@@ -0,0 +1,306 @@
+;//
+;//
+;// File Name: armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision: 7761
+;// Last Modified Date: Wed, 26 Sep 2007
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// Description:
+;// Compute a first stage Radix 4 FFT stage for a N point complex signal
+;//
+;//
+
+
+;// Include standard headers
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+;// Import symbols required from other files
+;// (For example tables)
+
+
+
+
+;// Set debugging level
+;//DEBUG_ON SETL {TRUE}
+
+
+
+;// Guarding implementation by the processor name
+
+
+
+;// Guarding implementation by the processor name
+
+ IF CortexA8
+
+;//Input Registers
+
+pSrc RN 0
+pDst RN 2
+pTwiddle RN 1
+pPingPongBuf RN 5
+subFFTNum RN 6
+subFFTSize RN 7
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+
+grpSize RN 3
+setCount RN 3 ;// Reuse grpSize as setCount
+pointStep RN 4
+outPointStep RN 4
+setStep RN 8
+step1 RN 9
+step3 RN 10
+
+;// Neon Registers
+
+dXr0 DN D0.S16
+dXi0 DN D1.S16
+dXr1 DN D2.S16
+dXi1 DN D3.S16
+dXr2 DN D4.S16
+dXi2 DN D5.S16
+dXr3 DN D6.S16
+dXi3 DN D7.S16
+dYr0 DN D8.S16
+dYi0 DN D9.S16
+dYr1 DN D10.S16
+dYi1 DN D11.S16
+dYr2 DN D12.S16
+dYi2 DN D13.S16
+dYr3 DN D14.S16
+dYi3 DN D15.S16
+dZr0 DN D16.S16
+dZi0 DN D17.S16
+dZr1 DN D18.S16
+dZi1 DN D19.S16
+dZr2 DN D20.S16
+dZi2 DN D21.S16
+dZr3 DN D22.S16
+dZi3 DN D23.S16
+qY0 QN Q4.S16
+qY2 QN Q6.S16
+qX0 QN Q0.S16
+qX2 QN Q2.S16
+
+qY1 QN Q5.S16
+qY3 QN Q7.S16
+qX1 QN Q1.S16
+qX3 QN Q3.S16
+qZ0 QN Q8.S16
+qZ1 QN Q9.S16
+
+
+ MACRO
+ FFTSTAGE $scaled, $inverse, $name
+
+ ;// Define stack arguments
+
+ MOV pointStep,subFFTNum
+ ;// Update pSubFFTSize and pSubFFTNum regs
+
+
+ VLD2 {dXr0,dXi0},[pSrc@128],pointStep ;// data[0]
+ ;// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
+ LSR grpSize,subFFTNum,#2
+ MOV subFFTNum,grpSize
+
+
+ ;// pT0+1 increments pT0 by 4 bytes
+ ;// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
+ ;// Note: outPointStep = pointStep for firststage
+ VLD2 {dXr1,dXi1},[pSrc@128],pointStep ;// data[1]
+
+
+ ;// Calculate the step of input data for the next set
+ ;//MOV setStep,pointStep,LSL #1
+ MOV setStep,grpSize,LSL #3
+ VLD2 {dXr2,dXi2},[pSrc@128],pointStep ;// data[2]
+ MOV step1,setStep
+ ADD setStep,setStep,pointStep ;// setStep = 3*pointStep
+ RSB setStep,setStep,#16 ;// setStep = - 3*pointStep+16
+
+
+ VLD2 {dXr3,dXi3},[pSrc@128],setStep ;// data[3]
+ MOV subFFTSize,#4 ;// subFFTSize = 1 for the first stage
+
+
+ IF $scaled
+ VHADD qY0,qX0,qX2 ;// u0
+ ELSE
+ VADD qY0,qX0,qX2 ;// u0
+ ENDIF
+ RSB step3,pointStep,#0
+
+ ;// grp = 0 a special case since all the twiddle factors are 1
+ ;// Loop on the sets: 4 sets at a time
+
+grpZeroSetLoop$name
+
+
+ IF $scaled
+
+ ;// finish first stage of 4 point FFT
+
+ VHSUB qY2,qX0,qX2 ;// u1
+ SUBS setCount,setCount,#4 ;// decrement the set loop counter
+
+ VLD2 {dXr0,dXi0},[pSrc@128],step1 ;// data[0]
+ VHADD qY1,qX1,qX3 ;// u2
+ VLD2 {dXr2,dXi2},[pSrc@128],step3
+ VHSUB qY3,qX1,qX3 ;// u3
+
+
+
+ ;// finish second stage of 4 point FFT
+
+ VLD2 {dXr1,dXi1},[pSrc@128],step1 ;// data[1]
+ VHADD qZ0,qY0,qY1 ;// y0
+
+ VLD2 {dXr3,dXi3},[pSrc@128],setStep
+
+
+ IF $inverse
+
+ VHSUB dZr3,dYr2,dYi3 ;// y3
+ VHADD dZi3,dYi2,dYr3
+ VST2 {dZr0,dZi0},[pDst@128],outPointStep
+
+ VHSUB qZ1,qY0,qY1 ;// y2
+ VST2 {dZr3,dZi3},[pDst@128],outPointStep
+
+ VHADD dZr2,dYr2,dYi3 ;// y1
+ VST2 {dZr1,dZi1},[pDst@128],outPointStep
+ VHSUB dZi2,dYi2,dYr3
+
+ VHADD qY0,qX0,qX2 ;// u0 (next loop)
+ VST2 {dZr2,dZi2},[pDst@128],setStep
+
+
+ ELSE
+
+ VHADD dZr2,dYr2,dYi3 ;// y1
+ VHSUB dZi2,dYi2,dYr3
+
+ VST2 {dZr0,dZi0},[pDst@128],outPointStep
+ VHSUB qZ1,qY0,qY1 ;// y2
+
+ VST2 {dZr2,dZi2},[pDst@128],outPointStep
+ VHSUB dZr3,dYr2,dYi3 ;// y3
+ VHADD dZi3,dYi2,dYr3
+ VST2 {dZr1,dZi1},[pDst@128],outPointStep
+ VHADD qY0,qX0,qX2 ;// u0 (next loop)
+ VST2 {dZr3,dZi3},[pDst@128],setStep
+
+ ENDIF
+
+
+ ELSE
+
+ ;// finish first stage of 4 point FFT
+
+ VSUB qY2,qX0,qX2 ;// u1
+ SUBS setCount,setCount,#4 ;// decrement the set loop counter
+
+ VLD2 {dXr0,dXi0},[pSrc@128],step1 ;// data[0]
+ VADD qY1,qX1,qX3 ;// u2
+ VLD2 {dXr2,dXi2},[pSrc@128],step3
+ VSUB qY3,qX1,qX3 ;// u3
+
+
+
+ ;// finish second stage of 4 point FFT
+
+ VLD2 {dXr1,dXi1},[pSrc@128],step1 ;// data[1]
+ VADD qZ0,qY0,qY1 ;// y0
+
+ VLD2 {dXr3,dXi3},[pSrc@128],setStep
+
+
+ IF $inverse
+
+ VSUB dZr3,dYr2,dYi3 ;// y3
+ VADD dZi3,dYi2,dYr3
+ VST2 {dZr0,dZi0},[pDst@128],outPointStep
+
+ VSUB qZ1,qY0,qY1 ;// y2
+ VST2 {dZr3,dZi3},[pDst@128],outPointStep
+
+ VADD dZr2,dYr2,dYi3 ;// y1
+ VST2 {dZr1,dZi1},[pDst@128],outPointStep
+ VSUB dZi2,dYi2,dYr3
+
+ VADD qY0,qX0,qX2 ;// u0 (next loop)
+ VST2 {dZr2,dZi2},[pDst@128],setStep
+
+
+ ELSE
+
+ VADD dZr2,dYr2,dYi3 ;// y1
+ VSUB dZi2,dYi2,dYr3
+
+ VST2 {dZr0,dZi0},[pDst@128],outPointStep
+ VSUB qZ1,qY0,qY1 ;// y2
+
+ VST2 {dZr2,dZi2},[pDst@128],outPointStep
+ VSUB dZr3,dYr2,dYi3 ;// y3
+ VADD dZi3,dYi2,dYr3
+ VST2 {dZr1,dZi1},[pDst@128],outPointStep
+ VADD qY0,qX0,qX2 ;// u0 (next loop)
+ VST2 {dZr3,dZi3},[pDst@128],setStep
+
+ ENDIF
+
+
+ ENDIF
+
+ BGT grpZeroSetLoop$name
+
+
+ ;// reset pSrc to pDst for the next stage
+ SUB pSrc,pDst,pointStep ;// pDst -= grpSize
+ MOV pDst,pPingPongBuf
+
+
+ MEND
+
+
+
+ M_START armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe,r4
+ FFTSTAGE {FALSE},{FALSE},FWD
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe,r4
+ FFTSTAGE {FALSE},{TRUE},INV
+ M_END
+
+
+ M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
+ FFTSTAGE {TRUE},{FALSE},FWDSFS
+ M_END
+
+
+ M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
+ FFTSTAGE {TRUE},{TRUE},INVSFS
+ M_END
+
+
+ ENDIF ;//CortexA8
+
+
+
+ END
\ No newline at end of file
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
new file mode 100644
index 0000000..ce324f5
--- /dev/null
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
@@ -0,0 +1,403 @@
+;//
+;//
+;// File Name: armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision: 7765
+;// Last Modified Date: Thu, 27 Sep 2007
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// Description:
+;// Compute a Radix 4 FFT stage for a N point complex signal
+;//
+;//
+
+
+;// Include standard headers
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+ INCLUDE armSP_FFT_s.h
+
+ M_VARIANTS CortexA8
+
+;// Import symbols required from other files
+;// (For example tables)
+
+
+
+
+;// Set debugging level
+;//DEBUG_ON SETL {TRUE}
+
+
+;// Guarding implementation by the processor name
+
+
+
+
+
+
+;// Guarding implementation by the processor name
+
+ IF CortexA8
+
+;// Import symbols required from other files
+;// (For example tables)
+ ;//IMPORT armAAC_constTable
+
+;//Input Registers
+
+pSrc RN 0
+pDst RN 2
+pTwiddle RN 1
+subFFTNum RN 6
+subFFTSize RN 7
+
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+
+outPointStep RN 3
+grpCount RN 4
+dstStep RN 5
+pw1 RN 8
+pw2 RN 9
+pw3 RN 10
+pTmp RN 4
+
+
+;// Neon Registers
+
+dButterfly1Real02 DN D0.S16
+dButterfly1Imag02 DN D1.S16
+dButterfly1Real13 DN D2.S16
+dButterfly1Imag13 DN D3.S16
+dButterfly2Real02 DN D4.S16
+dButterfly2Imag02 DN D5.S16
+dButterfly2Real13 DN D6.S16
+dButterfly2Imag13 DN D7.S16
+dXr0 DN D0.S16
+dXi0 DN D1.S16
+dXr1 DN D2.S16
+dXi1 DN D3.S16
+dXr2 DN D4.S16
+dXi2 DN D5.S16
+dXr3 DN D6.S16
+dXi3 DN D7.S16
+
+dW1rS32 DN D8.S32
+dW1iS32 DN D9.S32
+dW2rS32 DN D10.S32
+dW2iS32 DN D11.S32
+dW3rS32 DN D12.S32
+dW3iS32 DN D13.S32
+
+dW1r DN D8.S16
+dW1i DN D9.S16
+dW2r DN D10.S16
+dW2i DN D11.S16
+dW3r DN D12.S16
+dW3i DN D13.S16
+
+dTmp0 DN D12.S16
+dTmp1 DN D13.S16
+dTmp1S32 DN D13.S32
+dTmp2S32 DN D14.S32
+dTmp3S32 DN D15.S32
+
+dYr0 DN D18.S16
+dYi0 DN D19.S16
+dYr1 DN D16.S16
+dYi1 DN D17.S16
+dYr2 DN D20.S16
+dYi2 DN D21.S16
+dYr3 DN D14.S16
+dYi3 DN D15.S16
+qY0 QN Q9.S16
+qY1 QN Q8.S16
+qY2 QN Q10.S16
+qY3 QN Q7.S16
+
+qX0 QN Q0.S16
+qX1 QN Q1.S16
+qX2 QN Q2.S16
+qX3 QN Q3.S16
+
+qT0 QN Q9.S32
+qT1 QN Q10.S32
+qT2 QN Q7.S32
+qT3 QN Q8.S32
+
+dZr0 DN D22.S16
+dZi0 DN D23.S16
+dZr1 DN D24.S16
+dZi1 DN D25.S16
+dZr2 DN D26.S16
+dZi2 DN D27.S16
+dZr3 DN D28.S16
+dZi3 DN D29.S16
+
+qZ0 QN Q11.S16
+qZ1 QN Q12.S16
+qZ2 QN Q13.S16
+qZ3 QN Q14.S16
+
+
+ MACRO
+ FFTSTAGE $scaled, $inverse , $name
+
+ ;// Define stack arguments
+
+ MOV pw2,pTwiddle
+ VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2@256]!
+
+ MOV pw3,pTwiddle
+ MOV pw1,pTwiddle
+ ;// pOut0+1 increments pOut0 by 8 bytes
+ ;// pOut0+outPointStep == increment of 4*outPointStep bytes
+ MOV outPointStep,subFFTSize,LSL #2
+
+ VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3@64]!
+ MOV subFFTNum,#1 ;//after the last stage
+ LSL grpCount,subFFTSize,#2
+
+
+ ;// Update grpCount and grpSize rightaway
+ VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3@64]!
+
+ ;// update subFFTSize for the next stage
+ MOV subFFTSize,grpCount
+ MOV dstStep,outPointStep,LSL #1
+
+ VLD2 {dW1r,dW1i}, [pw1@128]!
+
+
+ ADD dstStep,dstStep,outPointStep ;// dstStep = 3*outPointStep
+ RSB dstStep,dstStep,#16 ;// dstStep = - 3*outPointStep+16
+
+ VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
+ VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
+
+ ;// Process 4 groups at a time
+
+grpLoop$name
+
+
+ ;// Rearrange the third twiddle
+ VUZP dW3r,dW3i
+ SUBS grpCount,grpCount,#16 ;// grpCount is multiplied by 4
+
+
+ VUZP dButterfly1Real13, dButterfly2Real13 ;// B.r D.r
+ VUZP dButterfly1Imag13, dButterfly2Imag13 ;// B.i D.i
+ VUZP dButterfly1Real02, dButterfly2Real02 ;// A.r C.r
+ VUZP dButterfly1Imag02, dButterfly2Imag02 ;// A.i C.i
+
+
+ IF $inverse
+ VMULL qT0,dXr1,dW1r
+ VMLAL qT0,dXi1,dW1i ;// real part
+ VMULL qT1,dXi1,dW1r
+ VMLSL qT1,dXr1,dW1i ;// imag part
+
+ ELSE
+ VMULL qT0,dXr1,dW1r
+ VMLSL qT0,dXi1,dW1i ;// real part
+ VMULL qT1,dXi1,dW1r
+ VMLAL qT1,dXr1,dW1i ;// imag part
+
+ ENDIF
+
+ ;// Load the first twiddle for 4 groups : w^1
+ ;// w^1 twiddle (i+0,i+1,i+2,i+3) for group 0,1,2,3
+
+ VLD2 {dW1r,dW1i}, [pw1@128]!
+
+ IF $inverse
+ VMULL qT2,dXr2,dW2r
+ VMLAL qT2,dXi2,dW2i ;// real part
+ VMULL qT3,dXi2,dW2r
+ VMLSL qT3,dXr2,dW2i ;// imag part
+
+ ELSE
+ VMULL qT2,dXr2,dW2r
+ VMLSL qT2,dXi2,dW2i ;// real part
+ VMULL qT3,dXi2,dW2r
+ VMLAL qT3,dXr2,dW2i ;// imag part
+
+ ENDIF
+
+ VRSHRN dZr1,qT0,#15
+ VRSHRN dZi1,qT1,#15
+
+
+
+ IF $inverse
+ VMULL qT0,dXr3,dW3r
+ VMLAL qT0,dXi3,dW3i ;// real part
+ VMULL qT1,dXi3,dW3r
+ VMLSL qT1,dXr3,dW3i ;// imag part
+
+ ELSE
+ VMULL qT0,dXr3,dW3r
+ VMLSL qT0,dXi3,dW3i ;// real part
+ VMULL qT1,dXi3,dW3r
+ VMLAL qT1,dXr3,dW3i ;// imag part
+
+ ENDIF
+
+ ;// Load the second twiddle for 4 groups : w^2
+ ;// w^2 twiddle (2i+0,2i+2,2i+4,2i+6) for group 0,1,2,3
+ VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2@256]!
+
+
+ VRSHRN dZr2,qT2,#15
+ VRSHRN dZi2,qT3,#15
+
+ ;// Load the third twiddle for 4 groups : w^3
+ ;// w^3 twiddle (3i+0,3i+3,3i+6,3i+9) for group 0,1,2,3
+
+ VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3@64]!
+
+ VRSHRN dZr3,qT0,#15
+ VRSHRN dZi3,qT1,#15
+
+ VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3@64]!
+
+ IF $scaled
+
+ ;// finish first stage of 4 point FFT
+
+ VHADD qY0,qX0,qZ2
+ VHSUB qY2,qX0,qZ2
+ VHADD qY1,qZ1,qZ3
+ VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
+
+ VHSUB qY3,qZ1,qZ3
+
+ ;// finish second stage of 4 point FFT
+
+ VHSUB qZ0,qY2,qY1
+ VHADD qZ2,qY2,qY1
+ VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
+
+
+ IF $inverse
+
+ VHADD dZr3,dYr0,dYi3 ;// y3 = u0-ju3
+ VST2 {dZr0,dZi0},[pDst@128],outPointStep
+ VHSUB dZi3,dYi0,dYr3
+
+ VHSUB dZr1,dYr0,dYi3 ;// y1 = u0+ju3
+ VHADD dZi1,dYi0,dYr3
+ VST2 {dZr3,dZi3},[pDst@128],outPointStep
+ VST2 {dZr2,dZi2},[pDst@128],outPointStep
+ VST2 {dZr1,dZi1},[pDst@128],dstStep ;// dstStep = -3*outPointStep + 16
+
+ ELSE
+
+ VHSUB dZr1,dYr0,dYi3 ;// y1 = u0+ju3
+ VHADD dZi1,dYi0,dYr3
+
+ VHADD dZr3,dYr0,dYi3 ;// y3 = u0-ju3
+ VST2 {dZr0,dZi0},[pDst@128],outPointStep
+ VHSUB dZi3,dYi0,dYr3
+ VST2 {dZr1,dZi1},[pDst@128],outPointStep
+ VST2 {dZr2,dZi2},[pDst@128],outPointStep
+ VST2 {dZr3,dZi3},[pDst@128],dstStep ;// dstStep = -3*outPointStep + 16
+
+ ENDIF
+
+ ELSE
+
+ ;// finish first stage of 4 point FFT
+
+ VADD qY0,qX0,qZ2
+ VSUB qY2,qX0,qZ2
+ VADD qY1,qZ1,qZ3
+ VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
+
+ VSUB qY3,qZ1,qZ3
+
+ ;// finish second stage of 4 point FFT
+
+ VSUB qZ0,qY2,qY1
+ VADD qZ2,qY2,qY1
+ VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
+
+
+ IF $inverse
+
+ VADD dZr3,dYr0,dYi3 ;// y3 = u0-ju3
+ VST2 {dZr0,dZi0},[pDst@128],outPointStep
+ VSUB dZi3,dYi0,dYr3
+
+ VSUB dZr1,dYr0,dYi3 ;// y1 = u0+ju3
+ VADD dZi1,dYi0,dYr3
+ VST2 {dZr3,dZi3},[pDst@128],outPointStep
+ VST2 {dZr2,dZi2},[pDst@128],outPointStep
+ VST2 {dZr1,dZi1},[pDst@128],dstStep ;// dstStep = -3*outPointStep + 16
+
+ ELSE
+
+ VSUB dZr1,dYr0,dYi3 ;// y1 = u0+ju3
+ VADD dZi1,dYi0,dYr3
+
+ VADD dZr3,dYr0,dYi3 ;// y3 = u0-ju3
+ VST2 {dZr0,dZi0},[pDst@128],outPointStep
+ VSUB dZi3,dYi0,dYr3
+ VST2 {dZr1,dZi1},[pDst@128],outPointStep
+ VST2 {dZr2,dZi2},[pDst@128],outPointStep
+ VST2 {dZr3,dZi3},[pDst@128],dstStep ;// dstStep = -3*outPointStep + 16
+
+ ENDIF
+
+
+
+
+ ENDIF
+
+ BGT grpLoop$name
+
+
+ ;// Reset and Swap pSrc and pDst for the next stage
+ MOV pTmp,pDst
+ SUB pSrc,pSrc,#64 ;// Extra increment currently done in the loop
+ SUB pDst,pSrc,outPointStep,LSL #2 ;// pDst -= size; pSrc -= 4*size bytes
+ SUB pSrc,pTmp,outPointStep
+
+ MEND
+
+
+ M_START armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
+ FFTSTAGE {FALSE},{FALSE},FWD
+ M_END
+
+
+ M_START armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
+ FFTSTAGE {FALSE},{TRUE},INV
+ M_END
+
+
+ M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
+ FFTSTAGE {TRUE},{FALSE},FWDSFS
+ M_END
+
+
+ M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
+ FFTSTAGE {TRUE},{TRUE},INVSFS
+ M_END
+
+
+ ENDIF ;//CortexA8
+
+
+
+
+ END
\ No newline at end of file
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S
new file mode 100644
index 0000000..c13df04
--- /dev/null
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S
@@ -0,0 +1,392 @@
+;//
+;//
+;// File Name: armSP_FFT_CToC_SC16_Radix4_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision: 7761
+;// Last Modified Date: Wed, 26 Sep 2007
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// Description:
+;// Compute a Radix 4 FFT stage for a N point complex signal
+;//
+;//
+
+
+;// Include standard headers
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+ INCLUDE armSP_FFT_s.h
+
+ M_VARIANTS CortexA8
+
+;// Import symbols required from other files
+;// (For example tables)
+
+
+
+
+;// Set debugging level
+;//DEBUG_ON SETL {TRUE}
+
+
+;// Guarding implementation by the processor name
+
+
+
+ ;// Guarding implementation by the processor name
+
+ IF CortexA8
+
+;// Import symbols required from other files
+;// (For example tables)
+
+
+;//Input Registers
+
+pSrc RN 0
+pDst RN 2
+pTwiddle RN 1
+subFFTNum RN 6
+subFFTSize RN 7
+
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+
+grpCount RN 3
+pointStep RN 4
+outPointStep RN 5
+stepTwiddle RN 12
+setCount RN 14
+srcStep RN 8
+setStep RN 9
+dstStep RN 10
+twStep RN 11
+t1 RN 3
+
+;// Neon Registers
+
+dW1 DN D0.S16
+dW2 DN D1.S16
+dW3 DN D2.S16
+
+dXr0 DN D4.S16
+dXi0 DN D5.S16
+dXr1 DN D6.S16
+dXi1 DN D7.S16
+dXr2 DN D8.S16
+dXi2 DN D9.S16
+dXr3 DN D10.S16
+dXi3 DN D11.S16
+dYr0 DN D12.S16
+dYi0 DN D13.S16
+dYr1 DN D14.S16
+dYi1 DN D15.S16
+dYr2 DN D16.S16
+dYi2 DN D17.S16
+dYr3 DN D18.S16
+dYi3 DN D19.S16
+qT0 QN Q8.S32
+qT1 QN Q9.S32
+qT2 QN Q6.S32
+qT3 QN Q7.S32
+
+dZr0 DN D20.S16
+dZi0 DN D21.S16
+dZr1 DN D22.S16
+dZi1 DN D23.S16
+dZr2 DN D24.S16
+dZi2 DN D25.S16
+dZr3 DN D26.S16
+dZi3 DN D27.S16
+qY0 QN Q6.S16
+qY1 QN Q7.S16
+qY2 QN Q8.S16
+qY3 QN Q9.S16
+qX0 QN Q2.S16
+qZ0 QN Q10.S16
+qZ1 QN Q11.S16
+qZ2 QN Q12.S16
+qZ3 QN Q13.S16
+
+
+ MACRO
+ FFTSTAGE $scaled, $inverse , $name
+
+ ;// Define stack arguments
+
+
+ ;// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+
+ LSL grpCount,subFFTSize,#2
+ LSR subFFTNum,subFFTNum,#2
+ MOV subFFTSize,grpCount
+
+
+ ;// pOut0+1 increments pOut0 by 4 bytes
+ ;// pOut0+outPointStep == increment of 4*outPointStep bytes = size bytes
+
+ MOV stepTwiddle,#0
+ SMULBB outPointStep,grpCount,subFFTNum
+
+ ;// pT0+1 increments pT0 by 4 bytes
+ ;// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
+
+ LSL pointStep,subFFTNum,#2 ;// 2*grpSize
+
+ VLD1 dW1,[pTwiddle@64] ;//[wi | wr]
+ MOV srcStep,pointStep,LSL #1 ;// srcStep = 2*pointStep
+ VLD1 dW2,[pTwiddle@64] ;//[wi | wr]
+ ADD setStep,srcStep,pointStep ;// setStep = 3*pointStep
+ SUB srcStep,srcStep,#16 ;// srcStep = 2*pointStep-16
+ VLD1 dW3,[pTwiddle@64]
+ ;//RSB setStep,setStep,#16 ;// setStep = - 3*pointStep+16
+ RSB setStep,setStep,#0 ;// setStep = - 3*pointStep
+
+ MOV dstStep,outPointStep,LSL #1
+ ADD dstStep,dstStep,outPointStep ;// dstStep = 3*outPointStep
+ RSB dstStep,dstStep,#16 ;// dstStep = - 3*outPointStep+16
+
+
+
+grpLoop$name
+
+ VLD2 {dXr0,dXi0},[pSrc@128],pointStep ;// data[0]
+ ADD stepTwiddle,stepTwiddle,pointStep
+ VLD2 {dXr1,dXi1},[pSrc@128],pointStep ;// data[1]
+ ADD pTwiddle,pTwiddle,stepTwiddle ;// set pTwiddle to the first point
+ VLD2 {dXr2,dXi2},[pSrc@128],pointStep ;// data[2]
+ MOV twStep,stepTwiddle,LSL #2
+ VLD2 {dXr3,dXi3},[pSrc@128],setStep ;// data[3] & reset pSrc
+
+ SUB twStep,stepTwiddle,twStep ;// twStep = -3*stepTwiddle
+
+
+ MOV setCount,pointStep,LSR #2
+ ADD pSrc,pSrc,#16 ;// set pSrc to data[0] of the next set
+ ADD pSrc,pSrc,pointStep ;// increment to data[1] of the next set
+
+ ;// Loop on the sets : 4 at a time
+
+setLoop$name
+
+ SUBS setCount,setCount,#4 ;// decrement the loop counter
+
+ IF $inverse
+ VMULL qT0,dXr1,dW1[0]
+ VMLAL qT0,dXi1,dW1[1] ;// real part
+ VMULL qT1,dXi1,dW1[0]
+ VMLSL qT1,dXr1,dW1[1] ;// imag part
+
+ ELSE
+ VMULL qT0,dXr1,dW1[0]
+ VMLSL qT0,dXi1,dW1[1] ;// real part
+ VMULL qT1,dXi1,dW1[0]
+ VMLAL qT1,dXr1,dW1[1] ;// imag part
+
+ ENDIF
+
+ VLD2 {dXr1,dXi1},[pSrc@128],pointStep ;// data[1]
+
+ IF $inverse
+ VMULL qT2,dXr2,dW2[0]
+ VMLAL qT2,dXi2,dW2[1] ;// real part
+ VMULL qT3,dXi2,dW2[0]
+ VMLSL qT3,dXr2,dW2[1] ;// imag part
+
+ ELSE
+ VMULL qT2,dXr2,dW2[0]
+ VMLSL qT2,dXi2,dW2[1] ;// real part
+ VMULL qT3,dXi2,dW2[0]
+ VMLAL qT3,dXr2,dW2[1] ;// imag part
+
+ ENDIF
+
+ VRSHRN dZr1,qT0,#15
+ VRSHRN dZi1,qT1,#15
+
+
+ VLD2 {dXr2,dXi2},[pSrc@128],pointStep ;// data[2]
+
+ IF $inverse
+ VMULL qT0,dXr3,dW3[0]
+ VMLAL qT0,dXi3,dW3[1] ;// real part
+ VMULL qT1,dXi3,dW3[0]
+ VMLSL qT1,dXr3,dW3[1] ;// imag part
+
+ ELSE
+ VMULL qT0,dXr3,dW3[0]
+ VMLSL qT0,dXi3,dW3[1] ;// real part
+ VMULL qT1,dXi3,dW3[0]
+ VMLAL qT1,dXr3,dW3[1] ;// imag part
+
+ ENDIF
+
+ VRSHRN dZr2,qT2,#15
+ VRSHRN dZi2,qT3,#15
+
+
+ VRSHRN dZr3,qT0,#15
+ VRSHRN dZi3,qT1,#15
+ VLD2 {dXr3,dXi3},[pSrc@128],setStep ;// data[3] & update pSrc for the next set
+
+
+ IF $scaled
+
+ ;// finish first stage of 4 point FFT
+ VHADD qY0,qX0,qZ2
+ VHSUB qY2,qX0,qZ2
+
+ VLD2 {dXr0,dXi0},[pSrc@128]! ;// data[0]
+ VHADD qY1,qZ1,qZ3
+ VHSUB qY3,qZ1,qZ3
+
+
+ ;// finish second stage of 4 point FFT
+
+ IF $inverse
+
+ VHSUB qZ0,qY2,qY1
+
+ VHADD dZr2,dYr0,dYi3
+ VST2 {dZr0,dZi0},[pDst@128],outPointStep
+ VHSUB dZi2,dYi0,dYr3
+
+ VHADD qZ1,qY2,qY1
+ VST2 {dZr2,dZi2},[pDst@128],outPointStep
+
+ VHSUB dZr3,dYr0,dYi3
+ VST2 {dZr1,dZi1},[pDst@128],outPointStep
+ VHADD dZi3,dYi0,dYr3
+ VST2 {dZr3,dZi3},[pDst@128],dstStep
+
+
+ ELSE
+
+ VHSUB qZ0,qY2,qY1
+
+ VHSUB dZr3,dYr0,dYi3
+ VST2 {dZr0,dZi0},[pDst@128],outPointStep
+ VHADD dZi3,dYi0,dYr3
+
+ VHADD qZ1,qY2,qY1
+ VST2 {dZr3,dZi3},[pDst@128],outPointStep
+
+ VHADD dZr2,dYr0,dYi3
+ VHSUB dZi2,dYi0,dYr3
+ VST2 {dZr1,dZi1},[pDst@128],outPointStep
+ VST2 {dZr2,dZi2},[pDst@128],dstStep
+
+
+ ENDIF
+
+
+ ELSE
+
+ ;// finish first stage of 4 point FFT
+ VADD qY0,qX0,qZ2
+ VSUB qY2,qX0,qZ2
+
+ VLD2 {dXr0,dXi0},[pSrc]! ;// data[0]
+ VADD qY1,qZ1,qZ3
+ VSUB qY3,qZ1,qZ3
+
+
+ ;// finish second stage of 4 point FFT
+
+
+ IF $inverse
+
+ VSUB qZ0,qY2,qY1
+
+ VADD dZr2,dYr0,dYi3
+ VST2 {dZr0,dZi0},[pDst@128],outPointStep
+ VSUB dZi2,dYi0,dYr3
+
+ VADD qZ1,qY2,qY1
+ VST2 {dZr2,dZi2},[pDst@128],outPointStep
+
+ VSUB dZr3,dYr0,dYi3
+ VST2 {dZr1,dZi1},[pDst@128],outPointStep
+ VADD dZi3,dYi0,dYr3
+ VST2 {dZr3,dZi3},[pDst@128],dstStep
+
+
+ ELSE
+
+ VSUB qZ0,qY2,qY1
+
+ VSUB dZr3,dYr0,dYi3
+ VST2 {dZr0,dZi0},[pDst@128],outPointStep
+ VADD dZi3,dYi0,dYr3
+
+ VADD qZ1,qY2,qY1
+ VST2 {dZr3,dZi3},[pDst@128],outPointStep
+
+ VADD dZr2,dYr0,dYi3
+ VSUB dZi2,dYi0,dYr3
+ VST2 {dZr1,dZi1},[pDst@128],outPointStep
+ VST2 {dZr2,dZi2},[pDst@128],dstStep
+
+
+ ENDIF
+
+
+
+ ENDIF
+
+ ADD pSrc,pSrc,pointStep ;// increment to data[1] of the next set
+ BGT setLoop$name
+
+ VLD1 dW1,[pTwiddle@64],stepTwiddle ;//[wi | wr]
+ SUBS grpCount,grpCount,#4 ;// subtract 4 since grpCount multiplied by 4
+ VLD1 dW2,[pTwiddle@64],stepTwiddle ;//[wi | wr]
+ ADD pSrc,pSrc,srcStep ;// increment pSrc for the next grp
+ VLD1 dW3,[pTwiddle@64],twStep ;//[wi | wr]
+
+
+
+ BGT grpLoop$name
+
+
+ ;// Reset and Swap pSrc and pDst for the next stage
+ MOV t1,pDst
+ SUB pDst,pSrc,outPointStep,LSL #2 ;// pDst -= size; pSrc -= 4*size bytes
+ SUB pSrc,t1,outPointStep
+
+
+ MEND
+
+
+ M_START armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe,r4
+ FFTSTAGE {FALSE},{FALSE},FWD
+ M_END
+
+
+ M_START armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe,r4
+ FFTSTAGE {FALSE},{TRUE},INV
+ M_END
+
+
+ M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4
+ FFTSTAGE {TRUE},{FALSE},FWDSFS
+ M_END
+
+
+ M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4
+ FFTSTAGE {TRUE},{TRUE},INVSFS
+ M_END
+
+
+ ENDIF ;//CortexA8
+
+
+
+ END
\ No newline at end of file
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
new file mode 100644
index 0000000..741681f
--- /dev/null
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
@@ -0,0 +1,591 @@
+;//
+;//
+;// File Name: armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision: 7766
+;// Last Modified Date: Thu, 27 Sep 2007
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// Description:
+;// Compute a first stage Radix 8 FFT stage for a N point complex signal
+;//
+;//
+
+
+;// Include standard headers
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+;// Import symbols required from other files
+;// (For example tables)
+
+
+;// Set debugging level
+;//DEBUG_ON SETL {TRUE}
+
+
+
+;// Guarding implementation by the processor name
+
+
+
+
+;// Guarding implementation by the processor name
+
+ IF CortexA8
+
+;//Input Registers
+
+pSrc RN 0
+pDst RN 2
+pTwiddle RN 1
+subFFTNum RN 6
+subFFTSize RN 7
+pPingPongBuf RN 5 ;// dest buffer for the next stage (not pSrc for first stage)
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+
+grpSize RN 3
+setCount RN 3 ;// Reuse grpSize as setCount
+pointStep RN 4
+outPointStep RN 4
+setStep RN 8
+step1 RN 9
+step2 RN 10
+t0 RN 11
+
+
+;// Neon Registers
+
+dXr0 DN D14.S16
+dXi0 DN D15.S16
+dXr1 DN D2.S16
+dXi1 DN D3.S16
+dXr2 DN D4.S16
+dXi2 DN D5.S16
+dXr3 DN D6.S16
+dXi3 DN D7.S16
+dXr4 DN D8.S16
+dXi4 DN D9.S16
+dXr5 DN D10.S16
+dXi5 DN D11.S16
+dXr6 DN D12.S16
+dXi6 DN D13.S16
+dXr7 DN D0.S16
+dXi7 DN D1.S16
+qX0 QN Q7.S16
+qX1 QN Q1.S16
+qX2 QN Q2.S16
+qX3 QN Q3.S16
+qX4 QN Q4.S16
+qX5 QN Q5.S16
+qX6 QN Q6.S16
+qX7 QN Q0.S16
+
+dUr0 DN D16.S16
+dUi0 DN D17.S16
+dUr2 DN D18.S16
+dUi2 DN D19.S16
+dUr4 DN D20.S16
+dUi4 DN D21.S16
+dUr6 DN D22.S16
+dUi6 DN D23.S16
+dUr1 DN D24.S16
+dUi1 DN D25.S16
+dUr3 DN D26.S16
+dUi3 DN D27.S16
+dUr5 DN D28.S16
+dUi5 DN D29.S16
+dUr7 DN D30.S16 ;// reuse dXr7 and dXi7
+dUi7 DN D31.S16
+qU0 QN Q8.S16
+qU1 QN Q12.S16
+qU2 QN Q9.S16
+qU3 QN Q13.S16
+qU4 QN Q10.S16
+qU5 QN Q14.S16
+qU6 QN Q11.S16
+qU7 QN Q15.S16
+
+
+
+dVr0 DN D24.S16
+dVi0 DN D25.S16
+dVr2 DN D26.S16
+dVi2 DN D27.S16
+dVr4 DN D28.S16
+dVi4 DN D29.S16
+dVr6 DN D30.S16
+dVi6 DN D31.S16
+dVr1 DN D16.S16
+dVi1 DN D17.S16
+dVr3 DN D18.S16
+dVi3 DN D19.S16
+dVr5 DN D20.S16
+dVi5 DN D21.S16
+dVr7 DN D22.S16 ;// reuse dUi7
+dVi7 DN D23.S16 ;// reuse dUr7
+qV0 QN Q12.S16
+qV1 QN Q8.S16
+qV2 QN Q13.S16
+qV3 QN Q9.S16
+qV4 QN Q14.S16
+qV5 QN Q10.S16
+qV6 QN Q15.S16
+qV7 QN Q11.S16
+
+
+
+dYr0 DN D16.S16
+dYi0 DN D17.S16
+dYr2 DN D18.S16
+dYi2 DN D19.S16
+dYr4 DN D20.S16
+dYi4 DN D21.S16
+dYr6 DN D22.S16
+dYi6 DN D23.S16
+dYr1 DN D24.S16
+dYi1 DN D25.S16
+dYr3 DN D26.S16
+dYi3 DN D27.S16
+dYr5 DN D28.S16
+dYi5 DN D29.S16
+dYr7 DN D30.S16 ;// reuse dYr4 and dYi4
+dYi7 DN D31.S16
+qY0 QN Q8.S16
+qY1 QN Q12.S16
+qY2 QN Q9.S16
+qY3 QN Q13.S16
+qY4 QN Q10.S16
+qY5 QN Q14.S16
+qY6 QN Q11.S16
+qY7 QN Q15.S16
+
+
+dT0 DN D0.S16
+dT1 DN D1.S16
+
+
+;// Define constants
+ONEBYSQRT2 EQU 0x00005A82 ;// Q15 format
+
+
+ MACRO
+ FFTSTAGE $scaled, $inverse , $name
+
+ ;// Define stack arguments
+
+ ;// Update pSubFFTSize and pSubFFTNum regs
+ MOV subFFTSize,#8 ;// subFFTSize = 1 for the first stage
+ LDR t0,=ONEBYSQRT2 ;// t0=(1/sqrt(2)) as Q15 format
+
+ ;// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
+ LSR grpSize,subFFTNum,#3
+ MOV subFFTNum,grpSize
+
+
+ ;// pT0+1 increments pT0 by 4 bytes
+ ;// pT0+pointStep = increment of 4*pointStep bytes = grpSize/2 bytes
+ ;// Note: outPointStep = pointStep for firststage
+
+ MOV pointStep,grpSize,LSL #2
+
+
+ ;// Calculate the step of input data for the next set
+ ;//MOV step1,pointStep,LSL #1 ;// step1 = 2*pointStep
+ VLD2 {dXr0,dXi0},[pSrc@128],pointStep ;// data[0]
+ MOV step1,grpSize,LSL #3
+
+ MOV step2,pointStep,LSL #3
+ VLD2 {dXr1,dXi1},[pSrc@128],pointStep ;// data[1]
+ SUB step2,step2,pointStep ;// step2 = 7*pointStep
+ RSB setStep,step2,#16 ;// setStep = - 7*pointStep+16
+
+
+
+ VLD2 {dXr2,dXi2},[pSrc@128],pointStep ;// data[2]
+ VLD2 {dXr3,dXi3},[pSrc@128],pointStep ;// data[3]
+ VLD2 {dXr4,dXi4},[pSrc@128],pointStep ;// data[4]
+ VLD2 {dXr5,dXi5},[pSrc@128],pointStep ;// data[5]
+ VLD2 {dXr6,dXi6},[pSrc@128],pointStep ;// data[6]
+ VLD2 {dXr7,dXi7},[pSrc@128],setStep ;// data[7] & update pSrc for the next set
+ ;// setStep = -7*pointStep + 16
+ ;// grp = 0 a special case since all the twiddle factors are 1
+ ;// Loop on the sets : 4 sets at a time
+
+grpZeroSetLoop$name
+
+ ;// Decrement setcount
+ SUBS setCount,setCount,#4 ;// decrement the set loop counter
+
+
+ IF $scaled
+ ;// finish first stage of 8 point FFT
+
+ VHADD qU0,qX0,qX4
+ VHADD qU2,qX1,qX5
+ VHADD qU4,qX2,qX6
+ VHADD qU6,qX3,qX7
+
+ ;// finish second stage of 8 point FFT
+
+ VHADD qV0,qU0,qU4
+ VHSUB qV2,qU0,qU4
+ VHADD qV4,qU2,qU6
+ VHSUB qV6,qU2,qU6
+
+ ;// finish third stage of 8 point FFT
+
+ VHADD qY0,qV0,qV4
+ VHSUB qY4,qV0,qV4
+ VST2 {dYr0,dYi0},[pDst@128],step1 ;// store y0
+
+ IF $inverse
+
+ VHSUB dYr2,dVr2,dVi6
+ VHADD dYi2,dVi2,dVr6
+
+ VHADD dYr6,dVr2,dVi6
+ VST2 {dYr2,dYi2},[pDst@128],step1 ;// store y2
+ VHSUB dYi6,dVi2,dVr6
+
+ VHSUB qU1,qX0,qX4
+ VST2 {dYr4,dYi4},[pDst@128],step1 ;// store y4
+
+ VHSUB qU3,qX1,qX5
+ VHSUB qU5,qX2,qX6
+ VST2 {dYr6,dYi6},[pDst@128],step1 ;// store y6
+
+ ELSE
+
+ VHADD dYr6,dVr2,dVi6
+ VHSUB dYi6,dVi2,dVr6
+
+ VHSUB dYr2,dVr2,dVi6
+ VST2 {dYr6,dYi6},[pDst@128],step1 ;// store y2
+ VHADD dYi2,dVi2,dVr6
+
+
+ VHSUB qU1,qX0,qX4
+ VST2 {dYr4,dYi4},[pDst@128],step1 ;// store y4
+ VHSUB qU3,qX1,qX5
+ VHSUB qU5,qX2,qX6
+ VST2 {dYr2,dYi2},[pDst@128],step1 ;// store y6
+
+
+ ENDIF
+
+ ;// finish first stage of 8 point FFT
+
+ VHSUB qU7,qX3,qX7
+ VMOV dT0[0],t0
+
+ ;// finish second stage of 8 point FFT
+
+ VHSUB dVr1,dUr1,dUi5
+ VLD2 {dXr0,dXi0},[pSrc@128],pointStep ;// data[0] for next iteration
+ VHADD dVi1,dUi1,dUr5
+ VHADD dVr3,dUr1,dUi5
+ VLD2 {dXr1,dXi1},[pSrc@128],pointStep ;// data[1]
+ VHSUB dVi3,dUi1,dUr5
+
+ VHSUB dVr5,dUr3,dUi7
+ VLD2 {dXr2,dXi2},[pSrc@128],pointStep ;// data[2]
+ VHADD dVi5,dUi3,dUr7
+ VHADD dVr7,dUr3,dUi7
+ VLD2 {dXr3,dXi3},[pSrc@128],pointStep ;// data[3]
+ VHSUB dVi7,dUi3,dUr7
+
+ ;// finish third stage of 8 point FFT
+
+ IF $inverse
+
+ ;// calculate a*v5
+ VQRDMULH dT1,dVr5,dT0[0] ;// use dVi0 for dT1
+ VLD2 {dXr4,dXi4},[pSrc@128],pointStep ;// data[4]
+ VQRDMULH dVi5,dVi5,dT0[0]
+
+ VLD2 {dXr5,dXi5},[pSrc@128],pointStep ;// data[5]
+ VSUB dVr5,dT1,dVi5 ;// a * V5
+ VADD dVi5,dT1,dVi5
+
+ VLD2 {dXr6,dXi6},[pSrc@128],pointStep ;// data[6]
+
+ ;// calculate b*v7
+ VQRDMULH dT1,dVr7,dT0[0]
+ VQRDMULH dVi7,dVi7,dT0[0]
+
+ VHADD qY1,qV1,qV5
+ VHSUB qY5,qV1,qV5
+
+
+ VADD dVr7,dT1,dVi7 ;// b * V7
+ VSUB dVi7,dVi7,dT1
+ SUB pDst, pDst, step2 ;// set pDst to y1
+
+ VLD2 {dXr7,dXi7},[pSrc@128],setStep ;// data[7]
+
+
+ VHSUB dYr3,dVr3,dVr7
+ VHSUB dYi3,dVi3,dVi7
+ VST2 {dYr1,dYi1},[pDst@128],step1 ;// store y1
+ VHADD dYr7,dVr3,dVr7
+ VHADD dYi7,dVi3,dVi7
+
+
+ VST2 {dYr3,dYi3},[pDst@128],step1 ;// store y3
+ VST2 {dYr5,dYi5},[pDst@128],step1 ;// store y5
+ VST2 {dYr7,dYi7},[pDst@128],#16 ;// store y7
+ ELSE
+
+ ;// calculate b*v7
+ VQRDMULH dT1,dVr7,dT0[0]
+ VLD2 {dXr4,dXi4},[pSrc@128],pointStep ;// data[4]
+ VQRDMULH dVi7,dVi7,dT0[0]
+
+ VLD2 {dXr5,dXi5},[pSrc@128],pointStep ;// data[5]
+ VADD dVr7,dT1,dVi7 ;// b * V7
+ VSUB dVi7,dVi7,dT1
+
+ VLD2 {dXr6,dXi6},[pSrc@128],pointStep ;// data[6]
+
+ ;// calculate a*v5
+ VQRDMULH dT1,dVr5,dT0[0] ;// use dVi0 for dT1
+ VQRDMULH dVi5,dVi5,dT0[0]
+
+ VHADD dYr7,dVr3,dVr7
+ VHADD dYi7,dVi3,dVi7
+ SUB pDst, pDst, step2 ;// set pDst to y1
+
+ VSUB dVr5,dT1,dVi5 ;// a * V5
+ VADD dVi5,dT1,dVi5
+ VLD2 {dXr7,dXi7},[pSrc@128],setStep ;// data[7]
+
+ VHSUB qY5,qV1,qV5
+
+ VHSUB dYr3,dVr3,dVr7
+ VST2 {dYr7,dYi7},[pDst@128],step1 ;// store y1
+ VHSUB dYi3,dVi3,dVi7
+ VHADD qY1,qV1,qV5
+
+
+ VST2 {dYr5,dYi5},[pDst@128],step1 ;// store y3
+ VST2 {dYr3,dYi3},[pDst@128],step1 ;// store y5
+ VST2 {dYr1,dYi1},[pDst@128],#16 ;// store y7
+
+
+ ENDIF
+
+
+
+ ELSE
+ ;// finish first stage of 8 point FFT
+
+ VADD qU0,qX0,qX4
+ VADD qU2,qX1,qX5
+ VADD qU4,qX2,qX6
+ VADD qU6,qX3,qX7
+
+ ;// finish second stage of 8 point FFT
+
+ VADD qV0,qU0,qU4
+ VSUB qV2,qU0,qU4
+ VADD qV4,qU2,qU6
+ VSUB qV6,qU2,qU6
+
+ ;// finish third stage of 8 point FFT
+
+ VADD qY0,qV0,qV4
+ VSUB qY4,qV0,qV4
+ VST2 {dYr0,dYi0},[pDst@128],step1 ;// store y0
+
+ IF $inverse
+
+ VSUB dYr2,dVr2,dVi6
+ VADD dYi2,dVi2,dVr6
+
+ VADD dYr6,dVr2,dVi6
+ VST2 {dYr2,dYi2},[pDst@128],step1 ;// store y2
+ VSUB dYi6,dVi2,dVr6
+
+ VSUB qU1,qX0,qX4
+ VST2 {dYr4,dYi4},[pDst@128],step1 ;// store y4
+
+ VSUB qU3,qX1,qX5
+ VSUB qU5,qX2,qX6
+ VST2 {dYr6,dYi6},[pDst@128],step1 ;// store y6
+
+ ELSE
+
+ VADD dYr6,dVr2,dVi6
+ VSUB dYi6,dVi2,dVr6
+
+ VSUB dYr2,dVr2,dVi6
+ VST2 {dYr6,dYi6},[pDst@128],step1 ;// store y2
+ VADD dYi2,dVi2,dVr6
+
+
+ VSUB qU1,qX0,qX4
+ VST2 {dYr4,dYi4},[pDst@128],step1 ;// store y4
+ VSUB qU3,qX1,qX5
+ VSUB qU5,qX2,qX6
+ VST2 {dYr2,dYi2},[pDst@128],step1 ;// store y6
+
+
+ ENDIF
+
+ ;// finish first stage of 8 point FFT
+
+ VSUB qU7,qX3,qX7
+ VMOV dT0[0],t0
+
+ ;// finish second stage of 8 point FFT
+
+ VSUB dVr1,dUr1,dUi5
+ VLD2 {dXr0,dXi0},[pSrc@128],pointStep ;// data[0] for next iteration
+ VADD dVi1,dUi1,dUr5
+ VADD dVr3,dUr1,dUi5
+ VLD2 {dXr1,dXi1},[pSrc@128],pointStep ;// data[1]
+ VSUB dVi3,dUi1,dUr5
+
+ VSUB dVr5,dUr3,dUi7
+ VLD2 {dXr2,dXi2},[pSrc@128],pointStep ;// data[2]
+ VADD dVi5,dUi3,dUr7
+ VADD dVr7,dUr3,dUi7
+ VLD2 {dXr3,dXi3},[pSrc@128],pointStep ;// data[3]
+ VSUB dVi7,dUi3,dUr7
+
+ ;// finish third stage of 8 point FFT
+
+ IF $inverse
+
+ ;// calculate a*v5
+ VQRDMULH dT1,dVr5,dT0[0] ;// use dVi0 for dT1
+ VLD2 {dXr4,dXi4},[pSrc@128],pointStep ;// data[4]
+ VQRDMULH dVi5,dVi5,dT0[0]
+
+ VLD2 {dXr5,dXi5},[pSrc@128],pointStep ;// data[5]
+ VSUB dVr5,dT1,dVi5 ;// a * V5
+ VADD dVi5,dT1,dVi5
+
+ VLD2 {dXr6,dXi6},[pSrc@128],pointStep ;// data[6]
+
+ ;// calculate b*v7
+ VQRDMULH dT1,dVr7,dT0[0]
+ VQRDMULH dVi7,dVi7,dT0[0]
+
+ VADD qY1,qV1,qV5
+ VSUB qY5,qV1,qV5
+
+
+ VADD dVr7,dT1,dVi7 ;// b * V7
+ VSUB dVi7,dVi7,dT1
+ SUB pDst, pDst, step2 ;// set pDst to y1
+
+ VLD2 {dXr7,dXi7},[pSrc@128],setStep ;// data[7]
+
+
+ VSUB dYr3,dVr3,dVr7
+ VSUB dYi3,dVi3,dVi7
+ VST2 {dYr1,dYi1},[pDst@128],step1 ;// store y1
+ VADD dYr7,dVr3,dVr7
+ VADD dYi7,dVi3,dVi7
+
+
+ VST2 {dYr3,dYi3},[pDst@128],step1 ;// store y3
+ VST2 {dYr5,dYi5},[pDst@128],step1 ;// store y5
+ VST2 {dYr7,dYi7},[pDst@128],#16 ;// store y7
+ ELSE
+
+ ;// calculate b*v7
+ VQRDMULH dT1,dVr7,dT0[0]
+ VLD2 {dXr4,dXi4},[pSrc@128],pointStep ;// data[4]
+ VQRDMULH dVi7,dVi7,dT0[0]
+
+ VLD2 {dXr5,dXi5},[pSrc@128],pointStep ;// data[5]
+ VADD dVr7,dT1,dVi7 ;// b * V7
+ VSUB dVi7,dVi7,dT1
+
+ VLD2 {dXr6,dXi6},[pSrc@128],pointStep ;// data[6]
+
+ ;// calculate a*v5
+ VQRDMULH dT1,dVr5,dT0[0] ;// use dVi0 for dT1
+ VQRDMULH dVi5,dVi5,dT0[0]
+
+ VADD dYr7,dVr3,dVr7
+ VADD dYi7,dVi3,dVi7
+ SUB pDst, pDst, step2 ;// set pDst to y1
+
+ VSUB dVr5,dT1,dVi5 ;// a * V5
+ VADD dVi5,dT1,dVi5
+ VLD2 {dXr7,dXi7},[pSrc@128],setStep ;// data[7]
+
+ VSUB qY5,qV1,qV5
+
+ VSUB dYr3,dVr3,dVr7
+ VST2 {dYr7,dYi7},[pDst@128],step1 ;// store y1
+ VSUB dYi3,dVi3,dVi7
+ VADD qY1,qV1,qV5
+
+
+ VST2 {dYr5,dYi5},[pDst@128],step1 ;// store y3
+ VST2 {dYr3,dYi3},[pDst@128],step1 ;// store y5
+ VST2 {dYr1,dYi1},[pDst@128],#16 ;// store y7
+
+
+ ENDIF
+
+
+ ENDIF
+
+ SUB pDst, pDst, step2 ;// update pDst for the next set
+ BGT grpZeroSetLoop$name
+
+
+ ;// reset pSrc to pDst for the next stage
+ SUB pSrc,pDst,pointStep ;// pDst -= 2*grpSize
+ MOV pDst,pPingPongBuf
+
+
+
+ MEND
+
+
+ ;// Allocate stack memory required by the function
+
+
+ M_START armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4
+ FFTSTAGE {FALSE},{FALSE},FWD
+ M_END
+
+
+ M_START armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4
+ FFTSTAGE {FALSE},{TRUE},INV
+ M_END
+
+
+ M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
+ FFTSTAGE {TRUE},{FALSE},FWDSFS
+ M_END
+
+
+ M_START armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
+ FFTSTAGE {TRUE},{TRUE},INVSFS
+ M_END
+
+
+ ENDIF ;//CortexA8
+
+
+
+ END
\ No newline at end of file
diff --git a/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S b/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S
new file mode 100644
index 0000000..399037c
--- /dev/null
+++ b/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S
@@ -0,0 +1,353 @@
+;//
+;//
+;// File Name: omxSP_FFTFwd_CToC_SC16_Sfs_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision: 6729
+;// Last Modified Date: Tue, 17 Jul 2007
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// Description:
+;// Compute an inverse FFT for a complex signal
+;//
+;//
+
+
+;// Include standard headers
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+;// Import symbols required from other files
+;// (For example tables)
+
+ IMPORT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ IMPORT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+ IMPORT armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+ IMPORT armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ IMPORT armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+ IMPORT armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+ IMPORT armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+ IMPORT armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+ IMPORT armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+ IMPORT armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+ IMPORT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+ IMPORT armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
+ IMPORT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+ IMPORT armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+
+;// Set debugging level
+;//DEBUG_ON SETL {TRUE}
+
+
+
+;// Guarding implementation by the processor name
+
+
+
+;// Guarding implementation by the processor name
+
+ IF CortexA8
+
+ IMPORT armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+ IMPORT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+
+;//Input Registers
+
+pSrc RN 0
+pDst RN 1
+pFFTSpec RN 2
+scale RN 3
+
+
+;// Output registers
+result RN 0
+
+;//Local Scratch Registers
+
+argTwiddle RN 1
+argDst RN 2
+argScale RN 4
+pTwiddle RN 4
+tmpOrder RN 4
+pOut RN 5
+subFFTSize RN 7
+subFFTNum RN 6
+N RN 6
+order RN 14
+diff RN 9
+count RN 8 ;// Total num of radix stages required to comple the FFT
+x0r RN 4
+x0i RN 5
+diffMinusOne RN 2
+round RN 3
+
+;// Neon registers
+
+dX0 DN D0.S16
+dShift DN D1.S16
+dX0S32 DN D0.S32
+
+
+
+ ;// Allocate stack memory required by the function
+ M_ALLOC4 diffOnStack, 4
+
+ ;// Write function header
+ M_START omxSP_FFTFwd_CToC_SC16_Sfs,r11,d15
+
+ M_STRUCT ARMsFFTSpec
+ M_FIELD N, 4
+ M_FIELD pBitRev, 4
+ M_FIELD pTwiddle, 4
+ M_FIELD pBuf, 4
+ M_ENDSTRUCT
+
+ ;// Define stack arguments
+
+ ;// Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+ ;// Read other structure parameters
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+ CLZ order,N ;// N = 2^order
+ RSB order,order,#31
+ MOV subFFTSize,#1
+ ;//MOV subFFTNum,N
+
+ CMP order,#3
+ BGT orderGreaterthan3 ;// order > 3
+
+ CMP order,#1
+ BGE orderGreaterthan0 ;// order > 0
+ M_STR scale, diffOnStack,LT ;// order = 0
+ LDRLT x0r,[pSrc]
+ STRLT x0r,[pDst]
+ MOVLT pSrc,pDst
+ BLT FFTEnd
+
+orderGreaterthan0
+ ;// set the buffers appropriately for various orders
+ CMP order,#2
+ MOVNE argDst,pDst
+ MOVEQ argDst,pOut
+ MOVEQ pOut,pDst ;// Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
+
+ SUBS diff,scale,order
+ M_STR diff,diffOnStack
+ MOVGT scale,order
+ ;// Now scale <= order
+
+ CMP order,#1
+ BGT orderGreaterthan1
+ SUBS scale,scale,#1
+ BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe ;// order = 1
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe ;// order = 1
+ B FFTEnd
+
+orderGreaterthan1
+ CMP order,#2
+ MOV argScale,scale
+ BGT orderGreaterthan2
+ SUBS argScale,argScale,#1
+ BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe ;// order =2
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1
+ BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+orderGreaterthan2 ;// order =3
+ SUBS argScale,argScale,#1
+ BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1
+ BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1
+ BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+
+orderGreaterthan3
+ ;// check scale = 0 or scale = order
+ SUBS diff, scale, order ;// scale > order
+ MOVGT scale,order
+ BGE specialScaleCase ;// scale = 0 or scale = order
+ CMP scale,#0
+ BEQ specialScaleCase
+ B generalScaleCase
+
+specialScaleCase ;// scale = 0 or scale = order and order > 3
+
+ TST order, #2 ;// Set input args to fft stages
+ MOVNE argDst,pDst
+ MOVEQ argDst,pOut
+ MOVEQ pOut,pDst ;// Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
+
+ CMP diff,#0
+ M_STR diff, diffOnStack
+ BGE scaleEqualsOrder
+
+ ;//check for even or odd order
+ ;// NOTE: The following combination of BL's would work fine eventhough the first
+ ;// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+ ;// armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+ BLNE armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+
+ CMP subFFTNum,#4
+ BLT FFTEnd
+
+unscaledRadix4Loop
+ BEQ lastStageUnscaledRadix4
+ BL armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+ CMP subFFTNum,#4
+ B unscaledRadix4Loop
+
+lastStageUnscaledRadix4
+ BL armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+scaleEqualsOrder
+ ;//check for even or odd order
+ ;// NOTE: The following combination of BL's would work fine eventhough the first
+ ;// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+ ;// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+ BLNE armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+
+ CMP subFFTNum,#4
+ BLT FFTEnd
+
+scaledRadix4Loop
+ BEQ lastStageScaledRadix4
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+ CMP subFFTNum,#4
+ B scaledRadix4Loop
+
+lastStageScaledRadix4
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+
+
+generalScaleCase ;// 0 < scale < order and order > 3
+ ;// Determine the correct destination buffer
+ SUB diff,order,scale
+ TST diff,#0x01
+ ADDEQ count,scale,diff,LSR #1 ;// count = scale + (order - scale)/2
+ MOVNE count,order
+ TST count,#0x01 ;// Is count even or odd ?
+
+ MOVNE argDst,pDst ;// Set input args to fft stages
+ MOVEQ argDst,pOut
+ MOVEQ pOut,pDst ;// Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
+
+ CMP diff,#1
+ M_STR diff, diffOnStack
+ BEQ scaleps ;// scaling including a radix2_ps stage
+
+ MOV argScale,scale ;// Put scale in RN4 so as to save and restore
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe ;// scaled first stage
+ SUBS argScale,argScale,#1
+
+scaledRadix2Loop
+ BLGT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1 ;// save and restore scale (RN4) in the scaled stages
+ BGT scaledRadix2Loop
+ B outScale
+
+scaleps
+ SUB argScale,scale,#1 ;// order>3 and diff=1 => scale >= 3
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe ;// scaled first stage
+ SUBS argScale,argScale,#1
+
+scaledRadix2psLoop
+ BEQ scaledRadix2psStage
+ BLGT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1 ;// save and restore scale (RN4) in the scaled stages
+ BGE scaledRadix2psLoop
+
+scaledRadix2psStage
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+ B generalLastStageUnscaledRadix2
+
+
+outScale
+ M_LDR diff, diffOnStack
+ ;//check for even or odd order
+ TST diff,#0x00000001
+ BEQ generalUnscaledRadix4Loop
+ B unscaledRadix2Loop
+
+generalUnscaledRadix4Loop
+ CMP subFFTNum,#4
+ BEQ generalLastStageUnscaledRadix4
+ BL armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+ B generalUnscaledRadix4Loop
+
+generalLastStageUnscaledRadix4
+ BL armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ B End
+
+unscaledRadix2Loop
+ CMP subFFTNum,#4
+ BEQ generalLastTwoStagesUnscaledRadix2
+ BL armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
+ B unscaledRadix2Loop
+
+generalLastTwoStagesUnscaledRadix2
+ BL armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+generalLastStageUnscaledRadix2
+ BL armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+ B End
+
+
+FFTEnd ;// Does only the scaling
+
+ M_LDR diff, diffOnStack
+ CMP diff,#0
+ BLE End
+
+ RSB diff,diff,#0 ;// to use VRSHL for right shift by a variable
+ VDUP dShift,diff
+
+scaleFFTData ;// N = subFFTSize ; dataptr = pDst ; scale = diff
+ VLD1 {dX0S32[0]},[pSrc] ;// pSrc contains pDst pointer
+ SUBS subFFTSize,subFFTSize,#1
+ VRSHL dX0,dShift
+ VST1 {dX0S32[0]},[pSrc]!
+
+ BGT scaleFFTData
+
+
+
+End
+ ;// Set return value
+ MOV result, #OMX_Sts_NoErr
+
+ ;// Write function tail
+ M_END
+
+ ENDIF ;//CortexA8
+
+
+
+
+
+ END
\ No newline at end of file
diff --git a/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S b/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S
new file mode 100644
index 0000000..f1a8d03
--- /dev/null
+++ b/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S
@@ -0,0 +1,334 @@
+;//
+;//
+;// File Name: omxSP_FFTInv_CToC_SC16_Sfs_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision: 6729
+;// Last Modified Date: Tue, 17 Jul 2007
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// Description:
+;// Compute an inverse FFT for a complex signal
+;//
+;//
+
+
+;// Include standard headers
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+;// Import symbols required from other files
+;// (For example tables)
+
+ IMPORT armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ IMPORT armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+ IMPORT armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+ IMPORT armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ IMPORT armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+ IMPORT armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
+ IMPORT armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+ IMPORT armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+ IMPORT armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+ IMPORT armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+ IMPORT armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+ IMPORT armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe
+ IMPORT armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+ IMPORT armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+
+;// Set debugging level
+;//DEBUG_ON SETL {TRUE}
+
+
+
+;// Guarding implementation by the processor name
+
+
+
+;// Guarding implementation by the processor name
+
+ IF CortexA8
+
+ IMPORT armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+ IMPORT armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+
+;//Input Registers
+
+pSrc RN 0
+pDst RN 1
+pFFTSpec RN 2
+scale RN 3
+
+
+;// Output registers
+result RN 0
+
+;//Local Scratch Registers
+
+argTwiddle RN 1
+argDst RN 2
+argScale RN 4
+pTwiddle RN 4
+tmpOrder RN 4
+pOut RN 5
+subFFTSize RN 7
+subFFTNum RN 6
+N RN 6
+order RN 14
+diff RN 9
+count RN 8 ;// Total num of radix stages required to comple the FFT
+x0r RN 4
+x0i RN 5
+diffMinusOne RN 2
+round RN 3
+
+;// Neon registers
+
+dX0 DN D0.S16
+dShift DN D1.S16
+dX0S32 DN D0.S32
+
+
+ ;// Allocate stack memory required by the function
+ M_ALLOC4 diffOnStack, 4
+
+ ;// Write function header
+ M_START omxSP_FFTInv_CToC_SC16_Sfs,r11,d15
+
+ M_STRUCT ARMsFFTSpec
+ M_FIELD N, 4
+ M_FIELD pBitRev, 4
+ M_FIELD pTwiddle, 4
+ M_FIELD pBuf, 4
+ M_ENDSTRUCT
+
+ ;// Define stack arguments
+
+ ;// Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+ ;// Read other structure parameters
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+ CLZ order,N ;// N = 2^order
+ RSB order,order,#31
+ MOV subFFTSize,#1
+ ;//MOV subFFTNum,N
+
+ ADD scale,scale,order ;// FFTInverse has a final scaling factor by N
+
+ CMP order,#3
+ BGT orderGreaterthan3 ;// order > 3
+
+ CMP order,#1
+ BGE orderGreaterthan0 ;// order > 0
+ M_STR scale, diffOnStack,LT ;// order = 0
+ LDRLT x0r,[pSrc]
+ STRLT x0r,[pDst]
+ MOVLT pSrc,pDst
+ BLT FFTEnd
+
+orderGreaterthan0
+ ;// set the buffers appropriately for various orders
+ CMP order,#2
+ MOVNE argDst,pDst
+ MOVEQ argDst,pOut
+ MOVEQ pOut,pDst ;// Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
+ ;// Store the scale factor and scale at the end
+ SUB diff,scale,order
+ M_STR diff, diffOnStack
+ BGE orderGreaterthan1
+ BLLT armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe ;// order = 1
+ B FFTEnd
+
+
+orderGreaterthan1
+ MOV tmpOrder,order ;// tmpOrder = RN 4
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ CMP tmpOrder,#2
+ BLGT armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+
+
+
+orderGreaterthan3
+ ;// check scale = 0 or scale = order
+ SUBS diff, scale, order ;// scale > order
+ MOVGT scale,order
+ BGE specialScaleCase ;// scale = 0 or scale = order
+ CMP scale,#0
+ BEQ specialScaleCase
+ B generalScaleCase
+
+specialScaleCase ;// scale = 0 or scale = order and order > 3
+
+ TST order, #2 ;// Set input args to fft stages
+ MOVNE argDst,pDst
+ MOVEQ argDst,pOut
+ MOVEQ pOut,pDst ;// Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
+
+ CMP diff,#0
+ M_STR diff, diffOnStack
+ BGE scaleEqualsOrder
+
+ ;//check for even or odd order
+ ;// NOTE: The following combination of BL's would work fine eventhough the first
+ ;// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+ ;// armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+ BLNE armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+
+ CMP subFFTNum,#4
+ BLT FFTEnd
+
+unscaledRadix4Loop
+ BEQ lastStageUnscaledRadix4
+ BL armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
+ CMP subFFTNum,#4
+ B unscaledRadix4Loop
+
+lastStageUnscaledRadix4
+ BL armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+scaleEqualsOrder
+ ;//check for even or odd order
+ ;// NOTE: The following combination of BL's would work fine eventhough the first
+ ;// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+ ;// armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+ BLNE armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+
+ CMP subFFTNum,#4
+ BLT FFTEnd
+
+scaledRadix4Loop
+ BEQ lastStageScaledRadix4
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+ CMP subFFTNum,#4
+ B scaledRadix4Loop
+
+lastStageScaledRadix4
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+
+
+generalScaleCase ;// 0 < scale < order and order > 3
+ ;// Determine the correct destination buffer
+ SUB diff,order,scale
+ TST diff,#0x01
+ ADDEQ count,scale,diff,LSR #1 ;// count = scale + (order - scale)/2
+ MOVNE count,order
+ TST count,#0x01 ;// Is count even or odd ?
+
+ MOVNE argDst,pDst ;// Set input args to fft stages
+ MOVEQ argDst,pOut
+ MOVEQ pOut,pDst ;// Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
+
+ CMP diff,#1
+ M_STR diff, diffOnStack
+ BEQ scaleps ;// scaling including a radix2_ps stage
+
+ MOV argScale,scale ;// Put scale in RN4 so as to save and restore
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe ;// scaled first stage
+ SUBS argScale,argScale,#1
+
+scaledRadix2Loop
+ BLGT armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1 ;// save and restore scale (RN4) in the scaled stages
+ BGT scaledRadix2Loop
+ B outScale
+
+scaleps
+ SUB argScale,scale,#1 ;// order>3 and diff=1 => scale >= 3
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe ;// scaled first stage
+ SUBS argScale,argScale,#1
+
+scaledRadix2psLoop
+ BEQ scaledRadix2psStage
+ BLGT armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1 ;// save and restore scale (RN4) in the scaled stages
+ BGE scaledRadix2psLoop
+
+scaledRadix2psStage
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+ B generalLastStageUnscaledRadix2
+
+
+outScale
+ M_LDR diff, diffOnStack
+ ;//check for even or odd order
+ TST diff,#0x00000001
+ BEQ generalUnscaledRadix4Loop
+ B unscaledRadix2Loop
+
+generalUnscaledRadix4Loop
+ CMP subFFTNum,#4
+ BEQ generalLastStageUnscaledRadix4
+ BL armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
+ B generalUnscaledRadix4Loop
+
+generalLastStageUnscaledRadix4
+ BL armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ B End
+
+unscaledRadix2Loop
+ CMP subFFTNum,#4
+ BEQ generalLastTwoStagesUnscaledRadix2
+ BL armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe
+ B unscaledRadix2Loop
+
+generalLastTwoStagesUnscaledRadix2
+ BL armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+generalLastStageUnscaledRadix2
+ BL armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+ B End
+
+
+FFTEnd ;// Does only the scaling
+
+ M_LDR diff, diffOnStack
+ CMP diff,#0
+ BLE End
+
+ RSB diff,diff,#0 ;// to use VRSHL for right shift by a variable
+ VDUP dShift,diff
+
+scaleFFTData ;// N = subFFTSize ; dataptr = pDst ; scale = diff
+ VLD1 {dX0S32[0]},[pSrc] ;// pSrc contains pDst pointer
+ SUBS subFFTSize,subFFTSize,#1
+ VRSHL dX0,dShift
+ VST1 {dX0S32[0]},[pSrc]!
+
+ BGT scaleFFTData
+
+
+End
+ ;// Set return value
+ MOV result, #OMX_Sts_NoErr
+
+ ;// Write function tail
+ M_END
+
+ ENDIF ;//CortexA8
+
+
+
+
+
+ END
\ No newline at end of file