Add 16-bit FFT routines as is from OpenMAX DL.

These are the assembly routines for the 16-bit complex FFT unchanged
from OpenMAX DL.

Review URL: https://webrtc-codereview.appspot.com/1101004

git-svn-id: http://webrtc.googlecode.com/svn/deps/third_party/openmax@3481 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S
new file mode 100644
index 0000000..f321502
--- /dev/null
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S
@@ -0,0 +1,162 @@
+;//
+;// 
+;// File Name:  armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision:   6693
+;// Last Modified Date:       Tue, 10 Jul 2007
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;// Compute a Radix 2 FFT stage for a N point complex signal
+;// 
+;// 
+
+        
+;// Include standard headers
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS CortexA8
+        
+;// Import symbols required from other files
+;// (For example tables)
+    
+        
+        
+        
+;// Set debugging level        
+;//DEBUG_ON    SETL {TRUE}
+
+
+
+;// Guarding implementation by the processor name
+    
+    
+            
+;// Guarding implementation by the processor name
+    
+    IF  CortexA8 
+    
+;//Input Registers
+
+pSrc            RN  0
+pDst            RN  2
+pTwiddle        RN  1
+pPingPongBuf    RN  5
+subFFTNum       RN  6
+subFFTSize      RN  7
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+
+pointStep        RN  3
+outPointStep     RN  3
+grpSize          RN  4
+setCount         RN  4
+step             RN  8
+dstStep          RN  8
+
+;// Neon Registers
+
+dX0             DN  D0.S16
+dX1             DN  D1.S16
+dY0             DN  D2.S16
+dY1             DN  D3.S16
+dX0S32          DN  D0.S32
+dX1S32          DN  D1.S32
+dY0S32          DN  D2.S32
+dY1S32          DN  D3.S32
+
+
+        MACRO
+        FFTSTAGE $scaled, $inverse, $name
+        
+        ;// Define stack arguments
+        
+        
+        ;// update subFFTSize and subFFTNum into RN6 and RN7 for the next stage
+        
+        
+        MOV        subFFTSize,#2
+        LSR        grpSize,subFFTNum,#1  
+        MOV        subFFTNum,grpSize 
+        
+        
+        ;// pT0+1 increments pT0 by 8 bytes
+        ;// pT0+pointStep = increment of 4*pointStep bytes = 2*grpSize bytes
+        ;// Note: outPointStep = pointStep for firststage
+        ;// Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+        
+        MOV        pointStep,grpSize,LSL #2
+        RSB        step,pointStep,#4 
+        
+        
+        ;// Loop on the sets for grp zero: 1 set at a time
+
+grpZeroSetLoop$name        
+        
+        VLD1    {dX0S32[0]},[pSrc],pointStep
+        VLD1    {dX1S32[0]},[pSrc],step                   ;// step = -pointStep + 4
+        SUBS    setCount,setCount,#1              ;// decrement the loop counter
+        
+        IF $scaled
+        
+            VHADD    dY0,dX0,dX1
+            VHSUB    dY1,dX0,dX1
+        
+        ELSE
+        
+            VADD    dY0,dX0,dX1
+            VSUB    dY1,dX0,dX1
+        
+         
+        ENDIF
+        
+        VST1    {dY0S32[0]},[pDst],outPointStep
+        VST1    {dY1S32[0]},[pDst],dstStep                  ;// dstStep =  step = -pointStep + 4
+               
+        BGT     grpZeroSetLoop$name
+        
+        
+        ;// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                     ;// pDst -= 2*grpSize 
+        MOV     pDst,pPingPongBuf
+                
+        MEND
+        
+        
+                
+        M_START armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE {FALSE},{FALSE},FWD
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE {FALSE},{TRUE},INV
+        M_END
+ 
+        
+        
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE {TRUE},{FALSE},FWDSFS
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE {TRUE},{TRUE},INVSFS
+        M_END
+
+        
+    ENDIF                                                           ;//CORTEXA8
+    
+     
+    END
\ No newline at end of file
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S
new file mode 100644
index 0000000..0932099
--- /dev/null
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S
@@ -0,0 +1,202 @@
+;//
+;// 
+;// File Name:  armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision:   6741
+;// Last Modified Date:       Wed, 18 Jul 2007
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;// Compute a Radix 2 FFT stage for a N point complex signal
+;// 
+;// 
+
+        
+;// Include standard headers
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS CortexA8
+        
+;// Import symbols required from other files
+;// (For example tables)
+    
+        
+        
+        
+;// Set debugging level        
+;//DEBUG_ON    SETL {TRUE}
+
+
+;// Guarding implementation by the processor name
+    
+    
+
+
+
+
+            
+;// Guarding implementation by the processor name
+    
+    IF  CortexA8 
+    
+;//Input Registers
+
+pSrc            RN  0
+pDst            RN  2
+pTwiddle        RN  1
+subFFTNum       RN  6
+subFFTSize      RN  7
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+
+
+outPointStep     RN  3
+grpCount         RN  4
+dstStep          RN  5
+pTmp             RN  4
+step             RN  8
+
+;// Neon Registers
+
+dWr             DN  D0.S16
+dWi             DN  D1.S16
+dXr0            DN  D2.S16
+dXi0            DN  D3.S16
+dXr1            DN  D4.S16
+dXi1            DN  D5.S16
+dYr0            DN  D6.S16
+dYi0            DN  D7.S16
+dYr1            DN  D8.S16
+dYi1            DN  D9.S16
+qT0             QN  Q5.S32
+qT1             QN  Q6.S32
+
+
+        MACRO
+        FFTSTAGE $scaled, $inverse, $name
+        
+        
+        MOV     outPointStep,subFFTSize,LSL #2
+        ;// Update grpCount and grpSize rightaway 
+        
+        MOV     subFFTNum,#1                            ;//after the last stage
+        LSL     grpCount,subFFTSize,#1
+        
+        ;// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+                               
+        SUB      step,outPointStep,#4                   ;// step = -4+outPointStep
+        RSB      dstStep,step,#0                        ;// dstStep = -4-outPointStep+8 = -step
+        ;//RSB      dstStep,outPointStep,#16
+        
+        
+        ;// Loop on 2 grps at a time for the last stage
+
+grpLoop$name
+        VLD2    {dWr[0],dWi[0]},[pTwiddle]!             ;// grp 0
+        VLD2    {dWr[1],dWi[1]},[pTwiddle]!             ;// grp 1
+        
+        ;//VLD2    {dWr,dWi},[pTwiddle],#16        
+        
+        VLD4    {dXr0[0],dXi0[0],dXr1[0],dXi1[0]},[pSrc]!   ;// grp 0
+        VLD4    {dXr0[1],dXi0[1],dXr1[1],dXi1[1]},[pSrc]!   ;// grp 1
+        
+        
+        ;//VLD4    {dXr0,dXi0,dXr1,dXi1},[pSrc],#32
+        SUBS    grpCount,grpCount,#4                   ;// grpCount is multiplied by 2 
+        
+        IF  $inverse
+            VMULL   qT0,dXr1,dWr
+            VMLAL   qT0,dXi1,dWi                       ;// real part
+            VMULL   qT1,dXi1,dWr
+            VMLSL   qT1,dXr1,dWi                       ;// imag part
+            
+        ELSE
+            VMULL   qT0,dXr1,dWr
+            VMLSL   qT0,dXi1,dWi                       ;// real part
+            VMULL   qT1,dXi1,dWr
+            VMLAL   qT1,dXr1,dWi                       ;// imag part
+        
+        ENDIF
+        
+        VRSHRN  dXr1,qT0,#15
+        VRSHRN  dXi1,qT1,#15
+        
+               
+        IF $scaled
+        
+            VHSUB    dYr0,dXr0,dXr1
+            VHSUB    dYi0,dXi0,dXi1
+            VHADD    dYr1,dXr0,dXr1
+            VHADD    dYi1,dXi0,dXi1
+            
+        ELSE
+        
+            VSUB    dYr0,dXr0,dXr1
+            VSUB    dYi0,dXi0,dXi1
+            VADD    dYr1,dXr0,dXr1
+            VADD    dYi1,dXi0,dXi1
+            
+         
+        ENDIF
+        
+        VST2    {dYr0[0],dYi0[0]},[pDst]!
+        VST2    {dYr0[1],dYi0[1]},[pDst],step               ;// step = -4+outPointStep
+        
+        VST2    {dYr1[0],dYi1[0]},[pDst]!
+        VST2    {dYr1[1],dYi1[1]},[pDst],dstStep            ;// dstStep = -4-outPointStep+8 = -step
+        
+        ;//VST2    {dYr0,dYi0},[pDst],outPointStep
+        ;//VST2    {dYr1,dYi1},[pDst],dstStep                  ;// dstStep =  step = -outPointStep + 16
+               
+        BGT     grpLoop$name
+        
+        
+        ;// Reset and Swap pSrc and pDst for the next stage     
+        MOV     pTmp,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #1       ;// pDst -= 2*size; pSrc -= 4*size bytes           
+        SUB     pSrc,pTmp,outPointStep
+        
+        ;// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      ;// pTwiddle -= 2*size bytes
+                
+        MEND
+        
+        
+                
+        M_START armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE {FALSE},{FALSE},FWD
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE {FALSE},{TRUE},INV
+        M_END
+ 
+        
+        
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE {TRUE},{FALSE},FWDSFS
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE {TRUE},{TRUE},INVSFS
+        M_END
+
+        
+    ENDIF                                                           ;//CORTEXA8
+    
+     
+    END
\ No newline at end of file
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S
new file mode 100644
index 0000000..49bf607
--- /dev/null
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S
@@ -0,0 +1,209 @@
+;//
+;// 
+;// File Name:  armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision:   6740
+;// Last Modified Date:       Wed, 18 Jul 2007
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;// Compute a Radix 2 FFT stage for a N point complex signal
+;// 
+;// 
+
+        
+;// Include standard headers
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS CortexA8
+        
+;// Import symbols required from other files
+;// (For example tables)
+    
+        
+        
+        
+;// Set debugging level        
+;//DEBUG_ON    SETL {TRUE}
+
+
+
+            
+;// Guarding implementation by the processor name
+    
+    IF  CortexA8 
+    
+;//Input Registers
+
+pSrc            RN  0
+pDst            RN  2
+pTwiddle        RN  1
+subFFTNum       RN  6
+subFFTSize      RN  7
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+
+outPointStep     RN  3
+grpCount         RN  4
+dstStep          RN  5
+twStep           RN  8
+pTmp             RN  4
+
+;// Neon Registers
+
+dW1S32          DN  D0.S32
+dW2S32          DN  D1.S32
+dW1             DN  D0.S16
+dW2             DN  D1.S16
+
+dX0             DN  D2.S16
+dX1             DN  D3.S16
+dX2             DN  D4.S16
+dX3             DN  D5.S16
+dY0             DN  D6.S16
+dY1             DN  D7.S16
+dY2             DN  D8.S16
+dY3             DN  D9.S16
+qT0             QN  Q5.S32
+qT1             QN  Q6.S32
+
+
+        MACRO
+        FFTSTAGE $scaled, $inverse, $name
+        
+        ;// Define stack arguments
+        
+        
+        ;// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+        
+        
+        LSL     grpCount,subFFTSize,#1
+        
+        
+        ;// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+        
+        ;// pOut0+1 increments pOut0 by 8 bytes
+        ;// pOut0+outPointStep == increment of 4*outPointStep bytes = 2*size bytes
+        SMULBB  outPointStep,grpCount,subFFTNum  
+        MOV     twStep,subFFTNum,LSL #1
+        LSR     subFFTNum,subFFTNum,#1                      ;//grpSize
+                                       
+                
+        RSB      dstStep,outPointStep,#8
+                
+        
+        ;// Note: pointStep is 8 in this case: so need of extra reg
+        ;// Loop on the groups: 2 groups at a time
+
+grpLoop$name        
+        
+        VLD1     dW1S32[],[pTwiddle],twStep                ;//[wi | wr] 
+        VLD1     dW2S32[],[pTwiddle],twStep
+        
+        ;// Process the sets for each grp:  2 sets at a time (no set looping required)     
+        
+        VLD1    dX0,[pSrc]!            ;// point0: of set0,set1 of grp0
+        VLD1    dX1,[pSrc]!            ;// point1: of set0,set1 of grp0
+        VLD1    dX2,[pSrc]!            ;// point0: of set0,set1 of grp1
+        VLD1    dX3,[pSrc]!            ;// point1: of set0,set1 of grp1
+        
+        SUBS    grpCount,grpCount,#4              ;// decrement the loop counter
+        VUZP    dW1,dW2
+        VUZP    dX1,dX3
+        
+        IF  $inverse
+            VMULL   qT0,dX1,dW1
+            VMLAL   qT0,dX3,dW2                       ;// real part
+            VMULL   qT1,dX3,dW1
+            VMLSL   qT1,dX1,dW2                       ;// imag part
+            
+        ELSE
+            VMULL   qT0,dX1,dW1
+            VMLSL   qT0,dX3,dW2                       ;// real part
+            VMULL   qT1,dX3,dW1
+            VMLAL   qT1,dX1,dW2                       ;// imag part
+        
+        ENDIF
+        
+        VRSHRN  dX1,qT0,#15
+        VRSHRN  dX3,qT1,#15
+        
+        VZIP    dX1,dX3
+        
+        
+        IF $scaled
+        
+            VHSUB    dY0,dX0,dX1
+            VHADD    dY1,dX0,dX1
+            VHSUB    dY2,dX2,dX3
+            VHADD    dY3,dX2,dX3
+            
+        ELSE
+        
+            VSUB    dY0,dX0,dX1
+            VADD    dY1,dX0,dX1
+            VSUB    dY2,dX2,dX3
+            VADD    dY3,dX2,dX3
+            
+        
+         
+        ENDIF
+        
+        VST1    dY0,[pDst],outPointStep             ;// point0: of set0,set1 of grp0
+        VST1    dY1,[pDst],dstStep                  ;// dstStep = -outPointStep + 8
+        VST1    dY2,[pDst],outPointStep             ;// point0: of set0,set1 of grp1
+        VST1    dY3,[pDst],dstStep                  ;// point1: of set0,set1 of grp1
+        
+               
+        BGT     grpLoop$name
+        
+        
+        ;// Reset and Swap pSrc and pDst for the next stage     
+        MOV     pTmp,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #1       ;// pDst -= 2*size; pSrc -= 4*size bytes           
+        SUB     pSrc,pTmp,outPointStep
+        
+        ;// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      ;// pTwiddle -= 2*size bytes
+                
+        MEND
+        
+        
+                
+        M_START armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe,r4
+        FFTSTAGE {FALSE},{FALSE},FWD
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe,r4
+        FFTSTAGE {FALSE},{TRUE},INV
+        M_END
+ 
+        
+        
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe,r4
+        FFTSTAGE {TRUE},{FALSE},FWDSFS
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe,r4
+        FFTSTAGE {TRUE},{TRUE},INVSFS
+        M_END
+
+        
+    ENDIF                                                           ;//CORTEXA8
+    
+     
+    END
\ No newline at end of file
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S
new file mode 100644
index 0000000..133b137
--- /dev/null
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S
@@ -0,0 +1,214 @@
+;//
+;// 
+;// File Name:  armSP_FFT_CToC_SC16_Radix2_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision:   5892
+;// Last Modified Date:       Thu, 07 Jun 2007
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;// Compute a Radix 2 FFT stage for a N point complex signal
+;// 
+;// 
+
+        
+;// Include standard headers
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS CortexA8
+        
+;// Import symbols required from other files
+;// (For example tables)
+
+           
+        
+;// Set debugging level        
+;//DEBUG_ON    SETL {TRUE}
+
+
+
+;// Guarding implementation by the processor name
+    
+    
+    
+    
+    ;// Guarding implementation by the processor name
+    
+    IF  CortexA8 
+    
+    
+;//Input Registers
+
+pSrc            RN  0
+pDst            RN  2
+pTwiddle        RN  1
+subFFTNum       RN  6
+subFFTSize      RN  7
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+
+outPointStep    RN  3
+pointStep       RN  4
+grpCount        RN  5
+setCount        RN  8
+step            RN  10
+dstStep         RN  11
+pTmp            RN  9    
+
+;// Neon Registers
+
+dW              DN  D0.S16
+dX0             DN  D2.S16
+dX1                DN  D3.S16
+dX2             DN  D4.S16
+dX3                DN  D5.S16
+dY0             DN  D6.S16
+dY1               DN  D7.S16
+dY2             DN  D8.S16
+dY3               DN  D9.S16
+qT0             QN  Q3.S32
+qT1             QN  Q4.S32
+
+    
+    
+        MACRO
+        FFTSTAGE $scaled, $inverse, $name
+        
+        ;// Define stack arguments
+        
+        
+        ;// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+        
+        LSR     subFFTNum,subFFTNum,#1                      ;//grpSize
+        LSL     grpCount,subFFTSize,#1
+        
+        
+        ;// pT0+1 increments pT0 by 8 bytes
+        ;// pT0+pointStep = increment of 4*pointStep bytes = 2*grpSize bytes
+        MOV     pointStep,subFFTNum,LSL #1
+        
+        ;// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+        
+        ;// pOut0+1 increments pOut0 by 8 bytes
+        ;// pOut0+outPointStep == increment of 4*outPointStep bytes = 2*size bytes
+        SMULBB  outPointStep,grpCount,pointStep  
+        LSL     pointStep,pointStep,#1    
+                               
+        
+        RSB      step,pointStep,#16
+        RSB      dstStep,outPointStep,#16
+        
+        ;// Loop on the groups
+
+grpLoop$name        
+        
+        VLD1     dW,[pTwiddle],pointStep                ;//[wi | wr]
+        MOV      setCount,pointStep,LSR #2
+        
+        
+        ;// Loop on the sets: 4 at a time
+        
+        
+setLoop$name        
+        
+        
+        VLD2    {dX0,dX1},[pSrc],pointStep            ;// point0: dX0-real part dX1-img part
+        VLD2    {dX2,dX3},[pSrc],step                 ;// point1: dX2-real part dX3-img part
+        
+        SUBS    setCount,setCount,#4               
+        
+        IF  $inverse
+            VMULL   qT0,dX2,dW[0]
+            VMLAL   qT0,dX3,dW[1]                       ;// real part
+            VMULL   qT1,dX3,dW[0]
+            VMLSL   qT1,dX2,dW[1]                       ;// imag part
+                
+        ELSE
+        
+            VMULL   qT0,dX2,dW[0]
+            VMLSL   qT0,dX3,dW[1]                       ;// real part
+            VMULL   qT1,dX3,dW[0]
+            VMLAL   qT1,dX2,dW[1]                       ;// imag part
+                    
+        ENDIF
+        
+        VRSHRN  dX2,qT0,#15
+        VRSHRN  dX3,qT1,#15
+        
+        IF $scaled
+            VHSUB    dY0,dX0,dX2
+            VHSUB    dY1,dX1,dX3
+            VHADD    dY2,dX0,dX2
+            VHADD    dY3,dX1,dX3
+                
+        ELSE
+            VSUB    dY0,dX0,dX2
+            VSUB    dY1,dX1,dX3
+            VADD    dY2,dX0,dX2
+            VADD    dY3,dX1,dX3
+        
+        ENDIF
+        
+        VST2    {dY0,dY1},[pDst],outPointStep
+        VST2    {dY2,dY3},[pDst],dstStep              ;// dstStep = -outPointStep + 16
+        
+        BGT     setLoop$name
+        
+        SUBS    grpCount,grpCount,#2               
+        ADD     pSrc,pSrc,pointStep
+        BGT     grpLoop$name    
+        
+        
+        ;// Reset and Swap pSrc and pDst for the next stage     
+        MOV     pTmp,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #1       ;// pDst -= 2*size; pSrc -= 4*size bytes           
+        SUB     pSrc,pTmp,outPointStep
+        
+        ;// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      ;// pTwiddle -= 2*size bytes
+        
+                
+        MEND
+        
+        
+        
+        M_START armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE {FALSE},{FALSE},FWD
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE {FALSE},{TRUE},INV
+        M_END
+ 
+        
+        
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE {TRUE},{FALSE},FWDSFS
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE {TRUE},{TRUE},INVSFS
+        M_END
+
+        
+
+    ENDIF                 ;//CORTEXA8
+        
+    
+     
+    END    
+     
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S
new file mode 100644
index 0000000..82662e6
--- /dev/null
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S
@@ -0,0 +1,306 @@
+;//
+;// 
+;// File Name:  armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision:   7761
+;// Last Modified Date:       Wed, 26 Sep 2007
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;// Compute a first stage Radix 4 FFT stage for a N point complex signal
+;// 
+;// 
+
+        
+;// Include standard headers
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS CortexA8
+        
+;// Import symbols required from other files
+;// (For example tables)
+    
+        
+        
+        
+;// Set debugging level        
+;//DEBUG_ON    SETL {TRUE}
+
+
+
+;// Guarding implementation by the processor name
+    
+    
+    
+;// Guarding implementation by the processor name
+    
+    IF  CortexA8
+    
+;//Input Registers
+
+pSrc            RN  0
+pDst            RN  2
+pTwiddle        RN  1
+pPingPongBuf    RN  5
+subFFTNum       RN  6
+subFFTSize      RN  7
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+
+grpSize         RN  3
+setCount        RN  3                  ;// Reuse grpSize as setCount
+pointStep       RN  4
+outPointStep    RN  4
+setStep         RN  8
+step1           RN  9
+step3           RN  10
+
+;// Neon Registers
+
+dXr0             DN  D0.S16
+dXi0             DN  D1.S16
+dXr1             DN  D2.S16
+dXi1             DN  D3.S16
+dXr2             DN  D4.S16
+dXi2             DN  D5.S16
+dXr3             DN  D6.S16
+dXi3             DN  D7.S16
+dYr0             DN  D8.S16
+dYi0             DN  D9.S16
+dYr1             DN  D10.S16
+dYi1             DN  D11.S16
+dYr2             DN  D12.S16
+dYi2             DN  D13.S16
+dYr3             DN  D14.S16
+dYi3             DN  D15.S16
+dZr0             DN  D16.S16
+dZi0             DN  D17.S16
+dZr1             DN  D18.S16
+dZi1             DN  D19.S16
+dZr2             DN  D20.S16
+dZi2             DN  D21.S16
+dZr3             DN  D22.S16
+dZi3             DN  D23.S16
+qY0              QN  Q4.S16
+qY2              QN  Q6.S16
+qX0              QN  Q0.S16
+qX2              QN  Q2.S16
+
+qY1              QN  Q5.S16
+qY3              QN  Q7.S16
+qX1              QN  Q1.S16
+qX3              QN  Q3.S16
+qZ0              QN  Q8.S16
+qZ1              QN  Q9.S16
+
+    
+        MACRO
+        FFTSTAGE $scaled, $inverse, $name
+        
+        ;// Define stack arguments
+        
+        MOV     pointStep,subFFTNum
+        ;// Update pSubFFTSize and pSubFFTNum regs
+        
+        
+        VLD2    {dXr0,dXi0},[pSrc@128],pointStep          ;//  data[0]
+        ;// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#2  
+        MOV     subFFTNum,grpSize
+        
+               
+        ;// pT0+1 increments pT0 by 4 bytes
+        ;// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
+        ;// Note: outPointStep = pointStep for firststage
+        VLD2    {dXr1,dXi1},[pSrc@128],pointStep          ;//  data[1]
+        
+                
+        ;// Calculate the step of input data for the next set
+        ;//MOV     setStep,pointStep,LSL #1
+        MOV     setStep,grpSize,LSL #3
+        VLD2    {dXr2,dXi2},[pSrc@128],pointStep          ;//  data[2]
+        MOV     step1,setStep
+        ADD     setStep,setStep,pointStep             ;// setStep = 3*pointStep
+        RSB     setStep,setStep,#16                   ;// setStep = - 3*pointStep+16
+        
+                
+        VLD2    {dXr3,dXi3},[pSrc@128],setStep            ;//  data[3]
+        MOV     subFFTSize,#4                         ;// subFFTSize = 1 for the first stage
+        
+        
+        IF  $scaled 
+            VHADD    qY0,qX0,qX2             ;// u0
+        ELSE
+            VADD   qY0,qX0,qX2               ;// u0
+        ENDIF
+        RSB     step3,pointStep,#0
+        
+        ;// grp = 0 a special case since all the twiddle factors are 1
+        ;// Loop on the sets: 4 sets at a time
+
+grpZeroSetLoop$name        
+        
+        
+        IF $scaled
+        
+            ;// finish first stage of 4 point FFT 
+            
+            VHSUB    qY2,qX0,qX2             ;// u1
+            SUBS    setCount,setCount,#4                    ;// decrement the set loop counter 
+            
+            VLD2    {dXr0,dXi0},[pSrc@128],step1          ;//  data[0]
+            VHADD    qY1,qX1,qX3             ;// u2
+            VLD2    {dXr2,dXi2},[pSrc@128],step3
+            VHSUB    qY3,qX1,qX3             ;// u3
+            
+                        
+            
+            ;// finish second stage of 4 point FFT 
+            
+            VLD2    {dXr1,dXi1},[pSrc@128],step1          ;//  data[1]
+            VHADD    qZ0,qY0,qY1             ;// y0
+            
+            VLD2    {dXr3,dXi3},[pSrc@128],setStep 
+                        
+            
+            IF  $inverse 
+
+                VHSUB    dZr3,dYr2,dYi3                  ;// y3
+                VHADD    dZi3,dYi2,dYr3
+                VST2    {dZr0,dZi0},[pDst@128],outPointStep
+                
+                VHSUB    qZ1,qY0,qY1                     ;// y2
+                VST2    {dZr3,dZi3},[pDst@128],outPointStep            
+                
+                VHADD    dZr2,dYr2,dYi3                  ;// y1
+                VST2    {dZr1,dZi1},[pDst@128],outPointStep
+                VHSUB    dZi2,dYi2,dYr3
+                
+                VHADD    qY0,qX0,qX2                     ;// u0 (next loop)
+                VST2    {dZr2,dZi2},[pDst@128],setStep     
+                
+                
+            ELSE
+            
+                VHADD    dZr2,dYr2,dYi3                  ;// y1
+                VHSUB    dZi2,dYi2,dYr3
+            
+                VST2    {dZr0,dZi0},[pDst@128],outPointStep
+                VHSUB    qZ1,qY0,qY1                     ;// y2
+                        
+                VST2    {dZr2,dZi2},[pDst@128],outPointStep            
+                VHSUB    dZr3,dYr2,dYi3                  ;// y3
+                VHADD    dZi3,dYi2,dYr3
+                VST2    {dZr1,dZi1},[pDst@128],outPointStep
+                VHADD    qY0,qX0,qX2                     ;// u0 (next loop)
+                VST2    {dZr3,dZi3},[pDst@128],setStep
+                                
+            ENDIF
+        
+        
+        ELSE
+        
+            ;// finish first stage of 4 point FFT 
+            
+            VSUB    qY2,qX0,qX2             ;// u1
+            SUBS    setCount,setCount,#4                    ;// decrement the set loop counter 
+            
+            VLD2    {dXr0,dXi0},[pSrc@128],step1          ;//  data[0]
+            VADD    qY1,qX1,qX3             ;// u2
+            VLD2    {dXr2,dXi2},[pSrc@128],step3
+            VSUB    qY3,qX1,qX3             ;// u3
+            
+                        
+            
+            ;// finish second stage of 4 point FFT 
+            
+            VLD2    {dXr1,dXi1},[pSrc@128],step1          ;//  data[1]
+            VADD    qZ0,qY0,qY1             ;// y0
+            
+            VLD2    {dXr3,dXi3},[pSrc@128],setStep 
+                        
+            
+            IF  $inverse 
+
+                VSUB    dZr3,dYr2,dYi3                  ;// y3
+                VADD    dZi3,dYi2,dYr3
+                VST2    {dZr0,dZi0},[pDst@128],outPointStep
+                
+                VSUB    qZ1,qY0,qY1                     ;// y2
+                VST2    {dZr3,dZi3},[pDst@128],outPointStep            
+                
+                VADD    dZr2,dYr2,dYi3                  ;// y1
+                VST2    {dZr1,dZi1},[pDst@128],outPointStep
+                VSUB    dZi2,dYi2,dYr3
+                
+                VADD    qY0,qX0,qX2                     ;// u0 (next loop)
+                VST2    {dZr2,dZi2},[pDst@128],setStep     
+                
+                
+            ELSE
+            
+                VADD    dZr2,dYr2,dYi3                  ;// y1
+                VSUB    dZi2,dYi2,dYr3
+            
+                VST2    {dZr0,dZi0},[pDst@128],outPointStep
+                VSUB    qZ1,qY0,qY1                     ;// y2
+                        
+                VST2    {dZr2,dZi2},[pDst@128],outPointStep            
+                VSUB    dZr3,dYr2,dYi3                  ;// y3
+                VADD    dZi3,dYi2,dYr3
+                VST2    {dZr1,dZi1},[pDst@128],outPointStep
+                VADD    qY0,qX0,qX2                     ;// u0 (next loop)
+                VST2    {dZr3,dZi3},[pDst@128],setStep
+                                
+            ENDIF
+                        
+                       
+        ENDIF
+        
+        BGT     grpZeroSetLoop$name
+        
+        
+        ;// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                     ;// pDst -= grpSize  
+        MOV     pDst,pPingPongBuf
+        
+        
+        MEND
+
+                
+        
+        M_START armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE {FALSE},{FALSE},FWD
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE {FALSE},{TRUE},INV
+        M_END
+ 
+                
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE {TRUE},{FALSE},FWDSFS
+        M_END
+
+                
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE {TRUE},{TRUE},INVSFS
+        M_END
+    
+            
+    ENDIF                                                           ;//CortexA8
+    
+    
+     
+    END
\ No newline at end of file
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
new file mode 100644
index 0000000..ce324f5
--- /dev/null
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
@@ -0,0 +1,403 @@
+;//
+;// 
+;// File Name:  armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision:   7765
+;// Last Modified Date:       Thu, 27 Sep 2007
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;// Compute a Radix 4 FFT stage for a N point complex signal
+;// 
+;// 
+
+        
+;// Include standard headers
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        INCLUDE armSP_FFT_s.h
+        
+        M_VARIANTS CortexA8
+        
+;// Import symbols required from other files
+;// (For example tables)
+    
+        
+        
+        
+;// Set debugging level        
+;//DEBUG_ON    SETL {TRUE}
+
+
+;// Guarding implementation by the processor name
+    
+    
+
+
+
+    
+;// Guarding implementation by the processor name
+    
+    IF  CortexA8
+    
+;// Import symbols required from other files
+;// (For example tables)
+    ;//IMPORT  armAAC_constTable    
+    
+;//Input Registers
+
+pSrc            RN  0
+pDst            RN  2
+pTwiddle        RN  1
+subFFTNum       RN  6
+subFFTSize      RN  7
+
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+
+outPointStep     RN  3
+grpCount         RN  4
+dstStep          RN  5
+pw1              RN  8
+pw2              RN  9
+pw3              RN  10   
+pTmp             RN  4
+
+
+;// Neon Registers
+
+dButterfly1Real02   DN  D0.S16
+dButterfly1Imag02   DN  D1.S16
+dButterfly1Real13   DN  D2.S16
+dButterfly1Imag13   DN  D3.S16
+dButterfly2Real02   DN  D4.S16
+dButterfly2Imag02   DN  D5.S16
+dButterfly2Real13   DN  D6.S16
+dButterfly2Imag13   DN  D7.S16
+dXr0             DN  D0.S16
+dXi0             DN  D1.S16
+dXr1             DN  D2.S16
+dXi1             DN  D3.S16
+dXr2             DN  D4.S16
+dXi2             DN  D5.S16
+dXr3             DN  D6.S16
+dXi3             DN  D7.S16
+
+dW1rS32          DN  D8.S32         
+dW1iS32             DN  D9.S32
+dW2rS32             DN  D10.S32
+dW2iS32             DN  D11.S32
+dW3rS32             DN  D12.S32
+dW3iS32             DN  D13.S32
+
+dW1r             DN  D8.S16
+dW1i             DN  D9.S16
+dW2r             DN  D10.S16
+dW2i             DN  D11.S16
+dW3r             DN  D12.S16
+dW3i             DN  D13.S16
+
+dTmp0            DN  D12.S16
+dTmp1             DN  D13.S16
+dTmp1S32         DN  D13.S32
+dTmp2S32         DN  D14.S32
+dTmp3S32         DN  D15.S32
+
+dYr0             DN  D18.S16
+dYi0             DN  D19.S16
+dYr1             DN  D16.S16
+dYi1             DN  D17.S16
+dYr2             DN  D20.S16
+dYi2             DN  D21.S16
+dYr3             DN  D14.S16
+dYi3             DN  D15.S16
+qY0              QN  Q9.S16
+qY1              QN  Q8.S16
+qY2              QN  Q10.S16
+qY3              QN  Q7.S16
+
+qX0              QN  Q0.S16
+qX1              QN  Q1.S16
+qX2              QN  Q2.S16
+qX3              QN  Q3.S16
+
+qT0              QN  Q9.S32
+qT1              QN  Q10.S32
+qT2              QN  Q7.S32
+qT3              QN  Q8.S32
+
+dZr0             DN  D22.S16
+dZi0             DN  D23.S16
+dZr1             DN  D24.S16
+dZi1             DN  D25.S16
+dZr2             DN  D26.S16
+dZi2             DN  D27.S16
+dZr3             DN  D28.S16
+dZi3             DN  D29.S16
+
+qZ0              QN  Q11.S16
+qZ1              QN  Q12.S16
+qZ2              QN  Q13.S16
+qZ3              QN  Q14.S16
+
+        
+        MACRO
+        FFTSTAGE $scaled, $inverse , $name
+        
+        ;// Define stack arguments
+        
+        MOV     pw2,pTwiddle 
+        VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2@256]!
+        
+        MOV     pw3,pTwiddle
+        MOV     pw1,pTwiddle
+        ;// pOut0+1 increments pOut0 by 8 bytes
+        ;// pOut0+outPointStep == increment of 4*outPointStep bytes
+        MOV     outPointStep,subFFTSize,LSL #2
+        
+        VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3@64]!
+        MOV     subFFTNum,#1                            ;//after the last stage
+        LSL     grpCount,subFFTSize,#2
+                       
+        
+        ;// Update grpCount and grpSize rightaway 
+        VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3@64]!
+        
+        ;// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+        MOV     dstStep,outPointStep,LSL #1
+        
+        VLD2 {dW1r,dW1i}, [pw1@128]!
+              
+        
+        ADD     dstStep,dstStep,outPointStep                ;// dstStep = 3*outPointStep
+        RSB     dstStep,dstStep,#16                         ;// dstStep = - 3*outPointStep+16
+        
+        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
+        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
+        
+        ;// Process 4 groups at a time
+        
+grpLoop$name
+        
+                          
+        ;// Rearrange the third twiddle
+        VUZP    dW3r,dW3i
+        SUBS    grpCount,grpCount,#16                    ;// grpCount is multiplied by 4
+        
+                
+        VUZP     dButterfly1Real13, dButterfly2Real13        ;// B.r D.r
+        VUZP     dButterfly1Imag13, dButterfly2Imag13        ;// B.i D.i
+        VUZP     dButterfly1Real02, dButterfly2Real02        ;// A.r C.r
+        VUZP     dButterfly1Imag02, dButterfly2Imag02        ;// A.i C.i
+        
+                
+        IF  $inverse
+            VMULL   qT0,dXr1,dW1r
+            VMLAL   qT0,dXi1,dW1i                       ;// real part
+            VMULL   qT1,dXi1,dW1r
+            VMLSL   qT1,dXr1,dW1i                       ;// imag part
+            
+        ELSE
+            VMULL   qT0,dXr1,dW1r
+            VMLSL   qT0,dXi1,dW1i                       ;// real part
+            VMULL   qT1,dXi1,dW1r
+            VMLAL   qT1,dXr1,dW1i                       ;// imag part
+        
+        ENDIF
+        
+        ;// Load the first twiddle for 4 groups : w^1
+        ;// w^1 twiddle (i+0,i+1,i+2,i+3)       for group 0,1,2,3
+        
+        VLD2 {dW1r,dW1i}, [pw1@128]!
+        
+        IF  $inverse
+            VMULL   qT2,dXr2,dW2r
+            VMLAL   qT2,dXi2,dW2i                       ;// real part
+            VMULL   qT3,dXi2,dW2r
+            VMLSL   qT3,dXr2,dW2i                       ;// imag part
+            
+        ELSE
+            VMULL   qT2,dXr2,dW2r
+            VMLSL   qT2,dXi2,dW2i                       ;// real part
+            VMULL   qT3,dXi2,dW2r
+            VMLAL   qT3,dXr2,dW2i                       ;// imag part
+        
+        ENDIF
+        
+        VRSHRN  dZr1,qT0,#15
+        VRSHRN  dZi1,qT1,#15
+        
+        
+        
+        IF  $inverse
+            VMULL   qT0,dXr3,dW3r
+            VMLAL   qT0,dXi3,dW3i                       ;// real part
+            VMULL   qT1,dXi3,dW3r
+            VMLSL   qT1,dXr3,dW3i                       ;// imag part
+            
+        ELSE
+            VMULL   qT0,dXr3,dW3r
+            VMLSL   qT0,dXi3,dW3i                       ;// real part
+            VMULL   qT1,dXi3,dW3r
+            VMLAL   qT1,dXr3,dW3i                       ;// imag part
+        
+        ENDIF
+        
+        ;// Load the second twiddle for 4 groups : w^2
+        ;// w^2 twiddle (2i+0,2i+2,2i+4,2i+6)   for group 0,1,2,3
+        VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2@256]!
+        
+        
+        VRSHRN  dZr2,qT2,#15
+        VRSHRN  dZi2,qT3,#15
+        
+        ;// Load the third twiddle for 4 groups : w^3
+        ;// w^3 twiddle (3i+0,3i+3,3i+6,3i+9)   for group 0,1,2,3
+        
+        VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3@64]!
+        
+        VRSHRN  dZr3,qT0,#15
+        VRSHRN  dZi3,qT1,#15
+        
+        VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3@64]!        
+        
+        IF $scaled
+        
+            ;// finish first stage of 4 point FFT 
+            
+            VHADD    qY0,qX0,qZ2
+            VHSUB    qY2,qX0,qZ2
+            VHADD    qY1,qZ1,qZ3
+            VLD4    {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
+            
+            VHSUB    qY3,qZ1,qZ3
+                        
+            ;// finish second stage of 4 point FFT 
+            
+            VHSUB    qZ0,qY2,qY1
+            VHADD    qZ2,qY2,qY1
+            VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
+            
+                                    
+            IF $inverse
+                
+                VHADD    dZr3,dYr0,dYi3                          ;// y3 = u0-ju3
+                VST2    {dZr0,dZi0},[pDst@128],outPointStep
+                VHSUB    dZi3,dYi0,dYr3
+                
+                VHSUB    dZr1,dYr0,dYi3                          ;// y1 = u0+ju3
+                VHADD    dZi1,dYi0,dYr3
+                VST2    {dZr3,dZi3},[pDst@128],outPointStep
+                VST2    {dZr2,dZi2},[pDst@128],outPointStep
+                VST2    {dZr1,dZi1},[pDst@128],dstStep              ;// dstStep = -3*outPointStep + 16
+            
+            ELSE
+                
+                VHSUB    dZr1,dYr0,dYi3                          ;// y1 = u0+ju3
+                VHADD    dZi1,dYi0,dYr3
+            
+                VHADD    dZr3,dYr0,dYi3                          ;// y3 = u0-ju3
+                VST2    {dZr0,dZi0},[pDst@128],outPointStep
+                VHSUB    dZi3,dYi0,dYr3
+                VST2    {dZr1,dZi1},[pDst@128],outPointStep
+                VST2    {dZr2,dZi2},[pDst@128],outPointStep
+                VST2    {dZr3,dZi3},[pDst@128],dstStep              ;// dstStep = -3*outPointStep + 16
+                
+            ENDIF            
+        
+        ELSE
+        
+            ;// finish first stage of 4 point FFT 
+            
+            VADD    qY0,qX0,qZ2
+            VSUB    qY2,qX0,qZ2
+            VADD    qY1,qZ1,qZ3
+            VLD4    {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
+            
+            VSUB    qY3,qZ1,qZ3
+                        
+            ;// finish second stage of 4 point FFT 
+            
+            VSUB    qZ0,qY2,qY1
+            VADD    qZ2,qY2,qY1
+            VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
+            
+                                    
+            IF $inverse
+                
+                VADD    dZr3,dYr0,dYi3                          ;// y3 = u0-ju3
+                VST2    {dZr0,dZi0},[pDst@128],outPointStep
+                VSUB    dZi3,dYi0,dYr3
+                
+                VSUB    dZr1,dYr0,dYi3                          ;// y1 = u0+ju3
+                VADD    dZi1,dYi0,dYr3
+                VST2    {dZr3,dZi3},[pDst@128],outPointStep
+                VST2    {dZr2,dZi2},[pDst@128],outPointStep
+                VST2    {dZr1,dZi1},[pDst@128],dstStep              ;// dstStep = -3*outPointStep + 16
+            
+            ELSE
+                
+                VSUB    dZr1,dYr0,dYi3                          ;// y1 = u0+ju3
+                VADD    dZi1,dYi0,dYr3
+            
+                VADD    dZr3,dYr0,dYi3                          ;// y3 = u0-ju3
+                VST2    {dZr0,dZi0},[pDst@128],outPointStep
+                VSUB    dZi3,dYi0,dYr3
+                VST2    {dZr1,dZi1},[pDst@128],outPointStep
+                VST2    {dZr2,dZi2},[pDst@128],outPointStep
+                VST2    {dZr3,dZi3},[pDst@128],dstStep              ;// dstStep = -3*outPointStep + 16
+                
+            ENDIF            
+                        
+            
+            
+                        
+        ENDIF
+        
+        BGT     grpLoop$name
+           
+
+        ;// Reset and Swap pSrc and pDst for the next stage     
+        MOV     pTmp,pDst
+        SUB     pSrc,pSrc,#64                       ;// Extra increment currently done in the loop
+        SUB     pDst,pSrc,outPointStep,LSL #2       ;// pDst -= size; pSrc -= 4*size bytes           
+        SUB     pSrc,pTmp,outPointStep
+                
+        MEND
+        
+        
+        M_START armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE {FALSE},{FALSE},FWD
+        M_END
+
+        
+        M_START armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE {FALSE},{TRUE},INV
+        M_END
+ 
+        
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE {TRUE},{FALSE},FWDSFS
+        M_END
+
+        
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE {TRUE},{TRUE},INVSFS
+        M_END
+
+        
+    ENDIF                                                           ;//CortexA8
+    
+
+    
+     
+    END
\ No newline at end of file
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S
new file mode 100644
index 0000000..c13df04
--- /dev/null
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S
@@ -0,0 +1,392 @@
+;//
+;// 
+;// File Name:  armSP_FFT_CToC_SC16_Radix4_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision:   7761
+;// Last Modified Date:       Wed, 26 Sep 2007
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;// Compute a Radix 4 FFT stage for a N point complex signal
+;// 
+;// 
+
+        
+;// Include standard headers
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        INCLUDE armSP_FFT_s.h
+        
+        M_VARIANTS CortexA8
+        
+;// Import symbols required from other files
+;// (For example tables)
+    
+        
+        
+        
+;// Set debugging level        
+;//DEBUG_ON    SETL {TRUE}
+
+
+;// Guarding implementation by the processor name
+    
+
+    
+    ;// Guarding implementation by the processor name
+    
+    IF  CortexA8
+    
+;// Import symbols required from other files
+;// (For example tables)
+    
+    
+;//Input Registers
+
+pSrc            RN  0
+pDst            RN  2
+pTwiddle        RN  1
+subFFTNum       RN  6
+subFFTSize      RN  7
+
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+
+grpCount        RN  3
+pointStep       RN  4
+outPointStep    RN  5
+stepTwiddle     RN  12
+setCount        RN  14
+srcStep         RN  8
+setStep         RN  9
+dstStep         RN  10
+twStep          RN  11
+t1              RN  3
+
+;// Neon Registers
+
+dW1              DN  D0.S16
+dW2              DN  D1.S16
+dW3              DN  D2.S16   
+
+dXr0             DN  D4.S16
+dXi0             DN  D5.S16
+dXr1             DN  D6.S16
+dXi1             DN  D7.S16
+dXr2             DN  D8.S16
+dXi2             DN  D9.S16
+dXr3             DN  D10.S16
+dXi3             DN  D11.S16
+dYr0             DN  D12.S16
+dYi0             DN  D13.S16
+dYr1             DN  D14.S16
+dYi1             DN  D15.S16
+dYr2             DN  D16.S16
+dYi2             DN  D17.S16
+dYr3             DN  D18.S16
+dYi3             DN  D19.S16
+qT0              QN  Q8.S32   
+qT1              QN  Q9.S32
+qT2              QN  Q6.S32
+qT3              QN  Q7.S32
+
+dZr0             DN  D20.S16
+dZi0             DN  D21.S16
+dZr1             DN  D22.S16
+dZi1             DN  D23.S16
+dZr2             DN  D24.S16
+dZi2             DN  D25.S16
+dZr3             DN  D26.S16
+dZi3             DN  D27.S16
+qY0              QN  Q6.S16
+qY1              QN  Q7.S16
+qY2              QN  Q8.S16
+qY3              QN  Q9.S16   
+qX0              QN  Q2.S16
+qZ0              QN  Q10.S16
+qZ1              QN  Q11.S16
+qZ2              QN  Q12.S16
+qZ3              QN  Q13.S16
+
+        
+        MACRO
+        FFTSTAGE $scaled, $inverse , $name
+        
+        ;// Define stack arguments
+        
+        
+        ;// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+        
+        LSL     grpCount,subFFTSize,#2
+        LSR     subFFTNum,subFFTNum,#2  
+        MOV     subFFTSize,grpCount
+        
+        
+        ;// pOut0+1 increments pOut0 by 4 bytes
+        ;// pOut0+outPointStep == increment of 4*outPointStep bytes = size bytes
+        
+        MOV     stepTwiddle,#0
+        SMULBB  outPointStep,grpCount,subFFTNum  
+        
+        ;// pT0+1 increments pT0 by 4 bytes
+        ;// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
+        
+        LSL     pointStep,subFFTNum,#2                      ;// 2*grpSize    
+        
+        VLD1     dW1,[pTwiddle@64]                             ;//[wi | wr]
+        MOV     srcStep,pointStep,LSL #1                    ;// srcStep = 2*pointStep
+        VLD1     dW2,[pTwiddle@64]                             ;//[wi | wr]
+        ADD     setStep,srcStep,pointStep                   ;// setStep = 3*pointStep
+        SUB     srcStep,srcStep,#16                         ;// srcStep = 2*pointStep-16
+        VLD1     dW3,[pTwiddle@64]
+        ;//RSB     setStep,setStep,#16                      ;// setStep = - 3*pointStep+16
+        RSB     setStep,setStep,#0                          ;// setStep = - 3*pointStep
+        
+        MOV     dstStep,outPointStep,LSL #1
+        ADD     dstStep,dstStep,outPointStep                ;// dstStep = 3*outPointStep
+        RSB     dstStep,dstStep,#16                         ;// dstStep = - 3*outPointStep+16
+        
+
+        
+grpLoop$name      
+        
+        VLD2    {dXr0,dXi0},[pSrc@128],pointStep          ;//  data[0]
+        ADD      stepTwiddle,stepTwiddle,pointStep
+        VLD2    {dXr1,dXi1},[pSrc@128],pointStep          ;//  data[1]
+        ADD      pTwiddle,pTwiddle,stepTwiddle               ;// set pTwiddle to the first point
+        VLD2    {dXr2,dXi2},[pSrc@128],pointStep          ;//  data[2]
+        MOV      twStep,stepTwiddle,LSL #2
+        VLD2    {dXr3,dXi3},[pSrc@128],setStep            ;//  data[3] & reset pSrc 
+        
+        SUB      twStep,stepTwiddle,twStep                   ;// twStep = -3*stepTwiddle
+        
+        
+        MOV      setCount,pointStep,LSR #2
+        ADD     pSrc,pSrc,#16                         ;// set pSrc to data[0] of the next set
+        ADD     pSrc,pSrc,pointStep                   ;// increment to data[1] of the next set
+       
+        ;// Loop on the sets : 4 at a time
+
+setLoop$name        
+               
+        SUBS    setCount,setCount,#4                    ;// decrement the loop counter
+        
+        IF  $inverse
+            VMULL   qT0,dXr1,dW1[0]
+            VMLAL   qT0,dXi1,dW1[1]                       ;// real part
+            VMULL   qT1,dXi1,dW1[0]
+            VMLSL   qT1,dXr1,dW1[1]                       ;// imag part
+            
+        ELSE
+            VMULL   qT0,dXr1,dW1[0]
+            VMLSL   qT0,dXi1,dW1[1]                       ;// real part
+            VMULL   qT1,dXi1,dW1[0]
+            VMLAL   qT1,dXr1,dW1[1]                       ;// imag part
+        
+        ENDIF
+        
+        VLD2    {dXr1,dXi1},[pSrc@128],pointStep          ;//  data[1]
+        
+        IF  $inverse
+            VMULL   qT2,dXr2,dW2[0]
+            VMLAL   qT2,dXi2,dW2[1]                       ;// real part
+            VMULL   qT3,dXi2,dW2[0]
+            VMLSL   qT3,dXr2,dW2[1]                       ;// imag part
+            
+        ELSE
+            VMULL   qT2,dXr2,dW2[0]
+            VMLSL   qT2,dXi2,dW2[1]                       ;// real part
+            VMULL   qT3,dXi2,dW2[0]
+            VMLAL   qT3,dXr2,dW2[1]                       ;// imag part
+        
+        ENDIF
+        
+        VRSHRN  dZr1,qT0,#15
+        VRSHRN  dZi1,qT1,#15
+        
+        
+        VLD2    {dXr2,dXi2},[pSrc@128],pointStep          ;//  data[2]
+        
+        IF  $inverse
+            VMULL   qT0,dXr3,dW3[0]
+            VMLAL   qT0,dXi3,dW3[1]                       ;// real part
+            VMULL   qT1,dXi3,dW3[0]
+            VMLSL   qT1,dXr3,dW3[1]                       ;// imag part
+            
+        ELSE
+            VMULL   qT0,dXr3,dW3[0]
+            VMLSL   qT0,dXi3,dW3[1]                       ;// real part
+            VMULL   qT1,dXi3,dW3[0]
+            VMLAL   qT1,dXr3,dW3[1]                       ;// imag part
+        
+        ENDIF
+        
+        VRSHRN  dZr2,qT2,#15
+        VRSHRN  dZi2,qT3,#15
+        
+        
+        VRSHRN  dZr3,qT0,#15
+        VRSHRN  dZi3,qT1,#15
+        VLD2    {dXr3,dXi3},[pSrc@128],setStep            ;//  data[3] & update pSrc for the next set
+        
+        
+        IF $scaled
+        
+            ;// finish first stage of 4 point FFT 
+            VHADD    qY0,qX0,qZ2
+            VHSUB    qY2,qX0,qZ2
+                        
+            VLD2    {dXr0,dXi0},[pSrc@128]!          ;//  data[0]
+            VHADD    qY1,qZ1,qZ3
+            VHSUB    qY3,qZ1,qZ3
+            
+                        
+            ;// finish second stage of 4 point FFT 
+                                    
+            IF  $inverse
+                
+                VHSUB    qZ0,qY2,qY1
+                
+                VHADD    dZr2,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst@128],outPointStep
+                VHSUB    dZi2,dYi0,dYr3
+                
+                VHADD    qZ1,qY2,qY1
+                VST2    {dZr2,dZi2},[pDst@128],outPointStep
+                
+                VHSUB    dZr3,dYr0,dYi3
+                VST2    {dZr1,dZi1},[pDst@128],outPointStep
+                VHADD    dZi3,dYi0,dYr3
+                VST2    {dZr3,dZi3},[pDst@128],dstStep
+                
+                
+            ELSE
+                
+                VHSUB    qZ0,qY2,qY1
+                       
+                VHSUB    dZr3,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst@128],outPointStep
+                VHADD    dZi3,dYi0,dYr3
+                        
+                VHADD    qZ1,qY2,qY1
+                VST2    {dZr3,dZi3},[pDst@128],outPointStep
+                        
+                VHADD    dZr2,dYr0,dYi3
+                VHSUB    dZi2,dYi0,dYr3
+                VST2    {dZr1,dZi1},[pDst@128],outPointStep
+                VST2    {dZr2,dZi2},[pDst@128],dstStep
+                
+            
+            ENDIF
+        
+        
+        ELSE
+        
+            ;// finish first stage of 4 point FFT 
+            VADD    qY0,qX0,qZ2
+            VSUB    qY2,qX0,qZ2
+                        
+            VLD2    {dXr0,dXi0},[pSrc]!          ;//  data[0]
+            VADD    qY1,qZ1,qZ3
+            VSUB    qY3,qZ1,qZ3
+            
+                        
+            ;// finish second stage of 4 point FFT 
+                                   
+                        
+            IF  $inverse
+                
+                VSUB    qZ0,qY2,qY1
+                
+                VADD    dZr2,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst@128],outPointStep
+                VSUB    dZi2,dYi0,dYr3
+                
+                VADD    qZ1,qY2,qY1
+                VST2    {dZr2,dZi2},[pDst@128],outPointStep
+                
+                VSUB    dZr3,dYr0,dYi3
+                VST2    {dZr1,dZi1},[pDst@128],outPointStep
+                VADD    dZi3,dYi0,dYr3
+                VST2    {dZr3,dZi3},[pDst@128],dstStep
+                
+                
+            ELSE
+                
+                VSUB    qZ0,qY2,qY1
+                       
+                VSUB    dZr3,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst@128],outPointStep
+                VADD    dZi3,dYi0,dYr3
+                        
+                VADD    qZ1,qY2,qY1
+                VST2    {dZr3,dZi3},[pDst@128],outPointStep
+                        
+                VADD    dZr2,dYr0,dYi3
+                VSUB    dZi2,dYi0,dYr3
+                VST2    {dZr1,dZi1},[pDst@128],outPointStep
+                VST2    {dZr2,dZi2},[pDst@128],dstStep
+                
+            
+            ENDIF
+                                    
+                        
+            
+        ENDIF
+        
+        ADD     pSrc,pSrc,pointStep                         ;// increment to data[1] of the next set       
+        BGT     setLoop$name
+        
+        VLD1     dW1,[pTwiddle@64],stepTwiddle                 ;//[wi | wr]
+        SUBS    grpCount,grpCount,#4                        ;// subtract 4 since grpCount multiplied by 4               
+        VLD1     dW2,[pTwiddle@64],stepTwiddle                 ;//[wi | wr]
+        ADD     pSrc,pSrc,srcStep                           ;// increment pSrc for the next grp
+        VLD1     dW3,[pTwiddle@64],twStep                      ;//[wi | wr]
+        
+        
+        
+        BGT     grpLoop$name    
+
+                
+        ;// Reset and Swap pSrc and pDst for the next stage
+        MOV     t1,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #2           ;// pDst -= size; pSrc -= 4*size bytes           
+        SUB     pSrc,t1,outPointStep    
+        
+        
+        MEND
+        
+        
+        M_START armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE {FALSE},{FALSE},FWD
+        M_END
+
+        
+        M_START armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE {FALSE},{TRUE},INV
+        M_END
+ 
+        
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE {TRUE},{FALSE},FWDSFS
+        M_END
+
+        
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE {TRUE},{TRUE},INVSFS
+        M_END
+
+        
+    ENDIF                                                           ;//CortexA8
+    
+ 
+    
+    END
\ No newline at end of file
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
new file mode 100644
index 0000000..741681f
--- /dev/null
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
@@ -0,0 +1,591 @@
+;//
+;// 
+;// File Name:  armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision:   7766
+;// Last Modified Date:       Thu, 27 Sep 2007
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;// Compute a first stage Radix 8 FFT stage for a N point complex signal
+;// 
+;// 
+
+        
+;// Include standard headers
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS CortexA8
+        
+;// Import symbols required from other files
+;// (For example tables)
+    
+        
+;// Set debugging level        
+;//DEBUG_ON    SETL {TRUE}
+
+
+
+;// Guarding implementation by the processor name
+    
+    
+    
+    
+;// Guarding implementation by the processor name
+    
+    IF  CortexA8
+    
+;//Input Registers
+
+pSrc            RN  0
+pDst            RN  2
+pTwiddle        RN  1
+subFFTNum       RN  6
+subFFTSize      RN  7
+pPingPongBuf    RN  5                  ;// dest buffer for the next stage (not pSrc for first stage) 
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+
+grpSize         RN  3
+setCount        RN  3                  ;// Reuse grpSize as setCount
+pointStep       RN  4
+outPointStep    RN  4
+setStep         RN  8
+step1           RN  9
+step2           RN  10
+t0              RN  11
+  
+
+;// Neon Registers
+
+dXr0             DN  D14.S16
+dXi0             DN  D15.S16
+dXr1             DN  D2.S16
+dXi1             DN  D3.S16
+dXr2             DN  D4.S16
+dXi2             DN  D5.S16
+dXr3             DN  D6.S16
+dXi3             DN  D7.S16
+dXr4             DN  D8.S16
+dXi4             DN  D9.S16
+dXr5             DN  D10.S16
+dXi5             DN  D11.S16
+dXr6             DN  D12.S16
+dXi6             DN  D13.S16
+dXr7             DN  D0.S16
+dXi7             DN  D1.S16
+qX0              QN  Q7.S16
+qX1              QN  Q1.S16
+qX2              QN  Q2.S16
+qX3              QN  Q3.S16   
+qX4              QN  Q4.S16
+qX5              QN  Q5.S16
+qX6              QN  Q6.S16
+qX7              QN  Q0.S16
+
+dUr0             DN  D16.S16
+dUi0             DN  D17.S16
+dUr2             DN  D18.S16
+dUi2             DN  D19.S16
+dUr4             DN  D20.S16
+dUi4             DN  D21.S16
+dUr6             DN  D22.S16
+dUi6             DN  D23.S16
+dUr1             DN  D24.S16
+dUi1             DN  D25.S16
+dUr3             DN  D26.S16
+dUi3             DN  D27.S16
+dUr5             DN  D28.S16
+dUi5             DN  D29.S16
+dUr7             DN  D30.S16                ;// reuse dXr7 and dXi7
+dUi7             DN  D31.S16
+qU0              QN   Q8.S16
+qU1              QN   Q12.S16
+qU2              QN   Q9.S16
+qU3              QN   Q13.S16   
+qU4              QN   Q10.S16
+qU5              QN   Q14.S16
+qU6              QN   Q11.S16
+qU7              QN   Q15.S16
+
+
+
+dVr0             DN  D24.S16
+dVi0             DN  D25.S16
+dVr2             DN  D26.S16
+dVi2             DN  D27.S16
+dVr4             DN  D28.S16
+dVi4             DN  D29.S16
+dVr6             DN  D30.S16
+dVi6             DN  D31.S16
+dVr1             DN  D16.S16
+dVi1             DN  D17.S16
+dVr3             DN  D18.S16
+dVi3             DN  D19.S16
+dVr5             DN  D20.S16
+dVi5             DN  D21.S16
+dVr7             DN  D22.S16              ;// reuse dUi7 
+dVi7             DN  D23.S16              ;// reuse dUr7 
+qV0              QN  Q12.S16
+qV1              QN  Q8.S16
+qV2              QN  Q13.S16
+qV3              QN  Q9.S16   
+qV4              QN  Q14.S16
+qV5              QN  Q10.S16
+qV6              QN  Q15.S16
+qV7              QN  Q11.S16
+
+
+
+dYr0             DN  D16.S16
+dYi0             DN  D17.S16
+dYr2             DN  D18.S16
+dYi2             DN  D19.S16
+dYr4             DN  D20.S16
+dYi4             DN  D21.S16
+dYr6             DN  D22.S16
+dYi6             DN  D23.S16
+dYr1             DN  D24.S16
+dYi1             DN  D25.S16
+dYr3             DN  D26.S16
+dYi3             DN  D27.S16
+dYr5             DN  D28.S16
+dYi5             DN  D29.S16
+dYr7             DN  D30.S16                 ;// reuse dYr4 and dYi4
+dYi7             DN  D31.S16
+qY0              QN   Q8.S16
+qY1              QN   Q12.S16
+qY2              QN   Q9.S16
+qY3              QN   Q13.S16   
+qY4              QN   Q10.S16
+qY5              QN   Q14.S16
+qY6              QN   Q11.S16
+qY7              QN   Q15.S16
+
+
+dT0              DN  D0.S16             
+dT1              DN  D1.S16
+
+
+;// Define constants
+ONEBYSQRT2      EQU   0x00005A82        ;// Q15 format
+    
+
+        MACRO
+        FFTSTAGE $scaled, $inverse , $name
+        
+        ;// Define stack arguments
+        
+        ;// Update pSubFFTSize and pSubFFTNum regs
+        MOV     subFFTSize,#8                               ;// subFFTSize = 1 for the first stage
+        LDR     t0,=ONEBYSQRT2                              ;// t0=(1/sqrt(2)) as Q15 format
+        
+        ;// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#3  
+        MOV     subFFTNum,grpSize
+        
+                
+        ;// pT0+1 increments pT0 by 4 bytes
+        ;// pT0+pointStep = increment of 4*pointStep bytes = grpSize/2 bytes
+        ;// Note: outPointStep = pointStep for firststage
+        
+        MOV     pointStep,grpSize,LSL #2
+        
+                                       
+        ;// Calculate the step of input data for the next set
+        ;//MOV     step1,pointStep,LSL #1                      ;// step1 = 2*pointStep
+        VLD2    {dXr0,dXi0},[pSrc@128],pointStep          ;//  data[0]
+        MOV     step1,grpSize,LSL #3
+        
+        MOV     step2,pointStep,LSL #3
+        VLD2    {dXr1,dXi1},[pSrc@128],pointStep          ;//  data[1]
+        SUB     step2,step2,pointStep                          ;// step2 = 7*pointStep
+        RSB     setStep,step2,#16                              ;// setStep = - 7*pointStep+16
+        
+        
+        
+        VLD2    {dXr2,dXi2},[pSrc@128],pointStep          ;//  data[2]
+        VLD2    {dXr3,dXi3},[pSrc@128],pointStep          ;//  data[3] 
+        VLD2    {dXr4,dXi4},[pSrc@128],pointStep          ;//  data[4]
+        VLD2    {dXr5,dXi5},[pSrc@128],pointStep          ;//  data[5]
+        VLD2    {dXr6,dXi6},[pSrc@128],pointStep          ;//  data[6]
+        VLD2    {dXr7,dXi7},[pSrc@128],setStep            ;//  data[7] & update pSrc for the next set
+                                                      ;//  setStep = -7*pointStep + 16  
+        ;// grp = 0 a special case since all the twiddle factors are 1
+        ;// Loop on the sets : 4 sets at a time
+
+grpZeroSetLoop$name
+                                                      
+        ;// Decrement setcount
+        SUBS    setCount,setCount,#4                    ;// decrement the set loop counter           
+                                                                         
+        
+        IF $scaled
+            ;// finish first stage of 8 point FFT 
+            
+            VHADD    qU0,qX0,qX4
+            VHADD    qU2,qX1,qX5
+            VHADD    qU4,qX2,qX6
+            VHADD    qU6,qX3,qX7
+            
+            ;// finish second stage of 8 point FFT 
+            
+            VHADD    qV0,qU0,qU4
+            VHSUB    qV2,qU0,qU4
+            VHADD    qV4,qU2,qU6
+            VHSUB    qV6,qU2,qU6
+            
+            ;// finish third stage of 8 point FFT 
+            
+            VHADD    qY0,qV0,qV4
+            VHSUB    qY4,qV0,qV4
+            VST2    {dYr0,dYi0},[pDst@128],step1                    ;// store y0
+            
+            IF  $inverse
+                
+                VHSUB    dYr2,dVr2,dVi6
+                VHADD    dYi2,dVi2,dVr6
+                
+                VHADD    dYr6,dVr2,dVi6
+                VST2    {dYr2,dYi2},[pDst@128],step1                    ;// store y2
+                VHSUB    dYi6,dVi2,dVr6
+            
+                VHSUB    qU1,qX0,qX4                    
+                VST2    {dYr4,dYi4},[pDst@128],step1                    ;// store y4
+            
+                VHSUB    qU3,qX1,qX5
+                VHSUB    qU5,qX2,qX6
+                VST2    {dYr6,dYi6},[pDst@128],step1                    ;// store y6
+            
+            ELSE
+            
+                VHADD    dYr6,dVr2,dVi6
+                VHSUB    dYi6,dVi2,dVr6
+                
+                VHSUB    dYr2,dVr2,dVi6
+                VST2    {dYr6,dYi6},[pDst@128],step1                    ;// store y2
+                VHADD    dYi2,dVi2,dVr6
+                
+                                
+                VHSUB    qU1,qX0,qX4
+                VST2    {dYr4,dYi4},[pDst@128],step1                    ;// store y4
+                VHSUB    qU3,qX1,qX5
+                VHSUB    qU5,qX2,qX6
+                VST2    {dYr2,dYi2},[pDst@128],step1                    ;// store y6
+
+            
+            ENDIF
+            
+            ;// finish first stage of 8 point FFT 
+            
+            VHSUB    qU7,qX3,qX7
+            VMOV    dT0[0],t0                                   
+            
+            ;// finish second stage of 8 point FFT 
+            
+            VHSUB    dVr1,dUr1,dUi5
+            VLD2    {dXr0,dXi0},[pSrc@128],pointStep          ;//  data[0] for next iteration
+            VHADD    dVi1,dUi1,dUr5
+            VHADD    dVr3,dUr1,dUi5
+            VLD2    {dXr1,dXi1},[pSrc@128],pointStep          ;//  data[1]
+            VHSUB    dVi3,dUi1,dUr5
+                        
+            VHSUB    dVr5,dUr3,dUi7
+            VLD2    {dXr2,dXi2},[pSrc@128],pointStep          ;//  data[2]
+            VHADD    dVi5,dUi3,dUr7
+            VHADD    dVr7,dUr3,dUi7
+            VLD2    {dXr3,dXi3},[pSrc@128],pointStep          ;//  data[3]
+            VHSUB    dVi7,dUi3,dUr7
+            
+            ;// finish third stage of 8 point FFT 
+            
+            IF  $inverse
+            
+                ;// calculate a*v5 
+                VQRDMULH    dT1,dVr5,dT0[0]                         ;// use dVi0 for dT1
+                VLD2    {dXr4,dXi4},[pSrc@128],pointStep          ;//  data[4]
+                VQRDMULH    dVi5,dVi5,dT0[0]
+                            
+                VLD2    {dXr5,dXi5},[pSrc@128],pointStep          ;//  data[5]
+                VSUB    dVr5,dT1,dVi5                               ;// a * V5
+                VADD    dVi5,dT1,dVi5
+                
+                VLD2    {dXr6,dXi6},[pSrc@128],pointStep          ;//  data[6]
+                
+                ;// calculate  b*v7
+                VQRDMULH    dT1,dVr7,dT0[0]
+                VQRDMULH    dVi7,dVi7,dT0[0]
+                
+                VHADD    qY1,qV1,qV5
+                VHSUB    qY5,qV1,qV5
+                
+                            
+                VADD    dVr7,dT1,dVi7                               ;// b * V7
+                VSUB    dVi7,dVi7,dT1
+                SUB     pDst, pDst, step2                           ;// set pDst to y1
+                
+                VLD2    {dXr7,dXi7},[pSrc@128],setStep            ;//  data[7]            
+                
+                
+                VHSUB    dYr3,dVr3,dVr7
+                VHSUB    dYi3,dVi3,dVi7
+                VST2    {dYr1,dYi1},[pDst@128],step1                    ;// store y1
+                VHADD    dYr7,dVr3,dVr7
+                VHADD    dYi7,dVi3,dVi7
+
+                
+                VST2    {dYr3,dYi3},[pDst@128],step1                    ;// store y3
+                VST2    {dYr5,dYi5},[pDst@128],step1                    ;// store y5
+                VST2    {dYr7,dYi7},[pDst@128],#16                      ;// store y7
+            ELSE
+            
+                ;// calculate  b*v7
+                VQRDMULH    dT1,dVr7,dT0[0]
+                VLD2    {dXr4,dXi4},[pSrc@128],pointStep          ;//  data[4]
+                VQRDMULH    dVi7,dVi7,dT0[0]
+                
+                VLD2    {dXr5,dXi5},[pSrc@128],pointStep          ;//  data[5]
+                VADD    dVr7,dT1,dVi7                               ;// b * V7
+                VSUB    dVi7,dVi7,dT1
+                
+                VLD2    {dXr6,dXi6},[pSrc@128],pointStep          ;//  data[6]
+                
+                ;// calculate a*v5 
+                VQRDMULH    dT1,dVr5,dT0[0]                         ;// use dVi0 for dT1
+                VQRDMULH    dVi5,dVi5,dT0[0]
+
+                VHADD    dYr7,dVr3,dVr7
+                VHADD    dYi7,dVi3,dVi7
+                SUB     pDst, pDst, step2                           ;// set pDst to y1
+            
+                VSUB    dVr5,dT1,dVi5                               ;// a * V5
+                VADD    dVi5,dT1,dVi5
+                VLD2    {dXr7,dXi7},[pSrc@128],setStep            ;//  data[7]            
+                
+                VHSUB    qY5,qV1,qV5
+                
+                VHSUB    dYr3,dVr3,dVr7
+                VST2    {dYr7,dYi7},[pDst@128],step1                    ;// store y1
+                VHSUB    dYi3,dVi3,dVi7
+                VHADD    qY1,qV1,qV5
+                
+                
+                VST2    {dYr5,dYi5},[pDst@128],step1                    ;// store y3
+                VST2    {dYr3,dYi3},[pDst@128],step1                    ;// store y5
+                VST2    {dYr1,dYi1},[pDst@128],#16                      ;// store y7
+
+            
+            ENDIF
+            
+            
+           
+        ELSE
+            ;// finish first stage of 8 point FFT 
+            
+            VADD    qU0,qX0,qX4
+            VADD    qU2,qX1,qX5
+            VADD    qU4,qX2,qX6
+            VADD    qU6,qX3,qX7
+            
+            ;// finish second stage of 8 point FFT 
+            
+            VADD    qV0,qU0,qU4
+            VSUB    qV2,qU0,qU4
+            VADD    qV4,qU2,qU6
+            VSUB    qV6,qU2,qU6
+            
+            ;// finish third stage of 8 point FFT 
+            
+            VADD    qY0,qV0,qV4
+            VSUB    qY4,qV0,qV4
+            VST2    {dYr0,dYi0},[pDst@128],step1                    ;// store y0
+            
+            IF  $inverse
+                
+                VSUB    dYr2,dVr2,dVi6
+                VADD    dYi2,dVi2,dVr6
+                
+                VADD    dYr6,dVr2,dVi6
+                VST2    {dYr2,dYi2},[pDst@128],step1                    ;// store y2
+                VSUB    dYi6,dVi2,dVr6
+            
+                VSUB    qU1,qX0,qX4                    
+                VST2    {dYr4,dYi4},[pDst@128],step1                    ;// store y4
+            
+                VSUB    qU3,qX1,qX5
+                VSUB    qU5,qX2,qX6
+                VST2    {dYr6,dYi6},[pDst@128],step1                    ;// store y6
+            
+            ELSE
+            
+                VADD    dYr6,dVr2,dVi6
+                VSUB    dYi6,dVi2,dVr6
+                
+                VSUB    dYr2,dVr2,dVi6
+                VST2    {dYr6,dYi6},[pDst@128],step1                    ;// store y2
+                VADD    dYi2,dVi2,dVr6
+                
+                                
+                VSUB    qU1,qX0,qX4
+                VST2    {dYr4,dYi4},[pDst@128],step1                    ;// store y4
+                VSUB    qU3,qX1,qX5
+                VSUB    qU5,qX2,qX6
+                VST2    {dYr2,dYi2},[pDst@128],step1                    ;// store y6
+
+            
+            ENDIF
+            
+            ;// finish first stage of 8 point FFT 
+            
+            VSUB    qU7,qX3,qX7
+            VMOV    dT0[0],t0                                   
+            
+            ;// finish second stage of 8 point FFT 
+            
+            VSUB    dVr1,dUr1,dUi5
+            VLD2    {dXr0,dXi0},[pSrc@128],pointStep          ;//  data[0] for next iteration
+            VADD    dVi1,dUi1,dUr5
+            VADD    dVr3,dUr1,dUi5
+            VLD2    {dXr1,dXi1},[pSrc@128],pointStep          ;//  data[1]
+            VSUB    dVi3,dUi1,dUr5
+                        
+            VSUB    dVr5,dUr3,dUi7
+            VLD2    {dXr2,dXi2},[pSrc@128],pointStep          ;//  data[2]
+            VADD    dVi5,dUi3,dUr7
+            VADD    dVr7,dUr3,dUi7
+            VLD2    {dXr3,dXi3},[pSrc@128],pointStep          ;//  data[3]
+            VSUB    dVi7,dUi3,dUr7
+            
+            ;// finish third stage of 8 point FFT 
+            
+            IF  $inverse
+            
+                ;// calculate a*v5 
+                VQRDMULH    dT1,dVr5,dT0[0]                         ;// use dVi0 for dT1
+                VLD2    {dXr4,dXi4},[pSrc@128],pointStep          ;//  data[4]
+                VQRDMULH    dVi5,dVi5,dT0[0]
+                            
+                VLD2    {dXr5,dXi5},[pSrc@128],pointStep          ;//  data[5]
+                VSUB    dVr5,dT1,dVi5                               ;// a * V5
+                VADD    dVi5,dT1,dVi5
+                
+                VLD2    {dXr6,dXi6},[pSrc@128],pointStep          ;//  data[6]
+                
+                ;// calculate  b*v7
+                VQRDMULH    dT1,dVr7,dT0[0]
+                VQRDMULH    dVi7,dVi7,dT0[0]
+                
+                VADD    qY1,qV1,qV5
+                VSUB    qY5,qV1,qV5
+                
+                            
+                VADD    dVr7,dT1,dVi7                               ;// b * V7
+                VSUB    dVi7,dVi7,dT1
+                SUB     pDst, pDst, step2                           ;// set pDst to y1
+                
+                VLD2    {dXr7,dXi7},[pSrc@128],setStep            ;//  data[7]            
+                
+                
+                VSUB    dYr3,dVr3,dVr7
+                VSUB    dYi3,dVi3,dVi7
+                VST2    {dYr1,dYi1},[pDst@128],step1                    ;// store y1
+                VADD    dYr7,dVr3,dVr7
+                VADD    dYi7,dVi3,dVi7
+
+                
+                VST2    {dYr3,dYi3},[pDst@128],step1                    ;// store y3
+                VST2    {dYr5,dYi5},[pDst@128],step1                    ;// store y5
+                VST2    {dYr7,dYi7},[pDst@128],#16                      ;// store y7
+            ELSE
+            
+                ;// calculate  b*v7
+                VQRDMULH    dT1,dVr7,dT0[0]
+                VLD2    {dXr4,dXi4},[pSrc@128],pointStep          ;//  data[4]
+                VQRDMULH    dVi7,dVi7,dT0[0]
+                
+                VLD2    {dXr5,dXi5},[pSrc@128],pointStep          ;//  data[5]
+                VADD    dVr7,dT1,dVi7                               ;// b * V7
+                VSUB    dVi7,dVi7,dT1
+                
+                VLD2    {dXr6,dXi6},[pSrc@128],pointStep          ;//  data[6]
+                
+                ;// calculate a*v5 
+                VQRDMULH    dT1,dVr5,dT0[0]                         ;// use dVi0 for dT1
+                VQRDMULH    dVi5,dVi5,dT0[0]
+
+                VADD    dYr7,dVr3,dVr7
+                VADD    dYi7,dVi3,dVi7
+                SUB     pDst, pDst, step2                           ;// set pDst to y1
+            
+                VSUB    dVr5,dT1,dVi5                               ;// a * V5
+                VADD    dVi5,dT1,dVi5
+                VLD2    {dXr7,dXi7},[pSrc@128],setStep            ;//  data[7]            
+                
+                VSUB    qY5,qV1,qV5
+                
+                VSUB    dYr3,dVr3,dVr7
+                VST2    {dYr7,dYi7},[pDst@128],step1                    ;// store y1
+                VSUB    dYi3,dVi3,dVi7
+                VADD    qY1,qV1,qV5
+                
+                
+                VST2    {dYr5,dYi5},[pDst@128],step1                    ;// store y3
+                VST2    {dYr3,dYi3},[pDst@128],step1                    ;// store y5
+                VST2    {dYr1,dYi1},[pDst@128],#16                      ;// store y7
+
+            
+            ENDIF
+            
+            
+        ENDIF
+        
+        SUB     pDst, pDst, step2                               ;// update pDst for the next set
+        BGT     grpZeroSetLoop$name
+        
+        
+        ;// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                             ;// pDst -= 2*grpSize  
+        MOV     pDst,pPingPongBuf 
+        
+        
+        
+        MEND
+        
+
+        ;// Allocate stack memory required by the function
+        
+        
+        M_START armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE {FALSE},{FALSE},FWD
+        M_END
+
+        
+        M_START armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE {FALSE},{TRUE},INV
+        M_END
+ 
+        
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE {TRUE},{FALSE},FWDSFS
+        M_END
+
+        
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE {TRUE},{TRUE},INVSFS
+        M_END
+
+    
+    ENDIF                                                           ;//CortexA8
+        
+    
+     
+    END
\ No newline at end of file
diff --git a/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S b/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S
new file mode 100644
index 0000000..399037c
--- /dev/null
+++ b/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S
@@ -0,0 +1,353 @@
+;//
+;// 
+;// File Name:  omxSP_FFTFwd_CToC_SC16_Sfs_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision:   6729
+;// Last Modified Date:       Tue, 17 Jul 2007
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;// Compute an inverse FFT for a complex signal
+;// 
+;// 
+
+        
+;// Include standard headers
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS CortexA8
+        
+;// Import symbols required from other files
+;// (For example tables)
+        
+        IMPORT  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe 
+        IMPORT  armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+        IMPORT  armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+        IMPORT  armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe 
+        IMPORT  armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe 
+        IMPORT  armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+        IMPORT  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+        IMPORT  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe  
+        IMPORT  armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe 
+        IMPORT  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+        IMPORT  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+        IMPORT  armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe  
+        IMPORT  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe 
+        IMPORT  armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe 
+        
+;// Set debugging level        
+;//DEBUG_ON    SETL {TRUE}
+
+
+
+;// Guarding implementation by the processor name
+    
+    
+
+;// Guarding implementation by the processor name
+    
+    IF  CortexA8
+    
+    IMPORT  armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe 
+    IMPORT  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe 
+    
+;//Input Registers
+
+pSrc            RN  0
+pDst            RN  1
+pFFTSpec        RN  2
+scale           RN  3
+
+
+;// Output registers
+result          RN  0
+
+;//Local Scratch Registers
+
+argTwiddle      RN  1
+argDst          RN  2
+argScale        RN  4
+pTwiddle        RN  4
+tmpOrder        RN  4
+pOut            RN  5
+subFFTSize      RN  7     
+subFFTNum       RN  6
+N               RN  6
+order           RN  14
+diff            RN  9
+count           RN  8                   ;// Total num of radix stages required to comple the FFT
+x0r             RN  4    
+x0i             RN  5
+diffMinusOne    RN  2
+round           RN  3
+
+;// Neon registers
+
+dX0             DN  D0.S16
+dShift          DN  D1.S16
+dX0S32          DN  D0.S32
+
+
+
+    ;// Allocate stack memory required by the function
+        M_ALLOC4        diffOnStack, 4
+
+    ;// Write function header
+        M_START     omxSP_FFTFwd_CToC_SC16_Sfs,r11,d15
+        
+        M_STRUCT     ARMsFFTSpec
+        M_FIELD      N, 4
+        M_FIELD      pBitRev, 4
+        M_FIELD      pTwiddle, 4
+        M_FIELD      pBuf, 4
+        M_ENDSTRUCT
+        
+        ;// Define stack arguments
+        
+        ;// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+        
+        ;// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+                
+        CLZ     order,N                             ;// N = 2^order 
+        RSB     order,order,#31     
+        MOV     subFFTSize,#1
+        ;//MOV     subFFTNum,N
+        
+        CMP     order,#3
+        BGT     orderGreaterthan3                   ;// order > 3
+        
+        CMP     order,#1
+        BGE     orderGreaterthan0                   ;// order > 0
+        M_STR   scale, diffOnStack,LT               ;// order = 0
+        LDRLT   x0r,[pSrc]
+        STRLT   x0r,[pDst]
+        MOVLT   pSrc,pDst
+        BLT     FFTEnd
+        
+orderGreaterthan0
+        ;// set the buffers appropriately for various orders
+        CMP     order,#2
+        MOVNE   argDst,pDst        
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                           ;// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+        
+        SUBS     diff,scale,order
+        M_STR   diff,diffOnStack
+        MOVGT   scale,order
+        ;// Now scale <= order
+        
+        CMP     order,#1
+        BGT     orderGreaterthan1
+        SUBS    scale,scale,#1
+        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe  ;// order = 1
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe      ;// order = 1
+        B       FFTEnd
+
+orderGreaterthan1
+        CMP     order,#2
+        MOV     argScale,scale
+        BGT     orderGreaterthan2
+        SUBS    argScale,argScale,#1
+        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe      ;// order =2          
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe  
+        SUBS    argScale,argScale,#1
+        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe  
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe  
+        B       FFTEnd
+        
+orderGreaterthan2                                                               ;// order =3        
+        SUBS    argScale,argScale,#1
+        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe      
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe  
+        SUBS    argScale,argScale,#1
+        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe  
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1
+        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe      
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe    
+        B       FFTEnd
+        
+
+orderGreaterthan3       
+        ;// check scale = 0 or scale = order
+        SUBS    diff, scale, order                 ;// scale > order 
+        MOVGT   scale,order     
+        BGE     specialScaleCase                   ;// scale = 0 or scale = order 
+        CMP     scale,#0
+        BEQ     specialScaleCase
+        B       generalScaleCase
+        
+specialScaleCase                                    ;//  scale = 0 or scale = order  and order > 3     
+        
+        TST     order, #2                           ;// Set input args to fft stages
+        MOVNE   argDst,pDst        
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                           ;// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle  
+
+        CMP      diff,#0
+        M_STR    diff, diffOnStack
+        BGE      scaleEqualsOrder  
+       
+        ;//check for even or odd order
+        ;// NOTE: The following combination of BL's would work fine eventhough the first
+        ;// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 
+        ;// armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+        
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe 
+        BLNE    armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe 
+        
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+
+unscaledRadix4Loop
+        BEQ        lastStageUnscaledRadix4
+        BL        armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        unscaledRadix4Loop
+
+lastStageUnscaledRadix4
+        BL      armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe 
+        B        FFTEnd         
+
+scaleEqualsOrder         
+        ;//check for even or odd order
+        ;// NOTE: The following combination of BL's would work fine eventhough the first
+        ;// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 
+        ;// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+                
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe 
+        BLNE    armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe 
+        
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+
+scaledRadix4Loop
+        BEQ        lastStageScaledRadix4
+        BL        armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        scaledRadix4Loop
+         
+lastStageScaledRadix4
+        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe 
+        B        FFTEnd                    
+         
+        
+        
+generalScaleCase                                        ;// 0 < scale < order and order > 3
+        ;// Determine the correct destination buffer
+        SUB     diff,order,scale
+        TST     diff,#0x01
+        ADDEQ   count,scale,diff,LSR #1         ;// count = scale + (order - scale)/2
+        MOVNE   count,order
+        TST     count,#0x01                     ;// Is count even or odd ?
+        
+        MOVNE   argDst,pDst                     ;// Set input args to fft stages
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                       ;// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle  
+
+        CMP     diff,#1
+        M_STR   diff, diffOnStack    
+        BEQ     scaleps                         ;// scaling including a radix2_ps stage
+        
+        MOV     argScale,scale                  ;// Put scale in RN4 so as to save and restore
+        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     ;// scaled first stage
+        SUBS    argScale,argScale,#1
+        
+scaledRadix2Loop        
+        BLGT    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1            ;// save and restore scale (RN4) in the scaled stages
+        BGT     scaledRadix2Loop
+        B       outScale
+
+scaleps
+        SUB     argScale,scale,#1                   ;// order>3 and diff=1 => scale >= 3
+        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     ;// scaled first stage
+        SUBS    argScale,argScale,#1
+        
+scaledRadix2psLoop
+        BEQ     scaledRadix2psStage        
+        BLGT    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1            ;// save and restore scale (RN4) in the scaled stages
+        BGE     scaledRadix2psLoop
+
+scaledRadix2psStage
+        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+        B       generalLastStageUnscaledRadix2         
+        
+        
+outScale        
+        M_LDR   diff, diffOnStack  
+        ;//check for even or odd order
+        TST     diff,#0x00000001
+        BEQ     generalUnscaledRadix4Loop
+        B       unscaledRadix2Loop
+
+generalUnscaledRadix4Loop
+        CMP        subFFTNum,#4
+         BEQ        generalLastStageUnscaledRadix4
+         BL        armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+         B        generalUnscaledRadix4Loop
+         
+generalLastStageUnscaledRadix4
+        BL      armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe 
+        B        End              
+
+unscaledRadix2Loop
+        CMP        subFFTNum,#4
+         BEQ        generalLastTwoStagesUnscaledRadix2
+         BL        armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
+         B        unscaledRadix2Loop        
+
+generalLastTwoStagesUnscaledRadix2
+        BL      armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+generalLastStageUnscaledRadix2                  
+        BL      armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe 
+        B        End
+
+
+FFTEnd                                              ;// Does only the scaling
+        
+        M_LDR   diff, diffOnStack  
+        CMP     diff,#0
+        BLE     End
+        
+        RSB     diff,diff,#0                        ;// to use VRSHL for right shift by a variable
+        VDUP    dShift,diff     
+        
+scaleFFTData                                        ;// N = subFFTSize  ; dataptr = pDst  ; scale = diff
+        VLD1    {dX0S32[0]},[pSrc]                        ;// pSrc contains pDst pointer
+        SUBS    subFFTSize,subFFTSize,#1
+        VRSHL   dX0,dShift
+        VST1    {dX0S32[0]},[pSrc]!
+                
+        BGT     scaleFFTData
+        
+                
+       
+End                        
+        ;// Set return value
+        MOV     result, #OMX_Sts_NoErr       
+
+        ;// Write function tail
+        M_END
+        
+    ENDIF                                           ;//CortexA8    
+
+    
+    
+    
+    
+    END
\ No newline at end of file
diff --git a/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S b/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S
new file mode 100644
index 0000000..f1a8d03
--- /dev/null
+++ b/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S
@@ -0,0 +1,334 @@
+;//
+;// 
+;// File Name:  omxSP_FFTInv_CToC_SC16_Sfs_s.s
+;// OpenMAX DL: v1.0.2
+;// Last Modified Revision:   6729
+;// Last Modified Date:       Tue, 17 Jul 2007
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;// Compute an inverse FFT for a complex signal
+;// 
+;// 
+
+        
+;// Include standard headers
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS CortexA8
+        
+;// Import symbols required from other files
+;// (For example tables)
+        
+        IMPORT  armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe 
+        IMPORT  armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+        IMPORT  armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+        IMPORT  armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe 
+        IMPORT  armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe 
+        IMPORT  armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
+        IMPORT  armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+        IMPORT  armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe  
+        IMPORT  armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe 
+        IMPORT  armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+        IMPORT  armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+        IMPORT  armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe  
+        IMPORT  armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe 
+        IMPORT  armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe 
+        
+;// Set debugging level        
+;//DEBUG_ON    SETL {TRUE}
+
+
+
+;// Guarding implementation by the processor name
+    
+    
+
+;// Guarding implementation by the processor name
+    
+    IF  CortexA8 
+    
+    IMPORT  armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe 
+    IMPORT  armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe 
+    
+;//Input Registers
+
+pSrc            RN  0
+pDst            RN  1
+pFFTSpec        RN  2
+scale           RN  3
+
+
+;// Output registers
+result          RN  0
+
+;//Local Scratch Registers
+
+argTwiddle      RN  1
+argDst          RN  2
+argScale        RN  4
+pTwiddle        RN  4
+tmpOrder        RN  4
+pOut            RN  5
+subFFTSize      RN  7     
+subFFTNum       RN  6
+N               RN  6
+order           RN  14
+diff            RN  9
+count           RN  8                   ;// Total num of radix stages required to comple the FFT
+x0r             RN  4    
+x0i             RN  5
+diffMinusOne    RN  2
+round           RN  3
+
+;// Neon registers
+
+dX0             DN  D0.S16
+dShift          DN  D1.S16
+dX0S32          DN  D0.S32
+
+
+    ;// Allocate stack memory required by the function
+        M_ALLOC4        diffOnStack, 4
+
+    ;// Write function header
+        M_START     omxSP_FFTInv_CToC_SC16_Sfs,r11,d15
+        
+        M_STRUCT     ARMsFFTSpec
+        M_FIELD      N, 4
+        M_FIELD      pBitRev, 4
+        M_FIELD      pTwiddle, 4
+        M_FIELD      pBuf, 4
+        M_ENDSTRUCT
+        
+        ;// Define stack arguments
+        
+        ;// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+        
+        ;// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+                
+        CLZ     order,N                             ;// N = 2^order 
+        RSB     order,order,#31     
+        MOV     subFFTSize,#1
+        ;//MOV     subFFTNum,N
+        
+        ADD     scale,scale,order                   ;// FFTInverse has a final scaling factor by N
+        
+        CMP     order,#3
+        BGT     orderGreaterthan3                   ;// order > 3
+        
+        CMP     order,#1
+        BGE     orderGreaterthan0                   ;// order > 0
+        M_STR   scale, diffOnStack,LT               ;// order = 0
+        LDRLT   x0r,[pSrc]
+        STRLT   x0r,[pDst]
+        MOVLT   pSrc,pDst
+        BLT     FFTEnd
+        
+orderGreaterthan0
+        ;// set the buffers appropriately for various orders
+        CMP     order,#2
+        MOVNE   argDst,pDst        
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                           ;// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+        ;// Store the scale factor and scale at the end
+        SUB     diff,scale,order
+        M_STR   diff, diffOnStack
+        BGE     orderGreaterthan1
+        BLLT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe  ;// order = 1
+        B       FFTEnd
+        
+        
+orderGreaterthan1
+        MOV     tmpOrder,order                          ;// tmpOrder = RN 4
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe        
+        CMP     tmpOrder,#2
+        BLGT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+        B       FFTEnd
+        
+               
+
+
+orderGreaterthan3       
+        ;// check scale = 0 or scale = order
+        SUBS    diff, scale, order                 ;// scale > order 
+        MOVGT   scale,order     
+        BGE     specialScaleCase                   ;// scale = 0 or scale = order 
+        CMP     scale,#0
+        BEQ     specialScaleCase
+        B       generalScaleCase
+        
+specialScaleCase                                    ;//  scale = 0 or scale = order  and order > 3     
+        
+        TST     order, #2                           ;// Set input args to fft stages
+        MOVNE   argDst,pDst        
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                           ;// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle  
+
+        CMP      diff,#0
+        M_STR    diff, diffOnStack
+        BGE      scaleEqualsOrder  
+       
+        ;//check for even or odd order
+        ;// NOTE: The following combination of BL's would work fine eventhough the first
+        ;// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 
+        ;// armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+        
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe 
+        BLNE    armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe 
+        
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+
+unscaledRadix4Loop
+        BEQ        lastStageUnscaledRadix4
+        BL        armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        unscaledRadix4Loop
+
+lastStageUnscaledRadix4
+        BL      armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe 
+        B        FFTEnd         
+
+scaleEqualsOrder         
+        ;//check for even or odd order
+        ;// NOTE: The following combination of BL's would work fine eventhough the first
+        ;// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 
+        ;// armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+                
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe 
+        BLNE    armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe 
+        
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+
+scaledRadix4Loop
+        BEQ        lastStageScaledRadix4
+        BL        armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        scaledRadix4Loop
+         
+lastStageScaledRadix4
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe 
+        B        FFTEnd                    
+         
+        
+        
+generalScaleCase                                        ;// 0 < scale < order and order > 3
+        ;// Determine the correct destination buffer
+        SUB     diff,order,scale
+        TST     diff,#0x01
+        ADDEQ   count,scale,diff,LSR #1         ;// count = scale + (order - scale)/2
+        MOVNE   count,order
+        TST     count,#0x01                     ;// Is count even or odd ?
+        
+        MOVNE   argDst,pDst                     ;// Set input args to fft stages
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                       ;// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle  
+
+        CMP     diff,#1
+        M_STR   diff, diffOnStack    
+        BEQ     scaleps                         ;// scaling including a radix2_ps stage
+        
+        MOV     argScale,scale                  ;// Put scale in RN4 so as to save and restore
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     ;// scaled first stage
+        SUBS    argScale,argScale,#1
+        
+scaledRadix2Loop        
+        BLGT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1            ;// save and restore scale (RN4) in the scaled stages
+        BGT     scaledRadix2Loop
+        B       outScale
+
+scaleps
+        SUB     argScale,scale,#1                   ;// order>3 and diff=1 => scale >= 3
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     ;// scaled first stage
+        SUBS    argScale,argScale,#1
+        
+scaledRadix2psLoop
+        BEQ     scaledRadix2psStage        
+        BLGT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1            ;// save and restore scale (RN4) in the scaled stages
+        BGE     scaledRadix2psLoop
+
+scaledRadix2psStage
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+        B       generalLastStageUnscaledRadix2         
+        
+        
+outScale        
+        M_LDR   diff, diffOnStack  
+        ;//check for even or odd order
+        TST     diff,#0x00000001
+        BEQ     generalUnscaledRadix4Loop
+        B       unscaledRadix2Loop
+
+generalUnscaledRadix4Loop
+        CMP        subFFTNum,#4
+         BEQ        generalLastStageUnscaledRadix4
+         BL        armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
+         B        generalUnscaledRadix4Loop
+         
+generalLastStageUnscaledRadix4
+        BL      armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe 
+        B        End              
+
+unscaledRadix2Loop
+        CMP        subFFTNum,#4
+         BEQ        generalLastTwoStagesUnscaledRadix2
+         BL        armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe
+         B        unscaledRadix2Loop        
+
+generalLastTwoStagesUnscaledRadix2
+        BL      armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+generalLastStageUnscaledRadix2                  
+        BL      armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe 
+        B        End
+
+
+FFTEnd                                              ;// Does only the scaling
+        
+        M_LDR   diff, diffOnStack  
+        CMP     diff,#0
+        BLE     End
+        
+        RSB     diff,diff,#0                        ;// to use VRSHL for right shift by a variable
+        VDUP    dShift,diff     
+        
+scaleFFTData                                        ;// N = subFFTSize  ; dataptr = pDst  ; scale = diff
+        VLD1    {dX0S32[0]},[pSrc]                        ;// pSrc contains pDst pointer
+        SUBS    subFFTSize,subFFTSize,#1
+        VRSHL   dX0,dShift
+        VST1    {dX0S32[0]},[pSrc]!
+                
+        BGT     scaleFFTData
+        
+       
+End                        
+        ;// Set return value
+        MOV     result, #OMX_Sts_NoErr       
+
+        ;// Write function tail
+        M_END
+        
+    ENDIF                                           ;//CortexA8    
+
+    
+    
+    
+    
+    END
\ No newline at end of file