Implementation of real value 16 bit FFT with 16 bit complex FFT routines, for ARM Neon platforms.
Verified with SNR testing code in Openmax folder.

R=aedla@chromium.org, rtoy@google.com

Review URL: https://webrtc-codereview.appspot.com/1323010

git-svn-id: http://webrtc.googlecode.com/svn/deps/third_party/openmax@4362 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/dl/dl.gyp b/dl/dl.gyp
index 0573ce2..40cbdc4 100644
--- a/dl/dl.gyp
+++ b/dl/dl.gyp
@@ -54,6 +54,7 @@
         'sp/src/omxSP_FFTInit_R_S32.c',
         'sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S',
         # Complex 16-bit fixed-point FFT
+        'sp/src/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S',
         'sp/src/omxSP_FFTInit_C_SC16.c',
         'sp/src/omxSP_FFTGetBufSize_C_SC16.c',
         'sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S',
@@ -67,6 +68,10 @@
         'sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S',
         'sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S',
         # Real 16-bit fixed-point FFT
+        'sp/src/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S',
+        'sp/src/omxSP_FFTGetBufSize_R_S16.c',
+        'sp/src/omxSP_FFTInit_R_S16.c',
+        'sp/src/omxSP_FFTInv_CCSToR_S16_Sfs_s.S',
         'sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S',
         'sp/src/omxSP_FFTGetBufSize_R_S16S32.c',
         'sp/src/omxSP_FFTInit_R_S16S32.c',
diff --git a/dl/sp/api/armSP.h b/dl/sp/api/armSP.h
index f615a87..4972f09 100644
--- a/dl/sp/api/armSP.h
+++ b/dl/sp/api/armSP.h
@@ -64,6 +64,14 @@
     OMX_S32     *pBuf;
 }ARMsFFTSpec_R_SC32;
 
+typedef struct  ARMsFFTSpec_R_SC16_Tag 
+{
+    OMX_U32     N;
+    OMX_U16     *pBitRev;    
+    OMX_SC16    *pTwiddle;
+    OMX_S16     *pBuf;
+} ARMsFFTSpec_R_SC16;
+
 typedef struct ARMsFFTSpec_R_FC32_Tag
 {
     OMX_U32 N;
diff --git a/dl/sp/api/omxSP.h b/dl/sp/api/omxSP.h
index 695fa90..c910363 100644
--- a/dl/sp/api/omxSP.h
+++ b/dl/sp/api/omxSP.h
@@ -44,6 +44,7 @@
  typedef void OMXFFTSpec_C_SC16;
  typedef void OMXFFTSpec_C_SC32;
  typedef void OMXFFTSpec_R_S16S32;
+ typedef void OMXFFTSpec_R_S16;
  typedef void OMXFFTSpec_R_S32;
  typedef void OMXFFTSpec_R_F32;
  typedef void OMXFFTSpec_C_FC32;
@@ -1487,6 +1488,45 @@
 
 
 /**
+ * Function:  omxSP_FFTInit_R_S16
+ *
+ * Description:
+ * These functions initialize specification structures required for the real 
+ * FFT and IFFT functions. The function <FFTInit_R_S16> is used 
+ * to initialize the specification structures for functions
+ * <FFTFwd_RToCCS_S16_Sfs> and <FFTInv_CCSToR_S16_Sfs>.
+ *
+ * Memory for *pFFTFwdSpec must be allocated before calling these functions
+ * and should be 8-byte aligned. 
+ *
+ * The number of bytes required for *pFFTFwdSpec can be 
+ * determined using <FFTGetBufSize_R_S16>. 
+ *
+ * Input Arguments:
+ *   
+ *   order - base-2 logarithm of the desired block length; valid in the range 
+ *            [1,12] 
+ *
+ * Output Arguments:
+ *   
+ *   pFFTFwdSpec - pointer to the initialized specification structure 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if one or more of the 
+ *              following is true: 
+ *    -   pFFTFwdSpec is either NULL or violates the 8-byte alignment 
+ *              restrictions 
+ *    -   order < 1 or order > 12 
+ *
+ */
+OMXResult omxSP_FFTInit_R_S16 (
+    OMXFFTSpec_R_S32*pFFTFwdSpec,
+    OMX_INT order
+);
+
+/**
  * Function:  omxSP_FFTInit_R_S32   (2.2.4.1.4)
  *
  * Description:
@@ -1699,6 +1739,38 @@
 );
 
 
+/**
+ * Function:  omxSP_FFTGetBufSize_R_S16
+ *
+ * Description:
+ * These functions compute the size of the specification structure 
+ * required for the length 2^order real FFT and IFFT functions.  The function 
+ * <FFTGetBufSize_R_S16> is used in conjunction with the 16-bit 
+ * functions <FFTFwd_RToCCS_S16_Sfs> and <FFTInv_CCSToR_S16_Sfs>. 
+ *
+ * Input Arguments:
+ *   
+ *   order - base-2 logarithm of the length; valid in the range
+ *   [1,12]
+ *
+ * Output Arguments:
+ *   
+ *   pSize - pointer to the number of bytes required for the specification 
+ *            structure 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments The function returns 
+ *              OMX_Sts_BadArgErr if one or more of the following is true: 
+ *    pSize is NULL 
+ *    order < 1 or order > 12 
+ *
+ */
+OMXResult omxSP_FFTGetBufSize_R_S16 (
+    OMX_INT order,
+    OMX_INT *pSize
+);
 
 /**
  * Function:  omxSP_FFTGetBufSize_R_S32   (2.2.4.1.8)
@@ -2023,6 +2095,59 @@
 );
 
 
+/**
+ * Function:  omxSP_FFTFwd_RToCCS_S16_Sfs
+ *
+ * Description:
+ * These functions compute an FFT for a real-valued signal of length of 2^order,
+ * where 0 < order <= 12. Transform length is determined by the
+ * specification structure, which must be initialized prior to calling the FFT
+ * function using the appropriate helper, i.e., <FFTInit_R_S16>.
+ * The relationship between the input and output sequences can
+ * be expressed in terms of the DFT, i.e.:
+ *
+ *     x[n] = (2^(-scalefactor)/N)  . SUM[k=0,...,N-1] X[k].e^(jnk.2.pi/N)
+ *     n=0,1,2,...N-1
+ *     N=2^order.
+ *
+ * The conjugate-symmetric output sequence is represented using a CCS vector,
+ * which is of length N+2, and is organized as follows:
+ *
+ *   Index:      0  1  2  3  4  5   . . .   N-2       N-1       N       N+1
+ *   Component:  R0 0  R1 I1 R2 I2  . . .   R[N/2-1]  I[N/2-1]  R[N/2]  0
+ *
+ * where R[n] and I[n], respectively, denote the real and imaginary components
+ * for FFT bin 'n'. Bins  are numbered from 0 to N/2, where N is the FFT length.
+ * Bin index 0 corresponds to the DC component, and bin index N/2 corresponds to
+ * the foldover frequency.
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the real-valued input sequence, of length 2^order;
+ *          must be aligned on a 32-byte boundary.
+ *   pFFTSpec - pointer to the preallocated and initialized specification
+ *            structure
+ *   scaleFactor - output scale factor; valid range is [0, 16]
+ *
+ * Output Arguments:
+ *   pDst - pointer to output sequence, represented using CCS format, of
+ *            length (2^order)+2; must be aligned on a 32-byte boundary.
+ *
+ * Return Value:
+ *
+ *    OMX_Sts_NoErr - no error
+ *    OMX_Sts_BadArgErr - bad arguments, if one or more of followings is true:
+ *    -    one of the pointers pSrc, pDst, or pFFTSpec is NULL
+ *    -    pSrc or pDst is not aligned on a 32-byte boundary
+ *    -    scaleFactor<0 or scaleFactor >16
+ *
+ */
+OMXResult omxSP_FFTFwd_RToCCS_S16_Sfs (
+    const OMX_S16* pSrc,
+    OMX_S16* pDst,
+    const OMXFFTSpec_R_S16* pFFTSpec,
+    OMX_INT scaleFactor
+);
+
 
 /**
  * Function:  omxSP_FFTFwd_RToCCS_S32_Sfs   (2.2.4.4.2)
@@ -2178,6 +2303,53 @@
 );
 
 
+/**
+ * Function:  omxSP_FFTInv_CCSToR_S16_Sfs
+ *
+ * Description:
+ * These functions compute the inverse FFT for a conjugate-symmetric input
+ * sequence.  Transform length is determined by the specification structure,
+ * which must be initialized prior to calling the FFT function using
+ * <FFTInit_R_S16>. For a transform of length M, the input
+ * sequence is represented using a packed CCS vector of length
+ * M+2, and is organized as follows:
+ *
+ *   Index:     0    1  2    3    4    5    . . .  M-2       M-1      M      M+1
+ *   Component  R[0] 0  R[1] I[1] R[2] I[2] . . .  R[M/2-1]  I[M/2-1] R[M/2] 0
+ *
+ * where R[n] and I[n], respectively, denote the real and imaginary components
+ * for FFT bin n.
+ * Bins are numbered from 0 to M/2, where M is the FFT length.  Bin index 0
+ * corresponds to the DC component, and bin index M/2 corresponds to the
+ * foldover frequency.
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the complex-valued input sequence represented using
+ *            CCS format, of length (2^order) + 2; must be aligned on a 32-byte
+ *            boundary.
+ *   pFFTSpec - pointer to the preallocated and initialized specification
+ *            structure
+ *   scaleFactor - output scalefactor; range is [0,16]
+ *
+ * Output Arguments:
+ *   pDst - pointer to the real-valued output sequence, of length 2^order ; must
+ *            be aligned on a 32-byte boundary.
+ *
+ * Return Value:
+ *
+ *    OMX_Sts_NoErr - no error
+ *    OMX_Sts_BadArgErr - bad arguments if one or more of the following is true:
+ *    -    pSrc, pDst, or pFFTSpec is NULL
+ *    -    pSrc or pDst is not aligned on a 32-byte boundary
+ *    -    scaleFactor<0 or scaleFactor >16
+ *
+ */
+OMXResult omxSP_FFTInv_CCSToR_S16_Sfs (
+    const OMX_S16* pSrc,
+    OMX_S16* pDst,
+    const OMXFFTSpec_R_S16* pFFTSpec,
+    OMX_INT scaleFactor
+);
 
 /**
  * Function:  omxSP_FFTInv_CCSToR_S32_Sfs   (2.2.4.4.4)
diff --git a/dl/sp/src/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S b/dl/sp/src/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S
new file mode 100644
index 0000000..31638dd
--- /dev/null
+++ b/dl/sp/src/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S
@@ -0,0 +1,409 @@
+@
+@  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@
+@  Use of this source code is governed by a BSD-style license
+@  that can be found in the LICENSE file in the root of the source
+@  tree. An additional intellectual property rights grant can be found
+@  in the file PATENTS.  All contributing project authors may
+@  be found in the AUTHORS file in the root of the source tree.
+@
+@ Some code in this file was originally from file
+@ armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S which was licensed as
+@ follows. It has been relicensed with permission from the copyright holders.
+@
+
+@
+@ OpenMAX DL: v1.0.2
+@ Last Modified Revision:   7485
+@ Last Modified Date:       Fri, 21 Sep 2007
+@ 
+@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@
+
+@
+@ Description:
+@ Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT.
+@ It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation.
+@ It implements both "scaled"(by 1/2) and "unscaled" versions of the above
+@ formula.
+@ 
+        
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+@//Input Registers
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+#define scale           r3
+
+@ Output registers
+#define result          r0
+
+@//Local Scratch Registers
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define tmpOrder        r4
+#define pTwiddle        r4
+#define pOut            r5
+#define subFFTSize      r7     
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+@ Total num of radix stages to comple the FFT.
+#define count           r8
+#define x0r             r4    
+#define x0i             r5
+#define diffMinusOne    r2
+#define round           r3
+#define pOut1           r2
+#define size            r7
+#define step            r8            
+#define step1           r9
+#define step2           r10
+#define twStep          r10
+#define pTwiddleTmp     r11
+#define argTwiddle1     r12
+#define zero            r14
+
+@ Neon registers
+#define dX0             D0.S16
+#define dX0S32          D0.S32
+#define dShift          D1.S16
+#define dX1             D1.S16
+#define dX1S32          D1.S32
+#define dY0             D2.S16
+#define dY1             D3.S16
+#define dX0r            D0.S16            
+#define dX0rS32         D0.S32
+#define dX0i            D1.S16
+#define dX1r            D2.S16
+#define dX1i            D3.S16
+#define qX1             Q1.S16
+#define dW0r            D4.S16
+#define dW0i            D5.S16
+#define dW1r            D6.S16
+#define dW1i            D7.S16
+#define dW0rS32         D4.S32
+#define dW0iS32         D5.S32
+#define dW1rS32         D6.S32
+#define dW1iS32         D7.S32
+#define dT0             D8.S16
+#define dT1             D9.S16
+#define dT2             D10.S16
+#define dT3             D11.S16
+#define qT0             Q6.S32
+#define qT1             Q7.S32
+#define qT2             Q8.S32
+#define qT3             Q9.S32
+#define dY0r            D4.S16
+#define dY0i            D5.S16
+#define dY1r            D6.S16
+#define dY1i            D7.S16
+#define qY1             Q3.S16
+#define dY2             D4.S16
+#define dY3             D5.S16
+#define dW0             D6.S16
+#define dW1             D7.S16
+#define dW0Tmp          D10.S16
+#define dW1Neg          D11.S16
+
+        @ Structure offsets for the FFTSpec             
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        .MACRO FFTSTAGE scaled, inverse, name
+        
+        @ Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+        
+        @ Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+        
+        MOV     size,N,ASR #1        @ preserve the contents of N
+        MOV     step,N,LSL #1        @ step = N/2 * 4 bytes
+        
+        @ Process different FFT sizes with different loops.
+        CMP    size,#4
+        BLE    smallFFTSize\name
+        
+        @ Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
+        @ Note: W^(k) is stored as negated value and also need to
+        @ conjugate the values from the table.
+        
+        @ Z(0) : no need of twiddle multiply
+        @ Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
+        
+        VLD1    dX0S32[0],[pSrc],step
+        ADD     pOut1,pOut,step      @ pOut1 = pOut+ N/2*4 bytes 
+                
+        VLD1    dX1S32[0],[pSrc]!
+        SUB     twStep,step,size     @ twStep = 3N/8 * 4 bytes pointing to W^1
+        
+        MOV     step1,size,LSL #1    @ step1 = N/4 * 4 = N/2*2 bytes
+        SUB     step1,step1,#4       @ (N/4-1)*4 bytes
+        
+        VHADD    dY0,dX0,dX1         @ [b+d | a+c]
+        VHSUB    dY1,dX0,dX1         @ [b-d | a-c] 
+        VTRN    dY0,dY1              @ dY0= [a-c | a+c] ;dY1= [b-d | b+d] 
+        
+        .ifeqs  "\scaled", "TRUE"
+            VHSUB   dX0,dY0,dY1
+            SUBS    size,size,#2
+            VHADD   dX1,dY0,dY1
+        .else
+            VSUB   dX0,dY0,dY1
+            SUBS    size,size,#2
+            VADD   dX1,dY0,dY1
+        .endif
+                    
+        SUB     pSrc,pSrc,step
+        VST1    dX0[0],[pOut1]!
+        ADD     pTwiddleTmp,pTwiddle,#4                @ W^2
+        VST1    dX1[1],[pOut1]!
+        ADD     argTwiddle1,pTwiddle,twStep            @ W^1 
+        
+        BLT     decrementScale\name
+        BEQ     lastElement\name
+                        
+        SUB     step,step,#20
+        SUB     step1,step1,#4                         @ (N/4-1)*8 bytes
+        SUB     step2, step1, #4
+                        
+        @ Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
+        @ Note: W^k is stored as negative values in the table and also need to
+        @ conjugate the values from the table.
+        @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+        @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
+
+evenOddButterflyLoop\name:     
+        VLD2    {dX0r,dX0i},[pSrc],step
+        VLD2    {dX1r,dX1i},[pSrc]!
+        SUB     pSrc, pSrc, step
+
+        VLD1    dW0r,[argTwiddle1],step1
+        VREV64  qX1,qX1
+        VLD1    dW1r,[argTwiddle1]!
+        VHSUB   dT2,dX0r,dX1r                          @ a-c
+        SUB     argTwiddle1, argTwiddle1, step1
+        SUB     step1,step1,#16
+
+        VLD1    dW0i,[pTwiddleTmp],step2
+        VHADD   dT3,dX0i,dX1i                          @ b+d
+        VLD1    dW1i,[pTwiddleTmp]!
+        VHADD   dT0,dX0r,dX1r                          @ a+c
+        VHSUB   dT1,dX0i,dX1i                          @ b-d
+        SUB     pTwiddleTmp, pTwiddleTmp, step2
+        SUB     step2,step2,#16
+
+        SUBS    size,size,#8
+        
+        VZIP    dW1r,dW1i
+        VTRN    dW0r,dW0i
+        VZIP    dW1iS32, dW1rS32
+                                
+        VMULL   qT0,dW1i,dT2
+        VMLSL   qT0,dW1r,dT3
+        VMULL   qT1,dW1i,dT3
+        VMLAL   qT1,dW1r,dT2
+        VMULL   qT2,dW0r,dT2
+        VMLAL   qT2,dW0i,dT3
+        VMULL   qT3,dW0r,dT3
+        VMLSL   qT3,dW0i,dT2
+        
+        VRSHRN  dX1r,qT0,#15
+        VRSHRN  dX1i,qT1,#15
+        VRSHRN  dX0r,qT2,#15
+        VRSHRN  dX0i,qT3,#15
+        
+        .ifeqs  "\scaled", "TRUE"
+            VHADD    dY1r,dT0,dX1i                     @ F(N/2 -1)
+            VHSUB    dY1i,dX1r,dT1
+        .else
+            VADD    dY1r,dT0,dX1i                      @ F(N/2 -1)
+            VSUB    dY1i,dX1r,dT1
+        .endif
+        
+        .ifeqs  "\scaled", "TRUE"
+            VHADD    dY0r,dT0,dX0i                     @ F(1)
+            VHSUB    dY0i,dT1,dX0r
+        .else
+            VADD    dY0r,dT0,dX0i                      @ F(1)
+            VSUB    dY0i,dT1,dX0r
+        .endif
+        
+        VREV64  qY1,qY1
+
+        VST2    {dY0r,dY0i},[pOut1],step
+        VST2    {dY1r,dY1i},[pOut1]
+        ADD     pOut1,pOut1,#16
+        SUB     pOut1, pOut1, step
+        SUB     step,step,#32
+       
+        BGT     evenOddButterflyLoop\name
+
+        SUB     pSrc,pSrc,#4           @ set both the ptrs to the last element
+        SUB     pOut1,pOut1,#4
+        B       lastElement\name
+        
+smallFFTSize\name:
+        @ Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
+        @ Note: W^(k) is stored as negated value and also need to
+        @ conjugate the values from the table.
+        
+        @ Z(0) : no need of twiddle multiply
+        @ Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
+        
+        VLD1    dX0S32[0],[pSrc],step
+        ADD     pOut1,pOut,step      @ pOut1 = pOut+ N/2*4 bytes 
+                
+        VLD1    dX1S32[0],[pSrc]!
+        SUB     twStep,step,size     @ twStep = 3N/8 * 4 bytes pointing to W^1
+        
+        MOV     step1,size,LSL #1    @ step1 = N/4 * 4 = N/2*2 bytes
+        SUB     step1,step1,#4       @ (N/4-1)*4 bytes
+        
+        VHADD    dY0,dX0,dX1         @ [b+d | a+c]
+        VHSUB    dY1,dX0,dX1         @ [b-d | a-c] 
+        VTRN    dY0,dY1              @ dY0= [a-c | a+c] ;dY1= [b-d | b+d] 
+        
+        .ifeqs  "\scaled", "TRUE"
+            VHSUB   dX0,dY0,dY1
+            SUBS    size,size,#2
+            VHADD   dX1,dY0,dY1
+        .else
+            VSUB   dX0,dY0,dY1
+            SUBS    size,size,#2
+            VADD   dX1,dY0,dY1
+        .endif
+                    
+        SUB     pSrc,pSrc,step
+        VST1    dX0[0],[pOut1]!
+        ADD     pTwiddleTmp,pTwiddle,#4                @ W^2
+        VST1    dX1[1],[pOut1]!
+        ADD     argTwiddle1,pTwiddle,twStep            @ W^1 
+        
+        BLT     decrementScale\name
+        BEQ     lastElement\name
+                        
+        @ Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
+        @ Note: W^k is stored as negative values in the table and also need to
+        @ conjugate the values from the table.
+        @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+        @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
+
+        SUB     step,step,#12
+
+evenOddButterflyLoopSize4\name:     
+        VLD1    dW0rS32[0],[argTwiddle1],step1
+        VLD1    dW1rS32[0],[argTwiddle1]!
+        
+        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
+        VLD2    {dX0r[1],dX0i[1]},[pSrc],step
+        SUB     pSrc,pSrc,#4
+        SUB     argTwiddle1,argTwiddle1,step1
+        VLD2    {dX1r[0],dX1i[0]},[pSrc]!
+        VLD2    {dX1r[1],dX1i[1]},[pSrc]!
+        
+        SUB     step1,step1,#4                         @ (N/4-2)*4 bytes
+        VLD1    dW0iS32[0],[pTwiddleTmp],step1
+        VLD1    dW1iS32[0],[pTwiddleTmp]!
+        SUB     pSrc,pSrc,step
+        
+        SUB     pTwiddleTmp,pTwiddleTmp,step1
+        VREV32  dX1r,dX1r
+        VREV32  dX1i,dX1i
+        SUBS    size,size,#4
+                        
+        VHSUB   dT2,dX0r,dX1r                          @ a-c
+        VHADD   dT3,dX0i,dX1i                          @ b+d
+        SUB     step1,step1,#4
+        VHADD   dT0,dX0r,dX1r                          @ a+c
+        VHSUB   dT1,dX0i,dX1i                          @ b-d
+        
+        VTRN    dW1r,dW1i
+        VTRN    dW0r,dW0i
+                                
+        VMULL   qT0,dW1r,dT2
+        VMLSL   qT0,dW1i,dT3
+        VMULL   qT1,dW1r,dT3
+        VMLAL   qT1,dW1i,dT2
+        VMULL   qT2,dW0r,dT2
+        VMLAL   qT2,dW0i,dT3
+        VMULL   qT3,dW0r,dT3
+        VMLSL   qT3,dW0i,dT2
+        
+        VRSHRN  dX1r,qT0,#15
+        VRSHRN  dX1i,qT1,#15
+        
+        .ifeqs  "\scaled", "TRUE"
+            VHADD    dY1r,dT0,dX1i                     @ F(N/2 -1)
+            VHSUB    dY1i,dX1r,dT1
+        .else
+            VADD    dY1r,dT0,dX1i                      @ F(N/2 -1)
+            VSUB    dY1i,dX1r,dT1
+        .endif
+        
+        VREV32  dY1r,dY1r
+        VREV32  dY1i,dY1i
+                            
+        VRSHRN  dX0r,qT2,#15
+        VRSHRN  dX0i,qT3,#15
+        
+        .ifeqs  "\scaled", "TRUE"
+            VHADD    dY0r,dT0,dX0i                     @ F(1)
+            VHSUB    dY0i,dT1,dX0r
+        .else
+            VADD    dY0r,dT0,dX0i                      @ F(1)
+            VSUB    dY0i,dT1,dX0r
+        .endif
+        
+        VST2    {dY0r[0],dY0i[0]},[pOut1]!
+        VST2    {dY0r[1],dY0i[1]},[pOut1],step
+        SUB     pOut1, #4
+        VST2    {dY1r[0],dY1i[0]},[pOut1]!
+        VST2    {dY1r[1],dY1i[1]},[pOut1]!
+        SUB     pOut1,pOut1,step
+        SUB     pSrc,pSrc,#4           @ set both the ptrs to the last element
+        SUB     pOut1,pOut1,#4
+        
+        @ Last element can be expanded as follows
+        @ 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (W^k is stored as -ve)
+        @ 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
+        @ 1/2[2a+j0] - j (c-jd) [0+j2b]
+        @ (a+bc, -bd)
+        @ Since (c,d) = (0,1) for the last element, result is just (a,-b)
+        
+lastElement\name:      
+        VLD1    dX0rS32[0],[pSrc]
+        
+        .ifeqs  "\scaled", "TRUE"
+            VSHR    dX0r,dX0r,#1
+        .endif
+        
+        VST1    dX0r[0],[pOut1]!
+        VNEG    dX0r,dX0r
+        VST1    dX0r[1],[pOut1]
+
+decrementScale\name:          
+        .ifeqs  "\scaled", "TRUE"
+            SUB scale,scale,#1
+        .endif
+        
+        .endm
+        
+        M_START armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",Inv
+        M_END
+        
+        M_START armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",InvSfs
+        M_END
+
+        
+        .end
diff --git a/dl/sp/src/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S b/dl/sp/src/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S
new file mode 100644
index 0000000..f97fe5a
--- /dev/null
+++ b/dl/sp/src/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S
@@ -0,0 +1,639 @@
+@
+@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS.  All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+@ Some code in this file was originally from file
+@ omxSP_FFTFwd_RToCCS_S32_Sfs_s.S which was licensed as follows.
+@ It has been relicensed with permission from the copyright holders.
+@
+
+@
+@ OpenMAX DL: v1.0.2
+@ Last Modified Revision:   7810
+@ Last Modified Date:       Thu, 04 Oct 2007
+@
+@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@
+
+@
+@ Description:
+@ Compute a forward FFT for a real signal, using 16 bit complex FFT routines.
+@
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+.extern  armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+.extern  armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+.extern  armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+.extern  armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+.extern  armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+.extern  armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
+.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+.extern  armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+.extern  armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+
+@Input Registers
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+#define scale           r3
+
+@ Output registers
+#define result          r0
+
+@Local Scratch Registers
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define pTwiddle        r4
+#define tmpOrder        r4
+#define pOut            r5
+#define subFFTSize      r7
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+@ Total num of radix stages to comple the FFT
+#define count           r8
+#define x0r             r4
+#define x0i             r5
+#define diffMinusOne    r2
+#define round           r3
+#define subFFTSizeTmp   r6
+#define step            r3
+#define stepr           r11
+#define step1           r10
+#define step1r          r6
+#define step2           r8
+#define step2r          r9
+#define twStep          r8
+#define zero            r9
+#define pTwiddleTmp     r5
+#define t0              r10
+
+@ Neon registers
+#define dX0             d0.s16
+#define dX0S32          d0.s32
+#define dzero           d1.s16
+#define dZero           d2.s16
+#define dShift          d3.s16
+#define qShift          q1.s16
+#define dX0r            d2.s16
+#define dX0i            d3.s16
+#define dX1r            d4.s16
+#define dX1i            d5.s16
+#define qX1             q2.s16
+#define dX0rS32         d2.s32
+#define dX0iS32         d3.s32
+#define dX1rS32         d4.s32
+#define dX1iS32         d5.s32
+#define dT0             d6.s16
+#define dT1             d7.s16
+#define dT2             d8.s16
+#define dT3             d9.s16
+#define qT0             q5.s32
+#define qT1             q6.s32
+#define qT0s            q5.s16
+#define qT1s            q6.s16
+#define dW0r            d14.s16
+#define dW0i            d15.s16
+#define dW1r            d16.s16
+#define dW1i            d17.s16
+#define dW0rS32         d14.s32
+#define dW0iS32         d15.s32
+#define dW1rS32         d16.s32
+#define dW1iS32         d17.s32
+#define dY0r            d14.s16
+#define dY0i            d15.s16
+#define dY0rS32         d14.s32
+#define dY0iS32         d15.s32
+#define dY1r            d16.s16
+#define dY1i            d17.s16
+#define qY1             q8.s16
+#define dY1rS32         d16.s32
+#define dY1iS32         d17.s32
+#define dY0rS64         d14.s32
+#define dY0iS64         d15.s32
+#define qT2             q9.s32
+#define qT3             q10.s32
+#define d18s16          d18.s16
+#define d19s16          d19.s16
+#define d20s16          d20.s16
+#define d21s16          d21.s16
+@ lastThreeelements
+#define dX1             d3.s16
+#define dW0             d4.s16
+#define dW1             d5.s16
+#define dY0             d10.s16
+#define dY1             d11.s16
+#define dY2             d12.s16
+#define dY3             d13.s16
+
+        @ Allocate stack memory required by the function
+        M_ALLOC4        diffOnStack, 4
+
+        @ Write function header
+        M_START     omxSP_FFTFwd_RToCCS_S16_Sfs,r11,d15
+
+        @ Structure offsets for the FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @ Define stack arguments
+
+        @ Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+
+        @ Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+        @ Do a N/2 point complex FFT including the scaling
+
+        MOV     N,N,ASR #1                    @ N/2 point complex FFT
+
+        CLZ     order,N                       @ N = 2^order
+        RSB     order,order,#31
+        MOV     subFFTSize,#1
+
+        CMP     order,#3
+        BGT     orderGreaterthan3             @ order > 3
+
+        CMP     order,#1
+        BGE     orderGreaterthan0             @ order > 0
+        M_STR   scale, diffOnStack,LT         @ order = 0
+        LDR     x0r,[pSrc]
+        STR     x0r,[pOut]
+        MOV     pSrc,pOut
+        MOV     argDst,pDst
+        B       FFTEnd
+
+orderGreaterthan0:
+        @ set the buffers appropriately for various orders
+        CMP     order,#2
+        MOVEQ   argDst,pDst
+        MOVNE   argDst,pOut
+        MOVNE   pOut,pDst                  @ Pass 1st stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+
+        SUBS    diff,scale,order
+        M_STR   diff,diffOnStack
+        MOVGT   scale,order
+        @ Now scale <= order
+
+        CMP     order,#1
+        BGT     orderGreaterthan1
+        @ order = 1:
+        SUBS    scale,scale,#1
+        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+        B       FFTEnd
+
+orderGreaterthan1:
+        CMP     order,#2
+        MOV     argScale,scale
+        BGT     orderGreaterthan2
+        @ order = 2:
+        SUBS    argScale,argScale,#1
+        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1
+        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+        B       FFTEnd
+
+orderGreaterthan2:   @ order = 3
+        SUBS    argScale,argScale,#1
+        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1
+        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1
+        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+        B       FFTEnd
+
+
+orderGreaterthan3:
+        @ check scale = 0 or scale = order
+        SUBS    diff, scale, order   @ scale > order
+        MOVGT   scale,order
+        BGE     specialScaleCase     @ scale = 0 or scale = order
+        CMP     scale,#0
+        BEQ     specialScaleCase
+        B       generalScaleCase
+
+specialScaleCase:   @ scale = 0, or, scale = order && order > 3
+        TST     order, #2            @ Set input args to fft stages
+        MOVEQ   argDst,pDst
+        MOVNE   argDst,pOut
+        MOVNE   pOut,pDst            @ Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+
+        CMP     diff,#0
+        M_STR   diff, diffOnStack
+        BGE     scaleEqualsOrder
+
+        @ check for even or odd order.
+        @ NOTE: The following combination of BL's would work fine even though
+        @ the first BL would corrupt the flags. This is because the end of the
+        @ "grpZeroSetLoop" loop inside
+        @ armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets Z flag to EQ.
+
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+
+        CMP     subFFTNum,#4
+        BLT     FFTEnd
+
+unscaledRadix4Loop:
+        BEQ     lastStageUnscaledRadix4
+        BL      armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+        CMP     subFFTNum,#4
+        B       unscaledRadix4Loop
+
+lastStageUnscaledRadix4:
+        BL      armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+        B       FFTEnd
+
+scaleEqualsOrder:
+        @ check for even or odd order
+        @ NOTE: The following combination of BL's would work fine even though
+        @ the first BL would corrupt the flags. This is because the end of the
+        @ "grpZeroSetLoop" loop inside
+        @ armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets Z flag to EQ.
+
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+
+        CMP     subFFTNum,#4
+        BLT     FFTEnd
+
+scaledRadix4Loop:
+        BEQ     lastStageScaledRadix4
+        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+        CMP     subFFTNum,#4
+        B       scaledRadix4Loop
+
+lastStageScaledRadix4:
+        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+        B       FFTEnd
+
+generalScaleCase:                        @ 0 < scale < order and order > 3
+        @ Determine the correct destination buffer
+        SUB     diff,order,scale
+        TST     diff,#0x01
+        ADDEQ   count,scale,diff,LSR #1  @ count = scale + (order - scale)/2
+        MOVNE   count,order
+        TST     count,#0x01              @ Is count even or odd ?
+
+        MOVEQ   argDst,pDst              @ Set input args to fft stages
+        MOVNE   argDst,pOut
+        MOVNE   pOut,pDst                @ Pass 1st stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+
+        CMP     diff,#1
+        M_STR   diff, diffOnStack
+        BEQ     scaleps                  @ scaling including a radix2_ps stage
+
+        MOV     argScale,scale           @ Put scale in RN4 to save and restore
+        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1
+
+scaledRadix2Loop:
+        BLGT    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1     @ save, restore scale in scaled stages
+        BGT     scaledRadix2Loop
+        B       outScale
+
+scaleps:
+        SUB     argScale,scale,#1        @ order>3 and diff=1 => scale >= 3
+        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1
+
+scaledRadix2psLoop:
+        BEQ     scaledRadix2psStage
+        BLGT    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1     @ save, restore scale in scaled stages
+        BGE     scaledRadix2psLoop
+
+scaledRadix2psStage:
+        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+        B       generalLastStageUnscaledRadix2
+
+outScale:
+        M_LDR   diff, diffOnStack
+        @check for even or odd order
+        TST     diff,#0x00000001
+        BEQ     generalUnscaledRadix4Loop
+        B       unscaledRadix2Loop
+
+generalUnscaledRadix4Loop:
+        CMP     subFFTNum,#4
+        BEQ     generalLastStageUnscaledRadix4
+        BL      armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+        B       generalUnscaledRadix4Loop
+
+generalLastStageUnscaledRadix4:
+        BL      armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+        B       End
+
+unscaledRadix2Loop:
+        CMP     subFFTNum,#4
+        BEQ     generalLastTwoStagesUnscaledRadix2
+        BL      armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
+        B       unscaledRadix2Loop
+
+generalLastTwoStagesUnscaledRadix2:
+        BL      armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+generalLastStageUnscaledRadix2:
+        BL      armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+        B       End
+
+FFTEnd:     @ Does only the scaling
+        M_LDR   diff, diffOnStack
+        CMP     diff,#0
+        BLE     finalComplexToRealFixup
+
+        RSB     diff,diff,#0               @ for right shift by a variable
+        VDUP    qShift,diff
+
+        @ save subFFTSize and use subFFTSizeTmp in the following loop
+        MOV     subFFTSizeTmp,subFFTSize   @ subFFTSizeTmp same reg as subFFTNum
+
+        @ Use parallel loads for bigger FFT size.
+        CMP     subFFTSizeTmp, #8
+        BLT     scaleLessFFTData
+
+scaleFFTData:
+        VLD1    {qT0s, qT1s},[pSrc:256]    @ pSrc contains pDst pointer
+        SUBS    subFFTSizeTmp,subFFTSizeTmp,#8
+        VSHL    qT0s,qShift
+        VSHL    qT1s,qShift
+        VST1    {qT0s, qT1s},[pSrc:256]!
+        BGT     scaleFFTData
+        B       afterScaling
+
+scaleLessFFTData:
+        VLD1    {dX0S32[0]},[pSrc]         @ pSrc contains pDst pointer
+        SUBS    subFFTSizeTmp,subFFTSizeTmp,#1
+        VSHL    dX0,dShift
+        VST1    {dX0S32[0]},[pSrc]!
+        BGT     scaleLessFFTData
+
+afterScaling:
+        SUB     pSrc,pSrc,subFFTSize,LSL #2 @ reset pSrc for final fixup
+
+        @  change the logic so that output after scaling is in pOut and not in pDst
+        @  finally store from pOut to pDst
+        @  change branch "End" to branch "finalComplexToRealFixup" in the above
+        @  chk the code below for multiplication by j factor
+
+finalComplexToRealFixup:
+        @ F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
+        @ 1/2[(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
+        @ 1/2[2a+j0] - j [0+j2b]
+        @ (a+b, 0)
+
+        @ F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
+        @ 1/2[(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
+        @ 1/2[2a+j0] + j [0+j2b]
+        @ (a-b, 0)
+
+        CMP    subFFTSize,#4
+        BLE    smallFFTSize
+
+@ SubSize > 3:
+        @ F(0) and F(N/2)
+        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
+        MOV     zero,#0
+        VMOV    dX0r[1],zero
+        MOV     step,subFFTSize,LSL #2        @ step = N/2 * 4 bytes
+        VMOV    dX0i[1],zero
+        SUB     twStep,step,subFFTSize        @ twStep = 3N/8 * 8 bytes
+
+        VADD    dY0r,dX0r,dX0i                @ F(0) = ((Z0.r+Z0.i) , 0)
+        MOV     step1,subFFTSize,LSL #1       @ step1 = N/2 * 2 bytes
+        VSUB    dY0i,dX0r,dX0i                @ F(N/2) = ((Z0.r-Z0.i) , 0)
+        SUBS    subFFTSize,subFFTSize,#2
+
+        VST1    dY0rS32[0],[argDst], step
+        ADD     pTwiddleTmp,argTwiddle,#4     @ W^2
+        VST1    dY0iS32[0],[argDst]!
+        ADD     argTwiddle,argTwiddle,twStep  @ W^1
+
+        VDUP    dzero,zero
+        SUB     argDst,argDst,step
+        SUB     step,step,#20
+        RSB     stepr, step, #16
+        SUB     step1,step1,#8                @ (N/4-1)*8 bytes
+        RSB     step1r,step1,#8
+
+        SUB     step2, step1, #4
+        RSB     step2r, step2, #8
+
+        @ F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
+        @ Note: W^k is stored as negative values in the table.
+        @ Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
+        @ since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1).
+
+evenOddButterflyLoop:
+        VLD2    {dX0r,dX0i},[pSrc],step
+        VLD2    {dX1r,dX1i},[pSrc],stepr
+
+        VLD1    dW0r,[argTwiddle],step1
+        SUB     step1, step1, #16
+        VREV64  qX1,qX1
+
+        VLD1    dW1r,[argTwiddle],step1r
+        ADD     step1r, step1r, #16
+        VSUB    dT2,dX0r,dX1r                 @ a-c
+
+        VLD1    dW0i,[pTwiddleTmp],step2
+        SUB     step2, step2, #16
+        VADD    dT3,dX0i,dX1i                 @ b+d
+
+        VLD1    dW1i,[pTwiddleTmp],step2r
+        ADD     step2r, step2r, #16
+
+        VTRN    dW0r,dW0i
+        VZIP    dW1r, dW1i
+
+        SUBS    subFFTSize,subFFTSize,#8
+
+        VHADD   dT0,dX0r,dX1r                 @ (a+c)/2
+        VZIP    dW1iS32, dW1rS32
+        VHSUB   dT1,dX0i,dX1i                 @ (b-d)/2
+
+        VQDMULH dY0,dW1i,dT2
+        VQDMULH dY1,dW1r,dT3
+        VQDMULH dY2,dW1i,dT3
+        VQDMULH dY3,dW1r,dT2
+
+        VQDMULH d18s16,dW0r,dT2
+        VQDMULH d19s16,dW0i,dT3
+        VQDMULH d20s16,dW0r,dT3
+        VQDMULH d21s16,dW0i,dT2
+
+        VRHADD  dX1r, dY0, dY1
+        VHSUB   dX1i, dY2, dY3
+        VHSUB   dX0r, d18s16, d19s16
+        VADD    dY1i,dT1,dX1r
+        VRHADD  dX0i, d20s16, d21s16
+        VSUB    dY1r,dT0,dX1i                 @ F(N/2 -1)
+        VSUB    dY0r,dT0,dX0i                 @ F(1)
+        VADD    dY0i,dT1,dX0r
+
+        VNEG    dY1i,dY1i
+        VREV64  qY1, qY1
+
+        VST2    {dY0r,dY0i},[argDst],step
+        SUB     step,step,#32                 @ (N/2-4)*4 bytes
+        VST2    {dY1r,dY1i},[argDst],stepr
+        ADD     stepr,stepr,#32
+
+        BGT     evenOddButterflyLoop
+
+        SUB     pSrc,pSrc,#4                  @ points to the last element.
+        SUB     argDst,argDst,#4              @ points to the last element.
+
+        b lastElement
+
+smallFFTSize:
+
+        @ F(0) and F(N/2)
+        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
+        MOV     zero,#0
+        VMOV    dX0r[1],zero
+        MOV     step,subFFTSize,LSL #2        @ step = N/2 * 4 bytes
+        VMOV    dX0i[1],zero
+        SUB     twStep,step,subFFTSize        @ twStep = 3N/8 * 8 bytes
+
+        VADD    dY0r,dX0r,dX0i                @ F(0) = ((Z0.r+Z0.i) , 0)
+        MOV     step1,subFFTSize,LSL #1       @ step1 = N/2 * 2 bytes
+        VSUB    dY0i,dX0r,dX0i                @ F(N/2) = ((Z0.r-Z0.i) , 0)
+        SUBS    subFFTSize,subFFTSize,#2
+
+
+        VST1    dY0rS32[0],[argDst], step
+        ADD     pTwiddleTmp,argTwiddle,#4     @ W^2
+        VST1    dY0iS32[0],[argDst]!
+        ADD     argTwiddle,argTwiddle,twStep  @ W^1
+
+        VDUP    dzero,zero
+        SUB     argDst,argDst,step
+
+        BLT     End
+        BEQ     lastElement
+
+        SUB     step,step,#12
+        SUB     step1,step1,#4                @ (N/4-1)*8 bytes
+
+        @ F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
+
+butterflyLoopSubFFTSize4:
+        VLD1    dW0rS32[0], [argTwiddle],step1
+        VLD1    dW1rS32[0],[argTwiddle]!
+
+        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
+        VLD2    {dX0r[1],dX0i[1]},[pSrc],step
+        SUB     pSrc,pSrc,#4
+        SUB     argTwiddle,argTwiddle,step1
+        VLD2    {dX1r[0],dX1i[0]},[pSrc]!
+        VLD2    {dX1r[1],dX1i[1]},[pSrc]!
+
+        SUB     step1,step1,#4                @ (N/4-2)*4 bytes
+        VLD1    dW0iS32[0],[pTwiddleTmp],step1
+        VLD1    dW1iS32[0],[pTwiddleTmp]!
+        SUB     pSrc,pSrc,step
+
+        SUB     pTwiddleTmp,pTwiddleTmp,step1
+        VREV32  dX1r,dX1r
+        VREV32  dX1i,dX1i
+        SUBS    subFFTSize,subFFTSize,#4
+
+        VSUB    dT2,dX0r,dX1r                 @ a-c
+        SUB     step1,step1,#4
+        VADD    dT3,dX0i,dX1i                 @ b+d
+        VADD    dT0,dX0r,dX1r                 @ a+c
+        VSUB    dT1,dX0i,dX1i                 @ b-d
+        VHADD   dT0,dT0,dzero
+        VHADD   dT1,dT1,dzero
+
+        VTRN    dW1r,dW1i
+        VTRN    dW0r,dW0i
+
+        VMULL   qT0,dW1r,dT2
+        VMLAL   qT0,dW1i,dT3
+        VMULL   qT1,dW1r,dT3
+        VMLSL   qT1,dW1i,dT2
+
+        VMULL   qT2,dW0r,dT2
+        VMLSL   qT2,dW0i,dT3
+        VMULL   qT3,dW0r,dT3
+        VMLAL   qT3,dW0i,dT2
+
+        VRSHRN  dX1r,qT0,#16
+        VRSHRN  dX1i,qT1,#16
+
+        VSUB    dY1r,dT0,dX1i                 @ F(N/2 -1)
+        VADD    dY1i,dT1,dX1r
+        VNEG    dY1i,dY1i
+
+        VREV32  dY1r,dY1r
+        VREV32  dY1i,dY1i
+
+        VRSHRN  dX0r,qT2,#16
+        VRSHRN  dX0i,qT3,#16
+
+        VSUB    dY0r,dT0,dX0i                 @ F(1)
+        VADD    dY0i,dT1,dX0r
+
+        VST2    {dY0r[0],dY0i[0]},[argDst]!
+        VST2    {dY0r[1],dY0i[1]},[argDst],step
+        SUB     argDst, #4
+        VST2    {dY1r[0],dY1i[0]},[argDst]!
+        VST2    {dY1r[1],dY1i[1]},[argDst]!
+        SUB     argDst,argDst,step
+        SUB     pSrc,pSrc,#4                  @ points to the last element.
+        SUB     argDst,argDst,#4              @ points to the last element.
+
+lastElement:
+        @ Last element can be expanded as follows
+        @ 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
+        @ 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
+        @ 1/2[2a+j0] + j (c+jd) [0+j2b]
+        @ (a-bc, -bd)
+        @ Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+        VLD1    dX0rS32[0],[pSrc]
+        VST1    dX0r[0],[argDst]!
+        VNEG    dX0r,dX0r
+        VST1    dX0r[1],[argDst]!
+
+End:
+        @ Set return value
+        MOV     result, #OMX_Sts_NoErr
+
+        @ Write function tail
+        M_END
+
+    .END
diff --git a/dl/sp/src/omxSP_FFTGetBufSize_R_S16.c b/dl/sp/src/omxSP_FFTGetBufSize_R_S16.c
new file mode 100644
index 0000000..cc318a3
--- /dev/null
+++ b/dl/sp/src/omxSP_FFTGetBufSize_R_S16.c
@@ -0,0 +1,77 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ * Some code in this file was originally from file omxSP_FFTGetBufSize_R_S32.c
+ * which was licensed as follows.
+ * It has been relicensed with permission from the copyright holders.
+ */
+
+/*
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:
+ * Last Modified Date:
+ */
+
+#include "dl/api/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+/**
+ * Function: omxSP_FFTGetBufSize_R_S16
+ *
+ * Description:
+ * Computes the size of the specification structure required for the length
+ * 2^order real FFT and IFFT functions.
+ *
+ * Remarks:
+ * This function is used in conjunction with the 16-bit functions
+ * <FFTFwd_RToCCS_S16_Sfs> and <FFTInv_CCSToR_S16_Sfs>.
+ *
+ * Parameters:
+ * [in]  order       base-2 logarithm of the length; valid in the range
+ *			   [1,12].
+ * [out] pSize	   pointer to the number of bytes required for the
+ *			   specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTGetBufSize_R_S16(OMX_INT order, OMX_INT *pSize) {
+  OMX_INT     NBy2,N,twiddleSize;
+
+  /* Order zero not allowed */
+  if (order == 0) {
+    return OMX_Sts_BadArgErr;
+  }
+
+  NBy2 = 1 << (order - 1);
+  N = NBy2 << 1;
+  twiddleSize = 5 * N / 8;  /* 3 / 4 (N / 2) + N / 4 */
+
+  /* 2 pointers to store bitreversed array and twiddle factor array */
+  *pSize = sizeof(ARMsFFTSpec_R_SC16)
+           /* Twiddle factors  */
+           + sizeof(OMX_SC16) * twiddleSize
+           /* Ping Pong buffer for doing the N/2 point complex FFT; */
+           /* extra size 'N' as a temporary buf for FFTInv_CCSToR_S16_Sfs */
+           + sizeof(OMX_S16) * (N << 1)
+           /* Extra bytes to get 32 byte alignment of ptwiddle and pBuf */
+           + 62 ;
+
+
+  return OMX_Sts_NoErr;
+}
+
+/*****************************************************************************
+ *                              END OF FILE
+ *****************************************************************************/
+
diff --git a/dl/sp/src/omxSP_FFTInit_R_S16.c b/dl/sp/src/omxSP_FFTInit_R_S16.c
new file mode 100644
index 0000000..f44dff3
--- /dev/null
+++ b/dl/sp/src/omxSP_FFTInit_R_S16.c
@@ -0,0 +1,232 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ * Some code in this file was originally from file omxSP_FFTInit_R_S16S32.c
+ * which was licensed as follows.
+ * It has been relicensed with permission from the copyright holders.
+ */
+
+/*
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:
+ * Last Modified Date:
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ */
+
+#include "dl/api/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+/**
+ * Function: omxSP_FFTInit_R_S16
+ *
+ * Description:
+ * Initialize the real forward-FFT specification information struct.
+ *
+ * Remarks:
+ * This function is used to initialize the specification structures
+ * for functions <ippsFFTFwd_RToCCS_S16_Sfs> and
+ * <ippsFFTInv_CCSToR_S16_Sfs>. Memory for *pFFTSpec must be
+ * allocated prior to calling this function. The number of bytes
+ * required for *pFFTSpec can be determined using
+ * <FFTGetBufSize_R_S16>.
+ *
+ * Parameters:
+ * [in]  order       base-2 logarithm of the desired block length;
+ *			   valid in the range [1,12].
+ * [out] pFFTFwdSpec pointer to the initialized specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTInit_R_S16(OMXFFTSpec_R_S16* pFFTSpec, OMX_INT order) {
+  OMX_INT i = 0, j = 0;
+  OMX_SC16 *pTwiddle = NULL, *pTwiddle1 = NULL, *pTwiddle2 = NULL;
+  OMX_SC16 *pTwiddle3 = NULL, *pTwiddle4 = NULL;
+  OMX_S16 *pBuf = NULL;
+  OMX_U16 *pBitRev = NULL;
+  OMX_U32 pTmp = 0;
+  OMX_INT Nby2 = 0, N = 0, M = 0, diff = 0, step = 0;
+  OMX_S16 x = 0, y = 0, xNeg = 0;
+  OMX_S32 xS32 = 0, yS32 = 0;
+  ARMsFFTSpec_R_SC16 *pFFTStruct = NULL;
+
+  /* Order zero not allowed */
+  if (order == 0) {
+    return OMX_Sts_BadArgErr;
+  }
+
+  /* Do the initializations */
+  pFFTStruct = (ARMsFFTSpec_R_SC16*) pFFTSpec;
+  Nby2 = 1 << (order - 1);
+  N = Nby2 << 1;
+  pBitRev = NULL ;  /* optimized implementations don't use bitreversal */
+  pTwiddle = (OMX_SC16*) (sizeof(ARMsFFTSpec_R_SC16) + (OMX_S8*)pFFTSpec);
+
+  /* Align to 32 byte boundary */
+  pTmp = ((OMX_U32)pTwiddle)&31;  /* (OMX_U32)pTwiddle % 32 */
+  if(pTmp != 0) {
+    pTwiddle = (OMX_SC16*) ((OMX_S8*)pTwiddle + (32 - pTmp));
+  }
+
+  pBuf = (OMX_S16*) (sizeof(OMX_SC16) * (5 * N / 8) + (OMX_S8*)pTwiddle);
+
+  /* Align to 32 byte boundary */
+  pTmp = ((OMX_U32)pBuf)&31;                 /* (OMX_U32)pBuf % 32 */
+  if(pTmp != 0) {
+    pBuf = (OMX_S16*)((OMX_S8*)pBuf + (32 - pTmp));
+  }
+
+  /*
+   * Filling Twiddle factors : exp^(-j*2*PI*k/ (N/2) ) ; k=0,1,2,...,3/4(N/2).
+   * N/2 point complex FFT is used to compute N point real FFT.
+   * The original twiddle table "armSP_FFT_S32TwiddleTable" is of size
+   * (MaxSize/8 + 1). Rest of the values i.e., up to MaxSize are calculated
+   * using the symmetries of sin and cos.
+   * The max size of the twiddle table needed is 3/4(N/2) for a radix-4 stage.
+   *
+   * W = (-2 * PI) / N
+   * N = 1 << order
+   * W = -PI >> (order - 1)
+   * 
+   * Note we use S32 twiddle factor table and round the values to 16 bits.
+   */
+
+  M = Nby2 >> 3;
+  diff = 12 - (order - 1);
+  step = 1 << diff;  /* Step into the twiddle table for the current order */
+
+  xS32 = armSP_FFT_S32TwiddleTable[0];
+  yS32 = armSP_FFT_S32TwiddleTable[1];
+  x = (xS32 + 0x8000) >> 16;
+  y = (yS32 + 0x8000) >> 16;
+  xNeg = 0x7FFF;
+
+  if((order-1) >= 3) {
+    /* i = 0 case */
+    pTwiddle[0].Re = x;
+    pTwiddle[0].Im = y;
+    pTwiddle[2 * M].Re = -y;
+    pTwiddle[2 * M].Im = xNeg;
+    pTwiddle[4 * M].Re = xNeg;
+    pTwiddle[4 * M].Im = y;
+
+    for (i=1; i<=M; i++){
+      OMX_S16 x_neg = 0, y_neg = 0;
+      j = i * step;
+
+      xS32 = armSP_FFT_S32TwiddleTable[2 * j];
+      yS32 = armSP_FFT_S32TwiddleTable[2 * j + 1];
+      x = (xS32 + 0x8000) >> 16;
+      y = (yS32 + 0x8000) >> 16;
+      /* |x_neg = -x| doesn't work when x is 0x8000. */
+      x_neg = (-(xS32 + 0x8000)) >> 16;
+      y_neg = (-(yS32 + 0x8000)) >> 16;
+
+      pTwiddle[i].Re = x;
+      pTwiddle[i].Im = y;
+      pTwiddle[2 * M - i].Re = y_neg;
+      pTwiddle[2 * M - i].Im = x_neg;
+      pTwiddle[2 * M + i].Re = y;
+      pTwiddle[2 * M + i].Im = x_neg;
+      pTwiddle[4 * M - i].Re = x_neg;
+      pTwiddle[4 * M - i].Im = y;
+      pTwiddle[4 * M + i].Re = x_neg;
+      pTwiddle[4 * M + i].Im = y_neg;
+      pTwiddle[6 * M - i].Re = y;
+      pTwiddle[6 * M - i].Im = x;
+    }
+  }
+  else {
+    if ((order - 1) == 2) {
+      pTwiddle[0].Re = x;
+      pTwiddle[0].Im = y;
+      pTwiddle[1].Re = -y;
+      pTwiddle[1].Im = xNeg;
+      pTwiddle[2].Re = xNeg;
+      pTwiddle[2].Im = y;
+    }
+    if ((order-1) == 1) {
+      pTwiddle[0].Re = x;
+      pTwiddle[0].Im = y;
+    }
+  }
+
+  /*
+   * Now fill the last N/4 values : exp^(-j*2*PI*k/N);  k=1,3,5,...,N/2-1.
+   * These are used for the final twiddle fix-up for converting complex to
+   * real FFT.
+   */
+
+  M = N >> 3;
+  diff = 12 - order;
+  step = 1 << diff;
+
+  pTwiddle1 = pTwiddle + 3 * N / 8;
+  pTwiddle4 = pTwiddle1 + (N / 4 - 1);
+  pTwiddle3 = pTwiddle1 + N / 8;
+  pTwiddle2 = pTwiddle1 + (N / 8 - 1);
+
+  xS32 = armSP_FFT_S32TwiddleTable[0];
+  yS32 = armSP_FFT_S32TwiddleTable[1];
+  x = (xS32 + 0x8000) >> 16;
+  y = (yS32 + 0x8000) >> 16;
+  xNeg = 0x7FFF;
+
+  if((order) >= 3) {
+    for (i = 1; i <= M; i += 2 ) {
+      OMX_S16 x_neg = 0, y_neg = 0;
+
+      j = i*step;
+
+      xS32 = armSP_FFT_S32TwiddleTable[2 * j];
+      yS32 = armSP_FFT_S32TwiddleTable[2 * j + 1];
+      x = (xS32 + 0x8000) >> 16;
+      y = (yS32 + 0x8000) >> 16;
+      /* |x_neg = -x| doesn't work when x is 0x8000. */
+      x_neg = (-(xS32 + 0x8000)) >> 16;
+      y_neg = (-(yS32 + 0x8000)) >> 16;
+
+      pTwiddle1[0].Re = x;
+      pTwiddle1[0].Im = y;
+      pTwiddle1 += 1;
+      pTwiddle2[0].Re = y_neg;
+      pTwiddle2[0].Im = x_neg;
+      pTwiddle2 -= 1;
+      pTwiddle3[0].Re = y;
+      pTwiddle3[0].Im = x_neg;
+      pTwiddle3 += 1;
+      pTwiddle4[0].Re = x_neg;
+      pTwiddle4[0].Im = y;
+      pTwiddle4 -= 1;
+    }
+  }
+  else {
+    if (order == 2) {
+      pTwiddle1[0].Re = -y;
+      pTwiddle1[0].Im = xNeg;
+    }
+  }
+
+  /* Update the structure */
+  pFFTStruct->N = N;
+  pFFTStruct->pTwiddle = pTwiddle;
+  pFFTStruct->pBitRev = pBitRev;
+  pFFTStruct->pBuf = pBuf;
+
+  return OMX_Sts_NoErr;
+}
+/*****************************************************************************
+ *                              END OF FILE
+ *****************************************************************************/
+
diff --git a/dl/sp/src/omxSP_FFTInv_CCSToR_S16_Sfs_s.S b/dl/sp/src/omxSP_FFTInv_CCSToR_S16_Sfs_s.S
new file mode 100644
index 0000000..553f280
--- /dev/null
+++ b/dl/sp/src/omxSP_FFTInv_CCSToR_S16_Sfs_s.S
@@ -0,0 +1,301 @@
+@
+@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS.  All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+@ Some code in this file was originally from file
+@ omxSP_FFTInv_CToC_SC16_Sfs_s.S which was licensed as follows.
+@ It has been relicensed with permission from the copyright holders.
+@
+
+@
+@ File Name:  omxSP_FFTInv_CToC_SC16_Sfs_s.s
+@ OpenMAX DL: v1.0.2
+@ Last Modified Revision:   6729
+@ Last Modified Date:       Tue, 17 Jul 2007
+@
+@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@
+
+@
+@ Description:
+@ Compute an inverse FFT for a 16-bit real signal, with complex FFT routines.
+@
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+.extern  armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+.extern  armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+.extern  armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+.extern  armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+.extern  armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
+.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+.extern  armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe
+.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+.extern  armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+.extern  armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+
+@Input Registers
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+#define scale           r3
+
+@ Output registers
+#define result  r0
+
+@Local Scratch Registers
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define pTwiddle        r4
+#define tmpOrder        r4
+#define pOut            r5
+#define subFFTSize      r7
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+@ Total num of radix stages to comple the FFT
+#define count           r8
+#define x0r             r4
+#define x0i             r5
+#define diffMinusOne    r2
+#define round           r3
+#define pOut1           r2
+#define size            r7
+#define step            r8
+#define step1           r9
+#define twStep          r10
+#define pTwiddleTmp     r11
+#define argTwiddle1     r12
+#define zero            r14
+
+@ Neon registers
+#define dX0             D0.S32
+#define dShift          D1.S32
+#define qShift          Q0.s16
+#define dX1             D1.S32
+#define dY0             D2.S32
+#define dY1             D3.S32
+#define dX0r            D0.S32
+#define dX0i            D1.S32
+#define dX1r            D2.S32
+#define dX1i            D3.S32
+#define dW0r            D4.S32
+#define dW0i            D5.S32
+#define dW1r            D6.S32
+#define dW1i            D7.S32
+#define dT0             D8.S32
+#define dT1             D9.S32
+#define dT2             D10.S32
+#define dT3             D11.S32
+#define qT0             Q6.S64
+#define qT1             Q7.S64
+#define qT0s            Q6.S16
+#define qT1s            Q7.S16
+#define qT2             Q8.S64
+#define qT3             Q9.S64
+#define dY0r            D4.S32
+#define dY0i            D5.S32
+#define dY1r            D6.S32
+#define dY1i            D7.S32
+#define dzero           D20.S32
+#define dY2             D4.S32
+#define dY3             D5.S32
+#define dW0             D6.S32
+#define dW1             D7.S32
+#define dW0Tmp          D10.S32
+#define dW1Neg          D11.S32
+
+
+
+    @ Allocate stack memory required by the function
+        M_ALLOC4        diffOnStack, 4
+
+    @ Write function header
+        M_START     omxSP_FFTInv_CCSToR_S16_Sfs,r11,d15
+
+@ Structure offsets for the FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @ Define stack arguments
+
+        @ Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+
+        @ Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+        @ Call the preTwiddle Radix2 stage before doing the complex IFFT
+
+        @ The following conditional BL combination would work since
+        @ evenOddButterflyLoop in the first call would set Z flag to zero
+
+        CMP     scale,#0
+        BLEQ    armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe
+        BLGT    armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe
+
+complexIFFT:
+
+        ASR     N,N,#1                              @ N/2 point complex IFFT
+        ADD     pSrc,pOut,N,LSL #2                  @ set pSrc as pOut1
+
+        CLZ     order,N                             @ N = 2^order
+        RSB     order,order,#31
+        MOV     subFFTSize,#1
+
+        ADD     scale,scale,order                   @ FFTInverse has a final scaling factor by N
+
+        CMP     order,#3
+        BGT     orderGreaterthan3                   @ order > 3
+
+        CMP     order,#1
+        BGE     orderGreaterthan0                   @ order > 0
+        M_STR   scale, diffOnStack,LT               @ order = 0
+        LDRLT   x0r,[pSrc]
+        STRLT   x0r,[pDst]
+        MOVLT   pSrc,pDst
+        BLT     FFTEnd
+
+orderGreaterthan0:
+        @ set the buffers appropriately for various orders
+        CMP     order,#2
+        MOVNE   argDst,pDst
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                           @ Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+        @ Store the scale factor and scale at the end
+        SUB     diff,scale,order
+        M_STR   diff, diffOnStack
+        BGE     orderGreaterthan1
+        BLLT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe  @ order = 1
+        B       FFTEnd
+
+
+orderGreaterthan1:
+        MOV     tmpOrder,order                      @ tmpOrder = RN 4
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+        CMP     tmpOrder,#2
+        BLGT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+        B       FFTEnd
+
+
+
+
+orderGreaterthan3:
+        @ check scale = 0 or scale = order
+        SUB     diff, scale, order                  @ scale > order
+
+        TST     order, #2                           @ Set input args to fft stages
+        MOVNE   argDst,pDst
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                           @ Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+
+        CMP     diff,#0
+        M_STR   diff, diffOnStack
+        BGE     scaleEqualsOrder
+
+        @check for even or odd order
+        @ NOTE: The following combination of BL's would work fine eventhough the first
+        @ BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+        @ armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+
+        CMP     subFFTNum,#4
+        BLT     FFTEnd
+
+unscaledRadix4Loop:
+        BEQ     lastStageUnscaledRadix4
+        BL      armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
+        CMP     subFFTNum,#4
+        B       unscaledRadix4Loop
+
+lastStageUnscaledRadix4:
+        BL      armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+        B       FFTEnd
+
+scaleEqualsOrder:
+        @check for even or odd order
+        @ NOTE: The following combination of BL's would work fine eventhough the first
+        @ BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+        @ armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+
+        CMP     subFFTNum,#4
+        BLT     FFTEnd
+
+scaledRadix4Loop:
+        BEQ     lastStageScaledRadix4
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+        CMP     subFFTNum,#4
+        B       scaledRadix4Loop
+
+lastStageScaledRadix4:
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+
+FFTEnd:                                         @ Does only the scaling
+
+        M_LDR   diff, diffOnStack
+        CMP     diff,#0
+        BLE     End
+
+        RSB     diff,diff,#0                    @ to use VRSHL for right shift by a variable
+        VDUP    qShift,diff
+
+        @ Use parallel loads for bigger FFT size.
+        CMP     subFFTSize, #8
+        BLT     scaleLessFFTData
+
+scaleFFTData:
+        VLD1    {qT0s, qT1s},[pSrc:256]         @ pSrc contains pDst pointer
+        SUBS    subFFTSize,subFFTSize,#8
+        VSHL    qT0s,qShift
+        VSHL    qT1s,qShift
+        VST1    {qT0s, qT1s},[pSrc:256]!
+        BGT     scaleFFTData
+        B       End
+
+scaleLessFFTData:                               @ N = subFFTSize  ; dataptr = pDst  ; scale = diff
+        VLD1    {dX0[0]},[pSrc]                 @ pSrc contains pDst pointer
+        SUBS    subFFTSize,subFFTSize,#1
+        VRSHL   dX0,dShift
+        VST1    {dX0[0]},[pSrc]!
+        BGT     scaleLessFFTData
+
+End:
+        @ Set return value
+        MOV     result, #OMX_Sts_NoErr
+
+        @ Write function tail
+        M_END
+
+
+
+
+
+
+    .END
diff --git a/dl/sp/src/test/test_fft.gyp b/dl/sp/src/test/test_fft.gyp
index 99b3774..3290550 100644
--- a/dl/sp/src/test/test_fft.gyp
+++ b/dl/sp/src/test/test_fft.gyp
@@ -67,11 +67,19 @@
       ],
     },
     {
-      # Test real 16-bit fixed-point FFT
-      'target_name': 'test_rfft16',
+      # Test real 16-bit fixed-point FFT implemented with S32 routines.
+      'target_name': 'test_rfft16_s32',
       'type': 'executable',
       'sources': [
-        'test_rfft16.c',
+        'test_rfft16_s32.c',
+      ],
+    },
+    {
+      # Test real 16-bit fixed-point FFT implemented with S16 routines.
+      'target_name': 'test_rfft16_s16',
+      'type': 'executable',
+      'sources': [
+        'test_rfft16_s16.c',
       ],
     },
     {
@@ -107,7 +115,8 @@
         'test_fft32',
         'test_float_fft',
         'test_float_rfft',
-        'test_rfft16',
+        'test_rfft16_s32',
+        'test_rfft16_s16',
         'test_rfft32',
         'test_fft_time',
       ],
diff --git a/dl/sp/src/test/test_fft16.c b/dl/sp/src/test/test_fft16.c
index 081bf23..bedf278 100644
--- a/dl/sp/src/test/test_fft16.c
+++ b/dl/sp/src/test/test_fft16.c
@@ -24,7 +24,7 @@
 #define MAX_FFT_ORDER   12
 
 int verbose = 0;
-int signal_value = 1024;
+int signal_value = 32767;
 int scale_factor = 0;
 
 struct KnownTestFailures known_failures[] = {
diff --git a/dl/sp/src/test/test_fft_time.c b/dl/sp/src/test/test_fft_time.c
index a401594..05f51d0 100644
--- a/dl/sp/src/test/test_fft_time.c
+++ b/dl/sp/src/test/test_fft_time.c
@@ -20,9 +20,14 @@
 #include "dl/sp/src/test/aligned_ptr.h"
 #include "dl/sp/src/test/gensig.h"
 
-#define MAX_FFT_ORDER   TWIDDLE_TABLE_ORDER
+#define MAX_FFT_ORDER TWIDDLE_TABLE_ORDER
 #define MAX_FFT_ORDER_FIXED_POINT 12
 
+typedef enum {
+  S16,
+  S32,
+} s16_s32;
+
 void TimeOneFloatFFT(int count, int fft_log_size, float signal_value,
                      int signal_type);
 void TimeFloatFFT(int count, float signal_value, int signal_type);
@@ -32,8 +37,11 @@
 void TimeOneSC32FFT(int count, int fft_log_size, float signal_value,
                     int signal_type);
 void TimeSC32FFT(int count, float signal_value, int signal_type);
+void TimeOneSC16FFT(int count, int fft_log_size, float signal_value,
+                    int signal_type);
+void TimeSC16FFT(int count, float signal_value, int signal_type);
 void TimeOneRFFT16(int count, int fft_log_size, float signal_value,
-                   int signal_type);
+                   int signal_type, s16_s32 s16s32);
 void TimeRFFT16(int count, float signal_value, int signal_type);
 void TimeOneRFFT32(int count, int fft_log_size, float signal_value,
                    int signal_type);
@@ -49,57 +57,58 @@
 
 void TimeFFTUsage(const char* prog) {
   fprintf(stderr, 
-	  "%s: [-hTFICA] [-f fft] [-c count] [-n logsize] [-s scale]\n"
-	  "    [-g signal-type] [-S signal value]\n"
-	  "    [-m minFFTsize] [-M maxFFTsize]\n",
+      "%s: [-hTFICA] [-f fft] [-c count] [-n logsize] [-s scale]\n"
+      "    [-g signal-type] [-S signal value]\n"
+      "    [-m minFFTsize] [-M maxFFTsize]\n",
           ProgramName(prog));
   fprintf(stderr, 
-	  "Simple FFT timing tests\n"
-	  "  -h          This help\n"
-	  "  -v level    Verbose output level (default = 1)\n"
-	  "  -F          Skip forward FFT tests\n"
-	  "  -I          Skip inverse FFT tests\n"
-	  "  -C          Include float-to-fixed and fixed-to-float cost for"
-	  " real\n"
-	  "                16-bit FFT (forward and inverse)\n"
-	  "  -c count    Number of FFTs to compute for timing.  This is a"
-	  " lower\n"
-	  "                lower limit; shorter FFTs will do more FFTs such"
-	  " that the\n"
-	  "                elapsed time is very roughly constant, if -A is"
-	  " not given.\n"
-	  "  -A          Don't adapt the count given by -c; use specified"
-	  " value\n"
-	  "  -m min      Mininum FFT order to test\n"
-	  "  -M max      Maximum FFT order to test\n"
-	  "  -T          Run just one FFT timing test\n"
-	  "  -f          FFT type:\n"
-	  "	         0 - Complex Float\n"
-	  "	         1 - Real Float\n"
-	  "	         2 - Complex 32-bit\n"
-	  "	         3 - Real 16-bit\n"
-	  "	         4 - Real 32-bit\n"
-	  "  -n logsize  Log2 of FFT size\n"
-	  "  -s scale    Scale factor for forward FFT (default = 0)\n"
-	  "  -S signal   Base value for the test signal (default = 1024)\n"
-	  "  -g type     Input signal type:\n"
-	  "	         0 - Constant signal S + i*S. (Default value.)\n"
-	  "	         1 - Real ramp starting at S/N, N = FFT size\n"
-	  "	         2 - Sine wave of amplitude S\n"
-	  "	         3 - Complex signal whose transform is a sine wave.\n"
-	  "\n"
-	  "Use -v 0 in combination with -F or -I to get output that can\n"
-	  "be pasted into a spreadsheet.\n"
-	  "\n"
-	  "Most of the options listed after -T above are only applicable\n"
-	  "when -T is given to test just one FFT size and FFT type.\n"
-	  "\n");
+      "Simple FFT timing tests\n"
+      "  -h          This help\n"
+      "  -v level    Verbose output level (default = 1)\n"
+      "  -F          Skip forward FFT tests\n"
+      "  -I          Skip inverse FFT tests\n"
+      "  -C          Include float-to-fixed and fixed-to-float cost for"
+      " real\n"
+      "                16-bit FFT (forward and inverse)\n"
+      "  -c count    Number of FFTs to compute for timing.  This is a"
+      " lower\n"
+      "                lower limit; shorter FFTs will do more FFTs such"
+      " that the\n"
+      "                elapsed time is very roughly constant, if -A is"
+      " not given.\n"
+      "  -A          Don't adapt the count given by -c; use specified"
+      " value\n"
+      "  -m min      Mininum FFT order to test\n"
+      "  -M max      Maximum FFT order to test\n"
+      "  -T          Run just one FFT timing test\n"
+      "  -f          FFT type:\n"
+      "              0 - Complex Float\n"
+      "              1 - Real Float\n"
+      "              2 - Complex 16-bit\n"
+      "              3 - Real 16-bit\n"
+      "              4 - Complex 32-bit\n"
+      "              5 - Real 32-bit\n"
+      "  -n logsize  Log2 of FFT size\n"
+      "  -s scale    Scale factor for forward FFT (default = 0)\n"
+      "  -S signal   Base value for the test signal (default = 1024)\n"
+      "  -g type     Input signal type:\n"
+      "              0 - Constant signal S + i*S. (Default value.)\n"
+      "              1 - Real ramp starting at S/N, N = FFT size\n"
+      "              2 - Sine wave of amplitude S\n"
+      "              3 - Complex signal whose transform is a sine wave.\n"
+      "\n"
+      "Use -v 0 in combination with -F or -I to get output that can\n"
+      "be pasted into a spreadsheet.\n"
+      "\n"
+      "Most of the options listed after -T above are only applicable\n"
+      "when -T is given to test just one FFT size and FFT type.\n"
+      "\n");
   exit(0);
 }
 
 void main(int argc, char* argv[]) {
   int fft_log_size = 4;
-  float signal_value = 1024;
+  float signal_value = 32767;
   int signal_type = 0;
   int test_mode = 1;
   int count = 100;
@@ -175,8 +184,9 @@
   if (test_mode) {
     TimeFloatFFT(count, signal_value, signal_type);
     TimeFloatRFFT(count, signal_value, signal_type);
-    TimeSC32FFT(count, signal_value, signal_type);
+    TimeSC16FFT(count, signal_value, signal_type);
     TimeRFFT16(count, signal_value, signal_type);
+    TimeSC32FFT(count, signal_value, signal_type);
     TimeRFFT32(count, signal_value, signal_type);
   } else {
     switch (fft_type) {
@@ -187,12 +197,16 @@
         TimeOneFloatRFFT(count, fft_log_size, signal_value, signal_type);
         break;
       case 2:
-        TimeOneSC32FFT(count, fft_log_size, signal_value, signal_type);
+        TimeOneSC16FFT(count, fft_log_size, signal_value, signal_type);
         break;
       case 3:
-        TimeOneRFFT16(count, fft_log_size, signal_value, signal_type);
+        TimeOneRFFT16(count, fft_log_size, signal_value, signal_type, S32);
+        TimeOneRFFT16(count, fft_log_size, signal_value, signal_type, S16);
         break;
       case 4:
+        TimeOneSC32FFT(count, fft_log_size, signal_value, signal_type);
+        break;
+      case 5:
         TimeOneRFFT32(count, fft_log_size, signal_value, signal_type);
         break;
       default:
@@ -638,6 +652,189 @@
   }
 }
 
+void generateSC16Signal(OMX_SC16* x, OMX_SC16* fft, int size, int signal_type,
+                        float signal_value) {
+  int k;
+  struct ComplexFloat *test_signal;
+  struct ComplexFloat *true_fft;
+
+  test_signal = (struct ComplexFloat*) malloc(sizeof(*test_signal) * size);
+  true_fft = (struct ComplexFloat*) malloc(sizeof(*true_fft) * size);
+  GenerateTestSignalAndFFT(test_signal, true_fft, size, signal_type,
+                           signal_value, 0);
+
+  /*
+   * Convert the complex result to what we want
+   */
+
+  for (k = 0; k < size; ++k) {
+    x[k].Re = 0.5 + test_signal[k].Re;
+    x[k].Im = 0.5 + test_signal[k].Im;
+    fft[k].Re = 0.5 + true_fft[k].Re;
+    fft[k].Im = 0.5 + true_fft[k].Im;
+  }
+
+  free(test_signal);
+  free(true_fft);
+}
+
+void TimeOneSC16FFT(int count, int fft_log_size, float signal_value,
+                    int signal_type) {
+  OMX_SC16* x;
+  OMX_SC16* y;
+  OMX_SC16* z;
+
+  struct AlignedPtr* x_aligned;
+  struct AlignedPtr* y_aligned;
+  struct AlignedPtr* z_aligned;
+
+  OMX_SC16* y_true;
+  OMX_SC16* temp16a;
+  OMX_SC16* temp16b;
+
+  OMX_INT n, fft_spec_buffer_size;
+  OMXResult status;
+  OMXFFTSpec_C_SC16 * fft_fwd_spec = NULL;
+  OMXFFTSpec_C_SC16 * fft_inv_spec = NULL;
+  int fft_size;
+  struct timeval start_time;
+  struct timeval end_time;
+  double elapsed_time;
+
+  fft_size = 1 << fft_log_size;
+
+  x_aligned = AllocAlignedPointer(32, sizeof(*x) * fft_size);
+  y_aligned = AllocAlignedPointer(32, sizeof(*y) * fft_size);
+  z_aligned = AllocAlignedPointer(32, sizeof(*z) * fft_size);
+  y_true = (OMX_SC16*) malloc(sizeof(*y_true) * fft_size);
+  temp16a = (OMX_SC16*) malloc(sizeof(*temp16a) * fft_size);
+  temp16b = (OMX_SC16*) malloc(sizeof(*temp16b) * fft_size);
+
+  x = x_aligned->aligned_pointer_;
+  y = y_aligned->aligned_pointer_;
+  z = z_aligned->aligned_pointer_;
+
+  generateSC16Signal(x, y_true, fft_size, signal_type, signal_value);
+
+  status = omxSP_FFTGetBufSize_C_SC16(fft_log_size, &fft_spec_buffer_size);
+
+  fft_fwd_spec = (OMXFFTSpec_C_SC16*) malloc(fft_spec_buffer_size);
+  fft_inv_spec = (OMXFFTSpec_C_SC16*) malloc(fft_spec_buffer_size);
+  status = omxSP_FFTInit_C_SC16(fft_fwd_spec, fft_log_size);
+
+  status = omxSP_FFTInit_C_SC16(fft_inv_spec, fft_log_size);
+
+  if (do_forward_test) {
+    if (include_conversion) {
+      int k;
+      float factor = -1;
+
+      GetUserTime(&start_time);
+      for (k = 0; k < count; ++k) {
+        for (n = 0; n < fft_size; ++n) {
+          if (fabs(x[n].Re) > factor) {
+            factor = fabs(x[n].Re);
+          }
+          if (fabs(x[n].Im) > factor) {
+            factor = fabs(x[n].Im);
+          }
+        }
+
+        factor = ((1 << 18) - 1) / factor;
+        for (n = 0; n < fft_size; ++n) {
+          temp16a[n].Re = factor * x[n].Re;
+          temp16a[n].Im = factor * x[n].Im;
+        }
+
+        omxSP_FFTFwd_CToC_SC16_Sfs(x, y, fft_fwd_spec, 0);
+
+        factor = 1 / factor;
+        for (n = 0; n < fft_size; ++n) {
+          temp16b[n].Re = y[n].Re * factor;
+          temp16b[n].Im = y[n].Im * factor;
+        }
+      }
+      GetUserTime(&end_time);
+    } else {
+      GetUserTime(&start_time);
+      for (n = 0; n < count; ++n) {
+        omxSP_FFTFwd_CToC_SC16_Sfs(x, y, fft_fwd_spec, 0);
+      }
+      GetUserTime(&end_time);
+    }
+
+    elapsed_time = TimeDifference(&start_time, &end_time);
+
+    PrintResult("Forward SC16 FFT", fft_log_size, elapsed_time, count);
+  }
+
+  if (do_inverse_test) {
+    if (include_conversion) {
+      int k;
+      float factor = -1;
+
+      GetUserTime(&start_time);
+      for (k = 0; k < count; ++k) {
+        for (n = 0; n < fft_size; ++n) {
+          if (fabs(x[n].Re) > factor) {
+            factor = fabs(x[n].Re);
+          }
+          if (fabs(x[n].Im) > factor) {
+            factor = fabs(x[n].Im);
+          }
+        }
+        factor = ((1 << 18) - 1) / factor;
+        for (n = 0; n < fft_size; ++n) {
+          temp16a[n].Re = factor * x[n].Re;
+          temp16a[n].Im = factor * x[n].Im;
+        }
+
+        status = omxSP_FFTInv_CToC_SC16_Sfs(y, z, fft_inv_spec, 0);
+
+        factor = 1 / factor;
+        for (n = 0; n < fft_size; ++n) {
+          temp16b[n].Re = y[n].Re * factor;
+          temp16b[n].Im = y[n].Im * factor;
+        }
+      }
+      GetUserTime(&end_time);
+    } else {
+      GetUserTime(&start_time);
+      for (n = 0; n < count; ++n) {
+        status = omxSP_FFTInv_CToC_SC16_Sfs(y, z, fft_inv_spec, 0);
+      }
+      GetUserTime(&end_time);
+    }
+
+    elapsed_time = TimeDifference(&start_time, &end_time);
+
+    PrintResult("Inverse SC16 FFT", fft_log_size, elapsed_time, count);
+  }
+
+  FreeAlignedPointer(x_aligned);
+  FreeAlignedPointer(y_aligned);
+  FreeAlignedPointer(z_aligned);
+  free(temp16a);
+  free(temp16b);
+  free(fft_fwd_spec);
+  free(fft_inv_spec);
+}
+
+void TimeSC16FFT(int count, float signal_value, int signal_type) {
+  int k;
+  int max_order = (max_fft_order > MAX_FFT_ORDER_FIXED_POINT)
+      ? MAX_FFT_ORDER_FIXED_POINT : max_fft_order;
+
+  if (verbose == 0)
+    printf("SC16 FFT\n");
+
+  for (k = min_fft_order; k <= max_order; ++k) {
+  //for (k = 7; k <= 8; ++k) {
+    int testCount = ComputeCount(count, k);
+    TimeOneSC16FFT(testCount, k, signal_value, signal_type);
+  }
+}
+
 void GenerateRFFT16Signal(OMX_S16* x, OMX_SC32* fft, int size, int signal_type,
                           float signal_value) {
   int k;
@@ -666,8 +863,12 @@
   free(true_fft);
 }
 
+/* Argument s16s32:
+ *     S32:       Calculate RFFT16 with 32 bit complex FFT;
+ *     otherwise: Calculate RFFT16 with 16 bit complex FFT.
+ */
 void TimeOneRFFT16(int count, int fft_log_size, float signal_value,
-                   int signal_type) {
+                   int signal_type, s16_s32 s16s32) {
   OMX_S16* x;
   OMX_S32* y;
   OMX_S16* z;
@@ -689,8 +890,8 @@
 
   OMX_INT n, fft_spec_buffer_size;
   OMXResult status;
-  OMXFFTSpec_R_S16S32 * fft_fwd_spec = NULL;
-  OMXFFTSpec_R_S16S32 * fft_inv_spec = NULL;
+  OMXFFTSpec_R_S16 * fft_fwd_spec = NULL;
+  OMXFFTSpec_R_S16 * fft_inv_spec = NULL;
   int fft_size;
   struct timeval start_time;
   struct timeval end_time;
@@ -728,13 +929,20 @@
   GenerateRealFloatSignal(xr, (OMX_FC32*) yrTrue, fft_size, signal_type,
                           signal_value);
 
-  status = omxSP_FFTGetBufSize_R_S16S32(fft_log_size, &fft_spec_buffer_size);
-
-  fft_fwd_spec = (OMXFFTSpec_R_S16S32*) malloc(fft_spec_buffer_size);
-  fft_inv_spec = (OMXFFTSpec_R_S16S32*) malloc(fft_spec_buffer_size);
-  status = omxSP_FFTInit_R_S16S32(fft_fwd_spec, fft_log_size);
-
-  status = omxSP_FFTInit_R_S16S32(fft_inv_spec, fft_log_size);
+  if(s16s32 == S32) {
+    status = omxSP_FFTGetBufSize_R_S16S32(fft_log_size, &fft_spec_buffer_size);
+    fft_fwd_spec = malloc(fft_spec_buffer_size);
+    fft_inv_spec = malloc(fft_spec_buffer_size);
+    status = omxSP_FFTInit_R_S16S32(fft_fwd_spec, fft_log_size);
+    status = omxSP_FFTInit_R_S16S32(fft_inv_spec, fft_log_size);
+  }
+  else {
+    status = omxSP_FFTGetBufSize_R_S16(fft_log_size, &fft_spec_buffer_size);
+    fft_fwd_spec = malloc(fft_spec_buffer_size);
+    fft_inv_spec = malloc(fft_spec_buffer_size);
+    status = omxSP_FFTInit_R_S16(fft_fwd_spec, fft_log_size);
+    status = omxSP_FFTInit_R_S16(fft_inv_spec, fft_log_size);
+  }
 
   if (do_forward_test) {
     if (include_conversion) {
@@ -757,9 +965,14 @@
           temp16[n] = factor * xr[n];
         }
 
-        status = omxSP_FFTFwd_RToCCS_S16S32_Sfs(x, y, fft_fwd_spec,
-                                                (OMX_INT) scaleFactor);
-
+        if(s16s32 == S32) {
+          status = omxSP_FFTFwd_RToCCS_S16S32_Sfs(x, y,
+              (OMXFFTSpec_R_S16S32*)fft_fwd_spec, (OMX_INT) scaleFactor);
+        }
+        else {
+          status = omxSP_FFTFwd_RToCCS_S16_Sfs(x,  (OMX_S16*)y,
+              (OMXFFTSpec_R_S16*)fft_fwd_spec, (OMX_INT) scaleFactor);
+        }
         /*
          * Now spend some time converting the fixed-point FFT back to float.
          */
@@ -774,15 +987,26 @@
 
       GetUserTime(&start_time);
       for (n = 0; n < count; ++n) {
-        status = omxSP_FFTFwd_RToCCS_S16S32_Sfs(x, y, fft_fwd_spec,
-                                                (OMX_INT) scaleFactor);
+      if(s16s32 == S32) {
+        status = omxSP_FFTFwd_RToCCS_S16S32_Sfs(x, y, 
+            (OMXFFTSpec_R_S16S32*)fft_fwd_spec, (OMX_INT) scaleFactor);
+      }
+      else {
+        status = omxSP_FFTFwd_RToCCS_S16_Sfs(x, (OMX_S16*)y,
+            (OMXFFTSpec_R_S16*)fft_fwd_spec, (OMX_INT) scaleFactor);
+      }
       }
       GetUserTime(&end_time);
     }
 
     elapsed_time = TimeDifference(&start_time, &end_time);
 
-    PrintResult("Forward RFFT16", fft_log_size, elapsed_time, count);
+    if(s16s32 == S32) {
+      PrintResult("Forward RFFT16 (with S32)", fft_log_size, elapsed_time, count);
+    }
+    else {
+      PrintResult("Forward RFFT16 (with S16)", fft_log_size, elapsed_time, count);
+    }
   }
 
   if (do_inverse_test) {
@@ -804,9 +1028,14 @@
           temp32[n] = factor * yrTrue[n];
         }
 
-        status = omxSP_FFTFwd_RToCCS_S16S32_Sfs(x, y, fft_fwd_spec,
-                                                (OMX_INT) scaleFactor);
-
+        if(s16s32 == S32) {
+          status = omxSP_FFTInv_CCSToR_S32S16_Sfs(y, z,
+              (OMXFFTSpec_R_S16S32*)fft_inv_spec, 0);
+        }
+        else {
+          status = omxSP_FFTInv_CCSToR_S16_Sfs((OMX_S16*)y, z,
+              (OMXFFTSpec_R_S16*)fft_inv_spec, 0);
+        }
         /*
          * Spend some time converting the result back to float
          */
@@ -819,14 +1048,26 @@
     } else {
       GetUserTime(&start_time);
       for (n = 0; n < count; ++n) {
-        status = omxSP_FFTInv_CCSToR_S32S16_Sfs(y, z, fft_inv_spec, 0);
+        if(s16s32 == S32) {
+          status = omxSP_FFTInv_CCSToR_S32S16_Sfs(y, z,
+              (OMXFFTSpec_R_S16S32*)fft_inv_spec, 0);
+        }
+        else {
+          status = omxSP_FFTInv_CCSToR_S16_Sfs((OMX_S16*)y, z,
+              (OMXFFTSpec_R_S16*)fft_inv_spec, 0);
+        }
       }
       GetUserTime(&end_time);
     }
 
     elapsed_time = TimeDifference(&start_time, &end_time);
 
-    PrintResult("Inverse RFFT16", fft_log_size, elapsed_time, count);
+    if(s16s32 == S32) {
+      PrintResult("Inverse RFFT16 (with S32)", fft_log_size, elapsed_time, count);
+    }
+    else {
+      PrintResult("Inverse RFFT16 (with S16)", fft_log_size, elapsed_time, count);
+    }
   }
 
   FreeAlignedPointer(x_aligned);
@@ -845,11 +1086,18 @@
       ? MAX_FFT_ORDER_FIXED_POINT : max_fft_order;
 
   if (verbose == 0)
-    printf("RFFT16\n");
-
+    printf("RFFT16 (with S32)\n");
   for (k = min_fft_order; k <= max_order; ++k) {
     int testCount = ComputeCount(count, k);
-    TimeOneRFFT16(testCount, k, signal_value, signal_type);
+    TimeOneRFFT16(testCount, k, signal_value, signal_type, 1);
+  }
+
+  if (verbose == 0)
+    printf("RFFT16 (with S16)\n");
+  //for (k = min_fft_order; k <= max_order; ++k) {
+  for (k = 7; k <= max_order; ++k) {
+    int testCount = ComputeCount(count, k);
+    TimeOneRFFT16(testCount, k, signal_value, signal_type, 0);
   }
 }
 
diff --git a/dl/sp/src/test/test_float_rfft.c b/dl/sp/src/test/test_float_rfft.c
index cb3262f..20b5e33 100644
--- a/dl/sp/src/test/test_float_rfft.c
+++ b/dl/sp/src/test/test_float_rfft.c
@@ -36,8 +36,6 @@
 
   SetDefaultOptions(&options, 1, MAX_FFT_ORDER);
 
-  options.signal_value_ = 1024;
-
   ProcessCommandLine(&options, argc, argv,
                      "Test forward and inverse real floating-point FFT\n");
 
diff --git a/dl/sp/src/test/test_rfft16_s16.c b/dl/sp/src/test/test_rfft16_s16.c
new file mode 100644
index 0000000..c5520f8
--- /dev/null
+++ b/dl/sp/src/test/test_rfft16_s16.c
@@ -0,0 +1,302 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+#include "dl/sp/src/test/aligned_ptr.h"
+#include "dl/sp/src/test/compare.h"
+#include "dl/sp/src/test/gensig.h"
+#include "dl/sp/src/test/test_util.h"
+
+#define MAX_FFT_ORDER   12
+
+int verbose = 0;
+int signal_value = 32767;
+int scale_factor = 0;
+
+void TestFFT(int fftLogSize, int scale_factor, int signalType);
+
+void main(int argc, char* argv[]) {
+  struct Options options;
+
+  SetDefaultOptions(&options, 1, MAX_FFT_ORDER);
+
+  options.signal_value_ = signal_value;
+  options.scale_factor_ = scale_factor;
+
+  ProcessCommandLine(&options, argc, argv, "Test forward and inverse real 16 \
+                     -bit fixed-point FFT, with 16-bit complex FFT routines\n");
+
+  verbose = options.verbose_;
+  signal_value = options.signal_value_;
+  scale_factor = options.scale_factor_;
+
+  if (verbose > 255)
+    DumpOptions(stderr, &options);
+
+  if (options.test_mode_) {
+    struct TestInfo info;
+
+    info.real_only_ = options.real_only_;
+    info.max_fft_order_ = options.max_fft_order_;
+    info.min_fft_order_ = options.min_fft_order_;
+    info.do_forward_tests_ = options.do_forward_tests_;
+    info.do_inverse_tests_ = options.do_inverse_tests_;
+    /* No known failures */
+    info.known_failures_ = 0;
+    info.forward_threshold_ = 45;
+    info.inverse_threshold_ = 14;
+
+    RunAllTests(&info);
+  } else {
+    TestFFT(options.fft_log_size_,
+            options.signal_type_,
+            options.scale_factor_);
+  }
+}
+
+void GenerateSignal(struct ComplexFloat* fft,
+                    float* x_true, int size, int sigtype) {
+  int k;
+  struct ComplexFloat *test_signal;
+
+  test_signal = (struct ComplexFloat*) malloc(sizeof(*test_signal) * size);
+  GenerateTestSignalAndFFT(test_signal, fft, size, sigtype, signal_value, 1);
+
+  /*
+   * Convert the complex result to what we want
+   */
+
+  for (k = 0; k < size; ++k) {
+    x_true[k] = test_signal[k].Re;
+  }
+
+  free(test_signal);
+}
+
+void TestFFT(int fft_log_size, int signal_type, int scale_factor) {
+  struct SnrResult snr;
+
+  RunOneForwardTest(fft_log_size, signal_type, signal_value, &snr);
+  printf("Forward float FFT\n");
+  printf("SNR:  real part    %f dB\n", snr.real_snr_);
+  printf("      imag part    %f dB\n", snr.imag_snr_);
+  printf("      complex part %f dB\n", snr.complex_snr_);
+
+  RunOneInverseTest(fft_log_size, signal_type, signal_value, &snr);
+  printf("Inverse float FFT\n");
+  printf("SNR:  %f dB\n", snr.real_snr_);
+}
+
+float RunOneForwardTest(int fft_log_size, int signal_type,
+                        float unused_signal_value,
+                        struct SnrResult* snr) {
+  OMX_S16* x;
+  OMX_SC16* y;
+
+  struct AlignedPtr* x_aligned;
+  struct AlignedPtr* y_aligned;
+
+  float* x_true;
+  struct ComplexFloat* y_true;
+  OMX_SC16* y_scaled;
+
+  OMX_INT n, fft_spec_buffer_size;
+  OMXResult status;
+  OMXFFTSpec_R_S16 * fft_fwd_spec = NULL;
+  int fft_size;
+
+  /*
+   * To get good FFT results, set the forward FFT scale factor
+   * to be the same as the order.
+   */
+  scale_factor = fft_log_size;
+
+  fft_size = 1 << fft_log_size;
+
+  status = omxSP_FFTGetBufSize_R_S16(fft_log_size, &fft_spec_buffer_size);
+  if (verbose > 63) {
+    printf("fft_spec_buffer_size = %d\n", fft_spec_buffer_size);
+  }
+
+  fft_fwd_spec = (OMXFFTSpec_R_S16*) malloc(fft_spec_buffer_size);
+  status = omxSP_FFTInit_R_S16(fft_fwd_spec, fft_log_size);
+  if (status) {
+    fprintf(stderr, "Failed to init forward FFT:  status = %d\n", status);
+    exit(1);
+  }
+
+  x_aligned = AllocAlignedPointer(32, sizeof(*x) * fft_size);
+  y_aligned = AllocAlignedPointer(32, sizeof(*y) * (fft_size + 2));
+
+  x = x_aligned->aligned_pointer_;
+  y = y_aligned->aligned_pointer_;
+
+  x_true = (float*) malloc(sizeof(*x_true) * fft_size);
+  y_true = (struct ComplexFloat*) malloc(sizeof(*y_true) * (fft_size / 2 + 1));
+  y_scaled = (OMX_SC16*) malloc(sizeof(*y_true) * (fft_size / 2 + 1));
+
+  GenerateSignal(y_true, x_true, fft_size, signal_type);
+  for (n = 0; n < fft_size; ++n) {
+    x[n] = 0.5 + x_true[n];
+  }
+
+  {
+    float scale = 1 << fft_log_size;
+
+    for (n = 0; n < fft_size; ++n) {
+      y_scaled[n].Re = 0.5 + y_true[n].Re / scale;
+      y_scaled[n].Im = 0.5 + y_true[n].Im / scale;
+    }
+  }
+
+  if (verbose > 63) {
+    printf("Signal\n");
+    DumpArrayReal16("x", fft_size, x);
+
+    printf("Expected FFT output\n");
+    DumpArrayComplex16("y", fft_size / 2 + 1, y_scaled);
+  }
+
+  status = omxSP_FFTFwd_RToCCS_S16_Sfs(x, (OMX_S16*) y, fft_fwd_spec, scale_factor);
+  if (status) {
+    fprintf(stderr, "Forward FFT failed: status = %d\n", status);
+    exit(1);
+  }
+
+  if (verbose > 63) {
+    printf("FFT Output\n");
+    DumpArrayComplex16("y", fft_size / 2 + 1, y);
+  }
+
+  CompareComplex16(snr, y, y_scaled, fft_size / 2 + 1);
+
+  FreeAlignedPointer(x_aligned);
+  FreeAlignedPointer(y_aligned);
+  free(fft_fwd_spec);
+
+  return snr->complex_snr_;
+}
+
+float RunOneInverseTest(int fft_log_size, int signal_type,
+                        float unused_signal_value,
+                        struct SnrResult* snr) {
+  OMX_S16* x_scaled;
+  OMX_S16* z;
+  OMX_SC16* y;
+  OMX_SC16* y_scaled;
+
+  struct AlignedPtr* y_aligned;
+  struct AlignedPtr* z_aligned;
+
+  float* x_true;
+  struct ComplexFloat* y_true;
+
+  OMX_INT n, fft_spec_buffer_size;
+  OMXResult status;
+  OMXFFTSpec_R_S16 * fft_inv_spec = NULL;
+  int fft_size;
+
+  fft_size = 1 << fft_log_size;
+
+  status = omxSP_FFTGetBufSize_R_S16(fft_log_size, &fft_spec_buffer_size);
+  if (verbose > 3) {
+    printf("fft_spec_buffer_size = %d\n", fft_spec_buffer_size);
+  }
+
+  fft_inv_spec = (OMXFFTSpec_R_S16*)malloc(fft_spec_buffer_size);
+  status = omxSP_FFTInit_R_S16(fft_inv_spec, fft_log_size);
+  if (status) {
+    fprintf(stderr, "Failed to init backward FFT:  status = %d\n", status);
+    exit(1);
+  }
+
+  y_aligned = AllocAlignedPointer(32, sizeof(*y) * (fft_size / 2 + 1));
+  z_aligned = AllocAlignedPointer(32, sizeof(*z) * fft_size);
+
+  x_true = (float*) malloc(sizeof(*x_true) * fft_size);
+  x_scaled = (OMX_S16*) malloc(sizeof(*x_scaled) * fft_size);
+  y_true = (struct ComplexFloat*) malloc(sizeof(*y_true) * fft_size);
+  y_scaled = y_aligned->aligned_pointer_;
+  z = z_aligned->aligned_pointer_;
+
+  GenerateSignal(y_true, x_true, fft_size, signal_type);
+
+  {
+    /*
+     * To get max accuracy, scale the input to the inverse FFT up
+     * to use as many bits as we can.
+     */
+    float scale = 1;
+    float max = 0;
+
+    for (n = 0; n < fft_size / 2 + 1; ++n) {
+      float val;
+      val = fabs(y_true[n].Re);
+      if (val > max) {
+        max = val;
+      }
+      val = fabs(y_true[n].Im);
+      if (val > max) {
+        max = val;
+      }
+    }
+
+    scale = 16384 / max;
+    if (verbose > 63)
+      printf("Inverse FFT input scaled factor %g\n", scale);
+
+    /*
+     * Scale both the true FFT signal and the input so we can
+     * compare them correctly later
+     */
+    for (n = 0; n < fft_size / 2 + 1; ++n) {
+      y_scaled[n].Re = (OMX_S16)(0.5 + y_true[n].Re * scale);
+      y_scaled[n].Im = (OMX_S16)(0.5 + y_true[n].Im * scale);
+    }
+    for (n = 0; n < fft_size; ++n) {
+      x_scaled[n] = 0.5 + x_true[n] * scale;
+    }
+  }
+
+
+  if (verbose > 63) {
+    printf("Inverse FFT Input Signal\n");
+    DumpArrayComplex16("y", fft_size / 2 + 1, y_scaled);
+
+    printf("Expected Inverse FFT output\n");
+    DumpArrayReal16("x", fft_size, x_scaled);
+  }
+
+  status = omxSP_FFTInv_CCSToR_S16_Sfs((OMX_S16 const *)y_scaled, z, fft_inv_spec, 0);
+  if (status) {
+    fprintf(stderr, "Inverse FFT failed: status = %d\n", status);
+    exit(1);
+  }
+
+  if (verbose > 63) {
+    printf("Actual Inverse FFT Output\n");
+    DumpArrayReal16("z", fft_size, z);
+  }
+
+  CompareReal16(snr, z, x_scaled, fft_size);
+
+  FreeAlignedPointer(y_aligned);
+  FreeAlignedPointer(z_aligned);
+  free(fft_inv_spec);
+
+  return snr->real_snr_;
+}
diff --git a/dl/sp/src/test/test_rfft16.c b/dl/sp/src/test/test_rfft16_s32.c
similarity index 96%
rename from dl/sp/src/test/test_rfft16.c
rename to dl/sp/src/test/test_rfft16_s32.c
index 171ccdc..2d825c6 100644
--- a/dl/sp/src/test/test_rfft16.c
+++ b/dl/sp/src/test/test_rfft16_s32.c
@@ -33,8 +33,8 @@
 
   SetDefaultOptions(&options, 1, MAX_FFT_ORDER);
 
-  ProcessCommandLine(&options, argc, argv,
-                     "Test forward and inverse real 16-bit fixed-point FFT\n");
+  ProcessCommandLine(&options, argc, argv, "Test forward and inverse real 16 \
+                     -bit fixed-point FFT, with 32-bit complex FFT routines\n");
 
   verbose = options.verbose_;
   signal_value = options.signal_value_;
@@ -54,7 +54,6 @@
     info.known_failures_ = 0;
     info.forward_threshold_ = 90.12;
     info.inverse_threshold_ = 89.28;
-    signal_value = 32767;
     RunAllTests(&info);
   } else {
     TestFFT(options.fft_log_size_,
@@ -82,8 +81,8 @@
   }
 
   for (k = 0; k < size / 2 + 1; ++k) {
-    fft[k].Re = true_fft[k].Re;
-    fft[k].Im = true_fft[k].Im;
+    fft[k].Re = true_fft[k].Re + 0.5;
+    fft[k].Im = true_fft[k].Im + 0.5;
   }
 
   free(test_signal);
diff --git a/dl/sp/src/test/test_util.c b/dl/sp/src/test/test_util.c
index 88d697b..69830b6 100644
--- a/dl/sp/src/test/test_util.c
+++ b/dl/sp/src/test/test_util.c
@@ -97,7 +97,7 @@
   options->fft_log_size_ = 4;
   options->scale_factor_ = 0;
   options->signal_type_ = 0;
-  options->signal_value_ = 1024;
+  options->signal_value_ = 32767;
   options->signal_value_given_ = 0;
 }
 
@@ -382,7 +382,7 @@
                         const OMX_SC16* array) {
   int n;
 
-  printf("%4s\t%10s.re[n]\t%10s.im[n]\n", "n", array_name);
+  printf("%4s\t%10s.re[n]\t%10s.im[n]\n", "n", array_name, array_name);
   for (n = 0; n < count; ++n) {
     printf("%4d\t%16d\t%16d\n", n, array[n].Re, array[n].Im);
   }