Revert 4347 "Implementation of real value 16 bit FFT with 16 bit..."

> Implementation of real value 16 bit FFT with 16 bit complex FFT routines, for ARM Neon platforms.
> Verified with SNR testing code in Openmax folder.
> 
> R=aedla@chromium.org, rtoy@google.com
> 
> Review URL: https://webrtc-codereview.appspot.com/1323010

TBR=kma@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/1820004

git-svn-id: http://webrtc.googlecode.com/svn/deps/third_party/openmax@4361 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/dl/dl.gyp b/dl/dl.gyp
index 1f014b0..0573ce2 100644
--- a/dl/dl.gyp
+++ b/dl/dl.gyp
@@ -54,6 +54,8 @@
         'sp/src/omxSP_FFTInit_R_S32.c',
         'sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S',
         # Complex 16-bit fixed-point FFT
+        'sp/src/omxSP_FFTInit_C_SC16.c',
+        'sp/src/omxSP_FFTGetBufSize_C_SC16.c',
         'sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S',
         'sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S',
         'sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S',
@@ -63,18 +65,11 @@
         'sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S',
         'sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S',
         'sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S',
-        'sp/src/omxSP_FFTGetBufSize_C_SC16.c',
-        'sp/src/omxSP_FFTInit_C_SC16.c',
         'sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S',
         # Real 16-bit fixed-point FFT
-        'sp/src/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S',
         'sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S',
-        'sp/src/omxSP_FFTGetBufSize_R_S16.c',
         'sp/src/omxSP_FFTGetBufSize_R_S16S32.c',
-        'sp/src/omxSP_FFTInit_R_S16.c',
         'sp/src/omxSP_FFTInit_R_S16S32.c',
-        'sp/src/omxSP_FFTInv_CCSToR_S16_Sfs_s.S',
-        'sp/src/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S',
         'sp/src/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S',
         # Complex floating-point FFT
         'sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S',
diff --git a/dl/sp/api/armSP.h b/dl/sp/api/armSP.h
index 4972f09..f615a87 100644
--- a/dl/sp/api/armSP.h
+++ b/dl/sp/api/armSP.h
@@ -64,14 +64,6 @@
     OMX_S32     *pBuf;
 }ARMsFFTSpec_R_SC32;
 
-typedef struct  ARMsFFTSpec_R_SC16_Tag 
-{
-    OMX_U32     N;
-    OMX_U16     *pBitRev;    
-    OMX_SC16    *pTwiddle;
-    OMX_S16     *pBuf;
-} ARMsFFTSpec_R_SC16;
-
 typedef struct ARMsFFTSpec_R_FC32_Tag
 {
     OMX_U32 N;
diff --git a/dl/sp/api/omxSP.h b/dl/sp/api/omxSP.h
index 13c64e3..695fa90 100644
--- a/dl/sp/api/omxSP.h
+++ b/dl/sp/api/omxSP.h
@@ -44,7 +44,6 @@
  typedef void OMXFFTSpec_C_SC16;
  typedef void OMXFFTSpec_C_SC32;
  typedef void OMXFFTSpec_R_S16S32;
- typedef void OMXFFTSpec_R_S16;
  typedef void OMXFFTSpec_R_S32;
  typedef void OMXFFTSpec_R_F32;
  typedef void OMXFFTSpec_C_FC32;
diff --git a/dl/sp/src/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S b/dl/sp/src/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S
deleted file mode 100644
index 7e33484..0000000
--- a/dl/sp/src/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S
+++ /dev/null
@@ -1,413 +0,0 @@
-@
-@  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
-@
-@  Use of this source code is governed by a BSD-style license
-@  that can be found in the LICENSE file in the root of the source
-@  tree. An additional intellectual property rights grant can be found
-@  in the file PATENTS.  All contributing project authors may
-@  be found in the AUTHORS file in the root of the source tree.
-@
-@ Some code in this file was originally from file
-@ armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S which was licensed as
-@ follows. It has been relicensed with permission from the copyright holders.
-@
-
-@
-@ OpenMAX DL: v1.0.2
-@ Last Modified Revision:   7485
-@ Last Modified Date:       Fri, 21 Sep 2007
-@ 
-@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-@
-
-@
-@ Description:
-@ Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT.
-@ It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation.
-@ It implements both "scaled"(by 1/2) and "unscaled" versions of the above
-@ formula.
-@ 
-        
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
-        
-@//Input Registers
-#define pSrc            r0
-#define pDst            r1
-#define pFFTSpec        r2
-#define scale           r3
-
-@ Output registers
-#define result          r0
-
-@//Local Scratch Registers
-#define argTwiddle      r1
-#define argDst          r2
-#define argScale        r4
-#define tmpOrder        r4
-#define pTwiddle        r4
-#define pOut            r5
-#define subFFTSize      r7     
-#define subFFTNum       r6
-#define N               r6
-#define order           r14
-#define diff            r9
-@ Total num of radix stages to comple the FFT.
-#define count           r8
-#define x0r             r4    
-#define x0i             r5
-#define diffMinusOne    r2
-#define round           r3
-#define pOut1           r2
-#define size            r7
-#define step            r8            
-#define step1           r9
-#define step2           r10
-#define twStep          r10
-#define pTwiddleTmp     r11
-#define argTwiddle1     r12
-#define zero            r14
-
-@ Neon registers
-#define dX0             D0.S16
-#define dX0S32          D0.S32
-#define dShift          D1.S16
-#define dX1             D1.S16
-#define dX1S32          D1.S32
-#define dY0             D2.S16
-#define dY1             D3.S16
-#define dX0r            D0.S16            
-#define dX0rS32         D0.S32
-#define dX0i            D1.S16
-#define dX1r            D2.S16
-#define dX1i            D3.S16
-#define qX1             Q1.S16
-#define dW0r            D4.S16
-#define dW0i            D5.S16
-#define dW1r            D6.S16
-#define dW1i            D7.S16
-#define dW0rS32         D4.S32
-#define dW0iS32         D5.S32
-#define dW1rS32         D6.S32
-#define dW1iS32         D7.S32
-#define dT0             D8.S16
-#define dT1             D9.S16
-#define dT2             D10.S16
-#define dT3             D11.S16
-#define qT0             Q6.S32
-#define qT1             Q7.S32
-#define qT2             Q8.S32
-#define qT3             Q9.S32
-#define dY0r            D4.S16
-#define dY0i            D5.S16
-#define dY1r            D6.S16
-#define dY1i            D7.S16
-#define qY1             Q3.S16
-#define dY2             D4.S16
-#define dY3             D5.S16
-#define dW0             D6.S16
-#define dW1             D7.S16
-#define dW0Tmp          D10.S16
-#define dW1Neg          D11.S16
-
-        @ Structure offsets for the FFTSpec             
-        .set    ARMsFFTSpec_N, 0
-        .set    ARMsFFTSpec_pBitRev, 4
-        .set    ARMsFFTSpec_pTwiddle, 8
-        .set    ARMsFFTSpec_pBuf, 12
-
-        .MACRO FFTSTAGE scaled, inverse, name
-        
-        @ Read the size from structure and take log
-        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
-        
-        @ Read other structure parameters
-        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
-        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
-        
-        MOV     size,N,ASR #1        @ preserve the contents of N
-        MOV     step,N,LSL #1        @ step = N/2 * 4 bytes
-        
-        @ Process different FFT sizes with different loops.
-        CMP    size,#4
-        BLE    smallFFTSize\name
-        
-        @ Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
-        @ Note: W^(k) is stored as negated value and also need to
-        @ conjugate the values from the table.
-        
-        @ Z(0) : no need of twiddle multiply
-        @ Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
-        
-        VLD1    dX0S32[0],[pSrc],step
-        ADD     pOut1,pOut,step      @ pOut1 = pOut+ N/2*4 bytes 
-                
-        VLD1    dX1S32[0],[pSrc]!
-        SUB     twStep,step,size     @ twStep = 3N/8 * 4 bytes pointing to W^1
-        
-        MOV     step1,size,LSL #1    @ step1 = N/4 * 4 = N/2*2 bytes
-        SUB     step1,step1,#4       @ (N/4-1)*4 bytes
-        
-        VHADD    dY0,dX0,dX1         @ [b+d | a+c]
-        VHSUB    dY1,dX0,dX1         @ [b-d | a-c] 
-        VTRN    dY0,dY1              @ dY0= [a-c | a+c] ;dY1= [b-d | b+d] 
-        
-        .ifeqs  "\scaled", "TRUE"
-            VHSUB   dX0,dY0,dY1
-            SUBS    size,size,#2
-            VHADD   dX1,dY0,dY1
-        .else
-            VSUB   dX0,dY0,dY1
-            SUBS    size,size,#2
-            VADD   dX1,dY0,dY1
-        .endif
-                    
-        SUB     pSrc,pSrc,step
-        VST1    dX0[0],[pOut1]!
-        ADD     pTwiddleTmp,pTwiddle,#4                @ W^2
-        VST1    dX1[1],[pOut1]!
-        ADD     argTwiddle1,pTwiddle,twStep            @ W^1 
-        
-        BLT     decrementScale\name
-        BEQ     lastElement\name
-                        
-        SUB     step,step,#20
-        SUB     step1,step1,#4                         @ (N/4-1)*8 bytes
-        SUB     step2, step1, #4
-                        
-        @ Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
-        @ Note: W^k is stored as negative values in the table and also need to
-        @ conjugate the values from the table.
-        @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
-        @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
-
-evenOddButterflyLoop\name:     
-        VLD2    {dX0r,dX0i},[pSrc],step
-        VLD2    {dX1r,dX1i},[pSrc]!
-        SUB     pSrc, pSrc, step
-
-        VLD1    dW0r,[argTwiddle1],step1
-        VREV64  qX1,qX1
-        VLD1    dW1r,[argTwiddle1]!
-        VHSUB   dT2,dX0r,dX1r                          @ a-c
-        SUB     argTwiddle1, argTwiddle1, step1
-        SUB     step1,step1,#16
-
-        VLD1    dW0i,[pTwiddleTmp],step2
-        VHADD   dT3,dX0i,dX1i                          @ b+d
-        VLD1    dW1i,[pTwiddleTmp]!
-        VHADD   dT0,dX0r,dX1r                          @ a+c
-        VHSUB   dT1,dX0i,dX1i                          @ b-d
-        SUB     pTwiddleTmp, pTwiddleTmp, step2
-        SUB     step2,step2,#16
-
-        SUBS    size,size,#8
-        
-        VZIP    dW1r,dW1i
-        VTRN    dW0r,dW0i
-        VZIP    dW1iS32, dW1rS32
-                                
-        VMULL   qT0,dW1i,dT2
-        VMLSL   qT0,dW1r,dT3
-        VMULL   qT1,dW1i,dT3
-        VMLAL   qT1,dW1r,dT2
-        VMULL   qT2,dW0r,dT2
-        VMLAL   qT2,dW0i,dT3
-        VMULL   qT3,dW0r,dT3
-        VMLSL   qT3,dW0i,dT2
-        
-        VRSHRN  dX1r,qT0,#15
-        VRSHRN  dX1i,qT1,#15
-        VRSHRN  dX0r,qT2,#15
-        VRSHRN  dX0i,qT3,#15
-        
-        .ifeqs  "\scaled", "TRUE"
-            VHADD    dY1r,dT0,dX1i                     @ F(N/2 -1)
-            VHSUB    dY1i,dX1r,dT1
-        .else
-            VADD    dY1r,dT0,dX1i                      @ F(N/2 -1)
-            VSUB    dY1i,dX1r,dT1
-        .endif
-        
-        .ifeqs  "\scaled", "TRUE"
-            VHADD    dY0r,dT0,dX0i                     @ F(1)
-            VHSUB    dY0i,dT1,dX0r
-        .else
-            VADD    dY0r,dT0,dX0i                      @ F(1)
-            VSUB    dY0i,dT1,dX0r
-        .endif
-        
-        VREV64  qY1,qY1
-
-        VST2    {dY0r,dY0i},[pOut1],step
-        VST2    {dY1r,dY1i},[pOut1]
-        ADD     pOut1,pOut1,#16
-        SUB     pOut1, pOut1, step
-        SUB     step,step,#32
-       
-        BGT     evenOddButterflyLoop\name
-
-        SUB     pSrc,pSrc,#4           @ set both the ptrs to the last element
-        SUB     pOut1,pOut1,#4
-        B       lastElement\name
-        
-smallFFTSize\name:
-        @ Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
-        @ Note: W^(k) is stored as negated value and also need to
-        @ conjugate the values from the table.
-        
-        @ Z(0) : no need of twiddle multiply
-        @ Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
-        
-        VLD1    dX0S32[0],[pSrc],step
-        ADD     pOut1,pOut,step      @ pOut1 = pOut+ N/2*4 bytes 
-                
-        VLD1    dX1S32[0],[pSrc]!
-        SUB     twStep,step,size     @ twStep = 3N/8 * 4 bytes pointing to W^1
-        
-        MOV     step1,size,LSL #1    @ step1 = N/4 * 4 = N/2*2 bytes
-        SUB     step1,step1,#4       @ (N/4-1)*4 bytes
-        
-        VHADD    dY0,dX0,dX1         @ [b+d | a+c]
-        VHSUB    dY1,dX0,dX1         @ [b-d | a-c] 
-        VTRN    dY0,dY1              @ dY0= [a-c | a+c] ;dY1= [b-d | b+d] 
-        
-        .ifeqs  "\scaled", "TRUE"
-            VHSUB   dX0,dY0,dY1
-            SUBS    size,size,#2
-            VHADD   dX1,dY0,dY1
-        .else
-            VSUB   dX0,dY0,dY1
-            SUBS    size,size,#2
-            VADD   dX1,dY0,dY1
-        .endif
-                    
-        SUB     pSrc,pSrc,step
-        VST1    dX0[0],[pOut1]!
-        ADD     pTwiddleTmp,pTwiddle,#4                @ W^2
-        VST1    dX1[1],[pOut1]!
-        ADD     argTwiddle1,pTwiddle,twStep            @ W^1 
-        
-        BLT     decrementScale\name
-        BEQ     lastElement\name
-                        
-        @ Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
-        @ Note: W^k is stored as negative values in the table and also need to
-        @ conjugate the values from the table.
-        @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
-        @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
-
-        SUB     step,step,#12
-
-evenOddButterflyLoopSize4\name:     
-        VLD1    dW0rS32[0],[argTwiddle1],step1
-        VLD1    dW1rS32[0],[argTwiddle1]!
-        
-        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
-        VLD2    {dX0r[1],dX0i[1]},[pSrc],step
-        SUB     pSrc,pSrc,#4
-        SUB     argTwiddle1,argTwiddle1,step1
-        VLD2    {dX1r[0],dX1i[0]},[pSrc]!
-        VLD2    {dX1r[1],dX1i[1]},[pSrc]!
-        
-        SUB     step1,step1,#4                         @ (N/4-2)*4 bytes
-        VLD1    dW0iS32[0],[pTwiddleTmp],step1
-        VLD1    dW1iS32[0],[pTwiddleTmp]!
-        SUB     pSrc,pSrc,step
-        
-        SUB     pTwiddleTmp,pTwiddleTmp,step1
-        VREV32  dX1r,dX1r
-        VREV32  dX1i,dX1i
-        SUBS    size,size,#4
-                        
-        VHSUB   dT2,dX0r,dX1r                          @ a-c
-        VHADD   dT3,dX0i,dX1i                          @ b+d
-        SUB     step1,step1,#4
-        VHADD   dT0,dX0r,dX1r                          @ a+c
-        VHSUB   dT1,dX0i,dX1i                          @ b-d
-        
-        VTRN    dW1r,dW1i
-        VTRN    dW0r,dW0i
-                                
-        VMULL   qT0,dW1r,dT2
-        VMLSL   qT0,dW1i,dT3
-        VMULL   qT1,dW1r,dT3
-        VMLAL   qT1,dW1i,dT2
-        VMULL   qT2,dW0r,dT2
-        VMLAL   qT2,dW0i,dT3
-        VMULL   qT3,dW0r,dT3
-        VMLSL   qT3,dW0i,dT2
-        
-        VRSHRN  dX1r,qT0,#15
-        VRSHRN  dX1i,qT1,#15
-        
-        .ifeqs  "\scaled", "TRUE"
-            VHADD    dY1r,dT0,dX1i                     @ F(N/2 -1)
-            VHSUB    dY1i,dX1r,dT1
-        .else
-            VADD    dY1r,dT0,dX1i                      @ F(N/2 -1)
-            VSUB    dY1i,dX1r,dT1
-        .endif
-        
-        VREV32  dY1r,dY1r
-        VREV32  dY1i,dY1i
-                            
-        VRSHRN  dX0r,qT2,#15
-        VRSHRN  dX0i,qT3,#15
-        
-        .ifeqs  "\scaled", "TRUE"
-            VHADD    dY0r,dT0,dX0i                     @ F(1)
-            VHSUB    dY0i,dT1,dX0r
-        .else
-            VADD    dY0r,dT0,dX0i                      @ F(1)
-            VSUB    dY0i,dT1,dX0r
-        .endif
-        
-        VST2    {dY0r[0],dY0i[0]},[pOut1]!
-        VST2    {dY0r[1],dY0i[1]},[pOut1],step
-        SUB     pOut1, #4
-        VST2    {dY1r[0],dY1i[0]},[pOut1]!
-        VST2    {dY1r[1],dY1i[1]},[pOut1]!
-        SUB     pOut1,pOut1,step
-        SUB     step,step,#16                          @ (N/2-4)*8 bytes
-        
-        BGT     evenOddButterflyLoopSize4\name
-        
-        SUB     pSrc,pSrc,#4           @ set both the ptrs to the last element
-        SUB     pOut1,pOut1,#4
-        
-        @ Last element can be expanded as follows
-        @ 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (W^k is stored as -ve)
-        @ 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
-        @ 1/2[2a+j0] - j (c-jd) [0+j2b]
-        @ (a+bc, -bd)
-        @ Since (c,d) = (0,1) for the last element, result is just (a,-b)
-        
-lastElement\name:      
-        VLD1    dX0rS32[0],[pSrc]
-        
-        .ifeqs  "\scaled", "TRUE"
-            VSHR    dX0r,dX0r,#1
-        .endif
-        
-        VST1    dX0r[0],[pOut1]!
-        VNEG    dX0r,dX0r
-        VST1    dX0r[1],[pOut1]
-
-decrementScale\name:          
-        .ifeqs  "\scaled", "TRUE"
-            SUB scale,scale,#1
-        .endif
-        
-        .endm
-        
-        M_START armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe,r4
-        FFTSTAGE "FALSE","TRUE",Inv
-        M_END
-        
-        M_START armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe,r4
-        FFTSTAGE "TRUE","TRUE",InvSfs
-        M_END
-
-        
-        .end
diff --git a/dl/sp/src/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S b/dl/sp/src/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S
deleted file mode 100644
index 50d8833..0000000
--- a/dl/sp/src/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S
+++ /dev/null
@@ -1,660 +0,0 @@
-@
-@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS.  All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-@ Some code in this file was originally from file
-@ omxSP_FFTFwd_RToCCS_S32_Sfs_s.S which was licensed as follows.
-@ It has been relicensed with permission from the copyright holders.
-@
-
-@
-@ OpenMAX DL: v1.0.2
-@ Last Modified Revision:   7810
-@ Last Modified Date:       Thu, 04 Oct 2007
-@
-@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-@
-
-@
-@ Description:
-@ Compute a forward FFT for a real signal, using 16 bit complex FFT routines.
-@
-
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
-
-.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
-.extern  armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
-.extern  armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
-.extern  armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
-.extern  armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
-.extern  armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
-.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
-.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
-.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
-.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
-.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
-.extern  armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
-.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
-.extern  armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
-.extern  armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
-.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
-
-@Input Registers
-#define pSrc            r0
-#define pDst            r1
-#define pFFTSpec        r2
-#define scale           r3
-
-@ Output registers
-#define result          r0
-
-@Local Scratch Registers
-#define argTwiddle      r1
-#define argDst          r2
-#define argScale        r4
-#define pTwiddle        r4
-#define tmpOrder        r4
-#define pOut            r5
-#define subFFTSize      r7
-#define subFFTNum       r6
-#define N               r6
-#define order           r14
-#define diff            r9
-@ Total num of radix stages to comple the FFT
-#define count           r8
-#define x0r             r4
-#define x0i             r5
-#define diffMinusOne    r2
-#define round           r3
-#define subFFTSizeTmp   r6
-#define step            r3
-#define stepr           r11
-#define step1           r10
-#define step1r          r6
-#define step2           r8
-#define step2r          r9
-#define twStep          r8
-#define zero            r9
-#define pTwiddleTmp     r5
-#define t0              r10
-
-@ Neon registers
-#define dX0             d0.s16
-#define dX0S32          d0.s32
-#define dzero           d1.s16
-#define dZero           d2.s16
-#define dShift          d3.s16
-#define qShift          q1.s16
-#define dX0r            d2.s16
-#define dX0i            d3.s16
-#define dX1r            d4.s16
-#define dX1i            d5.s16
-#define qX1             q2.s16
-#define dX0rS32         d2.s32
-#define dX0iS32         d3.s32
-#define dX1rS32         d4.s32
-#define dX1iS32         d5.s32
-#define dT0             d6.s16
-#define dT1             d7.s16
-#define dT2             d8.s16
-#define dT3             d9.s16
-#define qT0             q5.s32
-#define qT1             q6.s32
-#define qT0s            q5.s16
-#define qT1s            q6.s16
-#define dW0r            d14.s16
-#define dW0i            d15.s16
-#define dW1r            d16.s16
-#define dW1i            d17.s16
-#define dW0rS32         d14.s32
-#define dW0iS32         d15.s32
-#define dW1rS32         d16.s32
-#define dW1iS32         d17.s32
-#define dY0r            d14.s16
-#define dY0i            d15.s16
-#define dY0rS32         d14.s32
-#define dY0iS32         d15.s32
-#define dY1r            d16.s16
-#define dY1i            d17.s16
-#define qY1             q8.s16
-#define dY1rS32         d16.s32
-#define dY1iS32         d17.s32
-#define dY0rS64         d14.s32
-#define dY0iS64         d15.s32
-#define qT2             q9.s32
-#define qT3             q10.s32
-#define d18s16          d18.s16
-#define d19s16          d19.s16
-#define d20s16          d20.s16
-#define d21s16          d21.s16
-@ lastThreeelements
-#define dX1             d3.s16
-#define dW0             d4.s16
-#define dW1             d5.s16
-#define dY0             d10.s16
-#define dY1             d11.s16
-#define dY2             d12.s16
-#define dY3             d13.s16
-
-        @ Allocate stack memory required by the function
-        M_ALLOC4        diffOnStack, 4
-
-        @ Write function header
-        M_START     omxSP_FFTFwd_RToCCS_S16_Sfs,r11,d15
-
-        @ Structure offsets for the FFTSpec
-        .set    ARMsFFTSpec_N, 0
-        .set    ARMsFFTSpec_pBitRev, 4
-        .set    ARMsFFTSpec_pTwiddle, 8
-        .set    ARMsFFTSpec_pBuf, 12
-
-        @ Define stack arguments
-
-        @ Read the size from structure and take log
-        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
-
-        @ Read other structure parameters
-        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
-        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
-
-        @  N = 1 Treat seperately
-        CMP     N,#1
-        BGT     sizeGreaterThanOne
-        VLD1    dX0[0],[pSrc]
-        RSB     scale,scale,#0                @ for right shift by a variable
-        MOV     zero,#0
-        VMOV    dShift[0],scale
-        VMOV    dzero[0],zero
-        VRSHL   dX0,dShift
-        VMOV    dZero[0],zero
-        VST3    {dX0[0],dzero[0],dZero[0]},[pDst]
-
-        B       End
-
-sizeGreaterThanOne:
-        @ Do a N/2 point complex FFT including the scaling
-
-        MOV     N,N,ASR #1                    @ N/2 point complex FFT
-
-        CLZ     order,N                       @ N = 2^order
-        RSB     order,order,#31
-        MOV     subFFTSize,#1
-
-        CMP     order,#3
-        BGT     orderGreaterthan3             @ order > 3
-
-        CMP     order,#1
-        BGE     orderGreaterthan0             @ order > 0
-        M_STR   scale, diffOnStack,LT         @ order = 0
-        LDR     x0r,[pSrc]
-        STR     x0r,[pOut]
-        MOV     pSrc,pOut
-        MOV     argDst,pDst
-        B       FFTEnd
-
-orderGreaterthan0:
-        @ set the buffers appropriately for various orders
-        CMP     order,#2
-        MOVEQ   argDst,pDst
-        MOVNE   argDst,pOut
-        MOVNE   pOut,pDst                  @ Pass 1st stage destination in RN5
-        MOV     argTwiddle,pTwiddle
-
-        SUBS    diff,scale,order
-        M_STR   diff,diffOnStack
-        MOVGT   scale,order
-        @ Now scale <= order
-
-        CMP     order,#1
-        BGT     orderGreaterthan1
-        @ order = 1:
-        SUBS    scale,scale,#1
-        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
-        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
-        B       FFTEnd
-
-orderGreaterthan1:
-        CMP     order,#2
-        MOV     argScale,scale
-        BGT     orderGreaterthan2
-        @ order = 2:
-        SUBS    argScale,argScale,#1
-        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
-        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
-        SUBS    argScale,argScale,#1
-        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
-        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
-        B       FFTEnd
-
-orderGreaterthan2:   @ order = 3
-        SUBS    argScale,argScale,#1
-        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
-        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
-        SUBS    argScale,argScale,#1
-        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
-        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
-        SUBS    argScale,argScale,#1
-        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
-        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
-        B       FFTEnd
-
-
-orderGreaterthan3:
-        @ check scale = 0 or scale = order
-        SUBS    diff, scale, order   @ scale > order
-        MOVGT   scale,order
-        BGE     specialScaleCase     @ scale = 0 or scale = order
-        CMP     scale,#0
-        BEQ     specialScaleCase
-        B       generalScaleCase
-
-specialScaleCase:   @ scale = 0, or, scale = order && order > 3
-        TST     order, #2            @ Set input args to fft stages
-        MOVEQ   argDst,pDst
-        MOVNE   argDst,pOut
-        MOVNE   pOut,pDst            @ Pass the first stage destination in RN5
-        MOV     argTwiddle,pTwiddle
-
-        CMP     diff,#0
-        M_STR   diff, diffOnStack
-        BGE     scaleEqualsOrder
-
-        @ check for even or odd order.
-        @ NOTE: The following combination of BL's would work fine even though
-        @ the first BL would corrupt the flags. This is because the end of the
-        @ "grpZeroSetLoop" loop inside
-        @ armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets Z flag to EQ.
-
-        TST     order,#0x00000001
-        BLEQ    armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
-        BLNE    armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
-
-        CMP     subFFTNum,#4
-        BLT     FFTEnd
-
-unscaledRadix4Loop:
-        BEQ     lastStageUnscaledRadix4
-        BL      armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
-        CMP     subFFTNum,#4
-        B       unscaledRadix4Loop
-
-lastStageUnscaledRadix4:
-        BL      armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
-        B       FFTEnd
-
-scaleEqualsOrder:
-        @ check for even or odd order
-        @ NOTE: The following combination of BL's would work fine even though
-        @ the first BL would corrupt the flags. This is because the end of the
-        @ "grpZeroSetLoop" loop inside
-        @ armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets Z flag to EQ.
-
-        TST     order,#0x00000001
-        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
-        BLNE    armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
-
-        CMP     subFFTNum,#4
-        BLT     FFTEnd
-
-scaledRadix4Loop:
-        BEQ     lastStageScaledRadix4
-        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
-        CMP     subFFTNum,#4
-        B       scaledRadix4Loop
-
-lastStageScaledRadix4:
-        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
-        B       FFTEnd
-
-generalScaleCase:                        @ 0 < scale < order and order > 3
-        @ Determine the correct destination buffer
-        SUB     diff,order,scale
-        TST     diff,#0x01
-        ADDEQ   count,scale,diff,LSR #1  @ count = scale + (order - scale)/2
-        MOVNE   count,order
-        TST     count,#0x01              @ Is count even or odd ?
-
-        MOVEQ   argDst,pDst              @ Set input args to fft stages
-        MOVNE   argDst,pOut
-        MOVNE   pOut,pDst                @ Pass 1st stage destination in RN5
-        MOV     argTwiddle,pTwiddle
-
-        CMP     diff,#1
-        M_STR   diff, diffOnStack
-        BEQ     scaleps                  @ scaling including a radix2_ps stage
-
-        MOV     argScale,scale           @ Put scale in RN4 to save and restore
-        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
-        SUBS    argScale,argScale,#1
-
-scaledRadix2Loop:
-        BLGT    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
-        SUBS    argScale,argScale,#1     @ save, restore scale in scaled stages
-        BGT     scaledRadix2Loop
-        B       outScale
-
-scaleps:
-        SUB     argScale,scale,#1        @ order>3 and diff=1 => scale >= 3
-        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
-        SUBS    argScale,argScale,#1
-
-scaledRadix2psLoop:
-        BEQ     scaledRadix2psStage
-        BLGT    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
-        SUBS    argScale,argScale,#1     @ save, restore scale in scaled stages
-        BGE     scaledRadix2psLoop
-
-scaledRadix2psStage:
-        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
-        B       generalLastStageUnscaledRadix2
-
-outScale:
-        M_LDR   diff, diffOnStack
-        @check for even or odd order
-        TST     diff,#0x00000001
-        BEQ     generalUnscaledRadix4Loop
-        B       unscaledRadix2Loop
-
-generalUnscaledRadix4Loop:
-        CMP     subFFTNum,#4
-        BEQ     generalLastStageUnscaledRadix4
-        BL      armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
-        B       generalUnscaledRadix4Loop
-
-generalLastStageUnscaledRadix4:
-        BL      armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
-        B       End
-
-unscaledRadix2Loop:
-        CMP     subFFTNum,#4
-        BEQ     generalLastTwoStagesUnscaledRadix2
-        BL      armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
-        B       unscaledRadix2Loop
-
-generalLastTwoStagesUnscaledRadix2:
-        BL      armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
-generalLastStageUnscaledRadix2:
-        BL      armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
-        B       End
-
-FFTEnd:     @ Does only the scaling
-        M_LDR   diff, diffOnStack
-        CMP     diff,#0
-        BLE     finalComplexToRealFixup
-
-        RSB     diff,diff,#0               @ for right shift by a variable
-        VDUP    qShift,diff
-
-        @ save subFFTSize and use subFFTSizeTmp in the following loop
-        MOV     subFFTSizeTmp,subFFTSize   @ subFFTSizeTmp same reg as subFFTNum
-
-        @ Use parallel loads for bigger FFT size.
-        CMP     subFFTSizeTmp, #8
-        BLT     scaleLessFFTData
-
-scaleFFTData:
-        VLD1    {qT0s, qT1s},[pSrc:256]    @ pSrc contains pDst pointer
-        SUBS    subFFTSizeTmp,subFFTSizeTmp,#8
-        VSHL    qT0s,qShift
-        VSHL    qT1s,qShift
-        VST1    {qT0s, qT1s},[pSrc:256]!
-        BGT     scaleFFTData
-        B       afterScaling
-
-scaleLessFFTData:
-        VLD1    {dX0S32[0]},[pSrc]         @ pSrc contains pDst pointer
-        SUBS    subFFTSizeTmp,subFFTSizeTmp,#1
-        VSHL    dX0,dShift
-        VST1    {dX0S32[0]},[pSrc]!
-        BGT     scaleLessFFTData
-
-afterScaling:
-        SUB     pSrc,pSrc,subFFTSize,LSL #2 @ reset pSrc for final fixup
-
-        @  change the logic so that output after scaling is in pOut and not in pDst
-        @  finally store from pOut to pDst
-        @  change branch "End" to branch "finalComplexToRealFixup" in the above
-        @  chk the code below for multiplication by j factor
-
-finalComplexToRealFixup:
-        @ F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
-        @ 1/2[(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
-        @ 1/2[2a+j0] - j [0+j2b]
-        @ (a+b, 0)
-
-        @ F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
-        @ 1/2[(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
-        @ 1/2[2a+j0] + j [0+j2b]
-        @ (a-b, 0)
-
-        CMP    subFFTSize,#4
-        BLE    smallFFTSize
-
-@ SubSize > 3:
-        @ F(0) and F(N/2)
-        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
-        MOV     zero,#0
-        VMOV    dX0r[1],zero
-        MOV     step,subFFTSize,LSL #2        @ step = N/2 * 4 bytes
-        VMOV    dX0i[1],zero
-        SUB     twStep,step,subFFTSize        @ twStep = 3N/8 * 8 bytes
-
-        VADD    dY0r,dX0r,dX0i                @ F(0) = ((Z0.r+Z0.i) , 0)
-        MOV     step1,subFFTSize,LSL #1       @ step1 = N/2 * 2 bytes
-        VSUB    dY0i,dX0r,dX0i                @ F(N/2) = ((Z0.r-Z0.i) , 0)
-        SUBS    subFFTSize,subFFTSize,#2
-
-        VST1    dY0rS32[0],[argDst], step
-        ADD     pTwiddleTmp,argTwiddle,#4     @ W^2
-        VST1    dY0iS32[0],[argDst]!
-        ADD     argTwiddle,argTwiddle,twStep  @ W^1
-
-        VDUP    dzero,zero
-        SUB     argDst,argDst,step
-        SUB     step,step,#20
-        RSB     stepr, step, #16
-        SUB     step1,step1,#8                @ (N/4-1)*8 bytes
-        RSB     step1r,step1,#8
-
-        SUB     step2, step1, #4
-        RSB     step2r, step2, #8
-
-        @ F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
-        @ Note: W^k is stored as negative values in the table.
-        @ Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
-        @ since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1).
-
-evenOddButterflyLoop:
-        VLD2    {dX0r,dX0i},[pSrc],step
-        VLD2    {dX1r,dX1i},[pSrc],stepr
-
-        VLD1    dW0r,[argTwiddle],step1
-        SUB     step1, step1, #16
-        VREV64  qX1,qX1
-
-        VLD1    dW1r,[argTwiddle],step1r
-        ADD     step1r, step1r, #16
-        VSUB    dT2,dX0r,dX1r                 @ a-c
-
-        VLD1    dW0i,[pTwiddleTmp],step2
-        SUB     step2, step2, #16
-        VADD    dT3,dX0i,dX1i                 @ b+d
-
-        VLD1    dW1i,[pTwiddleTmp],step2r
-        ADD     step2r, step2r, #16
-
-        VTRN    dW0r,dW0i
-        VZIP    dW1r, dW1i
-
-        SUBS    subFFTSize,subFFTSize,#8
-
-        VHADD   dT0,dX0r,dX1r                 @ (a+c)/2
-        VZIP    dW1iS32, dW1rS32
-        VHSUB   dT1,dX0i,dX1i                 @ (b-d)/2
-
-        VQDMULH dY0,dW1i,dT2
-        VQDMULH dY1,dW1r,dT3
-        VQDMULH dY2,dW1i,dT3
-        VQDMULH dY3,dW1r,dT2
-
-        VQDMULH d18s16,dW0r,dT2
-        VQDMULH d19s16,dW0i,dT3
-        VQDMULH d20s16,dW0r,dT3
-        VQDMULH d21s16,dW0i,dT2
-
-        VRHADD  dX1r, dY0, dY1
-        VHSUB   dX1i, dY2, dY3
-        VHSUB   dX0r, d18s16, d19s16
-        VRHADD  dX0i, d20s16, d21s16
-
-        VADD    dY1i,dT1,dX1r
-        VSUB    dY1r,dT0,dX1i                 @ F(N/2 -1)
-
-        VSUB    dY0r,dT0,dX0i                 @ F(1)
-        VADD    dY0i,dT1,dX0r
-
-        VNEG    dY1i,dY1i
-        VREV64  qY1, qY1
-
-        VST2    {dY0r,dY0i},[argDst],step
-        SUB     step,step,#32                 @ (N/2-4)*4 bytes
-        VST2    {dY1r,dY1i},[argDst],stepr
-        ADD     stepr,stepr,#32
-
-        BGT     evenOddButterflyLoop
-
-        SUB     pSrc,pSrc,#4                  @ points to the last element.
-        SUB     argDst,argDst,#4              @ points to the last element.
-
-        b lastElement
-
-smallFFTSize:
-
-        @ F(0) and F(N/2)
-        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
-        MOV     zero,#0
-        VMOV    dX0r[1],zero
-        MOV     step,subFFTSize,LSL #2        @ step = N/2 * 4 bytes
-        VMOV    dX0i[1],zero
-        SUB     twStep,step,subFFTSize        @ twStep = 3N/8 * 8 bytes
-
-        VADD    dY0r,dX0r,dX0i                @ F(0) = ((Z0.r+Z0.i) , 0)
-        MOV     step1,subFFTSize,LSL #1       @ step1 = N/2 * 2 bytes
-        VSUB    dY0i,dX0r,dX0i                @ F(N/2) = ((Z0.r-Z0.i) , 0)
-        SUBS    subFFTSize,subFFTSize,#2
-
-
-        VST1    dY0rS32[0],[argDst], step
-        ADD     pTwiddleTmp,argTwiddle,#4     @ W^2
-        VST1    dY0iS32[0],[argDst]!
-        ADD     argTwiddle,argTwiddle,twStep  @ W^1
-
-        VDUP    dzero,zero
-        SUB     argDst,argDst,step
-
-        BLT     End
-        BEQ     lastElement
-
-        SUB     step,step,#12
-        SUB     step1,step1,#4                @ (N/4-1)*8 bytes
-
-        @ F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
-
-butterflyLoopSubFFTSize4:
-        VLD1    dW0rS32[0], [argTwiddle],step1
-        VLD1    dW1rS32[0],[argTwiddle]!
-
-        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
-        VLD2    {dX0r[1],dX0i[1]},[pSrc],step
-        SUB     pSrc,pSrc,#4
-        SUB     argTwiddle,argTwiddle,step1
-        VLD2    {dX1r[0],dX1i[0]},[pSrc]!
-        VLD2    {dX1r[1],dX1i[1]},[pSrc]!
-
-        SUB     step1,step1,#4                @ (N/4-2)*4 bytes
-        VLD1    dW0iS32[0],[pTwiddleTmp],step1
-        VLD1    dW1iS32[0],[pTwiddleTmp]!
-        SUB     pSrc,pSrc,step
-
-        SUB     pTwiddleTmp,pTwiddleTmp,step1
-        VREV32  dX1r,dX1r
-        VREV32  dX1i,dX1i
-        SUBS    subFFTSize,subFFTSize,#4
-
-        VSUB    dT2,dX0r,dX1r                 @ a-c
-        SUB     step1,step1,#4
-        VADD    dT3,dX0i,dX1i                 @ b+d
-        VADD    dT0,dX0r,dX1r                 @ a+c
-        VSUB    dT1,dX0i,dX1i                 @ b-d
-        VHADD   dT0,dT0,dzero
-        VHADD   dT1,dT1,dzero
-
-        VTRN    dW1r,dW1i
-        VTRN    dW0r,dW0i
-
-        VMULL   qT0,dW1r,dT2
-        VMLAL   qT0,dW1i,dT3
-        VMULL   qT1,dW1r,dT3
-        VMLSL   qT1,dW1i,dT2
-
-        VMULL   qT2,dW0r,dT2
-        VMLSL   qT2,dW0i,dT3
-        VMULL   qT3,dW0r,dT3
-        VMLAL   qT3,dW0i,dT2
-
-        VRSHRN  dX1r,qT0,#16
-        VRSHRN  dX1i,qT1,#16
-
-        VSUB    dY1r,dT0,dX1i                 @ F(N/2 -1)
-        VADD    dY1i,dT1,dX1r
-        VNEG    dY1i,dY1i
-
-        VREV32  dY1r,dY1r
-        VREV32  dY1i,dY1i
-
-        VRSHRN  dX0r,qT2,#16
-        VRSHRN  dX0i,qT3,#16
-
-        VSUB    dY0r,dT0,dX0i                 @ F(1)
-        VADD    dY0i,dT1,dX0r
-
-        VST2    {dY0r[0],dY0i[0]},[argDst]!
-        VST2    {dY0r[1],dY0i[1]},[argDst],step
-        SUB     argDst, #4
-        VST2    {dY1r[0],dY1i[0]},[argDst]!
-        VST2    {dY1r[1],dY1i[1]},[argDst]!
-        SUB     argDst,argDst,step
-        SUB     step,step,#16                 @ (N/2-4)*4 bytes
-
-        BGT     butterflyLoopSubFFTSize4
-
-        SUB     pSrc,pSrc,#4                  @ points to the last element.
-        SUB     argDst,argDst,#4              @ points to the last element.
-
-lastElement:
-        @ Last element can be expanded as follows
-        @ 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
-        @ 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
-        @ 1/2[2a+j0] + j (c+jd) [0+j2b]
-        @ (a-bc, -bd)
-        @ Since (c,d) = (0,1) for the last element, result is just (a,-b)
-
-        VLD1    dX0rS32[0],[pSrc]
-        VST1    dX0r[0],[argDst]!
-        VNEG    dX0r,dX0r
-        VST1    dX0r[1],[argDst]!
-
-End:
-        @ Set return value
-        MOV     result, #OMX_Sts_NoErr
-
-        @ Write function tail
-        M_END
-
-    .END
diff --git a/dl/sp/src/omxSP_FFTGetBufSize_R_S16.c b/dl/sp/src/omxSP_FFTGetBufSize_R_S16.c
deleted file mode 100644
index a61a374..0000000
--- a/dl/sp/src/omxSP_FFTGetBufSize_R_S16.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- *
- * Some code in this file was originally from file omxSP_FFTGetBufSize_R_S32.c
- * which was licensed as follows.
- * It has been relicensed with permission from the copyright holders.
- */
-
-/*
- * OpenMAX DL: v1.0.2
- * Last Modified Revision:
- * Last Modified Date:
- */
-
-#include "dl/api/armOMX.h"
-#include "dl/api/omxtypes.h"
-#include "dl/sp/api/armSP.h"
-#include "dl/sp/api/omxSP.h"
-
-/**
- * Function: omxSP_FFTGetBufSize_R_S16
- *
- * Description:
- * Computes the size of the specification structure required for the length
- * 2^order real FFT and IFFT functions.
- *
- * Remarks:
- * This function is used in conjunction with the 16-bit functions
- * <FFTFwd_RToCCS_S16_Sfs> and <FFTInv_CCSToR_S16_Sfs>.
- *
- * Parameters:
- * [in]  order       base-2 logarithm of the length; valid in the range
- *			   [0,12].
- * [out] pSize	   pointer to the number of bytes required for the
- *			   specification structure.
- *
- * Return Value:
- * Standard omxError result. See enumeration for possible result codes.
- *
- */
-
-OMXResult omxSP_FFTGetBufSize_R_S16(
-  OMX_INT order,
-  OMX_INT *pSize
-) {
-  OMX_INT     NBy2,N,twiddleSize;
-
-  /* Order zero not allowed */
-  if (order == 0) {
-    return OMX_Sts_BadArgErr;
-  }
-
-  NBy2 = 1 << (order - 1);
-  N = NBy2 << 1;
-  twiddleSize = 5 * N / 8;  /* 3 / 4 (N / 2) + N / 4 */
-
-  /* 2 pointers to store bitreversed array and twiddle factor array */
-  *pSize = sizeof(ARMsFFTSpec_R_SC16)
-           /* Twiddle factors  */
-           + sizeof(OMX_SC16) * twiddleSize
-           /* Ping Pong buffer for doing the N/2 point complex FFT; */
-           /* extra size 'N' as a temporary buf for FFTInv_CCSToR_S16_Sfs */
-           + sizeof(OMX_S16) * (N << 1)
-           /* Extra bytes to get 32 byte alignment of ptwiddle and pBuf */
-           + 62 ;
-
-
-  return OMX_Sts_NoErr;
-}
-
-/*****************************************************************************
- *                              END OF FILE
- *****************************************************************************/
-
diff --git a/dl/sp/src/omxSP_FFTInit_R_S16.c b/dl/sp/src/omxSP_FFTInit_R_S16.c
deleted file mode 100644
index 3a12167..0000000
--- a/dl/sp/src/omxSP_FFTInit_R_S16.c
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- *
- * Some code in this file was originally from file omxSP_FFTInit_R_S16S32.c
- * which was licensed as follows.
- * It has been relicensed with permission from the copyright holders.
- */
-
-/*
- * OpenMAX DL: v1.0.2
- * Last Modified Revision:
- * Last Modified Date:
- *
- * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
- */
-
-#include "dl/api/armOMX.h"
-#include "dl/api/omxtypes.h"
-#include "dl/sp/api/armSP.h"
-#include "dl/sp/api/omxSP.h"
-
-/**
- * Function: omxSP_FFTInit_R_S16
- *
- * Description:
- * Initialize the real forward-FFT specification information struct.
- *
- * Remarks:
- * This function is used to initialize the specification structures
- * for functions <ippsFFTFwd_RToCCS_S16_Sfs> and
- * <ippsFFTInv_CCSToR_S16_Sfs>. Memory for *pFFTSpec must be
- * allocated prior to calling this function. The number of bytes
- * required for *pFFTSpec can be determined using
- * <FFTGetBufSize_R_S16>.
- *
- * Parameters:
- * [in]  order       base-2 logarithm of the desired block length;
- *			   valid in the range [0,12].
- * [out] pFFTFwdSpec pointer to the initialized specification structure.
- *
- * Return Value:
- * Standard omxError result. See enumeration for possible result codes.
- *
- */
-
-OMXResult omxSP_FFTInit_R_S16(
-  OMXFFTSpec_R_S16* pFFTSpec,
-  OMX_INT order
-) {
-  OMX_INT i = 0, j = 0;
-  OMX_SC16 *pTwiddle = NULL, *pTwiddle1 = NULL, *pTwiddle2 = NULL;
-  OMX_SC16 *pTwiddle3 = NULL, *pTwiddle4 = NULL;
-  OMX_S16 *pBuf = NULL;
-  OMX_U16 *pBitRev = NULL;
-  OMX_U32 pTmp = 0;
-  OMX_INT Nby2 = 0, N = 0, M = 0, diff = 0, step = 0;
-  OMX_S16 x = 0, y = 0, xNeg = 0;
-  OMX_S32 xS32 = 0, yS32 = 0;
-  ARMsFFTSpec_R_SC16 *pFFTStruct = NULL;
-
-  /* Order zero not allowed */
-  if (order == 0) {
-    return OMX_Sts_BadArgErr;
-  }
-
-  /* Do the initializations */
-  pFFTStruct = (ARMsFFTSpec_R_SC16*) pFFTSpec;
-  Nby2 = 1 << (order - 1);
-  N = Nby2 << 1;
-  pBitRev = NULL ;  /* optimized implementations don't use bitreversal */
-  pTwiddle = (OMX_SC16*) (sizeof(ARMsFFTSpec_R_SC16) + (OMX_S8*)pFFTSpec);
-
-  /* Align to 32 byte boundary */
-  pTmp = ((OMX_U32)pTwiddle)&31;  /* (OMX_U32)pTwiddle % 32 */
-  if(pTmp != 0) {
-    pTwiddle = (OMX_SC16*) ((OMX_S8*)pTwiddle + (32 - pTmp));
-  }
-
-  pBuf = (OMX_S16*) (sizeof(OMX_SC16) * (5 * N / 8) + (OMX_S8*)pTwiddle);
-
-  /* Align to 32 byte boundary */
-  pTmp = ((OMX_U32)pBuf)&31;                 /* (OMX_U32)pBuf % 32 */
-  if(pTmp != 0) {
-    pBuf = (OMX_SC16*)((OMX_S8*)pBuf + (32 - pTmp));
-  }
-
-  /*
-   * Filling Twiddle factors : exp^(-j*2*PI*k/ (N/2) ) ; k=0,1,2,...,3/4(N/2).
-   * N/2 point complex FFT is used to compute N point real FFT.
-   * The original twiddle table "armSP_FFT_S32TwiddleTable" is of size
-   * (MaxSize/8 + 1). Rest of the values i.e., up to MaxSize are calculated
-   * using the symmetries of sin and cos.
-   * The max size of the twiddle table needed is 3/4(N/2) for a radix-4 stage.
-   *
-   * W = (-2 * PI) / N
-   * N = 1 << order
-   * W = -PI >> (order - 1)
-   * 
-   * Note we use S32 twiddle factor table and round the values to 16 bits.
-   */
-
-  M = Nby2 >> 3;
-  diff = 12 - (order - 1);
-  step = 1 << diff;  /* Step into the twiddle table for the current order */
-
-  xS32 = armSP_FFT_S32TwiddleTable[0];
-  yS32 = armSP_FFT_S32TwiddleTable[1];
-  x = (xS32 + 0x8000) >> 16;
-  y = (yS32 + 0x8000) >> 16;
-  xNeg = 0x7FFF;
-
-  if((order-1) >= 3) {
-    /* i = 0 case */
-    pTwiddle[0].Re = x;
-    pTwiddle[0].Im = y;
-    pTwiddle[2*M].Re = -y;
-    pTwiddle[2*M].Im = xNeg;
-    pTwiddle[4*M].Re = xNeg;
-    pTwiddle[4*M].Im = y;
-
-    for (i=1; i<=M; i++){
-      OMX_S16 x_neg = 0, y_neg = 0;
-      j = i * step;
-
-      xS32 = armSP_FFT_S32TwiddleTable[2 * j];
-      yS32 = armSP_FFT_S32TwiddleTable[2 * j + 1];
-      x = (xS32 + 0x8000) >> 16;
-      y = (yS32 + 0x8000) >> 16;
-      /* |x_neg = -x| doesn't work when x is 0x8000. */
-      x_neg = (-(xS32 + 0x8000)) >> 16;
-      y_neg = (-(yS32 + 0x8000)) >> 16;
-
-      pTwiddle[i].Re = x;
-      pTwiddle[i].Im = y;
-      pTwiddle[2* M- i].Re = y_neg;
-      pTwiddle[2* M- i].Im = x_neg;
-      pTwiddle[2* M+ i].Re = y;
-      pTwiddle[2* M+ i].Im = x_neg;
-      pTwiddle[4* M- i].Re = x_neg;
-      pTwiddle[4* M- i].Im = y;
-      pTwiddle[4* M+ i].Re = x_neg;
-      pTwiddle[4* M+ i].Im = y_neg;
-      pTwiddle[6* M- i].Re = y;
-      pTwiddle[6* M- i].Im = x;
-    }
-  }
-  else {
-    if ((order - 1) == 2) {
-      pTwiddle[0].Re = x;
-      pTwiddle[0].Im = y;
-      pTwiddle[1].Re = -y;
-      pTwiddle[1].Im = xNeg;
-      pTwiddle[2].Re = xNeg;
-      pTwiddle[2].Im = y;
-    }
-    if ((order-1) == 1) {
-      pTwiddle[0].Re = x;
-      pTwiddle[0].Im = y;
-    }
-  }
-
-  /*
-   * Now fill the last N/4 values : exp^(-j*2*PI*k/N);  k=1,3,5,...,N/2-1.
-   * These are used for the final twiddle fix-up for converting complex to
-   * real FFT.
-   */
-
-  M = N >> 3;
-  diff = 12 - order;
-  step = 1 << diff;
-
-  pTwiddle1 = pTwiddle + 3 * N / 8;
-  pTwiddle4 = pTwiddle1 + (N / 4 - 1);
-  pTwiddle3 = pTwiddle1 + N / 8;
-  pTwiddle2 = pTwiddle1 + (N / 8 - 1);
-
-  xS32 = armSP_FFT_S32TwiddleTable[0];
-  yS32 = armSP_FFT_S32TwiddleTable[1];
-  x = (xS32 + 0x8000) >> 16;
-  y = (yS32 + 0x8000) >> 16;
-  xNeg = 0x7FFF;
-
-  if((order) >= 3) {
-    for (i = 1; i <= M; i += 2 ) {
-      OMX_S16 x_neg = 0, y_neg = 0;
-
-      j = i*step;
-
-      xS32 = armSP_FFT_S32TwiddleTable[2 * j];
-      yS32 = armSP_FFT_S32TwiddleTable[2 * j + 1];
-      x = (xS32 + 0x8000) >> 16;
-      y = (yS32 + 0x8000) >> 16;
-      /* |x_neg = -x| doesn't work when x is 0x8000. */
-      x_neg = (-(xS32 + 0x8000)) >> 16;
-      y_neg = (-(yS32 + 0x8000)) >> 16;
-
-      pTwiddle1[0].Re = x;
-      pTwiddle1[0].Im = y;
-      pTwiddle1 += 1;
-      pTwiddle2[0].Re = y_neg;
-      pTwiddle2[0].Im = x_neg;
-      pTwiddle2 -= 1;
-      pTwiddle3[0].Re = y;
-      pTwiddle3[0].Im = x_neg;
-      pTwiddle3 += 1;
-      pTwiddle4[0].Re = x_neg;
-      pTwiddle4[0].Im = y;
-      pTwiddle4 -= 1;
-    }
-  }
-  else {
-    if (order == 2) {
-      pTwiddle1[0].Re = -y;
-      pTwiddle1[0].Im = xNeg;
-    }
-  }
-
-  /* Update the structure */
-  pFFTStruct->N = N;
-  pFFTStruct->pTwiddle = pTwiddle;
-  pFFTStruct->pBitRev = pBitRev;
-  pFFTStruct->pBuf = pBuf;
-
-  return OMX_Sts_NoErr;
-}
-/*****************************************************************************
- *                              END OF FILE
- *****************************************************************************/
-
diff --git a/dl/sp/src/omxSP_FFTInv_CCSToR_S16_Sfs_s.S b/dl/sp/src/omxSP_FFTInv_CCSToR_S16_Sfs_s.S
deleted file mode 100644
index 805623c..0000000
--- a/dl/sp/src/omxSP_FFTInv_CCSToR_S16_Sfs_s.S
+++ /dev/null
@@ -1,397 +0,0 @@
-@
-@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
-@
-@ Use of this source code is governed by a BSD-style license
-@ that can be found in the LICENSE file in the root of the source
-@ tree. An additional intellectual property rights grant can be found
-@ in the file PATENTS.  All contributing project authors may
-@ be found in the AUTHORS file in the root of the source tree.
-@
-@ Some code in this file was originally from file
-@ omxSP_FFTInv_CToC_SC16_Sfs_s.S which was licensed as follows.
-@ It has been relicensed with permission from the copyright holders.
-@
-
-@
-@ File Name:  omxSP_FFTInv_CToC_SC16_Sfs_s.s
-@ OpenMAX DL: v1.0.2
-@ Last Modified Revision:   6729
-@ Last Modified Date:       Tue, 17 Jul 2007
-@
-@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-@
-
-@
-@ Description:
-@ Compute an inverse FFT for a 16-bit real signal, with complex FFT routines.
-@
-
-#include "dl/api/armCOMM_s.h"
-#include "dl/api/omxtypes_s.h"
-
-.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
-.extern  armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
-.extern  armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
-.extern  armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
-.extern  armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
-.extern  armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
-.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
-.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
-.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
-.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
-.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
-.extern  armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe
-.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
-.extern  armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
-.extern  armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
-.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
-
-@Input Registers
-#define pSrc            r0
-#define pDst            r1
-#define pFFTSpec        r2
-#define scale           r3
-
-@ Output registers
-#define result  r0
-
-@Local Scratch Registers
-#define argTwiddle      r1
-#define argDst          r2
-#define argScale        r4
-#define pTwiddle        r4
-#define tmpOrder        r4
-#define pOut            r5
-#define subFFTSize      r7
-#define subFFTNum       r6
-#define N               r6
-#define order           r14
-#define diff            r9
-@ Total num of radix stages to comple the FFT
-#define count           r8
-#define x0r             r4
-#define x0i             r5
-#define diffMinusOne    r2
-#define round           r3
-#define pOut1           r2
-#define size            r7
-#define step            r8
-#define step1           r9
-#define twStep          r10
-#define pTwiddleTmp     r11
-#define argTwiddle1     r12
-#define zero            r14
-
-@ Neon registers
-#define dX0             D0.S32
-#define dShift          D1.S32
-#define qShift          Q0.s16
-#define dX1             D1.S32
-#define dY0             D2.S32
-#define dY1             D3.S32
-#define dX0r            D0.S32
-#define dX0i            D1.S32
-#define dX1r            D2.S32
-#define dX1i            D3.S32
-#define dW0r            D4.S32
-#define dW0i            D5.S32
-#define dW1r            D6.S32
-#define dW1i            D7.S32
-#define dT0             D8.S32
-#define dT1             D9.S32
-#define dT2             D10.S32
-#define dT3             D11.S32
-#define qT0             Q6.S64
-#define qT1             Q7.S64
-#define qT0s            Q6.S16
-#define qT1s            Q7.S16
-#define qT2             Q8.S64
-#define qT3             Q9.S64
-#define dY0r            D4.S32
-#define dY0i            D5.S32
-#define dY1r            D6.S32
-#define dY1i            D7.S32
-#define dzero           D20.S32
-#define dY2             D4.S32
-#define dY3             D5.S32
-#define dW0             D6.S32
-#define dW1             D7.S32
-#define dW0Tmp          D10.S32
-#define dW1Neg          D11.S32
-
-
-
-    @ Allocate stack memory required by the function
-        M_ALLOC4        diffOnStack, 4
-
-    @ Write function header
-        M_START     omxSP_FFTInv_CCSToR_S16_Sfs,r11,d15
-
-@ Structure offsets for the FFTSpec
-        .set    ARMsFFTSpec_N, 0
-        .set    ARMsFFTSpec_pBitRev, 4
-        .set    ARMsFFTSpec_pTwiddle, 8
-        .set    ARMsFFTSpec_pBuf, 12
-
-        @ Define stack arguments
-
-        @ Read the size from structure and take log
-        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
-
-        @ Read other structure parameters
-        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
-        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
-
-        @  N=1 Treat seperately
-        CMP     N,#1
-        BGT     sizeGreaterThanOne
-        VLD1    dX0[0],[pSrc]
-        RSB     scale,scale,#0                      @ To use VRSHL for right shift by a variable
-        VMOV    dShift[0],scale
-        VRSHL   dX0,dShift
-        VST1    dX0[0],[pDst]
-
-        B       End
-
-sizeGreaterThanOne:
-
-        @ Call the preTwiddle Radix2 stage before doing the complex IFFT
-
-        @ The following conditional BL combination would work since
-        @ evenOddButterflyLoop in the first call would set Z flag to zero
-
-        CMP     scale,#0
-        BLEQ    armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe
-        BLGT    armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe
-
-complexIFFT:
-
-        ASR     N,N,#1                              @ N/2 point complex IFFT
-        ADD     pSrc,pOut,N,LSL #2                  @ set pSrc as pOut1
-
-        CLZ     order,N                             @ N = 2^order
-        RSB     order,order,#31
-        MOV     subFFTSize,#1
-
-        ADD     scale,scale,order                   @ FFTInverse has a final scaling factor by N
-
-        CMP     order,#3
-        BGT     orderGreaterthan3                   @ order > 3
-
-        CMP     order,#1
-        BGE     orderGreaterthan0                   @ order > 0
-        M_STR   scale, diffOnStack,LT               @ order = 0
-        LDRLT   x0r,[pSrc]
-        STRLT   x0r,[pDst]
-        MOVLT   pSrc,pDst
-        BLT     FFTEnd
-
-orderGreaterthan0:
-        @ set the buffers appropriately for various orders
-        CMP     order,#2
-        MOVNE   argDst,pDst
-        MOVEQ   argDst,pOut
-        MOVEQ   pOut,pDst                           @ Pass the first stage destination in RN5
-        MOV     argTwiddle,pTwiddle
-        @ Store the scale factor and scale at the end
-        SUB     diff,scale,order
-        M_STR   diff, diffOnStack
-        BGE     orderGreaterthan1
-        BLLT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe  @ order = 1
-        B       FFTEnd
-
-
-orderGreaterthan1:
-        MOV     tmpOrder,order                      @ tmpOrder = RN 4
-        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
-        CMP     tmpOrder,#2
-        BLGT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
-        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
-        B       FFTEnd
-
-
-
-
-orderGreaterthan3:
-        @ check scale = 0 or scale = order
-        SUBS    diff, scale, order                  @ scale > order
-        MOVGT   scale,order
-        BGE     specialScaleCase                    @ scale = 0 or scale = order
-        CMP     scale,#0
-        BEQ     specialScaleCase
-        B       generalScaleCase
-
-specialScaleCase:                                   @  scale = 0 or scale = order  and order > 3
-
-        TST     order, #2                           @ Set input args to fft stages
-        MOVNE   argDst,pDst
-        MOVEQ   argDst,pOut
-        MOVEQ   pOut,pDst                           @ Pass the first stage destination in RN5
-        MOV     argTwiddle,pTwiddle
-
-        CMP     diff,#0
-        M_STR   diff, diffOnStack
-        BGE     scaleEqualsOrder
-
-        @check for even or odd order
-        @ NOTE: The following combination of BL's would work fine eventhough the first
-        @ BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
-        @ armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
-
-        TST     order,#0x00000001
-        BLEQ    armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
-        BLNE    armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
-
-        CMP     subFFTNum,#4
-        BLT     FFTEnd
-
-unscaledRadix4Loop:
-        BEQ     lastStageUnscaledRadix4
-        BL      armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
-        CMP     subFFTNum,#4
-        B       unscaledRadix4Loop
-
-lastStageUnscaledRadix4:
-        BL      armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
-        B       FFTEnd
-
-scaleEqualsOrder:
-        @check for even or odd order
-        @ NOTE: The following combination of BL's would work fine eventhough the first
-        @ BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
-        @ armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
-
-        TST     order,#0x00000001
-        BLEQ    armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
-        BLNE    armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
-
-        CMP     subFFTNum,#4
-        BLT     FFTEnd
-
-scaledRadix4Loop:
-        BEQ     lastStageScaledRadix4
-        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
-        CMP     subFFTNum,#4
-        B       scaledRadix4Loop
-
-lastStageScaledRadix4:
-        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
-        B       FFTEnd
-
-
-
-generalScaleCase:                               @ 0 < scale < order and order > 3
-        @ Determine the correct destination buffer
-        SUB     diff,order,scale
-        TST     diff,#0x01
-        ADDEQ   count,scale,diff,LSR #1         @ count = scale + (order - scale)/2
-        MOVNE   count,order
-        TST     count,#0x01                     @ Is count even or odd ?
-
-        MOVNE   argDst,pDst                     @ Set input args to fft stages
-        MOVEQ   argDst,pOut
-        MOVEQ   pOut,pDst                       @ Pass the first stage destination in RN5
-        MOV     argTwiddle,pTwiddle
-
-        CMP     diff,#1
-        M_STR   diff, diffOnStack
-        BEQ     scaleps                         @ scaling including a radix2_ps stage
-
-        MOV     argScale,scale                  @ Put scale in RN4 so as to save and restore
-        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     @ scaled first stage
-        SUBS    argScale,argScale,#1
-
-scaledRadix2Loop:
-        BLGT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
-        SUBS    argScale,argScale,#1            @ save and restore scale (RN4) in the scaled stages
-        BGT     scaledRadix2Loop
-        B       outScale
-
-scaleps:
-        SUB     argScale,scale,#1               @ order>3 and diff=1 => scale >= 3
-        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     @ scaled first stage
-        SUBS    argScale,argScale,#1
-
-scaledRadix2psLoop:
-        BEQ     scaledRadix2psStage
-        BLGT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
-        SUBS    argScale,argScale,#1            @ save and restore scale (RN4) in the scaled stages
-        BGE     scaledRadix2psLoop
-
-scaledRadix2psStage:
-        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
-        B       generalLastStageUnscaledRadix2
-
-
-outScale:
-        M_LDR   diff, diffOnStack
-        @check for even or odd order
-        TST     diff,#0x00000001
-        BEQ     generalUnscaledRadix4Loop
-        B       unscaledRadix2Loop
-
-generalUnscaledRadix4Loop:
-        CMP     subFFTNum,#4
-        BEQ     generalLastStageUnscaledRadix4
-        BL      armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
-        B       generalUnscaledRadix4Loop
-
-generalLastStageUnscaledRadix4:
-        BL      armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
-        B       End
-
-unscaledRadix2Loop:
-        CMP     subFFTNum,#4
-        BEQ     generalLastTwoStagesUnscaledRadix2
-        BL      armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe
-        B       unscaledRadix2Loop
-
-generalLastTwoStagesUnscaledRadix2:
-        BL      armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
-generalLastStageUnscaledRadix2:
-        BL      armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
-        B       End
-
-
-FFTEnd:                                         @ Does only the scaling
-
-        M_LDR   diff, diffOnStack
-        CMP     diff,#0
-        BLE     End
-
-        RSB     diff,diff,#0                    @ to use VRSHL for right shift by a variable
-        VDUP    qShift,diff
-
-        @ Use parallel loads for bigger FFT size.
-        CMP     subFFTSize, #8
-        BLT     scaleLessFFTData
-
-scaleFFTData:
-        VLD1    {qT0s, qT1s},[pSrc:256]         @ pSrc contains pDst pointer
-        SUBS    subFFTSize,subFFTSize,#8
-        VSHL    qT0s,qShift
-        VSHL    qT1s,qShift
-        VST1    {qT0s, qT1s},[pSrc:256]!
-        BGT     scaleFFTData
-        B       End
-
-scaleLessFFTData:                               @ N = subFFTSize  ; dataptr = pDst  ; scale = diff
-        VLD1    {dX0[0]},[pSrc]                 @ pSrc contains pDst pointer
-        SUBS    subFFTSize,subFFTSize,#1
-        VRSHL   dX0,dShift
-        VST1    {dX0[0]},[pSrc]!
-        BGT     scaleLessFFTData
-
-End:
-        @ Set return value
-        MOV     result, #OMX_Sts_NoErr
-
-        @ Write function tail
-        M_END
-
-
-
-
-
-
-    .END
diff --git a/dl/sp/src/test/test_fft.gyp b/dl/sp/src/test/test_fft.gyp
index 3290550..99b3774 100644
--- a/dl/sp/src/test/test_fft.gyp
+++ b/dl/sp/src/test/test_fft.gyp
@@ -67,19 +67,11 @@
       ],
     },
     {
-      # Test real 16-bit fixed-point FFT implemented with S32 routines.
-      'target_name': 'test_rfft16_s32',
+      # Test real 16-bit fixed-point FFT
+      'target_name': 'test_rfft16',
       'type': 'executable',
       'sources': [
-        'test_rfft16_s32.c',
-      ],
-    },
-    {
-      # Test real 16-bit fixed-point FFT implemented with S16 routines.
-      'target_name': 'test_rfft16_s16',
-      'type': 'executable',
-      'sources': [
-        'test_rfft16_s16.c',
+        'test_rfft16.c',
       ],
     },
     {
@@ -115,8 +107,7 @@
         'test_fft32',
         'test_float_fft',
         'test_float_rfft',
-        'test_rfft16_s32',
-        'test_rfft16_s16',
+        'test_rfft16',
         'test_rfft32',
         'test_fft_time',
       ],
diff --git a/dl/sp/src/test/test_fft16.c b/dl/sp/src/test/test_fft16.c
index bedf278..081bf23 100644
--- a/dl/sp/src/test/test_fft16.c
+++ b/dl/sp/src/test/test_fft16.c
@@ -24,7 +24,7 @@
 #define MAX_FFT_ORDER   12
 
 int verbose = 0;
-int signal_value = 32767;
+int signal_value = 1024;
 int scale_factor = 0;
 
 struct KnownTestFailures known_failures[] = {
diff --git a/dl/sp/src/test/test_fft_time.c b/dl/sp/src/test/test_fft_time.c
index 42431bb..a401594 100644
--- a/dl/sp/src/test/test_fft_time.c
+++ b/dl/sp/src/test/test_fft_time.c
@@ -20,14 +20,9 @@
 #include "dl/sp/src/test/aligned_ptr.h"
 #include "dl/sp/src/test/gensig.h"
 
-#define MAX_FFT_ORDER TWIDDLE_TABLE_ORDER
+#define MAX_FFT_ORDER   TWIDDLE_TABLE_ORDER
 #define MAX_FFT_ORDER_FIXED_POINT 12
 
-typedef enum {
-  rfft16_s16,
-  rfft16_s16s32,
-} rfft16_type;
-
 void TimeOneFloatFFT(int count, int fft_log_size, float signal_value,
                      int signal_type);
 void TimeFloatFFT(int count, float signal_value, int signal_type);
@@ -38,7 +33,7 @@
                     int signal_type);
 void TimeSC32FFT(int count, float signal_value, int signal_type);
 void TimeOneRFFT16(int count, int fft_log_size, float signal_value,
-                   int signal_type, rfft16_type rfft16_type_selection);
+                   int signal_type);
 void TimeRFFT16(int count, float signal_value, int signal_type);
 void TimeOneRFFT32(int count, int fft_log_size, float signal_value,
                    int signal_type);
@@ -104,7 +99,7 @@
 
 void main(int argc, char* argv[]) {
   int fft_log_size = 4;
-  float signal_value = 32767;
+  float signal_value = 1024;
   int signal_type = 0;
   int test_mode = 1;
   int count = 100;
@@ -195,8 +190,7 @@
         TimeOneSC32FFT(count, fft_log_size, signal_value, signal_type);
         break;
       case 3:
-        TimeOneRFFT16(count, fft_log_size, signal_value, signal_type, rfft16_s16s32);
-        TimeOneRFFT16(count, fft_log_size, signal_value, signal_type, rfft16_s16);
+        TimeOneRFFT16(count, fft_log_size, signal_value, signal_type);
         break;
       case 4:
         TimeOneRFFT32(count, fft_log_size, signal_value, signal_type);
@@ -672,12 +666,8 @@
   free(true_fft);
 }
 
-/* Argument rfft16_type_selection:
- *     rfft16_s16s32:       Calculate RFFT16 with 32 bit complex FFT;
- *     otherwise: Calculate RFFT16 with 16 bit complex FFT.
- */
 void TimeOneRFFT16(int count, int fft_log_size, float signal_value,
-                   int signal_type, rfft16_type rfft16_type_selection) {
+                   int signal_type) {
   OMX_S16* x;
   OMX_S32* y;
   OMX_S16* z;
@@ -699,8 +689,8 @@
 
   OMX_INT n, fft_spec_buffer_size;
   OMXResult status;
-  OMXFFTSpec_R_S16 * fft_fwd_spec = NULL;
-  OMXFFTSpec_R_S16 * fft_inv_spec = NULL;
+  OMXFFTSpec_R_S16S32 * fft_fwd_spec = NULL;
+  OMXFFTSpec_R_S16S32 * fft_inv_spec = NULL;
   int fft_size;
   struct timeval start_time;
   struct timeval end_time;
@@ -738,20 +728,13 @@
   GenerateRealFloatSignal(xr, (OMX_FC32*) yrTrue, fft_size, signal_type,
                           signal_value);
 
-  if(rfft16_type_selection == rfft16_s16s32) {
-    status = omxSP_FFTGetBufSize_R_S16S32(fft_log_size, &fft_spec_buffer_size);
-    fft_fwd_spec = malloc(fft_spec_buffer_size);
-    fft_inv_spec = malloc(fft_spec_buffer_size);
-    status = omxSP_FFTInit_R_S16S32(fft_fwd_spec, fft_log_size);
-    status = omxSP_FFTInit_R_S16S32(fft_inv_spec, fft_log_size);
-  }
-  else {
-    status = omxSP_FFTGetBufSize_R_S16(fft_log_size, &fft_spec_buffer_size);
-    fft_fwd_spec = malloc(fft_spec_buffer_size);
-    fft_inv_spec = malloc(fft_spec_buffer_size);
-    status = omxSP_FFTInit_R_S16(fft_fwd_spec, fft_log_size);
-    status = omxSP_FFTInit_R_S16(fft_inv_spec, fft_log_size);
-  }
+  status = omxSP_FFTGetBufSize_R_S16S32(fft_log_size, &fft_spec_buffer_size);
+
+  fft_fwd_spec = (OMXFFTSpec_R_S16S32*) malloc(fft_spec_buffer_size);
+  fft_inv_spec = (OMXFFTSpec_R_S16S32*) malloc(fft_spec_buffer_size);
+  status = omxSP_FFTInit_R_S16S32(fft_fwd_spec, fft_log_size);
+
+  status = omxSP_FFTInit_R_S16S32(fft_inv_spec, fft_log_size);
 
   if (do_forward_test) {
     if (include_conversion) {
@@ -774,14 +757,9 @@
           temp16[n] = factor * xr[n];
         }
 
-        if(rfft16_type_selection == rfft16_s16s32) {
-          status = omxSP_FFTFwd_RToCCS_S16S32_Sfs(x, y,
-              (OMXFFTSpec_R_S16S32*)fft_fwd_spec, (OMX_INT) scaleFactor);
-        }
-        else {
-          status = omxSP_FFTFwd_RToCCS_S16_Sfs(x, y,
-              (OMXFFTSpec_R_S16*)fft_fwd_spec, (OMX_INT) scaleFactor);
-        }
+        status = omxSP_FFTFwd_RToCCS_S16S32_Sfs(x, y, fft_fwd_spec,
+                                                (OMX_INT) scaleFactor);
+
         /*
          * Now spend some time converting the fixed-point FFT back to float.
          */
@@ -796,26 +774,15 @@
 
       GetUserTime(&start_time);
       for (n = 0; n < count; ++n) {
-      if(rfft16_type_selection == rfft16_s16s32) {
-        status = omxSP_FFTFwd_RToCCS_S16S32_Sfs(x, y, 
-            (OMXFFTSpec_R_S16S32*)fft_fwd_spec, (OMX_INT) scaleFactor);
-      }
-      else {
-        status = omxSP_FFTFwd_RToCCS_S16_Sfs(x, y,
-            (OMXFFTSpec_R_S16*)fft_fwd_spec, (OMX_INT) scaleFactor);
-      }
+        status = omxSP_FFTFwd_RToCCS_S16S32_Sfs(x, y, fft_fwd_spec,
+                                                (OMX_INT) scaleFactor);
       }
       GetUserTime(&end_time);
     }
 
     elapsed_time = TimeDifference(&start_time, &end_time);
 
-    if(rfft16_type_selection == rfft16_s16s32) {
-      PrintResult("Forward RFFT16 (with rfft16_s16s32)", fft_log_size, elapsed_time, count);
-    }
-    else {
-      PrintResult("Forward RFFT16 (with rfft16_s16)", fft_log_size, elapsed_time, count);
-    }
+    PrintResult("Forward RFFT16", fft_log_size, elapsed_time, count);
   }
 
   if (do_inverse_test) {
@@ -837,14 +804,9 @@
           temp32[n] = factor * yrTrue[n];
         }
 
-        if(rfft16_type_selection == rfft16_s16s32) {
-          status = omxSP_FFTInv_CCSToR_S32S16_Sfs(y, z,
-              (OMXFFTSpec_R_S16S32*)fft_inv_spec, 0);
-        }
-        else {
-          status = omxSP_FFTInv_CCSToR_S16_Sfs(y, z,
-              (OMXFFTSpec_R_S16*)fft_inv_spec, 0);
-        }
+        status = omxSP_FFTFwd_RToCCS_S16S32_Sfs(x, y, fft_fwd_spec,
+                                                (OMX_INT) scaleFactor);
+
         /*
          * Spend some time converting the result back to float
          */
@@ -857,26 +819,14 @@
     } else {
       GetUserTime(&start_time);
       for (n = 0; n < count; ++n) {
-        if(rfft16_type_selection == rfft16_s16s32) {
-          status = omxSP_FFTInv_CCSToR_S32S16_Sfs(y, z,
-              (OMXFFTSpec_R_S16S32*)fft_inv_spec, 0);
-        }
-        else {
-          status = omxSP_FFTInv_CCSToR_S16_Sfs(y, z,
-              (OMXFFTSpec_R_S16*)fft_inv_spec, 0);
-        }
+        status = omxSP_FFTInv_CCSToR_S32S16_Sfs(y, z, fft_inv_spec, 0);
       }
       GetUserTime(&end_time);
     }
 
     elapsed_time = TimeDifference(&start_time, &end_time);
 
-    if(rfft16_type_selection == rfft16_s16s32) {
-      PrintResult("Inverse RFFT16 (with rfft16_s16s32)", fft_log_size, elapsed_time, count);
-    }
-    else {
-      PrintResult("Inverse RFFT16 (with rfft16_s16)", fft_log_size, elapsed_time, count);
-    }
+    PrintResult("Inverse RFFT16", fft_log_size, elapsed_time, count);
   }
 
   FreeAlignedPointer(x_aligned);
@@ -893,18 +843,13 @@
   int k;
   int max_order = (max_fft_order > MAX_FFT_ORDER_FIXED_POINT)
       ? MAX_FFT_ORDER_FIXED_POINT : max_fft_order;
-  if (verbose == 0)
-    printf("RFFT16 (with rfft16_s16s32)\n");
-  for (k = min_fft_order; k <= max_order; ++k) {
-    int testCount = ComputeCount(count, k);
-    TimeOneRFFT16(testCount, k, signal_value, signal_type, 1);
-  }
 
   if (verbose == 0)
-    printf("RFFT16 (with rfft16_s16)\n");
+    printf("RFFT16\n");
+
   for (k = min_fft_order; k <= max_order; ++k) {
     int testCount = ComputeCount(count, k);
-    TimeOneRFFT16(testCount, k, signal_value, signal_type, 0);
+    TimeOneRFFT16(testCount, k, signal_value, signal_type);
   }
 }
 
diff --git a/dl/sp/src/test/test_float_rfft.c b/dl/sp/src/test/test_float_rfft.c
index 20b5e33..cb3262f 100644
--- a/dl/sp/src/test/test_float_rfft.c
+++ b/dl/sp/src/test/test_float_rfft.c
@@ -36,6 +36,8 @@
 
   SetDefaultOptions(&options, 1, MAX_FFT_ORDER);
 
+  options.signal_value_ = 1024;
+
   ProcessCommandLine(&options, argc, argv,
                      "Test forward and inverse real floating-point FFT\n");
 
diff --git a/dl/sp/src/test/test_rfft16_s32.c b/dl/sp/src/test/test_rfft16.c
similarity index 97%
rename from dl/sp/src/test/test_rfft16_s32.c
rename to dl/sp/src/test/test_rfft16.c
index f0e86e4..171ccdc 100644
--- a/dl/sp/src/test/test_rfft16_s32.c
+++ b/dl/sp/src/test/test_rfft16.c
@@ -33,8 +33,8 @@
 
   SetDefaultOptions(&options, 1, MAX_FFT_ORDER);
 
-  ProcessCommandLine(&options, argc, argv, "Test forward and inverse real 16 \
-                     -bit fixed-point FFT, with 32-bit complex FFT routines\n");
+  ProcessCommandLine(&options, argc, argv,
+                     "Test forward and inverse real 16-bit fixed-point FFT\n");
 
   verbose = options.verbose_;
   signal_value = options.signal_value_;
@@ -54,6 +54,7 @@
     info.known_failures_ = 0;
     info.forward_threshold_ = 90.12;
     info.inverse_threshold_ = 89.28;
+    signal_value = 32767;
     RunAllTests(&info);
   } else {
     TestFFT(options.fft_log_size_,
diff --git a/dl/sp/src/test/test_rfft16_s16.c b/dl/sp/src/test/test_rfft16_s16.c
deleted file mode 100644
index 9a9bc12..0000000
--- a/dl/sp/src/test/test_rfft16_s16.c
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <unistd.h>
-
-#include "dl/sp/api/armSP.h"
-#include "dl/sp/api/omxSP.h"
-#include "dl/sp/src/test/aligned_ptr.h"
-#include "dl/sp/src/test/compare.h"
-#include "dl/sp/src/test/gensig.h"
-#include "dl/sp/src/test/test_util.h"
-
-#define MAX_FFT_ORDER   12
-
-int verbose = 0;
-int signal_value = 32767;
-int scale_factor = 0;
-
-void TestFFT(int fftLogSize, int scale_factor, int signalType);
-
-void main(int argc, char* argv[]) {
-  struct Options options;
-
-  SetDefaultOptions(&options, 1, MAX_FFT_ORDER);
-
-  options.signal_value_ = signal_value;
-  options.scale_factor_ = scale_factor;
-
-  ProcessCommandLine(&options, argc, argv, "Test forward and inverse real 16 \
-                     -bit fixed-point FFT, with 16-bit complex FFT routines\n");
-
-  verbose = options.verbose_;
-  signal_value = options.signal_value_;
-  scale_factor = options.scale_factor_;
-
-  if (verbose > 255)
-    DumpOptions(stderr, &options);
-
-  if (options.test_mode_) {
-    struct TestInfo info;
-
-    info.real_only_ = options.real_only_;
-    info.max_fft_order_ = options.max_fft_order_;
-    info.min_fft_order_ = options.min_fft_order_;
-    info.do_forward_tests_ = options.do_forward_tests_;
-    info.do_inverse_tests_ = options.do_inverse_tests_;
-    /* No known failures */
-    info.known_failures_ = 0;
-    info.forward_threshold_ = 45;
-    info.inverse_threshold_ = 14;
-
-    RunAllTests(&info);
-  } else {
-    TestFFT(options.fft_log_size_,
-            options.signal_type_,
-            options.scale_factor_);
-  }
-}
-
-void GenerateSignal(struct ComplexFloat* fft,
-                    float* x_true, int size, int sigtype) {
-  int k;
-  struct ComplexFloat *test_signal;
-
-  test_signal = (struct ComplexFloat*) malloc(sizeof(*test_signal) * size);
-  GenerateTestSignalAndFFT(test_signal, fft, size, sigtype, signal_value, 1);
-
-  /*
-   * Convert the complex result to what we want
-   */
-
-  for (k = 0; k < size; ++k) {
-    x_true[k] = test_signal[k].Re;
-  }
-
-  free(test_signal);
-}
-
-void TestFFT(int fft_log_size, int signal_type, int scale_factor) {
-  struct SnrResult snr;
-
-  RunOneForwardTest(fft_log_size, signal_type, signal_value, &snr);
-  printf("Forward float FFT\n");
-  printf("SNR:  real part    %f dB\n", snr.real_snr_);
-  printf("      imag part    %f dB\n", snr.imag_snr_);
-  printf("      complex part %f dB\n", snr.complex_snr_);
-
-  RunOneInverseTest(fft_log_size, signal_type, signal_value, &snr);
-  printf("Inverse float FFT\n");
-  printf("SNR:  %f dB\n", snr.real_snr_);
-}
-
-float RunOneForwardTest(int fft_log_size, int signal_type,
-                        float unused_signal_value,
-                        struct SnrResult* snr) {
-  OMX_S16* x;
-  OMX_SC16* y;
-
-  struct AlignedPtr* x_aligned;
-  struct AlignedPtr* y_aligned;
-
-  float* x_true;
-  struct ComplexFloat* y_true;
-  OMX_SC16* y_scaled;
-
-  OMX_INT n, fft_spec_buffer_size;
-  OMXResult status;
-  OMXFFTSpec_R_S16 * fft_fwd_spec = NULL;
-  int fft_size;
-
-  /*
-   * To get good FFT results, set the forward FFT scale factor
-   * to be the same as the order.
-   */
-  scale_factor = fft_log_size;
-
-  fft_size = 1 << fft_log_size;
-
-  status = omxSP_FFTGetBufSize_R_S16(fft_log_size, &fft_spec_buffer_size);
-  if (verbose > 63) {
-    printf("fft_spec_buffer_size = %d\n", fft_spec_buffer_size);
-  }
-
-  fft_fwd_spec = (OMXFFTSpec_R_S16*) malloc(fft_spec_buffer_size);
-  status = omxSP_FFTInit_R_S16(fft_fwd_spec, fft_log_size);
-  if (status) {
-    fprintf(stderr, "Failed to init forward FFT:  status = %d\n", status);
-    exit(1);
-  }
-
-  x_aligned = AllocAlignedPointer(32, sizeof(*x) * fft_size);
-  y_aligned = AllocAlignedPointer(32, sizeof(*y) * (fft_size + 2));
-
-  x = x_aligned->aligned_pointer_;
-  y = y_aligned->aligned_pointer_;
-
-  x_true = (float*) malloc(sizeof(*x_true) * fft_size);
-  y_true = (struct ComplexFloat*) malloc(sizeof(*y_true) * (fft_size / 2 + 1));
-  y_scaled = (OMX_SC16*) malloc(sizeof(*y_true) * (fft_size / 2 + 1));
-
-  GenerateSignal(y_true, x_true, fft_size, signal_type);
-  for (n = 0; n < fft_size; ++n) {
-    x[n] = 0.5 + x_true[n];
-  }
-
-  {
-    float scale = 1 << fft_log_size;
-
-    for (n = 0; n < fft_size; ++n) {
-      y_scaled[n].Re = 0.5 + y_true[n].Re / scale;
-      y_scaled[n].Im = 0.5 + y_true[n].Im / scale;
-    }
-  }
-
-  if (verbose > 63) {
-    printf("Signal\n");
-    DumpArrayReal16("x", fft_size, x);
-
-    printf("Expected FFT output\n");
-    DumpArrayComplex16("y", fft_size / 2 + 1, y_scaled);
-  }
-
-  status = omxSP_FFTFwd_RToCCS_S16_Sfs(x, (OMX_S16*) y, fft_fwd_spec, scale_factor);
-  if (status) {
-    fprintf(stderr, "Forward FFT failed: status = %d\n", status);
-    exit(1);
-  }
-
-  if (verbose > 63) {
-    printf("FFT Output\n");
-    DumpArrayComplex16("y", fft_size / 2 + 1, y);
-  }
-
-  CompareComplex16(snr, y, y_scaled, fft_size / 2 + 1);
-
-  FreeAlignedPointer(x_aligned);
-  FreeAlignedPointer(y_aligned);
-  free(fft_fwd_spec);
-
-  return snr->complex_snr_;
-}
-
-float RunOneInverseTest(int fft_log_size, int signal_type,
-                        float unused_signal_value,
-                        struct SnrResult* snr) {
-  OMX_S16* x_scaled;
-  OMX_S16* z;
-  OMX_SC16* y;
-  OMX_SC16* y_scaled;
-
-  struct AlignedPtr* y_aligned;
-  struct AlignedPtr* z_aligned;
-
-  float* x_true;
-  struct ComplexFloat* y_true;
-
-  OMX_INT n, fft_spec_buffer_size;
-  OMXResult status;
-  OMXFFTSpec_R_S16 * fft_inv_spec = NULL;
-  int fft_size;
-
-  fft_size = 1 << fft_log_size;
-
-  status = omxSP_FFTGetBufSize_R_S16(fft_log_size, &fft_spec_buffer_size);
-  if (verbose > 3) {
-    printf("fft_spec_buffer_size = %d\n", fft_spec_buffer_size);
-  }
-
-  fft_inv_spec = (OMXFFTSpec_R_S16*)malloc(fft_spec_buffer_size);
-  status = omxSP_FFTInit_R_S16(fft_inv_spec, fft_log_size);
-  if (status) {
-    fprintf(stderr, "Failed to init backward FFT:  status = %d\n", status);
-    exit(1);
-  }
-
-  y_aligned = AllocAlignedPointer(32, sizeof(*y) * (fft_size / 2 + 1));
-  z_aligned = AllocAlignedPointer(32, sizeof(*z) * fft_size);
-
-  x_true = (float*) malloc(sizeof(*x_true) * fft_size);
-  x_scaled = (OMX_S16*) malloc(sizeof(*x_scaled) * fft_size);
-  y_true = (struct ComplexFloat*) malloc(sizeof(*y_true) * fft_size);
-  y_scaled = y_aligned->aligned_pointer_;
-  z = z_aligned->aligned_pointer_;
-
-  GenerateSignal(y_true, x_true, fft_size, signal_type);
-
-  {
-    /*
-     * To get max accuracy, scale the input to the inverse FFT up
-     * to use as many bits as we can.
-     */
-    float scale = 1;
-    float max = 0;
-
-    for (n = 0; n < fft_size / 2 + 1; ++n) {
-      float val;
-      val = fabs(y_true[n].Re);
-      if (val > max) {
-        max = val;
-      }
-      val = fabs(y_true[n].Im);
-      if (val > max) {
-        max = val;
-      }
-    }
-
-    scale = 16384 / max;
-    if (verbose > 63)
-      printf("Inverse FFT input scaled factor %g\n", scale);
-
-    /*
-     * Scale both the true FFT signal and the input so we can
-     * compare them correctly later
-     */
-    for (n = 0; n < fft_size / 2 + 1; ++n) {
-      y_scaled[n].Re = (OMX_S16)(0.5 + y_true[n].Re * scale);
-      y_scaled[n].Im = (OMX_S16)(0.5 + y_true[n].Im * scale);
-    }
-    for (n = 0; n < fft_size; ++n) {
-      x_scaled[n] = 0.5 + x_true[n] * scale;
-    }
-  }
-
-
-  if (verbose > 63) {
-    printf("Inverse FFT Input Signal\n");
-    DumpArrayComplex16("y", fft_size / 2 + 1, y_scaled);
-
-    printf("Expected Inverse FFT output\n");
-    DumpArrayReal16("x", fft_size, x_scaled);
-  }
-
-  status = omxSP_FFTInv_CCSToR_S16_Sfs((OMX_S32*) y_scaled, z, fft_inv_spec, 0);
-  if (status) {
-    fprintf(stderr, "Inverse FFT failed: status = %d\n", status);
-    exit(1);
-  }
-
-  if (verbose > 63) {
-    printf("Actual Inverse FFT Output\n");
-    DumpArrayReal16("z", fft_size, z);
-  }
-
-  CompareReal16(snr, z, x_scaled, fft_size);
-
-  FreeAlignedPointer(y_aligned);
-  FreeAlignedPointer(z_aligned);
-  free(fft_inv_spec);
-
-  return snr->real_snr_;
-}
diff --git a/dl/sp/src/test/test_util.c b/dl/sp/src/test/test_util.c
index 69830b6..88d697b 100644
--- a/dl/sp/src/test/test_util.c
+++ b/dl/sp/src/test/test_util.c
@@ -97,7 +97,7 @@
   options->fft_log_size_ = 4;
   options->scale_factor_ = 0;
   options->signal_type_ = 0;
-  options->signal_value_ = 32767;
+  options->signal_value_ = 1024;
   options->signal_value_given_ = 0;
 }
 
@@ -382,7 +382,7 @@
                         const OMX_SC16* array) {
   int n;
 
-  printf("%4s\t%10s.re[n]\t%10s.im[n]\n", "n", array_name, array_name);
+  printf("%4s\t%10s.re[n]\t%10s.im[n]\n", "n", array_name);
   for (n = 0; n < count; ++n) {
     printf("%4d\t%16d\t%16d\n", n, array[n].Re, array[n].Im);
   }