Make 16-bit FFT work with gcc, update license info, and add test program.

Review URL: https://webrtc-codereview.appspot.com/1103006

git-svn-id: http://webrtc.googlecode.com/svn/deps/third_party/openmax@3504 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/dl/dl.gyp b/dl/dl.gyp
index f0f0eb7..79bae5d 100644
--- a/dl/dl.gyp
+++ b/dl/dl.gyp
@@ -46,6 +46,19 @@
         'sp/src/omxSP_FFTGetBufSize_R_S32.c',
         'sp/src/omxSP_FFTInit_R_S32.c',
         'sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S',
+        # Complex 16-bit fixed-point FFT
+        'sp/src/omxSP_FFTInit_C_SC16.c',
+        'sp/src/omxSP_FFTGetBufSize_C_SC16.c',
+        'sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S',
+        'sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S',
+        'sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S',
+        'sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S',
+        'sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S',
+        'sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S',
+        'sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S',
+        'sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S',
+        'sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S',
+        'sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S',
         # Real 16-bit fixed-point FFT
         'sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S',
         'sp/src/omxSP_FFTGetBufSize_R_S16S32.c',
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S
index f321502..a16c79f 100644
--- a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S
@@ -1,162 +1,170 @@
-;//
-;// 
-;// File Name:  armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision:   6693
-;// Last Modified Date:       Tue, 10 Jul 2007
-;// 
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;// 
-;// 
-;//
-;// Description:
-;// Compute a Radix 2 FFT stage for a N point complex signal
-;// 
-;// 
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
 
-        
-;// Include standard headers
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   6693
+@// Last Modified Date:       Tue, 10 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
 
-        INCLUDE omxtypes_s.h
-        INCLUDE armCOMM_s.h
-        
-        M_VARIANTS CortexA8
-        
-;// Import symbols required from other files
-;// (For example tables)
-    
-        
-        
-        
-;// Set debugging level        
-;//DEBUG_ON    SETL {TRUE}
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
 
 
 
-;// Guarding implementation by the processor name
-    
-    
-            
-;// Guarding implementation by the processor name
-    
-    IF  CortexA8 
-    
-;//Input Registers
 
-pSrc            RN  0
-pDst            RN  2
-pTwiddle        RN  1
-pPingPongBuf    RN  5
-subFFTNum       RN  6
-subFFTSize      RN  7
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
 
 
-;//Output Registers
+
+@// Guarding implementation by the processor name
 
 
-;//Local Scratch Registers
 
-pointStep        RN  3
-outPointStep     RN  3
-grpSize          RN  4
-setCount         RN  4
-step             RN  8
-dstStep          RN  8
-
-;// Neon Registers
-
-dX0             DN  D0.S16
-dX1             DN  D1.S16
-dY0             DN  D2.S16
-dY1             DN  D3.S16
-dX0S32          DN  D0.S32
-dX1S32          DN  D1.S32
-dY0S32          DN  D2.S32
-dY1S32          DN  D3.S32
+@// Guarding implementation by the processor name
 
 
-        MACRO
-        FFTSTAGE $scaled, $inverse, $name
-        
-        ;// Define stack arguments
-        
-        
-        ;// update subFFTSize and subFFTNum into RN6 and RN7 for the next stage
-        
-        
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define pPingPongBuf                    r5
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define pointStep                       r3
+#define outPointStep                    r3
+#define grpSize                         r4
+#define setCount                        r4
+#define step                            r8
+#define dstStep                         r8
+
+@// Neon Registers
+
+#define dX0                             D0.S16
+#define dX1                             D1.S16
+#define dY0                             D2.S16
+#define dY1                             D3.S16
+#define dX0S32                          D0.S32
+#define dX1S32                          D1.S32
+#define dY0S32                          D2.S32
+#define dY1S32                          D3.S32
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+
+        @// update subFFTSize and subFFTNum into RN6 and RN7 for the next stage
+
+
         MOV        subFFTSize,#2
-        LSR        grpSize,subFFTNum,#1  
-        MOV        subFFTNum,grpSize 
-        
-        
-        ;// pT0+1 increments pT0 by 8 bytes
-        ;// pT0+pointStep = increment of 4*pointStep bytes = 2*grpSize bytes
-        ;// Note: outPointStep = pointStep for firststage
-        ;// Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
-        
-        MOV        pointStep,grpSize,LSL #2
-        RSB        step,pointStep,#4 
-        
-        
-        ;// Loop on the sets for grp zero: 1 set at a time
+        LSR        grpSize,subFFTNum,#1
+        MOV        subFFTNum,grpSize
 
-grpZeroSetLoop$name        
-        
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 4*pointStep bytes = 2*grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+        @// Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+
+        MOV        pointStep,grpSize,LSL #2
+        RSB        step,pointStep,#4
+
+
+        @// Loop on the sets for grp zero: 1 set at a time
+
+grpZeroSetLoop\name:
+
         VLD1    {dX0S32[0]},[pSrc],pointStep
-        VLD1    {dX1S32[0]},[pSrc],step                   ;// step = -pointStep + 4
-        SUBS    setCount,setCount,#1              ;// decrement the loop counter
-        
-        IF $scaled
-        
+        VLD1    {dX1S32[0]},[pSrc],step                   @// step = -pointStep + 4
+        SUBS    setCount,setCount,#1              @// decrement the loop counter
+
+        .ifeqs "\scaled", "TRUE"
+
             VHADD    dY0,dX0,dX1
             VHSUB    dY1,dX0,dX1
-        
-        ELSE
-        
+
+        .ELSE
+
             VADD    dY0,dX0,dX1
             VSUB    dY1,dX0,dX1
-        
-         
-        ENDIF
-        
+
+
+        .ENDIF
+
         VST1    {dY0S32[0]},[pDst],outPointStep
-        VST1    {dY1S32[0]},[pDst],dstStep                  ;// dstStep =  step = -pointStep + 4
-               
-        BGT     grpZeroSetLoop$name
-        
-        
-        ;// reset pSrc to pDst for the next stage
-        SUB     pSrc,pDst,pointStep                     ;// pDst -= 2*grpSize 
+        VST1    {dY1S32[0]},[pDst],dstStep                  @// dstStep =  step = -pointStep + 4
+
+        BGT     grpZeroSetLoop\name
+
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                     @// pDst -= 2*grpSize
         MOV     pDst,pPingPongBuf
-                
-        MEND
-        
-        
-                
+
+        .endm
+
+
+
         M_START armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe,r4
-        FFTSTAGE {FALSE},{FALSE},FWD
+        FFTSTAGE "FALSE","FALSE",FWD
         M_END
 
-        
-        
+
+
         M_START armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe,r4
-        FFTSTAGE {FALSE},{TRUE},INV
+        FFTSTAGE "FALSE","TRUE",INV
         M_END
- 
-        
-        
+
+
+
         M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe,r4
-        FFTSTAGE {TRUE},{FALSE},FWDSFS
+        FFTSTAGE "TRUE","FALSE",FWDSFS
         M_END
 
-        
-        
+
+
         M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe,r4
-        FFTSTAGE {TRUE},{TRUE},INVSFS
+        FFTSTAGE "TRUE","TRUE",INVSFS
         M_END
 
-        
-    ENDIF                                                           ;//CORTEXA8
-    
-     
-    END
\ No newline at end of file
+
+
+
+    .END
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S
index 0932099..9f7b531 100644
--- a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S
@@ -1,202 +1,210 @@
-;//
-;// 
-;// File Name:  armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision:   6741
-;// Last Modified Date:       Wed, 18 Jul 2007
-;// 
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;// 
-;// 
-;//
-;// Description:
-;// Compute a Radix 2 FFT stage for a N point complex signal
-;// 
-;// 
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
 
-        
-;// Include standard headers
-
-        INCLUDE omxtypes_s.h
-        INCLUDE armCOMM_s.h
-        
-        M_VARIANTS CortexA8
-        
-;// Import symbols required from other files
-;// (For example tables)
-    
-        
-        
-        
-;// Set debugging level        
-;//DEBUG_ON    SETL {TRUE}
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   6741
+@// Last Modified Date:       Wed, 18 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
 
 
-;// Guarding implementation by the processor name
-    
-    
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
 
 
 
 
-            
-;// Guarding implementation by the processor name
-    
-    IF  CortexA8 
-    
-;//Input Registers
-
-pSrc            RN  0
-pDst            RN  2
-pTwiddle        RN  1
-subFFTNum       RN  6
-subFFTSize      RN  7
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
 
 
-;//Output Registers
+@// Guarding implementation by the processor name
 
 
-;//Local Scratch Registers
 
 
-outPointStep     RN  3
-grpCount         RN  4
-dstStep          RN  5
-pTmp             RN  4
-step             RN  8
-
-;// Neon Registers
-
-dWr             DN  D0.S16
-dWi             DN  D1.S16
-dXr0            DN  D2.S16
-dXi0            DN  D3.S16
-dXr1            DN  D4.S16
-dXi1            DN  D5.S16
-dYr0            DN  D6.S16
-dYi0            DN  D7.S16
-dYr1            DN  D8.S16
-dYi1            DN  D9.S16
-qT0             QN  Q5.S32
-qT1             QN  Q6.S32
 
 
-        MACRO
-        FFTSTAGE $scaled, $inverse, $name
-        
-        
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+
+#define outPointStep                    r3
+#define grpCount                        r4
+#define dstStep                         r5
+#define pTmp                            r4
+#define step                            r8
+
+@// Neon Registers
+
+#define dWr                             D0.S16
+#define dWi                             D1.S16
+#define dXr0                            D2.S16
+#define dXi0                            D3.S16
+#define dXr1                            D4.S16
+#define dXi1                            D5.S16
+#define dYr0                            D6.S16
+#define dYi0                            D7.S16
+#define dYr1                            D8.S16
+#define dYi1                            D9.S16
+#define qT0                             Q5.S32
+#define qT1                             Q6.S32
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+
         MOV     outPointStep,subFFTSize,LSL #2
-        ;// Update grpCount and grpSize rightaway 
-        
-        MOV     subFFTNum,#1                            ;//after the last stage
-        LSL     grpCount,subFFTSize,#1
-        
-        ;// update subFFTSize for the next stage
-        MOV     subFFTSize,grpCount
-                               
-        SUB      step,outPointStep,#4                   ;// step = -4+outPointStep
-        RSB      dstStep,step,#0                        ;// dstStep = -4-outPointStep+8 = -step
-        ;//RSB      dstStep,outPointStep,#16
-        
-        
-        ;// Loop on 2 grps at a time for the last stage
+        @// Update grpCount and grpSize rightaway
 
-grpLoop$name
-        VLD2    {dWr[0],dWi[0]},[pTwiddle]!             ;// grp 0
-        VLD2    {dWr[1],dWi[1]},[pTwiddle]!             ;// grp 1
-        
-        ;//VLD2    {dWr,dWi},[pTwiddle],#16        
-        
-        VLD4    {dXr0[0],dXi0[0],dXr1[0],dXi1[0]},[pSrc]!   ;// grp 0
-        VLD4    {dXr0[1],dXi0[1],dXr1[1],dXi1[1]},[pSrc]!   ;// grp 1
-        
-        
-        ;//VLD4    {dXr0,dXi0,dXr1,dXi1},[pSrc],#32
-        SUBS    grpCount,grpCount,#4                   ;// grpCount is multiplied by 2 
-        
-        IF  $inverse
+        MOV     subFFTNum,#1                            @//after the last stage
+        LSL     grpCount,subFFTSize,#1
+
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        SUB      step,outPointStep,#4                   @// step = -4+outPointStep
+        RSB      dstStep,step,#0                        @// dstStep = -4-outPointStep+8 = -step
+        @//RSB      dstStep,outPointStep,#16
+
+
+        @// Loop on 2 grps at a time for the last stage
+
+grpLoop\name:
+        VLD2    {dWr[0],dWi[0]},[pTwiddle]!             @// grp 0
+        VLD2    {dWr[1],dWi[1]},[pTwiddle]!             @// grp 1
+
+        @//VLD2    {dWr,dWi},[pTwiddle],#16
+
+        VLD4    {dXr0[0],dXi0[0],dXr1[0],dXi1[0]},[pSrc]!   @// grp 0
+        VLD4    {dXr0[1],dXi0[1],dXr1[1],dXi1[1]},[pSrc]!   @// grp 1
+
+
+        @//VLD4    {dXr0,dXi0,dXr1,dXi1},[pSrc],#32
+        SUBS    grpCount,grpCount,#4                   @// grpCount is multiplied by 2
+
+        .ifeqs  "\inverse", "TRUE"
             VMULL   qT0,dXr1,dWr
-            VMLAL   qT0,dXi1,dWi                       ;// real part
+            VMLAL   qT0,dXi1,dWi                       @// real part
             VMULL   qT1,dXi1,dWr
-            VMLSL   qT1,dXr1,dWi                       ;// imag part
-            
-        ELSE
+            VMLSL   qT1,dXr1,dWi                       @// imag part
+
+        .ELSE
             VMULL   qT0,dXr1,dWr
-            VMLSL   qT0,dXi1,dWi                       ;// real part
+            VMLSL   qT0,dXi1,dWi                       @// real part
             VMULL   qT1,dXi1,dWr
-            VMLAL   qT1,dXr1,dWi                       ;// imag part
-        
-        ENDIF
-        
+            VMLAL   qT1,dXr1,dWi                       @// imag part
+
+        .ENDIF
+
         VRSHRN  dXr1,qT0,#15
         VRSHRN  dXi1,qT1,#15
-        
-               
-        IF $scaled
-        
+
+
+        .ifeqs "\scaled", "TRUE"
+
             VHSUB    dYr0,dXr0,dXr1
             VHSUB    dYi0,dXi0,dXi1
             VHADD    dYr1,dXr0,dXr1
             VHADD    dYi1,dXi0,dXi1
-            
-        ELSE
-        
+
+        .ELSE
+
             VSUB    dYr0,dXr0,dXr1
             VSUB    dYi0,dXi0,dXi1
             VADD    dYr1,dXr0,dXr1
             VADD    dYi1,dXi0,dXi1
-            
-         
-        ENDIF
-        
+
+
+        .ENDIF
+
         VST2    {dYr0[0],dYi0[0]},[pDst]!
-        VST2    {dYr0[1],dYi0[1]},[pDst],step               ;// step = -4+outPointStep
-        
+        VST2    {dYr0[1],dYi0[1]},[pDst],step               @// step = -4+outPointStep
+
         VST2    {dYr1[0],dYi1[0]},[pDst]!
-        VST2    {dYr1[1],dYi1[1]},[pDst],dstStep            ;// dstStep = -4-outPointStep+8 = -step
-        
-        ;//VST2    {dYr0,dYi0},[pDst],outPointStep
-        ;//VST2    {dYr1,dYi1},[pDst],dstStep                  ;// dstStep =  step = -outPointStep + 16
-               
-        BGT     grpLoop$name
-        
-        
-        ;// Reset and Swap pSrc and pDst for the next stage     
+        VST2    {dYr1[1],dYi1[1]},[pDst],dstStep            @// dstStep = -4-outPointStep+8 = -step
+
+        @//VST2    {dYr0,dYi0},[pDst],outPointStep
+        @//VST2    {dYr1,dYi1},[pDst],dstStep                  @// dstStep =  step = -outPointStep + 16
+
+        BGT     grpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
         MOV     pTmp,pDst
-        SUB     pDst,pSrc,outPointStep,LSL #1       ;// pDst -= 2*size; pSrc -= 4*size bytes           
+        SUB     pDst,pSrc,outPointStep,LSL #1       @// pDst -= 2*size; pSrc -= 4*size bytes
         SUB     pSrc,pTmp,outPointStep
-        
-        ;// Reset pTwiddle for the next stage
-        SUB     pTwiddle,pTwiddle,outPointStep      ;// pTwiddle -= 2*size bytes
-                
-        MEND
-        
-        
-                
+
+        @// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      @// pTwiddle -= 2*size bytes
+
+        .endm
+
+
+
         M_START armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe,r4
-        FFTSTAGE {FALSE},{FALSE},FWD
+        FFTSTAGE "FALSE","FALSE",FWD
         M_END
 
-        
-        
+
+
         M_START armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe,r4
-        FFTSTAGE {FALSE},{TRUE},INV
+        FFTSTAGE "FALSE","TRUE",INV
         M_END
- 
-        
-        
+
+
+
         M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe,r4
-        FFTSTAGE {TRUE},{FALSE},FWDSFS
+        FFTSTAGE "TRUE","FALSE",FWDSFS
         M_END
 
-        
-        
+
+
         M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe,r4
-        FFTSTAGE {TRUE},{TRUE},INVSFS
+        FFTSTAGE "TRUE","TRUE",INVSFS
         M_END
 
-        
-    ENDIF                                                           ;//CORTEXA8
-    
-     
-    END
\ No newline at end of file
+
+
+
+    .END
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S
index 49bf607..666f4f3 100644
--- a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S
@@ -1,209 +1,216 @@
-;//
-;// 
-;// File Name:  armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision:   6740
-;// Last Modified Date:       Wed, 18 Jul 2007
-;// 
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;// 
-;// 
-;//
-;// Description:
-;// Compute a Radix 2 FFT stage for a N point complex signal
-;// 
-;// 
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
 
-        
-;// Include standard headers
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   6740
+@// Last Modified Date:       Wed, 18 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
 
-        INCLUDE omxtypes_s.h
-        INCLUDE armCOMM_s.h
-        
-        M_VARIANTS CortexA8
-        
-;// Import symbols required from other files
-;// (For example tables)
-    
-        
-        
-        
-;// Set debugging level        
-;//DEBUG_ON    SETL {TRUE}
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
 
 
 
-            
-;// Guarding implementation by the processor name
-    
-    IF  CortexA8 
-    
-;//Input Registers
 
-pSrc            RN  0
-pDst            RN  2
-pTwiddle        RN  1
-subFFTNum       RN  6
-subFFTSize      RN  7
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
 
 
-;//Output Registers
 
 
-;//Local Scratch Registers
-
-outPointStep     RN  3
-grpCount         RN  4
-dstStep          RN  5
-twStep           RN  8
-pTmp             RN  4
-
-;// Neon Registers
-
-dW1S32          DN  D0.S32
-dW2S32          DN  D1.S32
-dW1             DN  D0.S16
-dW2             DN  D1.S16
-
-dX0             DN  D2.S16
-dX1             DN  D3.S16
-dX2             DN  D4.S16
-dX3             DN  D5.S16
-dY0             DN  D6.S16
-dY1             DN  D7.S16
-dY2             DN  D8.S16
-dY3             DN  D9.S16
-qT0             QN  Q5.S32
-qT1             QN  Q6.S32
+@// Guarding implementation by the processor name
 
 
-        MACRO
-        FFTSTAGE $scaled, $inverse, $name
-        
-        ;// Define stack arguments
-        
-        
-        ;// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
-        
-        
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep                    r3
+#define grpCount                        r4
+#define dstStep                         r5
+#define twStep                          r8
+#define pTmp                            r4
+
+@// Neon Registers
+
+#define dW1S32                          D0.S32
+#define dW2S32                          D1.S32
+#define dW1                             D0.S16
+#define dW2                             D1.S16
+
+#define dX0                             D2.S16
+#define dX1                             D3.S16
+#define dX2                             D4.S16
+#define dX3                             D5.S16
+#define dY0                             D6.S16
+#define dY1                             D7.S16
+#define dY2                             D8.S16
+#define dY3                             D9.S16
+#define qT0                             Q5.S32
+#define qT1                             Q6.S32
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+
+        @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+
+
         LSL     grpCount,subFFTSize,#1
-        
-        
-        ;// update subFFTSize for the next stage
-        MOV     subFFTSize,grpCount
-        
-        ;// pOut0+1 increments pOut0 by 8 bytes
-        ;// pOut0+outPointStep == increment of 4*outPointStep bytes = 2*size bytes
-        SMULBB  outPointStep,grpCount,subFFTNum  
-        MOV     twStep,subFFTNum,LSL #1
-        LSR     subFFTNum,subFFTNum,#1                      ;//grpSize
-                                       
-                
-        RSB      dstStep,outPointStep,#8
-                
-        
-        ;// Note: pointStep is 8 in this case: so need of extra reg
-        ;// Loop on the groups: 2 groups at a time
 
-grpLoop$name        
-        
-        VLD1     dW1S32[],[pTwiddle],twStep                ;//[wi | wr] 
+
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 4*outPointStep bytes = 2*size bytes
+        SMULBB  outPointStep,grpCount,subFFTNum
+        MOV     twStep,subFFTNum,LSL #1
+        LSR     subFFTNum,subFFTNum,#1                      @//grpSize
+
+
+        RSB      dstStep,outPointStep,#8
+
+
+        @// Note: pointStep is 8 in this case: so need of extra reg
+        @// Loop on the groups: 2 groups at a time
+
+grpLoop\name:
+
+        VLD1     dW1S32[],[pTwiddle],twStep                @//[wi | wr]
         VLD1     dW2S32[],[pTwiddle],twStep
-        
-        ;// Process the sets for each grp:  2 sets at a time (no set looping required)     
-        
-        VLD1    dX0,[pSrc]!            ;// point0: of set0,set1 of grp0
-        VLD1    dX1,[pSrc]!            ;// point1: of set0,set1 of grp0
-        VLD1    dX2,[pSrc]!            ;// point0: of set0,set1 of grp1
-        VLD1    dX3,[pSrc]!            ;// point1: of set0,set1 of grp1
-        
-        SUBS    grpCount,grpCount,#4              ;// decrement the loop counter
+
+        @// Process the sets for each grp:  2 sets at a time (no set looping required)
+
+        VLD1    dX0,[pSrc]!            @// point0: of set0,set1 of grp0
+        VLD1    dX1,[pSrc]!            @// point1: of set0,set1 of grp0
+        VLD1    dX2,[pSrc]!            @// point0: of set0,set1 of grp1
+        VLD1    dX3,[pSrc]!            @// point1: of set0,set1 of grp1
+
+        SUBS    grpCount,grpCount,#4              @// decrement the loop counter
         VUZP    dW1,dW2
         VUZP    dX1,dX3
-        
-        IF  $inverse
+
+        .ifeqs  "\inverse", "TRUE"
             VMULL   qT0,dX1,dW1
-            VMLAL   qT0,dX3,dW2                       ;// real part
+            VMLAL   qT0,dX3,dW2                       @// real part
             VMULL   qT1,dX3,dW1
-            VMLSL   qT1,dX1,dW2                       ;// imag part
-            
-        ELSE
+            VMLSL   qT1,dX1,dW2                       @// imag part
+
+        .ELSE
             VMULL   qT0,dX1,dW1
-            VMLSL   qT0,dX3,dW2                       ;// real part
+            VMLSL   qT0,dX3,dW2                       @// real part
             VMULL   qT1,dX3,dW1
-            VMLAL   qT1,dX1,dW2                       ;// imag part
-        
-        ENDIF
-        
+            VMLAL   qT1,dX1,dW2                       @// imag part
+
+        .ENDIF
+
         VRSHRN  dX1,qT0,#15
         VRSHRN  dX3,qT1,#15
-        
+
         VZIP    dX1,dX3
-        
-        
-        IF $scaled
-        
+
+
+        .ifeqs "\scaled", "TRUE"
+
             VHSUB    dY0,dX0,dX1
             VHADD    dY1,dX0,dX1
             VHSUB    dY2,dX2,dX3
             VHADD    dY3,dX2,dX3
-            
-        ELSE
-        
+
+        .ELSE
+
             VSUB    dY0,dX0,dX1
             VADD    dY1,dX0,dX1
             VSUB    dY2,dX2,dX3
             VADD    dY3,dX2,dX3
-            
-        
-         
-        ENDIF
-        
-        VST1    dY0,[pDst],outPointStep             ;// point0: of set0,set1 of grp0
-        VST1    dY1,[pDst],dstStep                  ;// dstStep = -outPointStep + 8
-        VST1    dY2,[pDst],outPointStep             ;// point0: of set0,set1 of grp1
-        VST1    dY3,[pDst],dstStep                  ;// point1: of set0,set1 of grp1
-        
-               
-        BGT     grpLoop$name
-        
-        
-        ;// Reset and Swap pSrc and pDst for the next stage     
+
+
+
+        .ENDIF
+
+        VST1    dY0,[pDst],outPointStep             @// point0: of set0,set1 of grp0
+        VST1    dY1,[pDst],dstStep                  @// dstStep = -outPointStep + 8
+        VST1    dY2,[pDst],outPointStep             @// point0: of set0,set1 of grp1
+        VST1    dY3,[pDst],dstStep                  @// point1: of set0,set1 of grp1
+
+
+        BGT     grpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
         MOV     pTmp,pDst
-        SUB     pDst,pSrc,outPointStep,LSL #1       ;// pDst -= 2*size; pSrc -= 4*size bytes           
+        SUB     pDst,pSrc,outPointStep,LSL #1       @// pDst -= 2*size; pSrc -= 4*size bytes
         SUB     pSrc,pTmp,outPointStep
-        
-        ;// Reset pTwiddle for the next stage
-        SUB     pTwiddle,pTwiddle,outPointStep      ;// pTwiddle -= 2*size bytes
-                
-        MEND
-        
-        
-                
+
+        @// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      @// pTwiddle -= 2*size bytes
+
+        .endm
+
+
+
         M_START armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe,r4
-        FFTSTAGE {FALSE},{FALSE},FWD
+        FFTSTAGE "FALSE","FALSE",FWD
         M_END
 
-        
-        
+
+
         M_START armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe,r4
-        FFTSTAGE {FALSE},{TRUE},INV
+        FFTSTAGE "FALSE","TRUE",INV
         M_END
- 
-        
-        
+
+
+
         M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe,r4
-        FFTSTAGE {TRUE},{FALSE},FWDSFS
+        FFTSTAGE "TRUE","FALSE",FWDSFS
         M_END
 
-        
-        
+
+
         M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe,r4
-        FFTSTAGE {TRUE},{TRUE},INVSFS
+        FFTSTAGE "TRUE","TRUE",INVSFS
         M_END
 
-        
-    ENDIF                                                           ;//CORTEXA8
-    
-     
-    END
\ No newline at end of file
+
+
+    .END
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S
index 133b137..f9bbebc 100644
--- a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S
@@ -1,214 +1,219 @@
-;//
-;// 
-;// File Name:  armSP_FFT_CToC_SC16_Radix2_unsafe_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision:   5892
-;// Last Modified Date:       Thu, 07 Jun 2007
-;// 
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;// 
-;// 
-;//
-;// Description:
-;// Compute a Radix 2 FFT stage for a N point complex signal
-;// 
-;// 
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
 
-        
-;// Include standard headers
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix2_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   5892
+@// Last Modified Date:       Thu, 07 Jun 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
 
-        INCLUDE omxtypes_s.h
-        INCLUDE armCOMM_s.h
-        
-        M_VARIANTS CortexA8
-        
-;// Import symbols required from other files
-;// (For example tables)
 
-           
-        
-;// Set debugging level        
-;//DEBUG_ON    SETL {TRUE}
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
 
 
 
-;// Guarding implementation by the processor name
-    
-    
-    
-    
-    ;// Guarding implementation by the processor name
-    
-    IF  CortexA8 
-    
-    
-;//Input Registers
-
-pSrc            RN  0
-pDst            RN  2
-pTwiddle        RN  1
-subFFTNum       RN  6
-subFFTSize      RN  7
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
 
 
-;//Output Registers
+
+@// Guarding implementation by the processor name
 
 
-;//Local Scratch Registers
 
-outPointStep    RN  3
-pointStep       RN  4
-grpCount        RN  5
-setCount        RN  8
-step            RN  10
-dstStep         RN  11
-pTmp            RN  9    
 
-;// Neon Registers
+    @// Guarding implementation by the processor name
 
-dW              DN  D0.S16
-dX0             DN  D2.S16
-dX1                DN  D3.S16
-dX2             DN  D4.S16
-dX3                DN  D5.S16
-dY0             DN  D6.S16
-dY1               DN  D7.S16
-dY2             DN  D8.S16
-dY3               DN  D9.S16
-qT0             QN  Q3.S32
-qT1             QN  Q4.S32
 
-    
-    
-        MACRO
-        FFTSTAGE $scaled, $inverse, $name
-        
-        ;// Define stack arguments
-        
-        
-        ;// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
-        
-        LSR     subFFTNum,subFFTNum,#1                      ;//grpSize
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep                    r3
+#define pointStep                       r4
+#define grpCount                        r5
+#define setCount                        r8
+#define step                            r10
+#define dstStep                         r11
+#define pTmp                            r9
+
+@// Neon Registers
+
+#define dW                              D0.S16
+#define dX0                             D2.S16
+#define dX1                             D3.S16
+#define dX2                             D4.S16
+#define dX3                             D5.S16
+#define dY0                             D6.S16
+#define dY1                             D7.S16
+#define dY2                             D8.S16
+#define dY3                             D9.S16
+#define qT0                             Q3.S32
+#define qT1                             Q4.S32
+
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+
+        @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+
+        LSR     subFFTNum,subFFTNum,#1                      @//grpSize
         LSL     grpCount,subFFTSize,#1
-        
-        
-        ;// pT0+1 increments pT0 by 8 bytes
-        ;// pT0+pointStep = increment of 4*pointStep bytes = 2*grpSize bytes
+
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 4*pointStep bytes = 2*grpSize bytes
         MOV     pointStep,subFFTNum,LSL #1
-        
-        ;// update subFFTSize for the next stage
+
+        @// update subFFTSize for the next stage
         MOV     subFFTSize,grpCount
-        
-        ;// pOut0+1 increments pOut0 by 8 bytes
-        ;// pOut0+outPointStep == increment of 4*outPointStep bytes = 2*size bytes
-        SMULBB  outPointStep,grpCount,pointStep  
-        LSL     pointStep,pointStep,#1    
-                               
-        
+
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 4*outPointStep bytes = 2*size bytes
+        SMULBB  outPointStep,grpCount,pointStep
+        LSL     pointStep,pointStep,#1
+
+
         RSB      step,pointStep,#16
         RSB      dstStep,outPointStep,#16
-        
-        ;// Loop on the groups
 
-grpLoop$name        
-        
-        VLD1     dW,[pTwiddle],pointStep                ;//[wi | wr]
+        @// Loop on the groups
+
+grpLoop\name:
+
+        VLD1     dW,[pTwiddle],pointStep                @//[wi | wr]
         MOV      setCount,pointStep,LSR #2
-        
-        
-        ;// Loop on the sets: 4 at a time
-        
-        
-setLoop$name        
-        
-        
-        VLD2    {dX0,dX1},[pSrc],pointStep            ;// point0: dX0-real part dX1-img part
-        VLD2    {dX2,dX3},[pSrc],step                 ;// point1: dX2-real part dX3-img part
-        
-        SUBS    setCount,setCount,#4               
-        
-        IF  $inverse
+
+
+        @// Loop on the sets: 4 at a time
+
+
+setLoop\name:
+
+
+        VLD2    {dX0,dX1},[pSrc],pointStep            @// point0: dX0-real part dX1-img part
+        VLD2    {dX2,dX3},[pSrc],step                 @// point1: dX2-real part dX3-img part
+
+        SUBS    setCount,setCount,#4
+
+        .ifeqs  "\inverse", "TRUE"
             VMULL   qT0,dX2,dW[0]
-            VMLAL   qT0,dX3,dW[1]                       ;// real part
+            VMLAL   qT0,dX3,dW[1]                       @// real part
             VMULL   qT1,dX3,dW[0]
-            VMLSL   qT1,dX2,dW[1]                       ;// imag part
-                
-        ELSE
-        
+            VMLSL   qT1,dX2,dW[1]                       @// imag part
+
+        .ELSE
+
             VMULL   qT0,dX2,dW[0]
-            VMLSL   qT0,dX3,dW[1]                       ;// real part
+            VMLSL   qT0,dX3,dW[1]                       @// real part
             VMULL   qT1,dX3,dW[0]
-            VMLAL   qT1,dX2,dW[1]                       ;// imag part
-                    
-        ENDIF
-        
+            VMLAL   qT1,dX2,dW[1]                       @// imag part
+
+        .ENDIF
+
         VRSHRN  dX2,qT0,#15
         VRSHRN  dX3,qT1,#15
-        
-        IF $scaled
+
+        .ifeqs "\scaled", "TRUE"
             VHSUB    dY0,dX0,dX2
             VHSUB    dY1,dX1,dX3
             VHADD    dY2,dX0,dX2
             VHADD    dY3,dX1,dX3
-                
-        ELSE
+
+        .ELSE
             VSUB    dY0,dX0,dX2
             VSUB    dY1,dX1,dX3
             VADD    dY2,dX0,dX2
             VADD    dY3,dX1,dX3
-        
-        ENDIF
-        
+
+        .ENDIF
+
         VST2    {dY0,dY1},[pDst],outPointStep
-        VST2    {dY2,dY3},[pDst],dstStep              ;// dstStep = -outPointStep + 16
-        
-        BGT     setLoop$name
-        
-        SUBS    grpCount,grpCount,#2               
+        VST2    {dY2,dY3},[pDst],dstStep              @// dstStep = -outPointStep + 16
+
+        BGT     setLoop\name
+
+        SUBS    grpCount,grpCount,#2
         ADD     pSrc,pSrc,pointStep
-        BGT     grpLoop$name    
-        
-        
-        ;// Reset and Swap pSrc and pDst for the next stage     
+        BGT     grpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
         MOV     pTmp,pDst
-        SUB     pDst,pSrc,outPointStep,LSL #1       ;// pDst -= 2*size; pSrc -= 4*size bytes           
+        SUB     pDst,pSrc,outPointStep,LSL #1       @// pDst -= 2*size; pSrc -= 4*size bytes
         SUB     pSrc,pTmp,outPointStep
-        
-        ;// Reset pTwiddle for the next stage
-        SUB     pTwiddle,pTwiddle,outPointStep      ;// pTwiddle -= 2*size bytes
-        
-                
-        MEND
-        
-        
-        
+
+        @// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      @// pTwiddle -= 2*size bytes
+
+
+        .endm
+
+
+
         M_START armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe,r4
-        FFTSTAGE {FALSE},{FALSE},FWD
+        FFTSTAGE "FALSE","FALSE",FWD
         M_END
 
-        
-        
+
+
         M_START armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe,r4
-        FFTSTAGE {FALSE},{TRUE},INV
+        FFTSTAGE "FALSE","TRUE",INV
         M_END
- 
-        
-        
+
+
+
         M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe,r4
-        FFTSTAGE {TRUE},{FALSE},FWDSFS
+        FFTSTAGE "TRUE","FALSE",FWDSFS
         M_END
 
-        
-        
+
+
         M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe,r4
-        FFTSTAGE {TRUE},{TRUE},INVSFS
+        FFTSTAGE "TRUE","TRUE",INVSFS
         M_END
 
-        
 
-    ENDIF                 ;//CORTEXA8
-        
-    
-     
-    END    
-     
+
+
+
+    .END
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S
index 82662e6..cdb42a9 100644
--- a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S
@@ -1,306 +1,314 @@
-;//
-;// 
-;// File Name:  armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision:   7761
-;// Last Modified Date:       Wed, 26 Sep 2007
-;// 
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;// 
-;// 
-;//
-;// Description:
-;// Compute a first stage Radix 4 FFT stage for a N point complex signal
-;// 
-;// 
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
 
-        
-;// Include standard headers
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7761
+@// Last Modified Date:       Wed, 26 Sep 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a first stage Radix 4 FFT stage for a N point complex signal
+@//
+@//
 
-        INCLUDE omxtypes_s.h
-        INCLUDE armCOMM_s.h
-        
-        M_VARIANTS CortexA8
-        
-;// Import symbols required from other files
-;// (For example tables)
-    
-        
-        
-        
-;// Set debugging level        
-;//DEBUG_ON    SETL {TRUE}
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+@// Import symbols required from other files
+@// (For example tables)
 
 
 
-;// Guarding implementation by the processor name
-    
-    
-    
-;// Guarding implementation by the processor name
-    
-    IF  CortexA8
-    
-;//Input Registers
 
-pSrc            RN  0
-pDst            RN  2
-pTwiddle        RN  1
-pPingPongBuf    RN  5
-subFFTNum       RN  6
-subFFTSize      RN  7
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
 
 
-;//Output Registers
+
+@// Guarding implementation by the processor name
 
 
-;//Local Scratch Registers
 
-grpSize         RN  3
-setCount        RN  3                  ;// Reuse grpSize as setCount
-pointStep       RN  4
-outPointStep    RN  4
-setStep         RN  8
-step1           RN  9
-step3           RN  10
+@// Guarding implementation by the processor name
 
-;// Neon Registers
 
-dXr0             DN  D0.S16
-dXi0             DN  D1.S16
-dXr1             DN  D2.S16
-dXi1             DN  D3.S16
-dXr2             DN  D4.S16
-dXi2             DN  D5.S16
-dXr3             DN  D6.S16
-dXi3             DN  D7.S16
-dYr0             DN  D8.S16
-dYi0             DN  D9.S16
-dYr1             DN  D10.S16
-dYi1             DN  D11.S16
-dYr2             DN  D12.S16
-dYi2             DN  D13.S16
-dYr3             DN  D14.S16
-dYi3             DN  D15.S16
-dZr0             DN  D16.S16
-dZi0             DN  D17.S16
-dZr1             DN  D18.S16
-dZi1             DN  D19.S16
-dZr2             DN  D20.S16
-dZi2             DN  D21.S16
-dZr3             DN  D22.S16
-dZi3             DN  D23.S16
-qY0              QN  Q4.S16
-qY2              QN  Q6.S16
-qX0              QN  Q0.S16
-qX2              QN  Q2.S16
+@//Input Registers
 
-qY1              QN  Q5.S16
-qY3              QN  Q7.S16
-qX1              QN  Q1.S16
-qX3              QN  Q3.S16
-qZ0              QN  Q8.S16
-qZ1              QN  Q9.S16
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define pPingPongBuf                    r5
+#define subFFTNum                       r6
+#define subFFTSize                      r7
 
-    
-        MACRO
-        FFTSTAGE $scaled, $inverse, $name
-        
-        ;// Define stack arguments
-        
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize                         r3
+@// Reuse grpSize as setCount
+#define setCount                        r3
+#define pointStep                       r4
+#define outPointStep                    r4
+#define setStep                         r8
+#define step1                           r9
+#define step3                           r10
+
+@// Neon Registers
+
+#define dXr0                            D0.S16
+#define dXi0                            D1.S16
+#define dXr1                            D2.S16
+#define dXi1                            D3.S16
+#define dXr2                            D4.S16
+#define dXi2                            D5.S16
+#define dXr3                            D6.S16
+#define dXi3                            D7.S16
+#define dYr0                            D8.S16
+#define dYi0                            D9.S16
+#define dYr1                            D10.S16
+#define dYi1                            D11.S16
+#define dYr2                            D12.S16
+#define dYi2                            D13.S16
+#define dYr3                            D14.S16
+#define dYi3                            D15.S16
+#define dZr0                            D16.S16
+#define dZi0                            D17.S16
+#define dZr1                            D18.S16
+#define dZi1                            D19.S16
+#define dZr2                            D20.S16
+#define dZi2                            D21.S16
+#define dZr3                            D22.S16
+#define dZi3                            D23.S16
+#define qY0                             Q4.S16
+#define qY2                             Q6.S16
+#define qX0                             Q0.S16
+#define qX2                             Q2.S16
+
+#define qY1                             Q5.S16
+#define qY3                             Q7.S16
+#define qX1                             Q1.S16
+#define qX3                             Q3.S16
+#define qZ0                             Q8.S16
+#define qZ1                             Q9.S16
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
         MOV     pointStep,subFFTNum
-        ;// Update pSubFFTSize and pSubFFTNum regs
-        
-        
-        VLD2    {dXr0,dXi0},[pSrc@128],pointStep          ;//  data[0]
-        ;// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
-        LSR     grpSize,subFFTNum,#2  
+        @// Update pSubFFTSize and pSubFFTNum regs
+
+
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0]
+        @// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#2
         MOV     subFFTNum,grpSize
-        
-               
-        ;// pT0+1 increments pT0 by 4 bytes
-        ;// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
-        ;// Note: outPointStep = pointStep for firststage
-        VLD2    {dXr1,dXi1},[pSrc@128],pointStep          ;//  data[1]
-        
-                
-        ;// Calculate the step of input data for the next set
-        ;//MOV     setStep,pointStep,LSL #1
+
+
+        @// pT0+1 increments pT0 by 4 bytes
+        @// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+
+
+        @// Calculate the step of input data for the next set
+        @//MOV     setStep,pointStep,LSL #1
         MOV     setStep,grpSize,LSL #3
-        VLD2    {dXr2,dXi2},[pSrc@128],pointStep          ;//  data[2]
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
         MOV     step1,setStep
-        ADD     setStep,setStep,pointStep             ;// setStep = 3*pointStep
-        RSB     setStep,setStep,#16                   ;// setStep = - 3*pointStep+16
-        
-                
-        VLD2    {dXr3,dXi3},[pSrc@128],setStep            ;//  data[3]
-        MOV     subFFTSize,#4                         ;// subFFTSize = 1 for the first stage
-        
-        
-        IF  $scaled 
-            VHADD    qY0,qX0,qX2             ;// u0
-        ELSE
-            VADD   qY0,qX0,qX2               ;// u0
-        ENDIF
+        ADD     setStep,setStep,pointStep             @// setStep = 3*pointStep
+        RSB     setStep,setStep,#16                   @// setStep = - 3*pointStep+16
+
+
+        VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3]
+        MOV     subFFTSize,#4                         @// subFFTSize = 1 for the first stage
+
+
+        .ifeqs  "\scaled", "TRUE"
+            VHADD    qY0,qX0,qX2             @// u0
+        .ELSE
+            VADD   qY0,qX0,qX2               @// u0
+        .ENDIF
         RSB     step3,pointStep,#0
-        
-        ;// grp = 0 a special case since all the twiddle factors are 1
-        ;// Loop on the sets: 4 sets at a time
 
-grpZeroSetLoop$name        
-        
-        
-        IF $scaled
-        
-            ;// finish first stage of 4 point FFT 
-            
-            VHSUB    qY2,qX0,qX2             ;// u1
-            SUBS    setCount,setCount,#4                    ;// decrement the set loop counter 
-            
-            VLD2    {dXr0,dXi0},[pSrc@128],step1          ;//  data[0]
-            VHADD    qY1,qX1,qX3             ;// u2
-            VLD2    {dXr2,dXi2},[pSrc@128],step3
-            VHSUB    qY3,qX1,qX3             ;// u3
-            
-                        
-            
-            ;// finish second stage of 4 point FFT 
-            
-            VLD2    {dXr1,dXi1},[pSrc@128],step1          ;//  data[1]
-            VHADD    qZ0,qY0,qY1             ;// y0
-            
-            VLD2    {dXr3,dXi3},[pSrc@128],setStep 
-                        
-            
-            IF  $inverse 
+        @// grp = 0 a special case since all the twiddle factors are 1
+        @// Loop on the sets: 4 sets at a time
 
-                VHSUB    dZr3,dYr2,dYi3                  ;// y3
+grpZeroSetLoop\name:
+
+
+        .ifeqs "\scaled", "TRUE"
+
+            @// finish first stage of 4 point FFT
+
+            VHSUB    qY2,qX0,qX2             @// u1
+            SUBS    setCount,setCount,#4                    @// decrement the set loop counter
+
+            VLD2    {dXr0,dXi0},[pSrc :128],step1          @//  data[0]
+            VHADD    qY1,qX1,qX3             @// u2
+            VLD2    {dXr2,dXi2},[pSrc :128],step3
+            VHSUB    qY3,qX1,qX3             @// u3
+
+
+
+            @// finish second stage of 4 point FFT
+
+            VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+            VHADD    qZ0,qY0,qY1             @// y0
+
+            VLD2    {dXr3,dXi3},[pSrc :128],setStep
+
+
+            .ifeqs  "\inverse", "TRUE"
+
+                VHSUB    dZr3,dYr2,dYi3                  @// y3
                 VHADD    dZi3,dYi2,dYr3
-                VST2    {dZr0,dZi0},[pDst@128],outPointStep
-                
-                VHSUB    qZ1,qY0,qY1                     ;// y2
-                VST2    {dZr3,dZi3},[pDst@128],outPointStep            
-                
-                VHADD    dZr2,dYr2,dYi3                  ;// y1
-                VST2    {dZr1,dZi1},[pDst@128],outPointStep
-                VHSUB    dZi2,dYi2,dYr3
-                
-                VHADD    qY0,qX0,qX2                     ;// u0 (next loop)
-                VST2    {dZr2,dZi2},[pDst@128],setStep     
-                
-                
-            ELSE
-            
-                VHADD    dZr2,dYr2,dYi3                  ;// y1
-                VHSUB    dZi2,dYi2,dYr3
-            
-                VST2    {dZr0,dZi0},[pDst@128],outPointStep
-                VHSUB    qZ1,qY0,qY1                     ;// y2
-                        
-                VST2    {dZr2,dZi2},[pDst@128],outPointStep            
-                VHSUB    dZr3,dYr2,dYi3                  ;// y3
-                VHADD    dZi3,dYi2,dYr3
-                VST2    {dZr1,dZi1},[pDst@128],outPointStep
-                VHADD    qY0,qX0,qX2                     ;// u0 (next loop)
-                VST2    {dZr3,dZi3},[pDst@128],setStep
-                                
-            ENDIF
-        
-        
-        ELSE
-        
-            ;// finish first stage of 4 point FFT 
-            
-            VSUB    qY2,qX0,qX2             ;// u1
-            SUBS    setCount,setCount,#4                    ;// decrement the set loop counter 
-            
-            VLD2    {dXr0,dXi0},[pSrc@128],step1          ;//  data[0]
-            VADD    qY1,qX1,qX3             ;// u2
-            VLD2    {dXr2,dXi2},[pSrc@128],step3
-            VSUB    qY3,qX1,qX3             ;// u3
-            
-                        
-            
-            ;// finish second stage of 4 point FFT 
-            
-            VLD2    {dXr1,dXi1},[pSrc@128],step1          ;//  data[1]
-            VADD    qZ0,qY0,qY1             ;// y0
-            
-            VLD2    {dXr3,dXi3},[pSrc@128],setStep 
-                        
-            
-            IF  $inverse 
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
 
-                VSUB    dZr3,dYr2,dYi3                  ;// y3
+                VHSUB    qZ1,qY0,qY1                     @// y2
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
+                VHADD    dZr2,dYr2,dYi3                  @// y1
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VHSUB    dZi2,dYi2,dYr3
+
+                VHADD    qY0,qX0,qX2                     @// u0 (next loop)
+                VST2    {dZr2,dZi2},[pDst :128],setStep
+
+
+            .ELSE
+
+                VHADD    dZr2,dYr2,dYi3                  @// y1
+                VHSUB    dZi2,dYi2,dYr3
+
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHSUB    qZ1,qY0,qY1                     @// y2
+
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VHSUB    dZr3,dYr2,dYi3                  @// y3
+                VHADD    dZi3,dYi2,dYr3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VHADD    qY0,qX0,qX2                     @// u0 (next loop)
+                VST2    {dZr3,dZi3},[pDst :128],setStep
+
+            .ENDIF
+
+
+        .ELSE
+
+            @// finish first stage of 4 point FFT
+
+            VSUB    qY2,qX0,qX2             @// u1
+            SUBS    setCount,setCount,#4                    @// decrement the set loop counter
+
+            VLD2    {dXr0,dXi0},[pSrc :128],step1          @//  data[0]
+            VADD    qY1,qX1,qX3             @// u2
+            VLD2    {dXr2,dXi2},[pSrc :128],step3
+            VSUB    qY3,qX1,qX3             @// u3
+
+
+
+            @// finish second stage of 4 point FFT
+
+            VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+            VADD    qZ0,qY0,qY1             @// y0
+
+            VLD2    {dXr3,dXi3},[pSrc :128],setStep
+
+
+            .ifeqs  "\inverse", "TRUE"
+
+                VSUB    dZr3,dYr2,dYi3                  @// y3
                 VADD    dZi3,dYi2,dYr3
-                VST2    {dZr0,dZi0},[pDst@128],outPointStep
-                
-                VSUB    qZ1,qY0,qY1                     ;// y2
-                VST2    {dZr3,dZi3},[pDst@128],outPointStep            
-                
-                VADD    dZr2,dYr2,dYi3                  ;// y1
-                VST2    {dZr1,dZi1},[pDst@128],outPointStep
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+
+                VSUB    qZ1,qY0,qY1                     @// y2
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
+                VADD    dZr2,dYr2,dYi3                  @// y1
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
                 VSUB    dZi2,dYi2,dYr3
-                
-                VADD    qY0,qX0,qX2                     ;// u0 (next loop)
-                VST2    {dZr2,dZi2},[pDst@128],setStep     
-                
-                
-            ELSE
-            
-                VADD    dZr2,dYr2,dYi3                  ;// y1
+
+                VADD    qY0,qX0,qX2                     @// u0 (next loop)
+                VST2    {dZr2,dZi2},[pDst :128],setStep
+
+
+            .ELSE
+
+                VADD    dZr2,dYr2,dYi3                  @// y1
                 VSUB    dZi2,dYi2,dYr3
-            
-                VST2    {dZr0,dZi0},[pDst@128],outPointStep
-                VSUB    qZ1,qY0,qY1                     ;// y2
-                        
-                VST2    {dZr2,dZi2},[pDst@128],outPointStep            
-                VSUB    dZr3,dYr2,dYi3                  ;// y3
+
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VSUB    qZ1,qY0,qY1                     @// y2
+
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VSUB    dZr3,dYr2,dYi3                  @// y3
                 VADD    dZi3,dYi2,dYr3
-                VST2    {dZr1,dZi1},[pDst@128],outPointStep
-                VADD    qY0,qX0,qX2                     ;// u0 (next loop)
-                VST2    {dZr3,dZi3},[pDst@128],setStep
-                                
-            ENDIF
-                        
-                       
-        ENDIF
-        
-        BGT     grpZeroSetLoop$name
-        
-        
-        ;// reset pSrc to pDst for the next stage
-        SUB     pSrc,pDst,pointStep                     ;// pDst -= grpSize  
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VADD    qY0,qX0,qX2                     @// u0 (next loop)
+                VST2    {dZr3,dZi3},[pDst :128],setStep
+
+            .ENDIF
+
+
+        .ENDIF
+
+        BGT     grpZeroSetLoop\name
+
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                     @// pDst -= grpSize
         MOV     pDst,pPingPongBuf
-        
-        
-        MEND
 
-                
-        
+
+        .endm
+
+
+
         M_START armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe,r4
-        FFTSTAGE {FALSE},{FALSE},FWD
+        FFTSTAGE "FALSE","FALSE",FWD
         M_END
 
-        
-        
+
+
         M_START armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe,r4
-        FFTSTAGE {FALSE},{TRUE},INV
-        M_END
- 
-                
-        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
-        FFTSTAGE {TRUE},{FALSE},FWDSFS
+        FFTSTAGE "FALSE","TRUE",INV
         M_END
 
-                
-        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
-        FFTSTAGE {TRUE},{TRUE},INVSFS
+
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",FWDSFS
         M_END
-    
-            
-    ENDIF                                                           ;//CortexA8
-    
-    
-     
-    END
\ No newline at end of file
+
+
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+
+
+
+
+    .END
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
index ce324f5..23e2c37 100644
--- a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
@@ -1,403 +1,410 @@
-;//
-;// 
-;// File Name:  armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision:   7765
-;// Last Modified Date:       Thu, 27 Sep 2007
-;// 
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;// 
-;// 
-;//
-;// Description:
-;// Compute a Radix 4 FFT stage for a N point complex signal
-;// 
-;// 
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
 
-        
-;// Include standard headers
-
-        INCLUDE omxtypes_s.h
-        INCLUDE armCOMM_s.h
-        INCLUDE armSP_FFT_s.h
-        
-        M_VARIANTS CortexA8
-        
-;// Import symbols required from other files
-;// (For example tables)
-    
-        
-        
-        
-;// Set debugging level        
-;//DEBUG_ON    SETL {TRUE}
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7765
+@// Last Modified Date:       Thu, 27 Sep 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@//
+@//
 
 
-;// Guarding implementation by the processor name
-    
-    
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
 
 
 
-    
-;// Guarding implementation by the processor name
-    
-    IF  CortexA8
-    
-;// Import symbols required from other files
-;// (For example tables)
-    ;//IMPORT  armAAC_constTable    
-    
-;//Input Registers
 
-pSrc            RN  0
-pDst            RN  2
-pTwiddle        RN  1
-subFFTNum       RN  6
-subFFTSize      RN  7
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+@// Guarding implementation by the processor name
 
 
 
-;//Output Registers
 
 
-;//Local Scratch Registers
 
-outPointStep     RN  3
-grpCount         RN  4
-dstStep          RN  5
-pw1              RN  8
-pw2              RN  9
-pw3              RN  10   
-pTmp             RN  4
+@// Guarding implementation by the processor name
 
 
-;// Neon Registers
+@// Import symbols required from other files
+@// (For example tables)
+    @//IMPORT  armAAC_constTable
 
-dButterfly1Real02   DN  D0.S16
-dButterfly1Imag02   DN  D1.S16
-dButterfly1Real13   DN  D2.S16
-dButterfly1Imag13   DN  D3.S16
-dButterfly2Real02   DN  D4.S16
-dButterfly2Imag02   DN  D5.S16
-dButterfly2Real13   DN  D6.S16
-dButterfly2Imag13   DN  D7.S16
-dXr0             DN  D0.S16
-dXi0             DN  D1.S16
-dXr1             DN  D2.S16
-dXi1             DN  D3.S16
-dXr2             DN  D4.S16
-dXi2             DN  D5.S16
-dXr3             DN  D6.S16
-dXi3             DN  D7.S16
+@//Input Registers
 
-dW1rS32          DN  D8.S32         
-dW1iS32             DN  D9.S32
-dW2rS32             DN  D10.S32
-dW2iS32             DN  D11.S32
-dW3rS32             DN  D12.S32
-dW3iS32             DN  D13.S32
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
 
-dW1r             DN  D8.S16
-dW1i             DN  D9.S16
-dW2r             DN  D10.S16
-dW2i             DN  D11.S16
-dW3r             DN  D12.S16
-dW3i             DN  D13.S16
 
-dTmp0            DN  D12.S16
-dTmp1             DN  D13.S16
-dTmp1S32         DN  D13.S32
-dTmp2S32         DN  D14.S32
-dTmp3S32         DN  D15.S32
 
-dYr0             DN  D18.S16
-dYi0             DN  D19.S16
-dYr1             DN  D16.S16
-dYi1             DN  D17.S16
-dYr2             DN  D20.S16
-dYi2             DN  D21.S16
-dYr3             DN  D14.S16
-dYi3             DN  D15.S16
-qY0              QN  Q9.S16
-qY1              QN  Q8.S16
-qY2              QN  Q10.S16
-qY3              QN  Q7.S16
+@//Output Registers
 
-qX0              QN  Q0.S16
-qX1              QN  Q1.S16
-qX2              QN  Q2.S16
-qX3              QN  Q3.S16
 
-qT0              QN  Q9.S32
-qT1              QN  Q10.S32
-qT2              QN  Q7.S32
-qT3              QN  Q8.S32
+@//Local Scratch Registers
 
-dZr0             DN  D22.S16
-dZi0             DN  D23.S16
-dZr1             DN  D24.S16
-dZi1             DN  D25.S16
-dZr2             DN  D26.S16
-dZi2             DN  D27.S16
-dZr3             DN  D28.S16
-dZi3             DN  D29.S16
+#define outPointStep                    r3
+#define grpCount                        r4
+#define dstStep                         r5
+#define pw1                             r8
+#define pw2                             r9
+#define pw3                             r10
+#define pTmp                            r4
 
-qZ0              QN  Q11.S16
-qZ1              QN  Q12.S16
-qZ2              QN  Q13.S16
-qZ3              QN  Q14.S16
 
-        
-        MACRO
-        FFTSTAGE $scaled, $inverse , $name
-        
-        ;// Define stack arguments
-        
-        MOV     pw2,pTwiddle 
-        VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2@256]!
-        
+@// Neon Registers
+
+#define dButterfly1Real02               D0.S16
+#define dButterfly1Imag02               D1.S16
+#define dButterfly1Real13               D2.S16
+#define dButterfly1Imag13               D3.S16
+#define dButterfly2Real02               D4.S16
+#define dButterfly2Imag02               D5.S16
+#define dButterfly2Real13               D6.S16
+#define dButterfly2Imag13               D7.S16
+#define dXr0                            D0.S16
+#define dXi0                            D1.S16
+#define dXr1                            D2.S16
+#define dXi1                            D3.S16
+#define dXr2                            D4.S16
+#define dXi2                            D5.S16
+#define dXr3                            D6.S16
+#define dXi3                            D7.S16
+
+#define dW1rS32                         D8.S32
+#define dW1iS32                         D9.S32
+#define dW2rS32                         D10.S32
+#define dW2iS32                         D11.S32
+#define dW3rS32                         D12.S32
+#define dW3iS32                         D13.S32
+
+#define dW1r                            D8.S16
+#define dW1i                            D9.S16
+#define dW2r                            D10.S16
+#define dW2i                            D11.S16
+#define dW3r                            D12.S16
+#define dW3i                            D13.S16
+
+#define dTmp0                           D12.S16
+#define dTmp1                           D13.S16
+#define dTmp1S32                        D13.S32
+#define dTmp2S32                        D14.S32
+#define dTmp3S32                        D15.S32
+
+#define dYr0                            D18.S16
+#define dYi0                            D19.S16
+#define dYr1                            D16.S16
+#define dYi1                            D17.S16
+#define dYr2                            D20.S16
+#define dYi2                            D21.S16
+#define dYr3                            D14.S16
+#define dYi3                            D15.S16
+#define qY0                             Q9.S16
+#define qY1                             Q8.S16
+#define qY2                             Q10.S16
+#define qY3                             Q7.S16
+
+#define qX0                             Q0.S16
+#define qX1                             Q1.S16
+#define qX2                             Q2.S16
+#define qX3                             Q3.S16
+
+#define qT0                             Q9.S32
+#define qT1                             Q10.S32
+#define qT2                             Q7.S32
+#define qT3                             Q8.S32
+
+#define dZr0                            D22.S16
+#define dZi0                            D23.S16
+#define dZr1                            D24.S16
+#define dZi1                            D25.S16
+#define dZr2                            D26.S16
+#define dZi2                            D27.S16
+#define dZr3                            D28.S16
+#define dZi3                            D29.S16
+
+#define qZ0                             Q11.S16
+#define qZ1                             Q12.S16
+#define qZ2                             Q13.S16
+#define qZ3                             Q14.S16
+
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        @// Define stack arguments
+
+        MOV     pw2,pTwiddle
+        VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]!
+
         MOV     pw3,pTwiddle
         MOV     pw1,pTwiddle
-        ;// pOut0+1 increments pOut0 by 8 bytes
-        ;// pOut0+outPointStep == increment of 4*outPointStep bytes
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 4*outPointStep bytes
         MOV     outPointStep,subFFTSize,LSL #2
-        
-        VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3@64]!
-        MOV     subFFTNum,#1                            ;//after the last stage
+
+        VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]!
+        MOV     subFFTNum,#1                            @//after the last stage
         LSL     grpCount,subFFTSize,#2
-                       
-        
-        ;// Update grpCount and grpSize rightaway 
-        VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3@64]!
-        
-        ;// update subFFTSize for the next stage
+
+
+        @// Update grpCount and grpSize rightaway
+        VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]!
+
+        @// update subFFTSize for the next stage
         MOV     subFFTSize,grpCount
         MOV     dstStep,outPointStep,LSL #1
-        
-        VLD2 {dW1r,dW1i}, [pw1@128]!
-              
-        
-        ADD     dstStep,dstStep,outPointStep                ;// dstStep = 3*outPointStep
-        RSB     dstStep,dstStep,#16                         ;// dstStep = - 3*outPointStep+16
-        
-        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
-        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
-        
-        ;// Process 4 groups at a time
-        
-grpLoop$name
-        
-                          
-        ;// Rearrange the third twiddle
+
+        VLD2 {dW1r,dW1i}, [pw1 :128]!
+
+
+        ADD     dstStep,dstStep,outPointStep                @// dstStep = 3*outPointStep
+        RSB     dstStep,dstStep,#16                         @// dstStep = - 3*outPointStep+16
+
+        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
+        @// Process 4 groups at a time
+
+grpLoop\name:
+
+
+        @// Rearrange the third twiddle
         VUZP    dW3r,dW3i
-        SUBS    grpCount,grpCount,#16                    ;// grpCount is multiplied by 4
-        
-                
-        VUZP     dButterfly1Real13, dButterfly2Real13        ;// B.r D.r
-        VUZP     dButterfly1Imag13, dButterfly2Imag13        ;// B.i D.i
-        VUZP     dButterfly1Real02, dButterfly2Real02        ;// A.r C.r
-        VUZP     dButterfly1Imag02, dButterfly2Imag02        ;// A.i C.i
-        
-                
-        IF  $inverse
+        SUBS    grpCount,grpCount,#16                    @// grpCount is multiplied by 4
+
+
+        VUZP     dButterfly1Real13, dButterfly2Real13        @// B.r D.r
+        VUZP     dButterfly1Imag13, dButterfly2Imag13        @// B.i D.i
+        VUZP     dButterfly1Real02, dButterfly2Real02        @// A.r C.r
+        VUZP     dButterfly1Imag02, dButterfly2Imag02        @// A.i C.i
+
+
+        .ifeqs  "\inverse", "TRUE"
             VMULL   qT0,dXr1,dW1r
-            VMLAL   qT0,dXi1,dW1i                       ;// real part
+            VMLAL   qT0,dXi1,dW1i                       @// real part
             VMULL   qT1,dXi1,dW1r
-            VMLSL   qT1,dXr1,dW1i                       ;// imag part
-            
-        ELSE
+            VMLSL   qT1,dXr1,dW1i                       @// imag part
+
+        .ELSE
             VMULL   qT0,dXr1,dW1r
-            VMLSL   qT0,dXi1,dW1i                       ;// real part
+            VMLSL   qT0,dXi1,dW1i                       @// real part
             VMULL   qT1,dXi1,dW1r
-            VMLAL   qT1,dXr1,dW1i                       ;// imag part
-        
-        ENDIF
-        
-        ;// Load the first twiddle for 4 groups : w^1
-        ;// w^1 twiddle (i+0,i+1,i+2,i+3)       for group 0,1,2,3
-        
-        VLD2 {dW1r,dW1i}, [pw1@128]!
-        
-        IF  $inverse
+            VMLAL   qT1,dXr1,dW1i                       @// imag part
+
+        .ENDIF
+
+        @// Load the first twiddle for 4 groups : w^1
+        @// w^1 twiddle (i+0,i+1,i+2,i+3)       for group 0,1,2,3
+
+        VLD2 {dW1r,dW1i}, [pw1 :128]!
+
+        .ifeqs  "\inverse", "TRUE"
             VMULL   qT2,dXr2,dW2r
-            VMLAL   qT2,dXi2,dW2i                       ;// real part
+            VMLAL   qT2,dXi2,dW2i                       @// real part
             VMULL   qT3,dXi2,dW2r
-            VMLSL   qT3,dXr2,dW2i                       ;// imag part
-            
-        ELSE
+            VMLSL   qT3,dXr2,dW2i                       @// imag part
+
+        .ELSE
             VMULL   qT2,dXr2,dW2r
-            VMLSL   qT2,dXi2,dW2i                       ;// real part
+            VMLSL   qT2,dXi2,dW2i                       @// real part
             VMULL   qT3,dXi2,dW2r
-            VMLAL   qT3,dXr2,dW2i                       ;// imag part
-        
-        ENDIF
-        
+            VMLAL   qT3,dXr2,dW2i                       @// imag part
+
+        .ENDIF
+
         VRSHRN  dZr1,qT0,#15
         VRSHRN  dZi1,qT1,#15
-        
-        
-        
-        IF  $inverse
+
+
+
+        .ifeqs  "\inverse", "TRUE"
             VMULL   qT0,dXr3,dW3r
-            VMLAL   qT0,dXi3,dW3i                       ;// real part
+            VMLAL   qT0,dXi3,dW3i                       @// real part
             VMULL   qT1,dXi3,dW3r
-            VMLSL   qT1,dXr3,dW3i                       ;// imag part
-            
-        ELSE
+            VMLSL   qT1,dXr3,dW3i                       @// imag part
+
+        .ELSE
             VMULL   qT0,dXr3,dW3r
-            VMLSL   qT0,dXi3,dW3i                       ;// real part
+            VMLSL   qT0,dXi3,dW3i                       @// real part
             VMULL   qT1,dXi3,dW3r
-            VMLAL   qT1,dXr3,dW3i                       ;// imag part
-        
-        ENDIF
-        
-        ;// Load the second twiddle for 4 groups : w^2
-        ;// w^2 twiddle (2i+0,2i+2,2i+4,2i+6)   for group 0,1,2,3
-        VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2@256]!
-        
-        
+            VMLAL   qT1,dXr3,dW3i                       @// imag part
+
+        .ENDIF
+
+        @// Load the second twiddle for 4 groups : w^2
+        @// w^2 twiddle (2i+0,2i+2,2i+4,2i+6)   for group 0,1,2,3
+        VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]!
+
+
         VRSHRN  dZr2,qT2,#15
         VRSHRN  dZi2,qT3,#15
-        
-        ;// Load the third twiddle for 4 groups : w^3
-        ;// w^3 twiddle (3i+0,3i+3,3i+6,3i+9)   for group 0,1,2,3
-        
-        VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3@64]!
-        
+
+        @// Load the third twiddle for 4 groups : w^3
+        @// w^3 twiddle (3i+0,3i+3,3i+6,3i+9)   for group 0,1,2,3
+
+        VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]!
+
         VRSHRN  dZr3,qT0,#15
         VRSHRN  dZi3,qT1,#15
-        
-        VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3@64]!        
-        
-        IF $scaled
-        
-            ;// finish first stage of 4 point FFT 
-            
+
+        VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]!
+
+        .ifeqs "\scaled", "TRUE"
+
+            @// finish first stage of 4 point FFT
+
             VHADD    qY0,qX0,qZ2
             VHSUB    qY2,qX0,qZ2
             VHADD    qY1,qZ1,qZ3
-            VLD4    {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
-            
+            VLD4    {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
             VHSUB    qY3,qZ1,qZ3
-                        
-            ;// finish second stage of 4 point FFT 
-            
+
+            @// finish second stage of 4 point FFT
+
             VHSUB    qZ0,qY2,qY1
             VHADD    qZ2,qY2,qY1
-            VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
-            
-                                    
-            IF $inverse
-                
-                VHADD    dZr3,dYr0,dYi3                          ;// y3 = u0-ju3
-                VST2    {dZr0,dZi0},[pDst@128],outPointStep
+            VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
+
+            .ifeqs "\inverse", "TRUE"
+
+                VHADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
                 VHSUB    dZi3,dYi0,dYr3
-                
-                VHSUB    dZr1,dYr0,dYi3                          ;// y1 = u0+ju3
+
+                VHSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
                 VHADD    dZi1,dYi0,dYr3
-                VST2    {dZr3,dZi3},[pDst@128],outPointStep
-                VST2    {dZr2,dZi2},[pDst@128],outPointStep
-                VST2    {dZr1,dZi1},[pDst@128],dstStep              ;// dstStep = -3*outPointStep + 16
-            
-            ELSE
-                
-                VHSUB    dZr1,dYr0,dYi3                          ;// y1 = u0+ju3
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VST2    {dZr1,dZi1},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
+
+            .ELSE
+
+                VHSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
                 VHADD    dZi1,dYi0,dYr3
-            
-                VHADD    dZr3,dYr0,dYi3                          ;// y3 = u0-ju3
-                VST2    {dZr0,dZi0},[pDst@128],outPointStep
+
+                VHADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
                 VHSUB    dZi3,dYi0,dYr3
-                VST2    {dZr1,dZi1},[pDst@128],outPointStep
-                VST2    {dZr2,dZi2},[pDst@128],outPointStep
-                VST2    {dZr3,dZi3},[pDst@128],dstStep              ;// dstStep = -3*outPointStep + 16
-                
-            ENDIF            
-        
-        ELSE
-        
-            ;// finish first stage of 4 point FFT 
-            
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VST2    {dZr3,dZi3},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
+
+            .ENDIF
+
+        .ELSE
+
+            @// finish first stage of 4 point FFT
+
             VADD    qY0,qX0,qZ2
             VSUB    qY2,qX0,qZ2
             VADD    qY1,qZ1,qZ3
-            VLD4    {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
-            
+            VLD4    {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
             VSUB    qY3,qZ1,qZ3
-                        
-            ;// finish second stage of 4 point FFT 
-            
+
+            @// finish second stage of 4 point FFT
+
             VSUB    qZ0,qY2,qY1
             VADD    qZ2,qY2,qY1
-            VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
-            
-                                    
-            IF $inverse
-                
-                VADD    dZr3,dYr0,dYi3                          ;// y3 = u0-ju3
-                VST2    {dZr0,dZi0},[pDst@128],outPointStep
-                VSUB    dZi3,dYi0,dYr3
-                
-                VSUB    dZr1,dYr0,dYi3                          ;// y1 = u0+ju3
-                VADD    dZi1,dYi0,dYr3
-                VST2    {dZr3,dZi3},[pDst@128],outPointStep
-                VST2    {dZr2,dZi2},[pDst@128],outPointStep
-                VST2    {dZr1,dZi1},[pDst@128],dstStep              ;// dstStep = -3*outPointStep + 16
-            
-            ELSE
-                
-                VSUB    dZr1,dYr0,dYi3                          ;// y1 = u0+ju3
-                VADD    dZi1,dYi0,dYr3
-            
-                VADD    dZr3,dYr0,dYi3                          ;// y3 = u0-ju3
-                VST2    {dZr0,dZi0},[pDst@128],outPointStep
-                VSUB    dZi3,dYi0,dYr3
-                VST2    {dZr1,dZi1},[pDst@128],outPointStep
-                VST2    {dZr2,dZi2},[pDst@128],outPointStep
-                VST2    {dZr3,dZi3},[pDst@128],dstStep              ;// dstStep = -3*outPointStep + 16
-                
-            ENDIF            
-                        
-            
-            
-                        
-        ENDIF
-        
-        BGT     grpLoop$name
-           
+            VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
 
-        ;// Reset and Swap pSrc and pDst for the next stage     
+
+            .ifeqs "\inverse", "TRUE"
+
+                VADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VSUB    dZi3,dYi0,dYr3
+
+                VSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
+                VADD    dZi1,dYi0,dYr3
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VST2    {dZr1,dZi1},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
+
+            .ELSE
+
+                VSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
+                VADD    dZi1,dYi0,dYr3
+
+                VADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VSUB    dZi3,dYi0,dYr3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VST2    {dZr3,dZi3},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
+
+            .ENDIF
+
+
+
+
+        .ENDIF
+
+        BGT     grpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
         MOV     pTmp,pDst
-        SUB     pSrc,pSrc,#64                       ;// Extra increment currently done in the loop
-        SUB     pDst,pSrc,outPointStep,LSL #2       ;// pDst -= size; pSrc -= 4*size bytes           
+        SUB     pSrc,pSrc,#64                       @// Extra increment currently done in the loop
+        SUB     pDst,pSrc,outPointStep,LSL #2       @// pDst -= size; pSrc -= 4*size bytes
         SUB     pSrc,pTmp,outPointStep
-                
-        MEND
-        
-        
+
+        .endm
+
+
         M_START armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
-        FFTSTAGE {FALSE},{FALSE},FWD
+        FFTSTAGE "FALSE","FALSE",FWD
         M_END
 
-        
+
         M_START armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
-        FFTSTAGE {FALSE},{TRUE},INV
+        FFTSTAGE "FALSE","TRUE",INV
         M_END
- 
-        
+
+
         M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
-        FFTSTAGE {TRUE},{FALSE},FWDSFS
+        FFTSTAGE "TRUE","FALSE",FWDSFS
         M_END
 
-        
+
         M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
-        FFTSTAGE {TRUE},{TRUE},INVSFS
+        FFTSTAGE "TRUE","TRUE",INVSFS
         M_END
 
-        
-    ENDIF                                                           ;//CortexA8
-    
 
-    
-     
-    END
\ No newline at end of file
+
+
+
+
+    .END
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S
index c13df04..0eba385 100644
--- a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S
@@ -1,392 +1,400 @@
-;//
-;// 
-;// File Name:  armSP_FFT_CToC_SC16_Radix4_unsafe_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision:   7761
-;// Last Modified Date:       Wed, 26 Sep 2007
-;// 
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;// 
-;// 
-;//
-;// Description:
-;// Compute a Radix 4 FFT stage for a N point complex signal
-;// 
-;// 
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
 
-        
-;// Include standard headers
-
-        INCLUDE omxtypes_s.h
-        INCLUDE armCOMM_s.h
-        INCLUDE armSP_FFT_s.h
-        
-        M_VARIANTS CortexA8
-        
-;// Import symbols required from other files
-;// (For example tables)
-    
-        
-        
-        
-;// Set debugging level        
-;//DEBUG_ON    SETL {TRUE}
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix4_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7761
+@// Last Modified Date:       Wed, 26 Sep 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@//
+@//
 
 
-;// Guarding implementation by the processor name
-    
+@// Include standard headers
 
-    
-    ;// Guarding implementation by the processor name
-    
-    IF  CortexA8
-    
-;// Import symbols required from other files
-;// (For example tables)
-    
-    
-;//Input Registers
-
-pSrc            RN  0
-pDst            RN  2
-pTwiddle        RN  1
-subFFTNum       RN  6
-subFFTSize      RN  7
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
 
 
 
-;//Output Registers
+@// Import symbols required from other files
+@// (For example tables)
 
 
-;//Local Scratch Registers
 
-grpCount        RN  3
-pointStep       RN  4
-outPointStep    RN  5
-stepTwiddle     RN  12
-setCount        RN  14
-srcStep         RN  8
-setStep         RN  9
-dstStep         RN  10
-twStep          RN  11
-t1              RN  3
 
-;// Neon Registers
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
 
-dW1              DN  D0.S16
-dW2              DN  D1.S16
-dW3              DN  D2.S16   
 
-dXr0             DN  D4.S16
-dXi0             DN  D5.S16
-dXr1             DN  D6.S16
-dXi1             DN  D7.S16
-dXr2             DN  D8.S16
-dXi2             DN  D9.S16
-dXr3             DN  D10.S16
-dXi3             DN  D11.S16
-dYr0             DN  D12.S16
-dYi0             DN  D13.S16
-dYr1             DN  D14.S16
-dYi1             DN  D15.S16
-dYr2             DN  D16.S16
-dYi2             DN  D17.S16
-dYr3             DN  D18.S16
-dYi3             DN  D19.S16
-qT0              QN  Q8.S32   
-qT1              QN  Q9.S32
-qT2              QN  Q6.S32
-qT3              QN  Q7.S32
+@// Guarding implementation by the processor name
 
-dZr0             DN  D20.S16
-dZi0             DN  D21.S16
-dZr1             DN  D22.S16
-dZi1             DN  D23.S16
-dZr2             DN  D24.S16
-dZi2             DN  D25.S16
-dZr3             DN  D26.S16
-dZi3             DN  D27.S16
-qY0              QN  Q6.S16
-qY1              QN  Q7.S16
-qY2              QN  Q8.S16
-qY3              QN  Q9.S16   
-qX0              QN  Q2.S16
-qZ0              QN  Q10.S16
-qZ1              QN  Q11.S16
-qZ2              QN  Q12.S16
-qZ3              QN  Q13.S16
 
-        
-        MACRO
-        FFTSTAGE $scaled, $inverse , $name
-        
-        ;// Define stack arguments
-        
-        
-        ;// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
-        
+
+    @// Guarding implementation by the processor name
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpCount                        r3
+#define pointStep                       r4
+#define outPointStep                    r5
+#define stepTwiddle                     r12
+#define setCount                        r14
+#define srcStep                         r8
+#define setStep                         r9
+#define dstStep                         r10
+#define twStep                          r11
+#define t1                              r3
+
+@// Neon Registers
+
+#define dW1                             D0.S16
+#define dW2                             D1.S16
+#define dW3                             D2.S16
+
+#define dXr0                            D4.S16
+#define dXi0                            D5.S16
+#define dXr1                            D6.S16
+#define dXi1                            D7.S16
+#define dXr2                            D8.S16
+#define dXi2                            D9.S16
+#define dXr3                            D10.S16
+#define dXi3                            D11.S16
+#define dYr0                            D12.S16
+#define dYi0                            D13.S16
+#define dYr1                            D14.S16
+#define dYi1                            D15.S16
+#define dYr2                            D16.S16
+#define dYi2                            D17.S16
+#define dYr3                            D18.S16
+#define dYi3                            D19.S16
+#define qT0                             Q8.S32
+#define qT1                             Q9.S32
+#define qT2                             Q6.S32
+#define qT3                             Q7.S32
+
+#define dZr0                            D20.S16
+#define dZi0                            D21.S16
+#define dZr1                            D22.S16
+#define dZi1                            D23.S16
+#define dZr2                            D24.S16
+#define dZi2                            D25.S16
+#define dZr3                            D26.S16
+#define dZi3                            D27.S16
+#define qY0                             Q6.S16
+#define qY1                             Q7.S16
+#define qY2                             Q8.S16
+#define qY3                             Q9.S16
+#define qX0                             Q2.S16
+#define qZ0                             Q10.S16
+#define qZ1                             Q11.S16
+#define qZ2                             Q12.S16
+#define qZ3                             Q13.S16
+
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        @// Define stack arguments
+
+
+        @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+
         LSL     grpCount,subFFTSize,#2
-        LSR     subFFTNum,subFFTNum,#2  
+        LSR     subFFTNum,subFFTNum,#2
         MOV     subFFTSize,grpCount
-        
-        
-        ;// pOut0+1 increments pOut0 by 4 bytes
-        ;// pOut0+outPointStep == increment of 4*outPointStep bytes = size bytes
-        
+
+
+        @// pOut0+1 increments pOut0 by 4 bytes
+        @// pOut0+outPointStep == increment of 4*outPointStep bytes = size bytes
+
         MOV     stepTwiddle,#0
-        SMULBB  outPointStep,grpCount,subFFTNum  
-        
-        ;// pT0+1 increments pT0 by 4 bytes
-        ;// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
-        
-        LSL     pointStep,subFFTNum,#2                      ;// 2*grpSize    
-        
-        VLD1     dW1,[pTwiddle@64]                             ;//[wi | wr]
-        MOV     srcStep,pointStep,LSL #1                    ;// srcStep = 2*pointStep
-        VLD1     dW2,[pTwiddle@64]                             ;//[wi | wr]
-        ADD     setStep,srcStep,pointStep                   ;// setStep = 3*pointStep
-        SUB     srcStep,srcStep,#16                         ;// srcStep = 2*pointStep-16
-        VLD1     dW3,[pTwiddle@64]
-        ;//RSB     setStep,setStep,#16                      ;// setStep = - 3*pointStep+16
-        RSB     setStep,setStep,#0                          ;// setStep = - 3*pointStep
-        
+        SMULBB  outPointStep,grpCount,subFFTNum
+
+        @// pT0+1 increments pT0 by 4 bytes
+        @// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
+
+        LSL     pointStep,subFFTNum,#2                      @// 2*grpSize
+
+        VLD1     dW1,[pTwiddle :64]                             @//[wi | wr]
+        MOV     srcStep,pointStep,LSL #1                    @// srcStep = 2*pointStep
+        VLD1     dW2,[pTwiddle :64]                             @//[wi | wr]
+        ADD     setStep,srcStep,pointStep                   @// setStep = 3*pointStep
+        SUB     srcStep,srcStep,#16                         @// srcStep = 2*pointStep-16
+        VLD1     dW3,[pTwiddle :64]
+        @//RSB     setStep,setStep,#16                      @// setStep = - 3*pointStep+16
+        RSB     setStep,setStep,#0                          @// setStep = - 3*pointStep
+
         MOV     dstStep,outPointStep,LSL #1
-        ADD     dstStep,dstStep,outPointStep                ;// dstStep = 3*outPointStep
-        RSB     dstStep,dstStep,#16                         ;// dstStep = - 3*outPointStep+16
-        
+        ADD     dstStep,dstStep,outPointStep                @// dstStep = 3*outPointStep
+        RSB     dstStep,dstStep,#16                         @// dstStep = - 3*outPointStep+16
 
-        
-grpLoop$name      
-        
-        VLD2    {dXr0,dXi0},[pSrc@128],pointStep          ;//  data[0]
+
+
+grpLoop\name:
+
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0]
         ADD      stepTwiddle,stepTwiddle,pointStep
-        VLD2    {dXr1,dXi1},[pSrc@128],pointStep          ;//  data[1]
-        ADD      pTwiddle,pTwiddle,stepTwiddle               ;// set pTwiddle to the first point
-        VLD2    {dXr2,dXi2},[pSrc@128],pointStep          ;//  data[2]
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+        ADD      pTwiddle,pTwiddle,stepTwiddle               @// set pTwiddle to the first point
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
         MOV      twStep,stepTwiddle,LSL #2
-        VLD2    {dXr3,dXi3},[pSrc@128],setStep            ;//  data[3] & reset pSrc 
-        
-        SUB      twStep,stepTwiddle,twStep                   ;// twStep = -3*stepTwiddle
-        
-        
-        MOV      setCount,pointStep,LSR #2
-        ADD     pSrc,pSrc,#16                         ;// set pSrc to data[0] of the next set
-        ADD     pSrc,pSrc,pointStep                   ;// increment to data[1] of the next set
-       
-        ;// Loop on the sets : 4 at a time
+        VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3] & reset pSrc
 
-setLoop$name        
-               
-        SUBS    setCount,setCount,#4                    ;// decrement the loop counter
-        
-        IF  $inverse
+        SUB      twStep,stepTwiddle,twStep                   @// twStep = -3*stepTwiddle
+
+
+        MOV      setCount,pointStep,LSR #2
+        ADD     pSrc,pSrc,#16                         @// set pSrc to data[0] of the next set
+        ADD     pSrc,pSrc,pointStep                   @// increment to data[1] of the next set
+
+        @// Loop on the sets : 4 at a time
+
+setLoop\name:
+
+        SUBS    setCount,setCount,#4                    @// decrement the loop counter
+
+        .ifeqs  "\inverse", "TRUE"
             VMULL   qT0,dXr1,dW1[0]
-            VMLAL   qT0,dXi1,dW1[1]                       ;// real part
+            VMLAL   qT0,dXi1,dW1[1]                       @// real part
             VMULL   qT1,dXi1,dW1[0]
-            VMLSL   qT1,dXr1,dW1[1]                       ;// imag part
-            
-        ELSE
+            VMLSL   qT1,dXr1,dW1[1]                       @// imag part
+
+        .ELSE
             VMULL   qT0,dXr1,dW1[0]
-            VMLSL   qT0,dXi1,dW1[1]                       ;// real part
+            VMLSL   qT0,dXi1,dW1[1]                       @// real part
             VMULL   qT1,dXi1,dW1[0]
-            VMLAL   qT1,dXr1,dW1[1]                       ;// imag part
-        
-        ENDIF
-        
-        VLD2    {dXr1,dXi1},[pSrc@128],pointStep          ;//  data[1]
-        
-        IF  $inverse
+            VMLAL   qT1,dXr1,dW1[1]                       @// imag part
+
+        .ENDIF
+
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+
+        .ifeqs  "\inverse", "TRUE"
             VMULL   qT2,dXr2,dW2[0]
-            VMLAL   qT2,dXi2,dW2[1]                       ;// real part
+            VMLAL   qT2,dXi2,dW2[1]                       @// real part
             VMULL   qT3,dXi2,dW2[0]
-            VMLSL   qT3,dXr2,dW2[1]                       ;// imag part
-            
-        ELSE
+            VMLSL   qT3,dXr2,dW2[1]                       @// imag part
+
+        .ELSE
             VMULL   qT2,dXr2,dW2[0]
-            VMLSL   qT2,dXi2,dW2[1]                       ;// real part
+            VMLSL   qT2,dXi2,dW2[1]                       @// real part
             VMULL   qT3,dXi2,dW2[0]
-            VMLAL   qT3,dXr2,dW2[1]                       ;// imag part
-        
-        ENDIF
-        
+            VMLAL   qT3,dXr2,dW2[1]                       @// imag part
+
+        .ENDIF
+
         VRSHRN  dZr1,qT0,#15
         VRSHRN  dZi1,qT1,#15
-        
-        
-        VLD2    {dXr2,dXi2},[pSrc@128],pointStep          ;//  data[2]
-        
-        IF  $inverse
+
+
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+
+        .ifeqs  "\inverse", "TRUE"
             VMULL   qT0,dXr3,dW3[0]
-            VMLAL   qT0,dXi3,dW3[1]                       ;// real part
+            VMLAL   qT0,dXi3,dW3[1]                       @// real part
             VMULL   qT1,dXi3,dW3[0]
-            VMLSL   qT1,dXr3,dW3[1]                       ;// imag part
-            
-        ELSE
+            VMLSL   qT1,dXr3,dW3[1]                       @// imag part
+
+        .ELSE
             VMULL   qT0,dXr3,dW3[0]
-            VMLSL   qT0,dXi3,dW3[1]                       ;// real part
+            VMLSL   qT0,dXi3,dW3[1]                       @// real part
             VMULL   qT1,dXi3,dW3[0]
-            VMLAL   qT1,dXr3,dW3[1]                       ;// imag part
-        
-        ENDIF
-        
+            VMLAL   qT1,dXr3,dW3[1]                       @// imag part
+
+        .ENDIF
+
         VRSHRN  dZr2,qT2,#15
         VRSHRN  dZi2,qT3,#15
-        
-        
+
+
         VRSHRN  dZr3,qT0,#15
         VRSHRN  dZi3,qT1,#15
-        VLD2    {dXr3,dXi3},[pSrc@128],setStep            ;//  data[3] & update pSrc for the next set
-        
-        
-        IF $scaled
-        
-            ;// finish first stage of 4 point FFT 
+        VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3] & update pSrc for the next set
+
+
+        .ifeqs "\scaled", "TRUE"
+
+            @// finish first stage of 4 point FFT
             VHADD    qY0,qX0,qZ2
             VHSUB    qY2,qX0,qZ2
-                        
-            VLD2    {dXr0,dXi0},[pSrc@128]!          ;//  data[0]
+
+            VLD2    {dXr0,dXi0},[pSrc :128]!          @//  data[0]
             VHADD    qY1,qZ1,qZ3
             VHSUB    qY3,qZ1,qZ3
-            
-                        
-            ;// finish second stage of 4 point FFT 
-                                    
-            IF  $inverse
-                
+
+
+            @// finish second stage of 4 point FFT
+
+            .ifeqs  "\inverse", "TRUE"
+
                 VHSUB    qZ0,qY2,qY1
-                
+
                 VHADD    dZr2,dYr0,dYi3
-                VST2    {dZr0,dZi0},[pDst@128],outPointStep
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
                 VHSUB    dZi2,dYi0,dYr3
-                
+
                 VHADD    qZ1,qY2,qY1
-                VST2    {dZr2,dZi2},[pDst@128],outPointStep
-                
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+
                 VHSUB    dZr3,dYr0,dYi3
-                VST2    {dZr1,dZi1},[pDst@128],outPointStep
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
                 VHADD    dZi3,dYi0,dYr3
-                VST2    {dZr3,dZi3},[pDst@128],dstStep
-                
-                
-            ELSE
-                
+                VST2    {dZr3,dZi3},[pDst :128],dstStep
+
+
+            .ELSE
+
                 VHSUB    qZ0,qY2,qY1
-                       
+
                 VHSUB    dZr3,dYr0,dYi3
-                VST2    {dZr0,dZi0},[pDst@128],outPointStep
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
                 VHADD    dZi3,dYi0,dYr3
-                        
+
                 VHADD    qZ1,qY2,qY1
-                VST2    {dZr3,dZi3},[pDst@128],outPointStep
-                        
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
                 VHADD    dZr2,dYr0,dYi3
                 VHSUB    dZi2,dYi0,dYr3
-                VST2    {dZr1,dZi1},[pDst@128],outPointStep
-                VST2    {dZr2,dZi2},[pDst@128],dstStep
-                
-            
-            ENDIF
-        
-        
-        ELSE
-        
-            ;// finish first stage of 4 point FFT 
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],dstStep
+
+
+            .ENDIF
+
+
+        .ELSE
+
+            @// finish first stage of 4 point FFT
             VADD    qY0,qX0,qZ2
             VSUB    qY2,qX0,qZ2
-                        
-            VLD2    {dXr0,dXi0},[pSrc]!          ;//  data[0]
+
+            VLD2    {dXr0,dXi0},[pSrc]!          @//  data[0]
             VADD    qY1,qZ1,qZ3
             VSUB    qY3,qZ1,qZ3
-            
-                        
-            ;// finish second stage of 4 point FFT 
-                                   
-                        
-            IF  $inverse
-                
-                VSUB    qZ0,qY2,qY1
-                
-                VADD    dZr2,dYr0,dYi3
-                VST2    {dZr0,dZi0},[pDst@128],outPointStep
-                VSUB    dZi2,dYi0,dYr3
-                
-                VADD    qZ1,qY2,qY1
-                VST2    {dZr2,dZi2},[pDst@128],outPointStep
-                
-                VSUB    dZr3,dYr0,dYi3
-                VST2    {dZr1,dZi1},[pDst@128],outPointStep
-                VADD    dZi3,dYi0,dYr3
-                VST2    {dZr3,dZi3},[pDst@128],dstStep
-                
-                
-            ELSE
-                
-                VSUB    qZ0,qY2,qY1
-                       
-                VSUB    dZr3,dYr0,dYi3
-                VST2    {dZr0,dZi0},[pDst@128],outPointStep
-                VADD    dZi3,dYi0,dYr3
-                        
-                VADD    qZ1,qY2,qY1
-                VST2    {dZr3,dZi3},[pDst@128],outPointStep
-                        
-                VADD    dZr2,dYr0,dYi3
-                VSUB    dZi2,dYi0,dYr3
-                VST2    {dZr1,dZi1},[pDst@128],outPointStep
-                VST2    {dZr2,dZi2},[pDst@128],dstStep
-                
-            
-            ENDIF
-                                    
-                        
-            
-        ENDIF
-        
-        ADD     pSrc,pSrc,pointStep                         ;// increment to data[1] of the next set       
-        BGT     setLoop$name
-        
-        VLD1     dW1,[pTwiddle@64],stepTwiddle                 ;//[wi | wr]
-        SUBS    grpCount,grpCount,#4                        ;// subtract 4 since grpCount multiplied by 4               
-        VLD1     dW2,[pTwiddle@64],stepTwiddle                 ;//[wi | wr]
-        ADD     pSrc,pSrc,srcStep                           ;// increment pSrc for the next grp
-        VLD1     dW3,[pTwiddle@64],twStep                      ;//[wi | wr]
-        
-        
-        
-        BGT     grpLoop$name    
 
-                
-        ;// Reset and Swap pSrc and pDst for the next stage
+
+            @// finish second stage of 4 point FFT
+
+
+            .ifeqs  "\inverse", "TRUE"
+
+                VSUB    qZ0,qY2,qY1
+
+                VADD    dZr2,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VSUB    dZi2,dYi0,dYr3
+
+                VADD    qZ1,qY2,qY1
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+
+                VSUB    dZr3,dYr0,dYi3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VADD    dZi3,dYi0,dYr3
+                VST2    {dZr3,dZi3},[pDst :128],dstStep
+
+
+            .ELSE
+
+                VSUB    qZ0,qY2,qY1
+
+                VSUB    dZr3,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VADD    dZi3,dYi0,dYr3
+
+                VADD    qZ1,qY2,qY1
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
+                VADD    dZr2,dYr0,dYi3
+                VSUB    dZi2,dYi0,dYr3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],dstStep
+
+
+            .ENDIF
+
+
+
+        .ENDIF
+
+        ADD     pSrc,pSrc,pointStep                         @// increment to data[1] of the next set
+        BGT     setLoop\name
+
+        VLD1     dW1,[pTwiddle :64],stepTwiddle                 @//[wi | wr]
+        SUBS    grpCount,grpCount,#4                        @// subtract 4 since grpCount multiplied by 4
+        VLD1     dW2,[pTwiddle :64],stepTwiddle                 @//[wi | wr]
+        ADD     pSrc,pSrc,srcStep                           @// increment pSrc for the next grp
+        VLD1     dW3,[pTwiddle :64],twStep                      @//[wi | wr]
+
+
+
+        BGT     grpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
         MOV     t1,pDst
-        SUB     pDst,pSrc,outPointStep,LSL #2           ;// pDst -= size; pSrc -= 4*size bytes           
-        SUB     pSrc,t1,outPointStep    
-        
-        
-        MEND
-        
-        
+        SUB     pDst,pSrc,outPointStep,LSL #2           @// pDst -= size; pSrc -= 4*size bytes
+        SUB     pSrc,t1,outPointStep
+
+
+        .endm
+
+
         M_START armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe,r4
-            FFTSTAGE {FALSE},{FALSE},FWD
+            FFTSTAGE "FALSE","FALSE",FWD
         M_END
 
-        
+
         M_START armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe,r4
-            FFTSTAGE {FALSE},{TRUE},INV
+            FFTSTAGE "FALSE","TRUE",INV
         M_END
- 
-        
+
+
         M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4
-            FFTSTAGE {TRUE},{FALSE},FWDSFS
+            FFTSTAGE "TRUE","FALSE",FWDSFS
         M_END
 
-        
+
         M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4
-            FFTSTAGE {TRUE},{TRUE},INVSFS
+            FFTSTAGE "TRUE","TRUE",INVSFS
         M_END
 
-        
-    ENDIF                                                           ;//CortexA8
-    
- 
-    
-    END
\ No newline at end of file
+
+
+
+
+    .END
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
index 741681f..588c319 100644
--- a/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
@@ -1,591 +1,619 @@
-;//
-;// 
-;// File Name:  armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision:   7766
-;// Last Modified Date:       Thu, 27 Sep 2007
-;// 
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;// 
-;// 
-;//
-;// Description:
-;// Compute a first stage Radix 8 FFT stage for a N point complex signal
-;// 
-;// 
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
 
-        
-;// Include standard headers
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7766
+@// Last Modified Date:       Thu, 27 Sep 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a first stage Radix 8 FFT stage for a N point complex signal
+@//
+@//
 
-        INCLUDE omxtypes_s.h
-        INCLUDE armCOMM_s.h
-        
-        M_VARIANTS CortexA8
-        
-;// Import symbols required from other files
-;// (For example tables)
-    
-        
-;// Set debugging level        
-;//DEBUG_ON    SETL {TRUE}
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
 
 
 
-;// Guarding implementation by the processor name
-    
-    
-    
-    
-;// Guarding implementation by the processor name
-    
-    IF  CortexA8
-    
-;//Input Registers
-
-pSrc            RN  0
-pDst            RN  2
-pTwiddle        RN  1
-subFFTNum       RN  6
-subFFTSize      RN  7
-pPingPongBuf    RN  5                  ;// dest buffer for the next stage (not pSrc for first stage) 
-
-
-;//Output Registers
-
-
-;//Local Scratch Registers
-
-grpSize         RN  3
-setCount        RN  3                  ;// Reuse grpSize as setCount
-pointStep       RN  4
-outPointStep    RN  4
-setStep         RN  8
-step1           RN  9
-step2           RN  10
-t0              RN  11
-  
-
-;// Neon Registers
-
-dXr0             DN  D14.S16
-dXi0             DN  D15.S16
-dXr1             DN  D2.S16
-dXi1             DN  D3.S16
-dXr2             DN  D4.S16
-dXi2             DN  D5.S16
-dXr3             DN  D6.S16
-dXi3             DN  D7.S16
-dXr4             DN  D8.S16
-dXi4             DN  D9.S16
-dXr5             DN  D10.S16
-dXi5             DN  D11.S16
-dXr6             DN  D12.S16
-dXi6             DN  D13.S16
-dXr7             DN  D0.S16
-dXi7             DN  D1.S16
-qX0              QN  Q7.S16
-qX1              QN  Q1.S16
-qX2              QN  Q2.S16
-qX3              QN  Q3.S16   
-qX4              QN  Q4.S16
-qX5              QN  Q5.S16
-qX6              QN  Q6.S16
-qX7              QN  Q0.S16
-
-dUr0             DN  D16.S16
-dUi0             DN  D17.S16
-dUr2             DN  D18.S16
-dUi2             DN  D19.S16
-dUr4             DN  D20.S16
-dUi4             DN  D21.S16
-dUr6             DN  D22.S16
-dUi6             DN  D23.S16
-dUr1             DN  D24.S16
-dUi1             DN  D25.S16
-dUr3             DN  D26.S16
-dUi3             DN  D27.S16
-dUr5             DN  D28.S16
-dUi5             DN  D29.S16
-dUr7             DN  D30.S16                ;// reuse dXr7 and dXi7
-dUi7             DN  D31.S16
-qU0              QN   Q8.S16
-qU1              QN   Q12.S16
-qU2              QN   Q9.S16
-qU3              QN   Q13.S16   
-qU4              QN   Q10.S16
-qU5              QN   Q14.S16
-qU6              QN   Q11.S16
-qU7              QN   Q15.S16
+@// Guarding implementation by the processor name
 
 
 
-dVr0             DN  D24.S16
-dVi0             DN  D25.S16
-dVr2             DN  D26.S16
-dVi2             DN  D27.S16
-dVr4             DN  D28.S16
-dVi4             DN  D29.S16
-dVr6             DN  D30.S16
-dVi6             DN  D31.S16
-dVr1             DN  D16.S16
-dVi1             DN  D17.S16
-dVr3             DN  D18.S16
-dVi3             DN  D19.S16
-dVr5             DN  D20.S16
-dVi5             DN  D21.S16
-dVr7             DN  D22.S16              ;// reuse dUi7 
-dVi7             DN  D23.S16              ;// reuse dUr7 
-qV0              QN  Q12.S16
-qV1              QN  Q8.S16
-qV2              QN  Q13.S16
-qV3              QN  Q9.S16   
-qV4              QN  Q14.S16
-qV5              QN  Q10.S16
-qV6              QN  Q15.S16
-qV7              QN  Q11.S16
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+@// dest buffer for the next stage (not pSrc for first stage)
+#define pPingPongBuf                    r5
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize                         r3
+@// Reuse grpSize as setCount
+#define setCount                        r3
+#define pointStep                       r4
+#define outPointStep                    r4
+#define setStep                         r8
+#define step1                           r9
+#define step2                           r10
+#define t0                              r11
+
+
+@// Neon Registers
+
+#define dXr0                            D14.S16
+#define dXi0                            D15.S16
+#define dXr1                            D2.S16
+#define dXi1                            D3.S16
+#define dXr2                            D4.S16
+#define dXi2                            D5.S16
+#define dXr3                            D6.S16
+#define dXi3                            D7.S16
+#define dXr4                            D8.S16
+#define dXi4                            D9.S16
+#define dXr5                            D10.S16
+#define dXi5                            D11.S16
+#define dXr6                            D12.S16
+#define dXi6                            D13.S16
+#define dXr7                            D0.S16
+#define dXi7                            D1.S16
+#define qX0                             Q7.S16
+#define qX1                             Q1.S16
+#define qX2                             Q2.S16
+#define qX3                             Q3.S16
+#define qX4                             Q4.S16
+#define qX5                             Q5.S16
+#define qX6                             Q6.S16
+#define qX7                             Q0.S16
+
+#define dUr0                            D16.S16
+#define dUi0                            D17.S16
+#define dUr2                            D18.S16
+#define dUi2                            D19.S16
+#define dUr4                            D20.S16
+#define dUi4                            D21.S16
+#define dUr6                            D22.S16
+#define dUi6                            D23.S16
+#define dUr1                            D24.S16
+#define dUi1                            D25.S16
+#define dUr3                            D26.S16
+#define dUi3                            D27.S16
+#define dUr5                            D28.S16
+#define dUi5                            D29.S16
+@// reuse dXr7 and dXi7
+#define dUr7                            D30.S16
+#define dUi7                            D31.S16
+#define qU0                             Q8.S16
+#define qU1                             Q12.S16
+#define qU2                             Q9.S16
+#define qU3                             Q13.S16
+#define qU4                             Q10.S16
+#define qU5                             Q14.S16
+#define qU6                             Q11.S16
+#define qU7                             Q15.S16
 
 
 
-dYr0             DN  D16.S16
-dYi0             DN  D17.S16
-dYr2             DN  D18.S16
-dYi2             DN  D19.S16
-dYr4             DN  D20.S16
-dYi4             DN  D21.S16
-dYr6             DN  D22.S16
-dYi6             DN  D23.S16
-dYr1             DN  D24.S16
-dYi1             DN  D25.S16
-dYr3             DN  D26.S16
-dYi3             DN  D27.S16
-dYr5             DN  D28.S16
-dYi5             DN  D29.S16
-dYr7             DN  D30.S16                 ;// reuse dYr4 and dYi4
-dYi7             DN  D31.S16
-qY0              QN   Q8.S16
-qY1              QN   Q12.S16
-qY2              QN   Q9.S16
-qY3              QN   Q13.S16   
-qY4              QN   Q10.S16
-qY5              QN   Q14.S16
-qY6              QN   Q11.S16
-qY7              QN   Q15.S16
+#define dVr0                            D24.S16
+#define dVi0                            D25.S16
+#define dVr2                            D26.S16
+#define dVi2                            D27.S16
+#define dVr4                            D28.S16
+#define dVi4                            D29.S16
+#define dVr6                            D30.S16
+#define dVi6                            D31.S16
+#define dVr1                            D16.S16
+#define dVi1                            D17.S16
+#define dVr3                            D18.S16
+#define dVi3                            D19.S16
+#define dVr5                            D20.S16
+#define dVi5                            D21.S16
+@// reuse dUi7
+#define dVr7                            D22.S16
+@// reuse dUr7
+#define dVi7                            D23.S16
+#define qV0                             Q12.S16
+#define qV1                             Q8.S16
+#define qV2                             Q13.S16
+#define qV3                             Q9.S16
+#define qV4                             Q14.S16
+#define qV5                             Q10.S16
+#define qV6                             Q15.S16
+#define qV7                             Q11.S16
 
 
-dT0              DN  D0.S16             
-dT1              DN  D1.S16
+
+#define dYr0                            D16.S16
+#define dYi0                            D17.S16
+#define dYr2                            D18.S16
+#define dYi2                            D19.S16
+#define dYr4                            D20.S16
+#define dYi4                            D21.S16
+#define dYr6                            D22.S16
+#define dYi6                            D23.S16
+#define dYr1                            D24.S16
+#define dYi1                            D25.S16
+#define dYr3                            D26.S16
+#define dYi3                            D27.S16
+#define dYr5                            D28.S16
+#define dYi5                            D29.S16
+@// reuse dYr4 and dYi4
+#define dYr7                            D30.S16
+#define dYi7                            D31.S16
+#define qY0                             Q8.S16
+#define qY1                             Q12.S16
+#define qY2                             Q9.S16
+#define qY3                             Q13.S16
+#define qY4                             Q10.S16
+#define qY5                             Q14.S16
+#define qY6                             Q11.S16
+#define qY7                             Q15.S16
 
 
-;// Define constants
-ONEBYSQRT2      EQU   0x00005A82        ;// Q15 format
-    
+#define dT0                             D0.S16
+#define dT1                             D1.S16
 
-        MACRO
-        FFTSTAGE $scaled, $inverse , $name
-        
-        ;// Define stack arguments
-        
-        ;// Update pSubFFTSize and pSubFFTNum regs
-        MOV     subFFTSize,#8                               ;// subFFTSize = 1 for the first stage
-        LDR     t0,=ONEBYSQRT2                              ;// t0=(1/sqrt(2)) as Q15 format
-        
-        ;// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
-        LSR     grpSize,subFFTNum,#3  
+
+@// Define constants
+        .set   ONEBYSQRT2, 0x00005A82        @// Q15 format
+
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        @// Define stack arguments
+
+        @// Update pSubFFTSize and pSubFFTNum regs
+        MOV     subFFTSize,#8                               @// subFFTSize = 1 for the first stage
+        LDR     t0,=ONEBYSQRT2                              @// t0=(1/sqrt(2)) as Q15 format
+
+        @// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#3
         MOV     subFFTNum,grpSize
-        
-                
-        ;// pT0+1 increments pT0 by 4 bytes
-        ;// pT0+pointStep = increment of 4*pointStep bytes = grpSize/2 bytes
-        ;// Note: outPointStep = pointStep for firststage
-        
-        MOV     pointStep,grpSize,LSL #2
-        
-                                       
-        ;// Calculate the step of input data for the next set
-        ;//MOV     step1,pointStep,LSL #1                      ;// step1 = 2*pointStep
-        VLD2    {dXr0,dXi0},[pSrc@128],pointStep          ;//  data[0]
-        MOV     step1,grpSize,LSL #3
-        
-        MOV     step2,pointStep,LSL #3
-        VLD2    {dXr1,dXi1},[pSrc@128],pointStep          ;//  data[1]
-        SUB     step2,step2,pointStep                          ;// step2 = 7*pointStep
-        RSB     setStep,step2,#16                              ;// setStep = - 7*pointStep+16
-        
-        
-        
-        VLD2    {dXr2,dXi2},[pSrc@128],pointStep          ;//  data[2]
-        VLD2    {dXr3,dXi3},[pSrc@128],pointStep          ;//  data[3] 
-        VLD2    {dXr4,dXi4},[pSrc@128],pointStep          ;//  data[4]
-        VLD2    {dXr5,dXi5},[pSrc@128],pointStep          ;//  data[5]
-        VLD2    {dXr6,dXi6},[pSrc@128],pointStep          ;//  data[6]
-        VLD2    {dXr7,dXi7},[pSrc@128],setStep            ;//  data[7] & update pSrc for the next set
-                                                      ;//  setStep = -7*pointStep + 16  
-        ;// grp = 0 a special case since all the twiddle factors are 1
-        ;// Loop on the sets : 4 sets at a time
 
-grpZeroSetLoop$name
-                                                      
-        ;// Decrement setcount
-        SUBS    setCount,setCount,#4                    ;// decrement the set loop counter           
-                                                                         
-        
-        IF $scaled
-            ;// finish first stage of 8 point FFT 
-            
+
+        @// pT0+1 increments pT0 by 4 bytes
+        @// pT0+pointStep = increment of 4*pointStep bytes = grpSize/2 bytes
+        @// Note: outPointStep = pointStep for firststage
+
+        MOV     pointStep,grpSize,LSL #2
+
+
+        @// Calculate the step of input data for the next set
+        @//MOV     step1,pointStep,LSL #1                      @// step1 = 2*pointStep
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0]
+        MOV     step1,grpSize,LSL #3
+
+        MOV     step2,pointStep,LSL #3
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+        SUB     step2,step2,pointStep                          @// step2 = 7*pointStep
+        RSB     setStep,step2,#16                              @// setStep = - 7*pointStep+16
+
+
+
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+        VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3]
+        VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
+        VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+        VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+        VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7] & update pSrc for the next set
+                                                      @//  setStep = -7*pointStep + 16
+        @// grp = 0 a special case since all the twiddle factors are 1
+        @// Loop on the sets : 4 sets at a time
+
+grpZeroSetLoop\name:
+
+        @// Decrement setcount
+        SUBS    setCount,setCount,#4                    @// decrement the set loop counter
+
+
+        .ifeqs "\scaled", "TRUE"
+            @// finish first stage of 8 point FFT
+
             VHADD    qU0,qX0,qX4
             VHADD    qU2,qX1,qX5
             VHADD    qU4,qX2,qX6
             VHADD    qU6,qX3,qX7
-            
-            ;// finish second stage of 8 point FFT 
-            
+
+            @// finish second stage of 8 point FFT
+
             VHADD    qV0,qU0,qU4
             VHSUB    qV2,qU0,qU4
             VHADD    qV4,qU2,qU6
             VHSUB    qV6,qU2,qU6
-            
-            ;// finish third stage of 8 point FFT 
-            
+
+            @// finish third stage of 8 point FFT
+
             VHADD    qY0,qV0,qV4
             VHSUB    qY4,qV0,qV4
-            VST2    {dYr0,dYi0},[pDst@128],step1                    ;// store y0
-            
-            IF  $inverse
-                
-                VHSUB    dYr2,dVr2,dVi6
-                VHADD    dYi2,dVi2,dVr6
-                
-                VHADD    dYr6,dVr2,dVi6
-                VST2    {dYr2,dYi2},[pDst@128],step1                    ;// store y2
-                VHSUB    dYi6,dVi2,dVr6
-            
-                VHSUB    qU1,qX0,qX4                    
-                VST2    {dYr4,dYi4},[pDst@128],step1                    ;// store y4
-            
-                VHSUB    qU3,qX1,qX5
-                VHSUB    qU5,qX2,qX6
-                VST2    {dYr6,dYi6},[pDst@128],step1                    ;// store y6
-            
-            ELSE
-            
-                VHADD    dYr6,dVr2,dVi6
-                VHSUB    dYi6,dVi2,dVr6
-                
-                VHSUB    dYr2,dVr2,dVi6
-                VST2    {dYr6,dYi6},[pDst@128],step1                    ;// store y2
-                VHADD    dYi2,dVi2,dVr6
-                
-                                
-                VHSUB    qU1,qX0,qX4
-                VST2    {dYr4,dYi4},[pDst@128],step1                    ;// store y4
-                VHSUB    qU3,qX1,qX5
-                VHSUB    qU5,qX2,qX6
-                VST2    {dYr2,dYi2},[pDst@128],step1                    ;// store y6
+            VST2    {dYr0,dYi0},[pDst :128],step1                    @// store y0
 
-            
-            ENDIF
-            
-            ;// finish first stage of 8 point FFT 
-            
+            .ifeqs  "\inverse", "TRUE"
+
+                VHSUB    dYr2,dVr2,dVi6
+                VHADD    dYi2,dVi2,dVr6
+
+                VHADD    dYr6,dVr2,dVi6
+                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y2
+                VHSUB    dYi6,dVi2,dVr6
+
+                VHSUB    qU1,qX0,qX4
+                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
+
+                VHSUB    qU3,qX1,qX5
+                VHSUB    qU5,qX2,qX6
+                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y6
+
+            .ELSE
+
+                VHADD    dYr6,dVr2,dVi6
+                VHSUB    dYi6,dVi2,dVr6
+
+                VHSUB    dYr2,dVr2,dVi6
+                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y2
+                VHADD    dYi2,dVi2,dVr6
+
+
+                VHSUB    qU1,qX0,qX4
+                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
+                VHSUB    qU3,qX1,qX5
+                VHSUB    qU5,qX2,qX6
+                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y6
+
+
+            .ENDIF
+
+            @// finish first stage of 8 point FFT
+
             VHSUB    qU7,qX3,qX7
-            VMOV    dT0[0],t0                                   
-            
-            ;// finish second stage of 8 point FFT 
-            
+            VMOV    dT0[0],t0
+
+            @// finish second stage of 8 point FFT
+
             VHSUB    dVr1,dUr1,dUi5
-            VLD2    {dXr0,dXi0},[pSrc@128],pointStep          ;//  data[0] for next iteration
+            VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0] for next iteration
             VHADD    dVi1,dUi1,dUr5
             VHADD    dVr3,dUr1,dUi5
-            VLD2    {dXr1,dXi1},[pSrc@128],pointStep          ;//  data[1]
+            VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
             VHSUB    dVi3,dUi1,dUr5
-                        
+
             VHSUB    dVr5,dUr3,dUi7
-            VLD2    {dXr2,dXi2},[pSrc@128],pointStep          ;//  data[2]
+            VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
             VHADD    dVi5,dUi3,dUr7
             VHADD    dVr7,dUr3,dUi7
-            VLD2    {dXr3,dXi3},[pSrc@128],pointStep          ;//  data[3]
+            VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3]
             VHSUB    dVi7,dUi3,dUr7
-            
-            ;// finish third stage of 8 point FFT 
-            
-            IF  $inverse
-            
-                ;// calculate a*v5 
-                VQRDMULH    dT1,dVr5,dT0[0]                         ;// use dVi0 for dT1
-                VLD2    {dXr4,dXi4},[pSrc@128],pointStep          ;//  data[4]
+
+            @// finish third stage of 8 point FFT
+
+            .ifeqs  "\inverse", "TRUE"
+
+                @// calculate a*v5
+                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
+                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
                 VQRDMULH    dVi5,dVi5,dT0[0]
-                            
-                VLD2    {dXr5,dXi5},[pSrc@128],pointStep          ;//  data[5]
-                VSUB    dVr5,dT1,dVi5                               ;// a * V5
+
+                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+                VSUB    dVr5,dT1,dVi5                               @// a * V5
                 VADD    dVi5,dT1,dVi5
-                
-                VLD2    {dXr6,dXi6},[pSrc@128],pointStep          ;//  data[6]
-                
-                ;// calculate  b*v7
+
+                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+
+                @// calculate  b*v7
                 VQRDMULH    dT1,dVr7,dT0[0]
                 VQRDMULH    dVi7,dVi7,dT0[0]
-                
+
                 VHADD    qY1,qV1,qV5
                 VHSUB    qY5,qV1,qV5
-                
-                            
-                VADD    dVr7,dT1,dVi7                               ;// b * V7
+
+
+                VADD    dVr7,dT1,dVi7                               @// b * V7
                 VSUB    dVi7,dVi7,dT1
-                SUB     pDst, pDst, step2                           ;// set pDst to y1
-                
-                VLD2    {dXr7,dXi7},[pSrc@128],setStep            ;//  data[7]            
-                
-                
+                SUB     pDst, pDst, step2                           @// set pDst to y1
+
+                VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7]
+
+
                 VHSUB    dYr3,dVr3,dVr7
                 VHSUB    dYi3,dVi3,dVi7
-                VST2    {dYr1,dYi1},[pDst@128],step1                    ;// store y1
+                VST2    {dYr1,dYi1},[pDst :128],step1                    @// store y1
                 VHADD    dYr7,dVr3,dVr7
                 VHADD    dYi7,dVi3,dVi7
 
-                
-                VST2    {dYr3,dYi3},[pDst@128],step1                    ;// store y3
-                VST2    {dYr5,dYi5},[pDst@128],step1                    ;// store y5
-                VST2    {dYr7,dYi7},[pDst@128],#16                      ;// store y7
-            ELSE
-            
-                ;// calculate  b*v7
+
+                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y3
+                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y5
+#if 0
+                VST2    {dYr7,dYi7},[pDst :128],#16                      @// store y7
+#else
+                VST2    {dYr7,dYi7},[pDst :128]!                      @// store y7
+#endif
+            .ELSE
+
+                @// calculate  b*v7
                 VQRDMULH    dT1,dVr7,dT0[0]
-                VLD2    {dXr4,dXi4},[pSrc@128],pointStep          ;//  data[4]
+                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
                 VQRDMULH    dVi7,dVi7,dT0[0]
-                
-                VLD2    {dXr5,dXi5},[pSrc@128],pointStep          ;//  data[5]
-                VADD    dVr7,dT1,dVi7                               ;// b * V7
+
+                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+                VADD    dVr7,dT1,dVi7                               @// b * V7
                 VSUB    dVi7,dVi7,dT1
-                
-                VLD2    {dXr6,dXi6},[pSrc@128],pointStep          ;//  data[6]
-                
-                ;// calculate a*v5 
-                VQRDMULH    dT1,dVr5,dT0[0]                         ;// use dVi0 for dT1
+
+                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+
+                @// calculate a*v5
+                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
                 VQRDMULH    dVi5,dVi5,dT0[0]
 
                 VHADD    dYr7,dVr3,dVr7
                 VHADD    dYi7,dVi3,dVi7
-                SUB     pDst, pDst, step2                           ;// set pDst to y1
-            
-                VSUB    dVr5,dT1,dVi5                               ;// a * V5
+                SUB     pDst, pDst, step2                           @// set pDst to y1
+
+                VSUB    dVr5,dT1,dVi5                               @// a * V5
                 VADD    dVi5,dT1,dVi5
-                VLD2    {dXr7,dXi7},[pSrc@128],setStep            ;//  data[7]            
-                
+                VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7]
+
                 VHSUB    qY5,qV1,qV5
-                
+
                 VHSUB    dYr3,dVr3,dVr7
-                VST2    {dYr7,dYi7},[pDst@128],step1                    ;// store y1
+                VST2    {dYr7,dYi7},[pDst :128],step1                    @// store y1
                 VHSUB    dYi3,dVi3,dVi7
                 VHADD    qY1,qV1,qV5
-                
-                
-                VST2    {dYr5,dYi5},[pDst@128],step1                    ;// store y3
-                VST2    {dYr3,dYi3},[pDst@128],step1                    ;// store y5
-                VST2    {dYr1,dYi1},[pDst@128],#16                      ;// store y7
 
-            
-            ENDIF
-            
-            
-           
-        ELSE
-            ;// finish first stage of 8 point FFT 
-            
+
+                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y3
+                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y5
+#if 0
+                VST2    {dYr1,dYi1},[pDst :128],#16                      @// store y7
+#else
+                VST2    {dYr1,dYi1},[pDst :128]!                      @// store y7
+#endif
+
+            .ENDIF
+
+
+
+        .ELSE
+            @// finish first stage of 8 point FFT
+
             VADD    qU0,qX0,qX4
             VADD    qU2,qX1,qX5
             VADD    qU4,qX2,qX6
             VADD    qU6,qX3,qX7
-            
-            ;// finish second stage of 8 point FFT 
-            
+
+            @// finish second stage of 8 point FFT
+
             VADD    qV0,qU0,qU4
             VSUB    qV2,qU0,qU4
             VADD    qV4,qU2,qU6
             VSUB    qV6,qU2,qU6
-            
-            ;// finish third stage of 8 point FFT 
-            
+
+            @// finish third stage of 8 point FFT
+
             VADD    qY0,qV0,qV4
             VSUB    qY4,qV0,qV4
-            VST2    {dYr0,dYi0},[pDst@128],step1                    ;// store y0
-            
-            IF  $inverse
-                
-                VSUB    dYr2,dVr2,dVi6
-                VADD    dYi2,dVi2,dVr6
-                
-                VADD    dYr6,dVr2,dVi6
-                VST2    {dYr2,dYi2},[pDst@128],step1                    ;// store y2
-                VSUB    dYi6,dVi2,dVr6
-            
-                VSUB    qU1,qX0,qX4                    
-                VST2    {dYr4,dYi4},[pDst@128],step1                    ;// store y4
-            
-                VSUB    qU3,qX1,qX5
-                VSUB    qU5,qX2,qX6
-                VST2    {dYr6,dYi6},[pDst@128],step1                    ;// store y6
-            
-            ELSE
-            
-                VADD    dYr6,dVr2,dVi6
-                VSUB    dYi6,dVi2,dVr6
-                
-                VSUB    dYr2,dVr2,dVi6
-                VST2    {dYr6,dYi6},[pDst@128],step1                    ;// store y2
-                VADD    dYi2,dVi2,dVr6
-                
-                                
-                VSUB    qU1,qX0,qX4
-                VST2    {dYr4,dYi4},[pDst@128],step1                    ;// store y4
-                VSUB    qU3,qX1,qX5
-                VSUB    qU5,qX2,qX6
-                VST2    {dYr2,dYi2},[pDst@128],step1                    ;// store y6
+            VST2    {dYr0,dYi0},[pDst :128],step1                    @// store y0
 
-            
-            ENDIF
-            
-            ;// finish first stage of 8 point FFT 
-            
+            .ifeqs  "\inverse", "TRUE"
+
+                VSUB    dYr2,dVr2,dVi6
+                VADD    dYi2,dVi2,dVr6
+
+                VADD    dYr6,dVr2,dVi6
+                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y2
+                VSUB    dYi6,dVi2,dVr6
+
+                VSUB    qU1,qX0,qX4
+                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
+
+                VSUB    qU3,qX1,qX5
+                VSUB    qU5,qX2,qX6
+                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y6
+
+            .ELSE
+
+                VADD    dYr6,dVr2,dVi6
+                VSUB    dYi6,dVi2,dVr6
+
+                VSUB    dYr2,dVr2,dVi6
+                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y2
+                VADD    dYi2,dVi2,dVr6
+
+
+                VSUB    qU1,qX0,qX4
+                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
+                VSUB    qU3,qX1,qX5
+                VSUB    qU5,qX2,qX6
+                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y6
+
+
+            .ENDIF
+
+            @// finish first stage of 8 point FFT
+
             VSUB    qU7,qX3,qX7
-            VMOV    dT0[0],t0                                   
-            
-            ;// finish second stage of 8 point FFT 
-            
+            VMOV    dT0[0],t0
+
+            @// finish second stage of 8 point FFT
+
             VSUB    dVr1,dUr1,dUi5
-            VLD2    {dXr0,dXi0},[pSrc@128],pointStep          ;//  data[0] for next iteration
+            VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0] for next iteration
             VADD    dVi1,dUi1,dUr5
             VADD    dVr3,dUr1,dUi5
-            VLD2    {dXr1,dXi1},[pSrc@128],pointStep          ;//  data[1]
+            VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
             VSUB    dVi3,dUi1,dUr5
-                        
+
             VSUB    dVr5,dUr3,dUi7
-            VLD2    {dXr2,dXi2},[pSrc@128],pointStep          ;//  data[2]
+            VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
             VADD    dVi5,dUi3,dUr7
             VADD    dVr7,dUr3,dUi7
-            VLD2    {dXr3,dXi3},[pSrc@128],pointStep          ;//  data[3]
+            VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3]
             VSUB    dVi7,dUi3,dUr7
-            
-            ;// finish third stage of 8 point FFT 
-            
-            IF  $inverse
-            
-                ;// calculate a*v5 
-                VQRDMULH    dT1,dVr5,dT0[0]                         ;// use dVi0 for dT1
-                VLD2    {dXr4,dXi4},[pSrc@128],pointStep          ;//  data[4]
+
+            @// finish third stage of 8 point FFT
+
+            .ifeqs  "\inverse", "TRUE"
+
+                @// calculate a*v5
+                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
+                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
                 VQRDMULH    dVi5,dVi5,dT0[0]
-                            
-                VLD2    {dXr5,dXi5},[pSrc@128],pointStep          ;//  data[5]
-                VSUB    dVr5,dT1,dVi5                               ;// a * V5
+
+                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+                VSUB    dVr5,dT1,dVi5                               @// a * V5
                 VADD    dVi5,dT1,dVi5
-                
-                VLD2    {dXr6,dXi6},[pSrc@128],pointStep          ;//  data[6]
-                
-                ;// calculate  b*v7
+
+                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+
+                @// calculate  b*v7
                 VQRDMULH    dT1,dVr7,dT0[0]
                 VQRDMULH    dVi7,dVi7,dT0[0]
-                
+
                 VADD    qY1,qV1,qV5
                 VSUB    qY5,qV1,qV5
-                
-                            
-                VADD    dVr7,dT1,dVi7                               ;// b * V7
+
+
+                VADD    dVr7,dT1,dVi7                               @// b * V7
                 VSUB    dVi7,dVi7,dT1
-                SUB     pDst, pDst, step2                           ;// set pDst to y1
-                
-                VLD2    {dXr7,dXi7},[pSrc@128],setStep            ;//  data[7]            
-                
-                
+                SUB     pDst, pDst, step2                           @// set pDst to y1
+
+                VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7]
+
+
                 VSUB    dYr3,dVr3,dVr7
                 VSUB    dYi3,dVi3,dVi7
-                VST2    {dYr1,dYi1},[pDst@128],step1                    ;// store y1
+                VST2    {dYr1,dYi1},[pDst :128],step1                    @// store y1
                 VADD    dYr7,dVr3,dVr7
                 VADD    dYi7,dVi3,dVi7
 
-                
-                VST2    {dYr3,dYi3},[pDst@128],step1                    ;// store y3
-                VST2    {dYr5,dYi5},[pDst@128],step1                    ;// store y5
-                VST2    {dYr7,dYi7},[pDst@128],#16                      ;// store y7
-            ELSE
-            
-                ;// calculate  b*v7
+
+                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y3
+                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y5
+#if 0
+                VST2    {dYr7,dYi7},[pDst :128],#16                      @// store y7
+#else
+                VST2    {dYr7,dYi7},[pDst :128]!                      @// store y7
+#endif
+            .ELSE
+
+                @// calculate  b*v7
                 VQRDMULH    dT1,dVr7,dT0[0]
-                VLD2    {dXr4,dXi4},[pSrc@128],pointStep          ;//  data[4]
+                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
                 VQRDMULH    dVi7,dVi7,dT0[0]
-                
-                VLD2    {dXr5,dXi5},[pSrc@128],pointStep          ;//  data[5]
-                VADD    dVr7,dT1,dVi7                               ;// b * V7
+
+                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+                VADD    dVr7,dT1,dVi7                               @// b * V7
                 VSUB    dVi7,dVi7,dT1
-                
-                VLD2    {dXr6,dXi6},[pSrc@128],pointStep          ;//  data[6]
-                
-                ;// calculate a*v5 
-                VQRDMULH    dT1,dVr5,dT0[0]                         ;// use dVi0 for dT1
+
+                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+
+                @// calculate a*v5
+                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
                 VQRDMULH    dVi5,dVi5,dT0[0]
 
                 VADD    dYr7,dVr3,dVr7
                 VADD    dYi7,dVi3,dVi7
-                SUB     pDst, pDst, step2                           ;// set pDst to y1
-            
-                VSUB    dVr5,dT1,dVi5                               ;// a * V5
+                SUB     pDst, pDst, step2                           @// set pDst to y1
+
+                VSUB    dVr5,dT1,dVi5                               @// a * V5
                 VADD    dVi5,dT1,dVi5
-                VLD2    {dXr7,dXi7},[pSrc@128],setStep            ;//  data[7]            
-                
+                VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7]
+
                 VSUB    qY5,qV1,qV5
-                
+
                 VSUB    dYr3,dVr3,dVr7
-                VST2    {dYr7,dYi7},[pDst@128],step1                    ;// store y1
+                VST2    {dYr7,dYi7},[pDst :128],step1                    @// store y1
                 VSUB    dYi3,dVi3,dVi7
                 VADD    qY1,qV1,qV5
-                
-                
-                VST2    {dYr5,dYi5},[pDst@128],step1                    ;// store y3
-                VST2    {dYr3,dYi3},[pDst@128],step1                    ;// store y5
-                VST2    {dYr1,dYi1},[pDst@128],#16                      ;// store y7
 
-            
-            ENDIF
-            
-            
-        ENDIF
-        
-        SUB     pDst, pDst, step2                               ;// update pDst for the next set
-        BGT     grpZeroSetLoop$name
-        
-        
-        ;// reset pSrc to pDst for the next stage
-        SUB     pSrc,pDst,pointStep                             ;// pDst -= 2*grpSize  
-        MOV     pDst,pPingPongBuf 
-        
-        
-        
-        MEND
-        
 
-        ;// Allocate stack memory required by the function
-        
-        
+                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y3
+                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y5
+#if 0
+                VST2    {dYr1,dYi1},[pDst :128],#16                      @// store y7
+#else
+                VST2    {dYr1,dYi1},[pDst :128]!                      @// store y7
+#endif
+
+            .ENDIF
+
+
+        .ENDIF
+
+        SUB     pDst, pDst, step2                               @// update pDst for the next set
+        BGT     grpZeroSetLoop\name
+
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                             @// pDst -= 2*grpSize
+        MOV     pDst,pPingPongBuf
+
+
+
+        .endm
+
+
+        @// Allocate stack memory required by the function
+
+
         M_START armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4
-            FFTSTAGE {FALSE},{FALSE},FWD
+            FFTSTAGE "FALSE","FALSE",FWD
         M_END
 
-        
+
         M_START armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4
-            FFTSTAGE {FALSE},{TRUE},INV
+            FFTSTAGE "FALSE","TRUE",INV
         M_END
- 
-        
+
+
         M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
-            FFTSTAGE {TRUE},{FALSE},FWDSFS
+            FFTSTAGE "TRUE","FALSE",FWDSFS
         M_END
 
-        
+
         M_START armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
-            FFTSTAGE {TRUE},{TRUE},INVSFS
+            FFTSTAGE "TRUE","TRUE",INVSFS
         M_END
 
-    
-    ENDIF                                                           ;//CortexA8
-        
-    
-     
-    END
\ No newline at end of file
+
+
+
+
+    .END
diff --git a/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S b/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S
index 399037c..ca15c6b 100644
--- a/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S
+++ b/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S
@@ -1,353 +1,356 @@
-;//
-;// 
-;// File Name:  omxSP_FFTFwd_CToC_SC16_Sfs_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision:   6729
-;// Last Modified Date:       Tue, 17 Jul 2007
-;// 
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;// 
-;// 
-;//
-;// Description:
-;// Compute an inverse FFT for a complex signal
-;// 
-;// 
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
 
-        
-;// Include standard headers
+@//
+@//
+@// File Name:  omxSP_FFTFwd_CToC_SC16_Sfs_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   6729
+@// Last Modified Date:       Tue, 17 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
 
-        INCLUDE omxtypes_s.h
-        INCLUDE armCOMM_s.h
-        
-        M_VARIANTS CortexA8
-        
-;// Import symbols required from other files
-;// (For example tables)
-        
-        IMPORT  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe 
-        IMPORT  armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
-        IMPORT  armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
-        IMPORT  armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe 
-        IMPORT  armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe 
-        IMPORT  armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
-        IMPORT  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
-        IMPORT  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe  
-        IMPORT  armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe 
-        IMPORT  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
-        IMPORT  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
-        IMPORT  armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe  
-        IMPORT  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe 
-        IMPORT  armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe 
-        
-;// Set debugging level        
-;//DEBUG_ON    SETL {TRUE}
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+        .extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
 
 
 
-;// Guarding implementation by the processor name
-    
-    
-
-;// Guarding implementation by the processor name
-    
-    IF  CortexA8
-    
-    IMPORT  armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe 
-    IMPORT  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe 
-    
-;//Input Registers
-
-pSrc            RN  0
-pDst            RN  1
-pFFTSpec        RN  2
-scale           RN  3
-
-
-;// Output registers
-result          RN  0
-
-;//Local Scratch Registers
-
-argTwiddle      RN  1
-argDst          RN  2
-argScale        RN  4
-pTwiddle        RN  4
-tmpOrder        RN  4
-pOut            RN  5
-subFFTSize      RN  7     
-subFFTNum       RN  6
-N               RN  6
-order           RN  14
-diff            RN  9
-count           RN  8                   ;// Total num of radix stages required to comple the FFT
-x0r             RN  4    
-x0i             RN  5
-diffMinusOne    RN  2
-round           RN  3
-
-;// Neon registers
-
-dX0             DN  D0.S16
-dShift          DN  D1.S16
-dX0S32          DN  D0.S32
+@// Guarding implementation by the processor name
 
 
 
-    ;// Allocate stack memory required by the function
+@// Guarding implementation by the processor name
+
+
+    .extern  armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+    .extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec                r2
+#define scale           r3
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+#define argTwiddle              r1
+#define argDst          r2
+#define argScale                r4
+#define pTwiddle                r4
+#define tmpOrder                r4
+#define pOut            r5
+#define subFFTSize              r7
+#define subFFTNum               r6
+#define N               r6
+#define order           r14
+#define diff            r9
+@// Total num of radix stages required to comple the FFT
+#define count           r8
+#define x0r             r4
+#define x0i             r5
+#define diffMinusOne            r2
+#define round           r3
+
+@// Neon registers
+
+#define dX0     D0.S16
+#define dShift  D1.S16
+#define dX0S32  D0.S32
+
+
+
+    @// Allocate stack memory required by the function
         M_ALLOC4        diffOnStack, 4
 
-    ;// Write function header
+    @// Write function header
         M_START     omxSP_FFTFwd_CToC_SC16_Sfs,r11,d15
-        
-        M_STRUCT     ARMsFFTSpec
-        M_FIELD      N, 4
-        M_FIELD      pBitRev, 4
-        M_FIELD      pTwiddle, 4
-        M_FIELD      pBuf, 4
-        M_ENDSTRUCT
-        
-        ;// Define stack arguments
-        
-        ;// Read the size from structure and take log
+
+@ Structure offsets for the FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @// Define stack arguments
+
+        @// Read the size from structure and take log
         LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
-        
-        ;// Read other structure parameters
+
+        @// Read other structure parameters
         LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
         LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
-                
-        CLZ     order,N                             ;// N = 2^order 
-        RSB     order,order,#31     
+
+        CLZ     order,N                             @// N = 2^order
+        RSB     order,order,#31
         MOV     subFFTSize,#1
-        ;//MOV     subFFTNum,N
-        
+        @//MOV     subFFTNum,N
+
         CMP     order,#3
-        BGT     orderGreaterthan3                   ;// order > 3
-        
+        BGT     orderGreaterthan3                   @// order > 3
+
         CMP     order,#1
-        BGE     orderGreaterthan0                   ;// order > 0
-        M_STR   scale, diffOnStack,LT               ;// order = 0
+        BGE     orderGreaterthan0                   @// order > 0
+        M_STR   scale, diffOnStack,LT               @// order = 0
         LDRLT   x0r,[pSrc]
         STRLT   x0r,[pDst]
         MOVLT   pSrc,pDst
         BLT     FFTEnd
-        
-orderGreaterthan0
-        ;// set the buffers appropriately for various orders
+
+orderGreaterthan0:
+        @// set the buffers appropriately for various orders
         CMP     order,#2
-        MOVNE   argDst,pDst        
+        MOVNE   argDst,pDst
         MOVEQ   argDst,pOut
-        MOVEQ   pOut,pDst                           ;// Pass the first stage destination in RN5
+        MOVEQ   pOut,pDst                           @// Pass the first stage destination in RN5
         MOV     argTwiddle,pTwiddle
-        
+
         SUBS     diff,scale,order
         M_STR   diff,diffOnStack
         MOVGT   scale,order
-        ;// Now scale <= order
-        
+        @// Now scale <= order
+
         CMP     order,#1
         BGT     orderGreaterthan1
         SUBS    scale,scale,#1
-        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe  ;// order = 1
-        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe      ;// order = 1
+        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe  @// order = 1
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe      @// order = 1
         B       FFTEnd
 
-orderGreaterthan1
+orderGreaterthan1:
         CMP     order,#2
         MOV     argScale,scale
         BGT     orderGreaterthan2
         SUBS    argScale,argScale,#1
-        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe      ;// order =2          
-        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe  
+        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe      @// order =2
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
         SUBS    argScale,argScale,#1
-        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe  
-        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe  
+        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
         B       FFTEnd
-        
-orderGreaterthan2                                                               ;// order =3        
+
+orderGreaterthan2:                                                                     @// order =3
         SUBS    argScale,argScale,#1
-        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe      
-        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe  
+        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
         SUBS    argScale,argScale,#1
-        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe  
+        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
         BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
         SUBS    argScale,argScale,#1
-        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe      
-        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe    
+        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
         B       FFTEnd
-        
 
-orderGreaterthan3       
-        ;// check scale = 0 or scale = order
-        SUBS    diff, scale, order                 ;// scale > order 
-        MOVGT   scale,order     
-        BGE     specialScaleCase                   ;// scale = 0 or scale = order 
+
+orderGreaterthan3:
+        @// check scale = 0 or scale = order
+        SUBS    diff, scale, order                 @// scale > order
+        MOVGT   scale,order
+        BGE     specialScaleCase                   @// scale = 0 or scale = order
         CMP     scale,#0
         BEQ     specialScaleCase
         B       generalScaleCase
-        
-specialScaleCase                                    ;//  scale = 0 or scale = order  and order > 3     
-        
-        TST     order, #2                           ;// Set input args to fft stages
-        MOVNE   argDst,pDst        
+
+specialScaleCase:                                           @//  scale = 0 or scale = order  and order > 3
+
+        TST     order, #2                           @// Set input args to fft stages
+        MOVNE   argDst,pDst
         MOVEQ   argDst,pOut
-        MOVEQ   pOut,pDst                           ;// Pass the first stage destination in RN5
-        MOV     argTwiddle,pTwiddle  
+        MOVEQ   pOut,pDst                           @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
 
         CMP      diff,#0
         M_STR    diff, diffOnStack
-        BGE      scaleEqualsOrder  
-       
-        ;//check for even or odd order
-        ;// NOTE: The following combination of BL's would work fine eventhough the first
-        ;// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 
-        ;// armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
-        
+        BGE      scaleEqualsOrder
+
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine eventhough the first
+        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+        @// armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
         TST     order,#0x00000001
-        BLEQ    armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe 
-        BLNE    armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe 
-        
+        BLEQ    armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+
         CMP        subFFTNum,#4
         BLT     FFTEnd
 
-unscaledRadix4Loop
+unscaledRadix4Loop:
         BEQ        lastStageUnscaledRadix4
         BL        armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
          CMP        subFFTNum,#4
          B        unscaledRadix4Loop
 
-lastStageUnscaledRadix4
-        BL      armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe 
-        B        FFTEnd         
+lastStageUnscaledRadix4:
+        BL      armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+        B        FFTEnd
 
-scaleEqualsOrder         
-        ;//check for even or odd order
-        ;// NOTE: The following combination of BL's would work fine eventhough the first
-        ;// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 
-        ;// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
-                
+scaleEqualsOrder:
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine eventhough the first
+        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+        @// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
         TST     order,#0x00000001
-        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe 
-        BLNE    armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe 
-        
+        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+
         CMP        subFFTNum,#4
         BLT     FFTEnd
 
-scaledRadix4Loop
+scaledRadix4Loop:
         BEQ        lastStageScaledRadix4
         BL        armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
          CMP        subFFTNum,#4
          B        scaledRadix4Loop
-         
-lastStageScaledRadix4
-        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe 
-        B        FFTEnd                    
-         
-        
-        
-generalScaleCase                                        ;// 0 < scale < order and order > 3
-        ;// Determine the correct destination buffer
+
+lastStageScaledRadix4:
+        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+        B        FFTEnd
+
+
+
+generalScaleCase:                                               @// 0 < scale < order and order > 3
+        @// Determine the correct destination buffer
         SUB     diff,order,scale
         TST     diff,#0x01
-        ADDEQ   count,scale,diff,LSR #1         ;// count = scale + (order - scale)/2
+        ADDEQ   count,scale,diff,LSR #1         @// count = scale + (order - scale)/2
         MOVNE   count,order
-        TST     count,#0x01                     ;// Is count even or odd ?
-        
-        MOVNE   argDst,pDst                     ;// Set input args to fft stages
+        TST     count,#0x01                     @// Is count even or odd ?
+
+        MOVNE   argDst,pDst                     @// Set input args to fft stages
         MOVEQ   argDst,pOut
-        MOVEQ   pOut,pDst                       ;// Pass the first stage destination in RN5
-        MOV     argTwiddle,pTwiddle  
+        MOVEQ   pOut,pDst                       @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
 
         CMP     diff,#1
-        M_STR   diff, diffOnStack    
-        BEQ     scaleps                         ;// scaling including a radix2_ps stage
-        
-        MOV     argScale,scale                  ;// Put scale in RN4 so as to save and restore
-        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     ;// scaled first stage
+        M_STR   diff, diffOnStack
+        BEQ     scaleps                         @// scaling including a radix2_ps stage
+
+        MOV     argScale,scale                  @// Put scale in RN4 so as to save and restore
+        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     @// scaled first stage
         SUBS    argScale,argScale,#1
-        
-scaledRadix2Loop        
+
+scaledRadix2Loop:
         BLGT    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
-        SUBS    argScale,argScale,#1            ;// save and restore scale (RN4) in the scaled stages
+        SUBS    argScale,argScale,#1            @// save and restore scale (RN4) in the scaled stages
         BGT     scaledRadix2Loop
         B       outScale
 
-scaleps
-        SUB     argScale,scale,#1                   ;// order>3 and diff=1 => scale >= 3
-        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     ;// scaled first stage
+scaleps:
+        SUB     argScale,scale,#1                   @// order>3 and diff=1 => scale >= 3
+        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     @// scaled first stage
         SUBS    argScale,argScale,#1
-        
-scaledRadix2psLoop
-        BEQ     scaledRadix2psStage        
+
+scaledRadix2psLoop:
+        BEQ     scaledRadix2psStage
         BLGT    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
-        SUBS    argScale,argScale,#1            ;// save and restore scale (RN4) in the scaled stages
+        SUBS    argScale,argScale,#1            @// save and restore scale (RN4) in the scaled stages
         BGE     scaledRadix2psLoop
 
-scaledRadix2psStage
+scaledRadix2psStage:
         BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
-        B       generalLastStageUnscaledRadix2         
-        
-        
-outScale        
-        M_LDR   diff, diffOnStack  
-        ;//check for even or odd order
+        B       generalLastStageUnscaledRadix2
+
+
+outScale:
+        M_LDR   diff, diffOnStack
+        @//check for even or odd order
         TST     diff,#0x00000001
         BEQ     generalUnscaledRadix4Loop
         B       unscaledRadix2Loop
 
-generalUnscaledRadix4Loop
+generalUnscaledRadix4Loop:
         CMP        subFFTNum,#4
          BEQ        generalLastStageUnscaledRadix4
          BL        armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
          B        generalUnscaledRadix4Loop
-         
-generalLastStageUnscaledRadix4
-        BL      armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe 
-        B        End              
 
-unscaledRadix2Loop
+generalLastStageUnscaledRadix4:
+        BL      armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+        B        End
+
+unscaledRadix2Loop:
         CMP        subFFTNum,#4
          BEQ        generalLastTwoStagesUnscaledRadix2
          BL        armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
-         B        unscaledRadix2Loop        
+         B        unscaledRadix2Loop
 
-generalLastTwoStagesUnscaledRadix2
+generalLastTwoStagesUnscaledRadix2:
         BL      armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
-generalLastStageUnscaledRadix2                  
-        BL      armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe 
+generalLastStageUnscaledRadix2:
+        BL      armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
         B        End
 
 
-FFTEnd                                              ;// Does only the scaling
-        
-        M_LDR   diff, diffOnStack  
+FFTEnd:                                               @// Does only the scaling
+
+        M_LDR   diff, diffOnStack
         CMP     diff,#0
         BLE     End
-        
-        RSB     diff,diff,#0                        ;// to use VRSHL for right shift by a variable
-        VDUP    dShift,diff     
-        
-scaleFFTData                                        ;// N = subFFTSize  ; dataptr = pDst  ; scale = diff
-        VLD1    {dX0S32[0]},[pSrc]                        ;// pSrc contains pDst pointer
+
+        RSB     diff,diff,#0                        @// to use VRSHL for right shift by a variable
+        VDUP    dShift,diff
+
+scaleFFTData:                                           @// N = subFFTSize  ; dataptr = pDst  ; scale = diff
+        VLD1    {dX0S32[0]},[pSrc]                        @// pSrc contains pDst pointer
         SUBS    subFFTSize,subFFTSize,#1
         VRSHL   dX0,dShift
         VST1    {dX0S32[0]},[pSrc]!
-                
+
         BGT     scaleFFTData
-        
-                
-       
-End                        
-        ;// Set return value
-        MOV     result, #OMX_Sts_NoErr       
 
-        ;// Write function tail
+
+
+End:
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr
+
+        @// Write function tail
         M_END
-        
-    ENDIF                                           ;//CortexA8    
 
-    
-    
-    
-    
-    END
\ No newline at end of file
+    .END
diff --git a/dl/sp/src/omxSP_FFTInit_C_SC16.c b/dl/sp/src/omxSP_FFTInit_C_SC16.c
index 342fc0c..fdab9b0 100644
--- a/dl/sp/src/omxSP_FFTInit_C_SC16.c
+++ b/dl/sp/src/omxSP_FFTInit_C_SC16.c
@@ -12,15 +12,15 @@
  */
 
 /**
- * 
+ *
  * File Name:  omxSP_FFTInit_C_SC16.c
  * OpenMAX DL: v1.0.2
  * Last Modified Revision:   15322
  * Last Modified Date:       Wed, 15 Oct 2008
- * 
+ *
  * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
- * 
- * 
+ *
+ *
  * Description:
  * Initializes the specification structures required
  */
@@ -47,9 +47,9 @@
  * *pFFTSpec, in bytes, can be determined using <FFTGetBufSize_C_SC16>.
  *
  * Parameters:
- * [in]  order       	base-2 logarithm of the desired block length;
- *				valid in the range [0,12].
- * [out] pFFTSpec		pointer to initialized specification structure.
+ * [in]  order          base-2 logarithm of the desired block length;
+ *                              valid in the range [0,12].
+ * [out] pFFTSpec               pointer to initialized specification structure.
  *
  * Return Value:
  * Standard omxError result. See enumeration for possible result codes.
@@ -69,8 +69,8 @@
     ARMsFFTSpec_SC16 *pFFTStruct = 0;
     OMX_S16     x,y,xNeg;
     OMX_S32     xS32,yS32;
-            
-    
+
+
     pFFTStruct = (ARMsFFTSpec_SC16 *) pFFTSpec;
 
     /* if order zero no init is needed */
@@ -84,51 +84,51 @@
     Nby2 = 1 << (order - 1);
     N = Nby2 << 1;
     M = N>>3;
-        
-    pBitRev = NULL ;  
-    
-    pTwiddle = (OMX_SC16 *) 
+
+    pBitRev = NULL ;
+
+    pTwiddle = (OMX_SC16 *)
         (sizeof(ARMsFFTSpec_SC16) + (OMX_S8*) pFFTSpec);
-    
+
     /* Align to 32 byte boundary */
     pTmp = ((OMX_U32)pTwiddle)&31;              /* (OMX_U32)pTwiddle % 32 */
     if(pTmp != 0)
-        pTwiddle = (OMX_SC16*) ((OMX_S8*)pTwiddle + (32-pTmp));        
-    
-    pBuf = (OMX_SC16 *)        
+        pTwiddle = (OMX_SC16*) ((OMX_S8*)pTwiddle + (32-pTmp));
+
+    pBuf = (OMX_SC16 *)
         (sizeof(OMX_SC16) * (3*N/4) + (OMX_S8*) pTwiddle);
-    
+
     /* Align to 32 byte boundary */
     pTmp = ((OMX_U32)pBuf)&31;                 /* (OMX_U32)pBuf % 32 */
     if(pTmp != 0)
-        pBuf = (OMX_SC16*) ((OMX_S8*)pBuf + (32-pTmp));            
+        pBuf = (OMX_SC16*) ((OMX_S8*)pBuf + (32-pTmp));
 
-    
 
-    /* 
-     * Filling Twiddle factors : 
+
+    /*
+     * Filling Twiddle factors :
      * The original twiddle table "armSP_FFT_S16TwiddleTable" is of size (MaxSize/8 + 1)
      * Rest of the values i.e., upto MaxSize are calculated using the symmetries of sin and cos
      * The max size of the twiddle table needed is 3N/4 for a radix-4 stage
      *
-     * W = (-2 * PI) / N 
+     * W = (-2 * PI) / N
      * N = 1 << order
      * W = -PI >> (order - 1)
      */
-     
-    
-   
+
+
+
     diff = 12 - order;
     step = 1<<diff;             /* step into the twiddle table for the current order */
-    
+
     xS32 = armSP_FFT_S32TwiddleTable[0];
     yS32 = armSP_FFT_S32TwiddleTable[1];
     x = (xS32+0x8000)>>16;
     y = (yS32+0x8000)>>16;
 
     xNeg = 0x7FFF;
-    
-    if(order >=3)    
+
+    if(order >=3)
     {
             /* i = 0 case */
             pTwiddle[0].Re = x;
@@ -137,17 +137,17 @@
             pTwiddle[2*M].Im = xNeg;
             pTwiddle[4*M].Re = xNeg;
             pTwiddle[4*M].Im = y;
-            
-    
+
+
         for (i=1; i<=M; i++)
           {
             j = i*step;
-            
+
             xS32 = armSP_FFT_S32TwiddleTable[2*j];
             yS32 = armSP_FFT_S32TwiddleTable[2*j+1];
             x = (xS32+0x8000)>>16;
             y = (yS32+0x8000)>>16;
-            
+
             pTwiddle[i].Re = x;
             pTwiddle[i].Im = y;
             pTwiddle[2*M-i].Re = -y;
@@ -161,8 +161,8 @@
             pTwiddle[6*M-i].Re = y;
             pTwiddle[6*M-i].Im = x;
         }
-        
-     
+
+
     }
     else
     {
@@ -174,19 +174,19 @@
             pTwiddle[1].Im = xNeg;
             pTwiddle[2].Re = xNeg;
             pTwiddle[2].Im = y;
-        
+
         }
         if (order == 1)
         {
             pTwiddle[0].Re = x;
             pTwiddle[0].Im = y;
-        
-        }        
-        
-    
+
+        }
+
+
     }
-    
-       
+
+
     /* Update the structure */
     pFFTStruct->N = N;
     pFFTStruct->pTwiddle = pTwiddle;
diff --git a/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S b/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S
index f1a8d03..ff85e2b 100644
--- a/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S
+++ b/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S
@@ -1,334 +1,342 @@
-;//
-;// 
-;// File Name:  omxSP_FFTInv_CToC_SC16_Sfs_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision:   6729
-;// Last Modified Date:       Tue, 17 Jul 2007
-;// 
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;// 
-;// 
-;//
-;// Description:
-;// Compute an inverse FFT for a complex signal
-;// 
-;// 
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
 
-        
-;// Include standard headers
+@//
+@//
+@// File Name:  omxSP_FFTInv_CToC_SC16_Sfs_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   6729
+@// Last Modified Date:       Tue, 17 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
 
-        INCLUDE omxtypes_s.h
-        INCLUDE armCOMM_s.h
-        
-        M_VARIANTS CortexA8
-        
-;// Import symbols required from other files
-;// (For example tables)
-        
-        IMPORT  armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe 
-        IMPORT  armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
-        IMPORT  armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
-        IMPORT  armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe 
-        IMPORT  armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe 
-        IMPORT  armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
-        IMPORT  armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
-        IMPORT  armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe  
-        IMPORT  armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe 
-        IMPORT  armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
-        IMPORT  armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
-        IMPORT  armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe  
-        IMPORT  armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe 
-        IMPORT  armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe 
-        
-;// Set debugging level        
-;//DEBUG_ON    SETL {TRUE}
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+@// Import symbols required from other files
+@// (For example tables)
+
+        .extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
 
 
 
-;// Guarding implementation by the processor name
-    
-    
-
-;// Guarding implementation by the processor name
-    
-    IF  CortexA8 
-    
-    IMPORT  armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe 
-    IMPORT  armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe 
-    
-;//Input Registers
-
-pSrc            RN  0
-pDst            RN  1
-pFFTSpec        RN  2
-scale           RN  3
+@// Guarding implementation by the processor name
 
 
-;// Output registers
-result          RN  0
 
-;//Local Scratch Registers
-
-argTwiddle      RN  1
-argDst          RN  2
-argScale        RN  4
-pTwiddle        RN  4
-tmpOrder        RN  4
-pOut            RN  5
-subFFTSize      RN  7     
-subFFTNum       RN  6
-N               RN  6
-order           RN  14
-diff            RN  9
-count           RN  8                   ;// Total num of radix stages required to comple the FFT
-x0r             RN  4    
-x0i             RN  5
-diffMinusOne    RN  2
-round           RN  3
-
-;// Neon registers
-
-dX0             DN  D0.S16
-dShift          DN  D1.S16
-dX0S32          DN  D0.S32
+@// Guarding implementation by the processor name
 
 
-    ;// Allocate stack memory required by the function
+    .extern  armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+    .extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+
+@//Input Registers
+
+#define pSrc    r0
+#define pDst    r1
+#define pFFTSpec        r2
+#define scale   r3
+
+
+@// Output registers
+#define result  r0
+
+@//Local Scratch Registers
+
+#define argTwiddle      r1
+#define argDst  r2
+#define argScale        r4
+#define pTwiddle        r4
+#define tmpOrder        r4
+#define pOut    r5
+#define subFFTSize      r7
+#define subFFTNum       r6
+#define N       r6
+#define order   r14
+#define diff    r9
+@// Total num of radix stages required to comple the FFT
+#define count   r8
+#define x0r     r4
+#define x0i     r5
+#define diffMinusOne    r2
+#define round   r3
+
+@// Neon registers
+
+#define dX0  D0.S16
+#define dShift  D1.S16
+#define dX0S32  D0.S32
+
+
+    @// Allocate stack memory required by the function
         M_ALLOC4        diffOnStack, 4
 
-    ;// Write function header
+    @// Write function header
         M_START     omxSP_FFTInv_CToC_SC16_Sfs,r11,d15
-        
-        M_STRUCT     ARMsFFTSpec
-        M_FIELD      N, 4
-        M_FIELD      pBitRev, 4
-        M_FIELD      pTwiddle, 4
-        M_FIELD      pBuf, 4
-        M_ENDSTRUCT
-        
-        ;// Define stack arguments
-        
-        ;// Read the size from structure and take log
+
+@ Structure offsets for the FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @// Define stack arguments
+
+        @// Read the size from structure and take log
         LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
-        
-        ;// Read other structure parameters
+
+        @// Read other structure parameters
         LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
         LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
-                
-        CLZ     order,N                             ;// N = 2^order 
-        RSB     order,order,#31     
+
+        CLZ     order,N                             @// N = 2^order
+        RSB     order,order,#31
         MOV     subFFTSize,#1
-        ;//MOV     subFFTNum,N
-        
-        ADD     scale,scale,order                   ;// FFTInverse has a final scaling factor by N
-        
+        @//MOV     subFFTNum,N
+
+        ADD     scale,scale,order                   @// FFTInverse has a final scaling factor by N
+
         CMP     order,#3
-        BGT     orderGreaterthan3                   ;// order > 3
-        
+        BGT     orderGreaterthan3                   @// order > 3
+
         CMP     order,#1
-        BGE     orderGreaterthan0                   ;// order > 0
-        M_STR   scale, diffOnStack,LT               ;// order = 0
+        BGE     orderGreaterthan0                   @// order > 0
+        M_STR   scale, diffOnStack,LT               @// order = 0
         LDRLT   x0r,[pSrc]
         STRLT   x0r,[pDst]
         MOVLT   pSrc,pDst
         BLT     FFTEnd
-        
-orderGreaterthan0
-        ;// set the buffers appropriately for various orders
+
+orderGreaterthan0:
+        @// set the buffers appropriately for various orders
         CMP     order,#2
-        MOVNE   argDst,pDst        
+        MOVNE   argDst,pDst
         MOVEQ   argDst,pOut
-        MOVEQ   pOut,pDst                           ;// Pass the first stage destination in RN5
+        MOVEQ   pOut,pDst                           @// Pass the first stage destination in RN5
         MOV     argTwiddle,pTwiddle
-        ;// Store the scale factor and scale at the end
+        @// Store the scale factor and scale at the end
         SUB     diff,scale,order
         M_STR   diff, diffOnStack
         BGE     orderGreaterthan1
-        BLLT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe  ;// order = 1
+        BLLT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe  @// order = 1
         B       FFTEnd
-        
-        
-orderGreaterthan1
-        MOV     tmpOrder,order                          ;// tmpOrder = RN 4
-        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe        
+
+
+orderGreaterthan1:
+        MOV     tmpOrder,order                          @// tmpOrder = RN 4
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
         CMP     tmpOrder,#2
         BLGT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
         BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
         B       FFTEnd
-        
-               
 
 
-orderGreaterthan3       
-        ;// check scale = 0 or scale = order
-        SUBS    diff, scale, order                 ;// scale > order 
-        MOVGT   scale,order     
-        BGE     specialScaleCase                   ;// scale = 0 or scale = order 
+
+
+orderGreaterthan3:
+        @// check scale = 0 or scale = order
+        SUBS    diff, scale, order                 @// scale > order
+        MOVGT   scale,order
+        BGE     specialScaleCase                   @// scale = 0 or scale = order
         CMP     scale,#0
         BEQ     specialScaleCase
         B       generalScaleCase
-        
-specialScaleCase                                    ;//  scale = 0 or scale = order  and order > 3     
-        
-        TST     order, #2                           ;// Set input args to fft stages
-        MOVNE   argDst,pDst        
+
+specialScaleCase:                                           @//  scale = 0 or scale = order  and order > 3
+
+        TST     order, #2                           @// Set input args to fft stages
+        MOVNE   argDst,pDst
         MOVEQ   argDst,pOut
-        MOVEQ   pOut,pDst                           ;// Pass the first stage destination in RN5
-        MOV     argTwiddle,pTwiddle  
+        MOVEQ   pOut,pDst                           @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
 
         CMP      diff,#0
         M_STR    diff, diffOnStack
-        BGE      scaleEqualsOrder  
-       
-        ;//check for even or odd order
-        ;// NOTE: The following combination of BL's would work fine eventhough the first
-        ;// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 
-        ;// armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
-        
+        BGE      scaleEqualsOrder
+
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine eventhough the first
+        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+        @// armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
         TST     order,#0x00000001
-        BLEQ    armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe 
-        BLNE    armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe 
-        
+        BLEQ    armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+
         CMP        subFFTNum,#4
         BLT     FFTEnd
 
-unscaledRadix4Loop
+unscaledRadix4Loop:
         BEQ        lastStageUnscaledRadix4
         BL        armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
          CMP        subFFTNum,#4
          B        unscaledRadix4Loop
 
-lastStageUnscaledRadix4
-        BL      armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe 
-        B        FFTEnd         
+lastStageUnscaledRadix4:
+        BL      armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+        B        FFTEnd
 
-scaleEqualsOrder         
-        ;//check for even or odd order
-        ;// NOTE: The following combination of BL's would work fine eventhough the first
-        ;// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 
-        ;// armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
-                
+scaleEqualsOrder:
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine eventhough the first
+        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+        @// armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
         TST     order,#0x00000001
-        BLEQ    armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe 
-        BLNE    armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe 
-        
+        BLEQ    armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+
         CMP        subFFTNum,#4
         BLT     FFTEnd
 
-scaledRadix4Loop
+scaledRadix4Loop:
         BEQ        lastStageScaledRadix4
         BL        armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
          CMP        subFFTNum,#4
          B        scaledRadix4Loop
-         
-lastStageScaledRadix4
-        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe 
-        B        FFTEnd                    
-         
-        
-        
-generalScaleCase                                        ;// 0 < scale < order and order > 3
-        ;// Determine the correct destination buffer
+
+lastStageScaledRadix4:
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+        B        FFTEnd
+
+
+
+generalScaleCase:                                        @// 0 < scale < order and order > 3
+        @// Determine the correct destination buffer
         SUB     diff,order,scale
         TST     diff,#0x01
-        ADDEQ   count,scale,diff,LSR #1         ;// count = scale + (order - scale)/2
+        ADDEQ   count,scale,diff,LSR #1         @// count = scale + (order - scale)/2
         MOVNE   count,order
-        TST     count,#0x01                     ;// Is count even or odd ?
-        
-        MOVNE   argDst,pDst                     ;// Set input args to fft stages
+        TST     count,#0x01                     @// Is count even or odd ?
+
+        MOVNE   argDst,pDst                     @// Set input args to fft stages
         MOVEQ   argDst,pOut
-        MOVEQ   pOut,pDst                       ;// Pass the first stage destination in RN5
-        MOV     argTwiddle,pTwiddle  
+        MOVEQ   pOut,pDst                       @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
 
         CMP     diff,#1
-        M_STR   diff, diffOnStack    
-        BEQ     scaleps                         ;// scaling including a radix2_ps stage
-        
-        MOV     argScale,scale                  ;// Put scale in RN4 so as to save and restore
-        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     ;// scaled first stage
+        M_STR   diff, diffOnStack
+        BEQ     scaleps                         @// scaling including a radix2_ps stage
+
+        MOV     argScale,scale                  @// Put scale in RN4 so as to save and restore
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     @// scaled first stage
         SUBS    argScale,argScale,#1
-        
-scaledRadix2Loop        
+
+scaledRadix2Loop:
         BLGT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
-        SUBS    argScale,argScale,#1            ;// save and restore scale (RN4) in the scaled stages
+        SUBS    argScale,argScale,#1            @// save and restore scale (RN4) in the scaled stages
         BGT     scaledRadix2Loop
         B       outScale
 
-scaleps
-        SUB     argScale,scale,#1                   ;// order>3 and diff=1 => scale >= 3
-        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     ;// scaled first stage
+scaleps:
+        SUB     argScale,scale,#1                   @// order>3 and diff=1 => scale >= 3
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     @// scaled first stage
         SUBS    argScale,argScale,#1
-        
-scaledRadix2psLoop
-        BEQ     scaledRadix2psStage        
+
+scaledRadix2psLoop:
+        BEQ     scaledRadix2psStage
         BLGT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
-        SUBS    argScale,argScale,#1            ;// save and restore scale (RN4) in the scaled stages
+        SUBS    argScale,argScale,#1            @// save and restore scale (RN4) in the scaled stages
         BGE     scaledRadix2psLoop
 
-scaledRadix2psStage
+scaledRadix2psStage:
         BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
-        B       generalLastStageUnscaledRadix2         
-        
-        
-outScale        
-        M_LDR   diff, diffOnStack  
-        ;//check for even or odd order
+        B       generalLastStageUnscaledRadix2
+
+
+outScale:
+        M_LDR   diff, diffOnStack
+        @//check for even or odd order
         TST     diff,#0x00000001
         BEQ     generalUnscaledRadix4Loop
         B       unscaledRadix2Loop
 
-generalUnscaledRadix4Loop
+generalUnscaledRadix4Loop:
         CMP        subFFTNum,#4
          BEQ        generalLastStageUnscaledRadix4
          BL        armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
          B        generalUnscaledRadix4Loop
-         
-generalLastStageUnscaledRadix4
-        BL      armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe 
-        B        End              
 
-unscaledRadix2Loop
+generalLastStageUnscaledRadix4:
+        BL      armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+        B        End
+
+unscaledRadix2Loop:
         CMP        subFFTNum,#4
          BEQ        generalLastTwoStagesUnscaledRadix2
          BL        armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe
-         B        unscaledRadix2Loop        
+         B        unscaledRadix2Loop
 
-generalLastTwoStagesUnscaledRadix2
+generalLastTwoStagesUnscaledRadix2:
         BL      armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
-generalLastStageUnscaledRadix2                  
-        BL      armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe 
+generalLastStageUnscaledRadix2:
+        BL      armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
         B        End
 
 
-FFTEnd                                              ;// Does only the scaling
-        
-        M_LDR   diff, diffOnStack  
+FFTEnd:                                              @// Does only the scaling
+
+        M_LDR   diff, diffOnStack
         CMP     diff,#0
         BLE     End
-        
-        RSB     diff,diff,#0                        ;// to use VRSHL for right shift by a variable
-        VDUP    dShift,diff     
-        
-scaleFFTData                                        ;// N = subFFTSize  ; dataptr = pDst  ; scale = diff
-        VLD1    {dX0S32[0]},[pSrc]                        ;// pSrc contains pDst pointer
+
+        RSB     diff,diff,#0                        @// to use VRSHL for right shift by a variable
+        VDUP    dShift,diff
+
+scaleFFTData:                                        @// N = subFFTSize  ; dataptr = pDst  ; scale = diff
+        VLD1    {dX0S32[0]},[pSrc]                        @// pSrc contains pDst pointer
         SUBS    subFFTSize,subFFTSize,#1
         VRSHL   dX0,dShift
         VST1    {dX0S32[0]},[pSrc]!
-                
+
         BGT     scaleFFTData
-        
-       
-End                        
-        ;// Set return value
-        MOV     result, #OMX_Sts_NoErr       
 
-        ;// Write function tail
+
+End:
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr
+
+        @// Write function tail
         M_END
-        
-    ENDIF                                           ;//CortexA8    
 
-    
-    
-    
-    
-    END
\ No newline at end of file
+
+
+
+
+
+    .END
diff --git a/dl/sp/src/test/compare.c b/dl/sp/src/test/compare.c
index 6cf76b4..c2ed0c0 100644
--- a/dl/sp/src/test/compare.c
+++ b/dl/sp/src/test/compare.c
@@ -69,6 +69,45 @@
   snr->complex_snr_ = CalculateSNR(complex_signal_power, complex_noise_power);
 }
 
+void CompareComplex16(struct SnrResult* snr, OMX_SC16* actual,
+                      OMX_SC16* expected, int size) {
+    double realSignalPower = 0;
+    double imagSignalPower = 0;
+    double complexSignalPower = 0;
+    double realNoisePower = 0;
+    double imagNoisePower = 0;
+    double complexNoisePower = 0;
+    int k;
+    for (k = 0; k < size; ++k) {
+        double x2;
+        double y2;
+        double z2;
+
+        if (verbose > 255) {
+            printf("%4d: (%10d, %10d) (%10d, %10d)\n", k,
+                   actual[k].Re, actual[k].Im,
+                   expected[k].Re, expected[k].Im);
+        }
+
+        x2 = pow((double) expected[k].Re, 2);
+        y2 = pow((double) expected[k].Im, 2);
+        realSignalPower += x2;
+        imagSignalPower += y2;
+        complexSignalPower += x2 + y2;
+
+        x2 = pow((double) actual[k].Re - expected[k].Re, 2);
+        y2 = pow((double) actual[k].Im - expected[k].Im, 2);
+
+        realNoisePower += x2;
+        imagNoisePower += y2;
+        complexNoisePower += x2 + y2;
+    }
+
+    snr->real_snr_ = CalculateSNR(realSignalPower, realNoisePower);
+    snr->imag_snr_ = CalculateSNR(imagSignalPower, imagNoisePower);
+    snr->complex_snr_ = CalculateSNR(complexSignalPower, complexNoisePower);
+}
+
 /*
  * Compute the SNR of the actual real signal, returning the SNR.
  */
diff --git a/dl/sp/src/test/compare.h b/dl/sp/src/test/compare.h
index 258c627..348c407 100644
--- a/dl/sp/src/test/compare.h
+++ b/dl/sp/src/test/compare.h
@@ -32,6 +32,8 @@
  */
 void CompareComplex32(struct SnrResult* snr, OMX_SC32* actual,
                       OMX_SC32* expected, int size);
+void CompareComplex16(struct SnrResult* snr, OMX_SC16* actual,
+                      OMX_SC16* expected, int size);
 void CompareReal32(struct SnrResult* snr, OMX_S32* actual,
                    OMX_S32* expected, int size);
 void CompareReal16(struct SnrResult* snr, OMX_S16* actual,
diff --git a/dl/sp/src/test/test_fft.gyp b/dl/sp/src/test/test_fft.gyp
index ee63818..f47bf83 100644
--- a/dl/sp/src/test/test_fft.gyp
+++ b/dl/sp/src/test/test_fft.gyp
@@ -43,6 +43,14 @@
       ],
     },
     {
+      # Test complex fixed-point 16-bit FFT
+      'target_name': 'test_fft16',
+      'type': 'executable',
+      'sources': [
+        'test_fft16.c',
+      ],
+    },
+    {
       # Test complex fixed-point 32-bit FFT
       'target_name': 'test_fft32',
       'type': 'executable',
@@ -95,6 +103,7 @@
       'target_name': 'All',
       'type': 'none',
       'dependencies': [
+        'test_fft16',
         'test_fft32',
         'test_float_fft',
         'test_float_rfft',
diff --git a/dl/sp/src/test/test_fft16.c b/dl/sp/src/test/test_fft16.c
new file mode 100644
index 0000000..081bf23
--- /dev/null
+++ b/dl/sp/src/test/test_fft16.c
@@ -0,0 +1,364 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+#include "dl/sp/src/test/aligned_ptr.h"
+#include "dl/sp/src/test/compare.h"
+#include "dl/sp/src/test/gensig.h"
+#include "dl/sp/src/test/test_util.h"
+
+#define MAX_FFT_ORDER   12
+
+int verbose = 0;
+int signal_value = 1024;
+int scale_factor = 0;
+
+struct KnownTestFailures known_failures[] = {
+    {11, 0, 1},
+    {11, 0, 2},
+    {11, 0, 3},
+    {12, 0, 1},
+    {12, 0, 2},
+    {12, 0, 3},
+    { 6, 1, 3},
+    { 7, 1, 3},
+    { 8, 1, 3},
+    { 9, 1, 3},
+    {10, 1, 3},
+    {11, 1, 1},
+    {11, 1, 2},
+    {11, 1, 3},
+    {12, 1, 1},
+    {12, 1, 2},
+    {12, 1, 3},
+    /* Marker to terminate array */
+    {-1, 0, 0}
+};
+
+void TestFFT(int fftLogSize, int scale_factor, int signalType);
+
+void main(int argc, char* argv[]) {
+  struct Options options;
+
+  SetDefaultOptions(&options, 0, MAX_FFT_ORDER);
+
+  options.signal_value_ = signal_value;
+  options.scale_factor_ = scale_factor;
+
+  ProcessCommandLine(&options, argc, argv,
+                     "Test forward and inverse 16-bit fixed-point FFT\n");
+
+  verbose = options.verbose_;
+  signal_value = options.signal_value_;
+  scale_factor = options.scale_factor_;
+
+  if (verbose > 255)
+    DumpOptions(stderr, &options);
+
+  if (options.test_mode_) {
+    struct TestInfo info;
+
+    info.real_only_ = options.real_only_;
+    info.max_fft_order_ = options.max_fft_order_;
+    info.min_fft_order_ = options.min_fft_order_;
+    info.do_forward_tests_ = options.do_forward_tests_;
+    info.do_inverse_tests_ = options.do_inverse_tests_;
+    info.known_failures_ = known_failures;
+    /*
+     * These SNR threshold values critically depend on the
+     * signal_value that is set for the tests!
+     */
+    info.forward_threshold_ = 33.01;
+    info.inverse_threshold_ = 35.59;
+
+    RunAllTests(&info);
+  } else {
+    TestFFT(options.fft_log_size_,
+            options.signal_type_,
+            options.scale_factor_);
+  }
+}
+
+void GenerateSignal(OMX_SC16* x, struct ComplexFloat* fft,
+                    struct ComplexFloat* x_true, int size, int sigtype,
+                    int scale_factor) {
+  int k;
+
+  GenerateTestSignalAndFFT(x_true, fft, size, sigtype, signal_value, 0);
+
+  /*
+   * Convert the complex result to what we want
+   */
+
+  for (k = 0; k < size; ++k) {
+    x[k].Re = 0.5 + x_true[k].Re;
+    x[k].Im = 0.5 + x_true[k].Im;
+  }
+}
+
+void DumpFFTSpec(OMXFFTSpec_C_SC16* pSpec) {
+  ARMsFFTSpec_SC16* p = (ARMsFFTSpec_SC16*) pSpec;
+  printf(" N = %d\n", p->N);
+  printf(" pBitRev  = %p\n", p->pBitRev);
+  printf(" pTwiddle = %p\n", p->pTwiddle);
+  printf(" pBuf     = %p\n", p->pBuf);
+}
+
+void TestFFT(int fft_log_size, int signal_type, int scale_factor) {
+  struct SnrResult snr;
+
+  RunOneForwardTest(fft_log_size, signal_type, signal_value, &snr);
+  printf("Forward float FFT\n");
+  printf("SNR:  real part    %f dB\n", snr.real_snr_);
+  printf("      imag part    %f dB\n", snr.imag_snr_);
+  printf("      complex part %f dB\n", snr.complex_snr_);
+
+  RunOneInverseTest(fft_log_size, signal_type, signal_value, &snr);
+  printf("Inverse float FFT\n");
+  printf("SNR:  real part    %f dB\n", snr.real_snr_);
+  printf("      imag part    %f dB\n", snr.imag_snr_);
+  printf("      complex part %f dB\n", snr.complex_snr_);
+}
+
+
+float RunOneForwardTest(int fft_log_size, int signal_type,
+                        float unused_signal_value,
+                        struct SnrResult* snr) {
+  OMX_SC16* x;
+  OMX_SC16* y;
+
+  struct AlignedPtr* x_aligned;
+  struct AlignedPtr* y_aligned;
+
+  struct ComplexFloat* x_true;
+  struct ComplexFloat* y_true;
+  OMX_SC16* y_scaled;
+
+  OMX_INT n, fft_spec_buffer_size;
+  OMXResult status;
+  OMXFFTSpec_C_SC16 * fft_fwd_spec = NULL;
+  int fft_size;
+
+  /*
+   * With 16-bit numbers, we need to be careful to use all of the
+   * available bits to get good accuracy.  Hence, set signal_value to
+   * the max 16-bit value (or close to it).
+   *
+   * To get good FFT results, also set the forward FFT scale factor
+   * to be the same as the order.  This was determined by
+   * experimentation, so be careful!
+   */
+  signal_value = 32767;
+  scale_factor = fft_log_size;
+
+  fft_size = 1 << fft_log_size;
+
+  status = omxSP_FFTGetBufSize_C_SC16(fft_log_size, &fft_spec_buffer_size);
+  if (verbose > 63) {
+    printf("bufSize = %d\n", fft_spec_buffer_size);
+  }
+
+  fft_fwd_spec = (OMXFFTSpec_C_SC16*) malloc(fft_spec_buffer_size);
+  status = omxSP_FFTInit_C_SC16(fft_fwd_spec, fft_log_size);
+  if (status) {
+    fprintf(stderr, "Failed to init forward FFT:  status = %d\n", status);
+    exit(1);
+  }
+
+  x_aligned = AllocAlignedPointer(32, sizeof(*x) * fft_size);
+  y_aligned = AllocAlignedPointer(32, sizeof(*y) * (fft_size + 2));
+
+  x = x_aligned->aligned_pointer_;
+  y = y_aligned->aligned_pointer_;
+
+  x_true = (struct ComplexFloat*) malloc(sizeof(*x_true) * fft_size);
+  y_true = (struct ComplexFloat*) malloc(sizeof(*y_true) * fft_size);
+  y_scaled = (OMX_SC16*) malloc(sizeof(*y_true) * fft_size);
+
+  GenerateSignal(x, y_true, x_true, fft_size, signal_type, scale_factor);
+
+  {
+    float scale = pow(2.0, fft_log_size);
+
+    for (n = 0; n < fft_size; ++n) {
+      y_scaled[n].Re = 0.5 + y_true[n].Re / scale;
+      y_scaled[n].Im = 0.5 + y_true[n].Im / scale;
+    }
+  }
+
+  if (verbose > 63) {
+    printf("Signal\n");
+    DumpArrayComplex16("x", fft_size, x);
+    printf("Expected FFT output\n");
+    DumpArrayComplex16("y", fft_size, y_scaled);
+  }
+
+  status = omxSP_FFTFwd_CToC_SC16_Sfs(x, y, fft_fwd_spec, scale_factor);
+  if (status) {
+    fprintf(stderr, "Forward FFT failed: status = %d\n", status);
+    exit(1);
+  }
+
+  if (verbose > 63) {
+    printf("FFT Output\n");
+    DumpArrayComplex16("y", fft_size, y);
+  }
+
+  CompareComplex16(snr, y, y_scaled, fft_size);
+
+  return snr->complex_snr_;
+}
+
+float RunOneInverseTest(int fft_log_size, int signal_type,
+                        float unused_signal_value,
+                        struct SnrResult* snr) {
+  OMX_SC16* x;
+  OMX_SC16* y;
+  OMX_SC16* z;
+  OMX_SC16* y_scaled;
+
+  struct AlignedPtr* x_aligned;
+  struct AlignedPtr* y_aligned;
+  struct AlignedPtr* z_aligned;
+  struct AlignedPtr* y_scaled_aligned;
+
+  struct ComplexFloat* x_true;
+  struct ComplexFloat* y_true;
+
+  OMX_INT n, fft_spec_buffer_size;
+  OMXResult status;
+  OMXFFTSpec_C_SC16 * fft_fwd_spec = NULL;
+  OMXFFTSpec_C_SC16 * fft_inv_spec = NULL;
+  int fft_size;
+
+  /*
+   * With 16-bit numbers, we need to be careful to use all of the
+   * available bits to get good accuracy.  Hence, set signal_value to
+   * the max 16-bit value (or close to it).
+   *
+   * To get good FFT results, also set the forward FFT scale factor
+   * to be the same as the order.  This was determined by
+   * experimentation, so be careful!
+   */
+  signal_value = 32767;
+
+  fft_size = 1 << fft_log_size;
+
+  status = omxSP_FFTGetBufSize_C_SC16(fft_log_size, &fft_spec_buffer_size);
+  if (verbose > 3) {
+    printf("bufSize = %d\n", fft_spec_buffer_size);
+  }
+
+  fft_inv_spec = (OMXFFTSpec_C_SC16*)malloc(fft_spec_buffer_size);
+  status = omxSP_FFTInit_C_SC16(fft_inv_spec, fft_log_size);
+  if (status) {
+    fprintf(stderr, "Failed to init backward FFT:  status = %d\n", status);
+    exit(1);
+  }
+
+  x_aligned = AllocAlignedPointer(32, sizeof(*x) * fft_size);
+  y_aligned = AllocAlignedPointer(32, sizeof(*y) * (fft_size + 2));
+  z_aligned = AllocAlignedPointer(32, sizeof(*z) * fft_size);
+  y_scaled_aligned = AllocAlignedPointer(32, sizeof(*y_true) * fft_size);
+
+  x = x_aligned->aligned_pointer_;
+  y = y_aligned->aligned_pointer_;
+  z = z_aligned->aligned_pointer_;
+  y_scaled = y_scaled_aligned->aligned_pointer_;
+
+  y_true = (struct ComplexFloat*) malloc(sizeof(*y_true) * fft_size);
+  x_true = (struct ComplexFloat*) malloc(sizeof(*x_true) * fft_size);
+
+
+  GenerateSignal(x, y_true, x_true, fft_size, signal_type, fft_log_size);
+
+  {
+    /*
+     * To get max accuracy, scale the input to the inverse FFT up
+     * to use as many bits as we can.
+     */
+    float scale = 1;
+    float max = 0;
+
+    for (n = 0; n < fft_size; ++n) {
+      float val;
+      val = fabs(y_true[n].Re);
+      if (val > max) {
+        max = val;
+      }
+      val = fabs(y_true[n].Im);
+      if (val > max) {
+        max = val;
+      }
+    }
+
+    scale = 16384 / max;
+    if (verbose > 63)
+      printf("Inverse FFT input scaled factor %g\n", scale);
+
+    /*
+     * Scale both the true FFT signal and the input so we can
+     * compare them correctly later
+     */
+    for (n = 0; n < fft_size; ++n) {
+      y_scaled[n].Re = 0.5 + y_true[n].Re * scale;
+      y_scaled[n].Im = 0.5 + y_true[n].Im * scale;
+      x_true[n].Re *= scale;
+      x_true[n].Im *= scale;
+    }
+  }
+
+
+  if (verbose > 63) {
+    printf("Inverse FFT Input Signal\n");
+    DumpArrayComplex16("yScaled", fft_size, y_scaled);
+    printf("Expected Inverse FFT Output\n");
+    DumpArrayComplexFloat("x_true", fft_size, (OMX_FC32*) x_true);
+  }
+
+  status = omxSP_FFTInv_CToC_SC16_Sfs(y_scaled, z, fft_inv_spec, 0);
+
+  if (verbose > 7)
+    printf("Inverse FFT scaling = %d\n", status);
+
+  if (verbose > 127) {
+    printf("Raw Inverse FFT Output\n");
+    DumpArrayComplex16("z", fft_size, z);
+  }
+
+  /*
+   * The inverse FFT routine returns how much scaling was done. To
+   * compare the output with the expected output, we need to scale
+   * the expected output according to the scale factor returned.
+   */
+  for (n = 0; n < fft_size; ++n) {
+    x[n].Re = 0.5 + x_true[n].Re;
+    x[n].Im = 0.5 + x_true[n].Im;
+  }
+
+  if (verbose > 63) {
+    printf("Inverse FFT Output\n");
+    printf(" Actual\n");
+    DumpArrayComplex16("z", fft_size, z);
+    printf(" Expected (scaled)\n");
+    DumpArrayComplex16("x", fft_size, x);
+  }
+
+  CompareComplex16(snr, z, x, fft_size);
+
+  return snr->complex_snr_;
+}
diff --git a/dl/sp/src/test/test_util.c b/dl/sp/src/test/test_util.c
index ab989c4..88d697b 100644
--- a/dl/sp/src/test/test_util.c
+++ b/dl/sp/src/test/test_util.c
@@ -378,6 +378,16 @@
   }
 }
 
+void DumpArrayComplex16(const char* array_name, int count,
+                        const OMX_SC16* array) {
+  int n;
+
+  printf("%4s\t%10s.re[n]\t%10s.im[n]\n", "n", array_name);
+  for (n = 0; n < count; ++n) {
+    printf("%4d\t%16d\t%16d\n", n, array[n].Re, array[n].Im);
+  }
+}
+
 void DumpArrayFloat(const char* array_name, int count, const OMX_F32* array) {
   int n;
 
diff --git a/dl/sp/src/test/test_util.h b/dl/sp/src/test/test_util.h
index c75ccc7..5851e12 100644
--- a/dl/sp/src/test/test_util.h
+++ b/dl/sp/src/test/test_util.h
@@ -152,6 +152,8 @@
  */
 void DumpArrayReal16(const char* array_name, int count, const OMX_S16* array);
 void DumpArrayReal32(const char* array_name, int count, const OMX_S32* array);
+void DumpArrayComplex16(const char* array_name, int count,
+                        const OMX_SC16* array);
 void DumpArrayComplex32(const char* array_name, int count,
                         const OMX_SC32* array);
 void DumpArrayFloat(const char* array_name, int count, const OMX_F32* array);