Make 16-bit FFT work with gcc, update license info, and add test program.
Review URL: https://webrtc-codereview.appspot.com/1103006
git-svn-id: http://webrtc.googlecode.com/svn/deps/third_party/openmax@3504 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/dl/dl.gyp b/dl/dl.gyp
index f0f0eb7..79bae5d 100644
--- a/dl/dl.gyp
+++ b/dl/dl.gyp
@@ -46,6 +46,19 @@
'sp/src/omxSP_FFTGetBufSize_R_S32.c',
'sp/src/omxSP_FFTInit_R_S32.c',
'sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S',
+ # Complex 16-bit fixed-point FFT
+ 'sp/src/omxSP_FFTInit_C_SC16.c',
+ 'sp/src/omxSP_FFTGetBufSize_C_SC16.c',
+ 'sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S',
+ 'sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S',
+ 'sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S',
# Real 16-bit fixed-point FFT
'sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S',
'sp/src/omxSP_FFTGetBufSize_R_S16S32.c',
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S
index f321502..a16c79f 100644
--- a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S
@@ -1,162 +1,170 @@
-;//
-;//
-;// File Name: armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision: 6693
-;// Last Modified Date: Tue, 10 Jul 2007
-;//
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;//
-;//
-;//
-;// Description:
-;// Compute a Radix 2 FFT stage for a N point complex signal
-;//
-;//
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This file was originally licensed as follows. It has been
+@// relicensed with permission from the copyright holders.
-
-;// Include standard headers
+@//
+@//
+@// File Name: armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision: 6693
+@// Last Modified Date: Tue, 10 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
- INCLUDE omxtypes_s.h
- INCLUDE armCOMM_s.h
-
- M_VARIANTS CortexA8
-
-;// Import symbols required from other files
-;// (For example tables)
-
-
-
-
-;// Set debugging level
-;//DEBUG_ON SETL {TRUE}
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
-;// Guarding implementation by the processor name
-
-
-
-;// Guarding implementation by the processor name
-
- IF CortexA8
-
-;//Input Registers
-pSrc RN 0
-pDst RN 2
-pTwiddle RN 1
-pPingPongBuf RN 5
-subFFTNum RN 6
-subFFTSize RN 7
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
-;//Output Registers
+
+@// Guarding implementation by the processor name
-;//Local Scratch Registers
-pointStep RN 3
-outPointStep RN 3
-grpSize RN 4
-setCount RN 4
-step RN 8
-dstStep RN 8
-
-;// Neon Registers
-
-dX0 DN D0.S16
-dX1 DN D1.S16
-dY0 DN D2.S16
-dY1 DN D3.S16
-dX0S32 DN D0.S32
-dX1S32 DN D1.S32
-dY0S32 DN D2.S32
-dY1S32 DN D3.S32
+@// Guarding implementation by the processor name
- MACRO
- FFTSTAGE $scaled, $inverse, $name
-
- ;// Define stack arguments
-
-
- ;// update subFFTSize and subFFTNum into RN6 and RN7 for the next stage
-
-
+@//Input Registers
+
+#define pSrc r0
+#define pDst r2
+#define pTwiddle r1
+#define pPingPongBuf r5
+#define subFFTNum r6
+#define subFFTSize r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define pointStep r3
+#define outPointStep r3
+#define grpSize r4
+#define setCount r4
+#define step r8
+#define dstStep r8
+
+@// Neon Registers
+
+#define dX0 D0.S16
+#define dX1 D1.S16
+#define dY0 D2.S16
+#define dY1 D3.S16
+#define dX0S32 D0.S32
+#define dX1S32 D1.S32
+#define dY0S32 D2.S32
+#define dY1S32 D3.S32
+
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ @// Define stack arguments
+
+
+ @// update subFFTSize and subFFTNum into RN6 and RN7 for the next stage
+
+
MOV subFFTSize,#2
- LSR grpSize,subFFTNum,#1
- MOV subFFTNum,grpSize
-
-
- ;// pT0+1 increments pT0 by 8 bytes
- ;// pT0+pointStep = increment of 4*pointStep bytes = 2*grpSize bytes
- ;// Note: outPointStep = pointStep for firststage
- ;// Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
-
- MOV pointStep,grpSize,LSL #2
- RSB step,pointStep,#4
-
-
- ;// Loop on the sets for grp zero: 1 set at a time
+ LSR grpSize,subFFTNum,#1
+ MOV subFFTNum,grpSize
-grpZeroSetLoop$name
-
+
+ @// pT0+1 increments pT0 by 8 bytes
+ @// pT0+pointStep = increment of 4*pointStep bytes = 2*grpSize bytes
+ @// Note: outPointStep = pointStep for firststage
+ @// Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+
+ MOV pointStep,grpSize,LSL #2
+ RSB step,pointStep,#4
+
+
+ @// Loop on the sets for grp zero: 1 set at a time
+
+grpZeroSetLoop\name:
+
VLD1 {dX0S32[0]},[pSrc],pointStep
- VLD1 {dX1S32[0]},[pSrc],step ;// step = -pointStep + 4
- SUBS setCount,setCount,#1 ;// decrement the loop counter
-
- IF $scaled
-
+ VLD1 {dX1S32[0]},[pSrc],step @// step = -pointStep + 4
+ SUBS setCount,setCount,#1 @// decrement the loop counter
+
+ .ifeqs "\scaled", "TRUE"
+
VHADD dY0,dX0,dX1
VHSUB dY1,dX0,dX1
-
- ELSE
-
+
+ .ELSE
+
VADD dY0,dX0,dX1
VSUB dY1,dX0,dX1
-
-
- ENDIF
-
+
+
+ .ENDIF
+
VST1 {dY0S32[0]},[pDst],outPointStep
- VST1 {dY1S32[0]},[pDst],dstStep ;// dstStep = step = -pointStep + 4
-
- BGT grpZeroSetLoop$name
-
-
- ;// reset pSrc to pDst for the next stage
- SUB pSrc,pDst,pointStep ;// pDst -= 2*grpSize
+ VST1 {dY1S32[0]},[pDst],dstStep @// dstStep = step = -pointStep + 4
+
+ BGT grpZeroSetLoop\name
+
+
+ @// reset pSrc to pDst for the next stage
+ SUB pSrc,pDst,pointStep @// pDst -= 2*grpSize
MOV pDst,pPingPongBuf
-
- MEND
-
-
-
+
+ .endm
+
+
+
M_START armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe,r4
- FFTSTAGE {FALSE},{FALSE},FWD
+ FFTSTAGE "FALSE","FALSE",FWD
M_END
-
-
+
+
M_START armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe,r4
- FFTSTAGE {FALSE},{TRUE},INV
+ FFTSTAGE "FALSE","TRUE",INV
M_END
-
-
-
+
+
+
M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe,r4
- FFTSTAGE {TRUE},{FALSE},FWDSFS
+ FFTSTAGE "TRUE","FALSE",FWDSFS
M_END
-
-
+
+
M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe,r4
- FFTSTAGE {TRUE},{TRUE},INVSFS
+ FFTSTAGE "TRUE","TRUE",INVSFS
M_END
-
- ENDIF ;//CORTEXA8
-
-
- END
\ No newline at end of file
+
+
+
+ .END
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S
index 0932099..9f7b531 100644
--- a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S
@@ -1,202 +1,210 @@
-;//
-;//
-;// File Name: armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision: 6741
-;// Last Modified Date: Wed, 18 Jul 2007
-;//
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;//
-;//
-;//
-;// Description:
-;// Compute a Radix 2 FFT stage for a N point complex signal
-;//
-;//
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This file was originally licensed as follows. It has been
+@// relicensed with permission from the copyright holders.
-
-;// Include standard headers
-
- INCLUDE omxtypes_s.h
- INCLUDE armCOMM_s.h
-
- M_VARIANTS CortexA8
-
-;// Import symbols required from other files
-;// (For example tables)
-
-
-
-
-;// Set debugging level
-;//DEBUG_ON SETL {TRUE}
+@//
+@//
+@// File Name: armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision: 6741
+@// Last Modified Date: Wed, 18 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
-;// Guarding implementation by the processor name
-
-
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
-
-;// Guarding implementation by the processor name
-
- IF CortexA8
-
-;//Input Registers
-
-pSrc RN 0
-pDst RN 2
-pTwiddle RN 1
-subFFTNum RN 6
-subFFTSize RN 7
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
-;//Output Registers
+@// Guarding implementation by the processor name
-;//Local Scratch Registers
-outPointStep RN 3
-grpCount RN 4
-dstStep RN 5
-pTmp RN 4
-step RN 8
-
-;// Neon Registers
-
-dWr DN D0.S16
-dWi DN D1.S16
-dXr0 DN D2.S16
-dXi0 DN D3.S16
-dXr1 DN D4.S16
-dXi1 DN D5.S16
-dYr0 DN D6.S16
-dYi0 DN D7.S16
-dYr1 DN D8.S16
-dYi1 DN D9.S16
-qT0 QN Q5.S32
-qT1 QN Q6.S32
- MACRO
- FFTSTAGE $scaled, $inverse, $name
-
-
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r2
+#define pTwiddle r1
+#define subFFTNum r6
+#define subFFTSize r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+
+#define outPointStep r3
+#define grpCount r4
+#define dstStep r5
+#define pTmp r4
+#define step r8
+
+@// Neon Registers
+
+#define dWr D0.S16
+#define dWi D1.S16
+#define dXr0 D2.S16
+#define dXi0 D3.S16
+#define dXr1 D4.S16
+#define dXi1 D5.S16
+#define dYr0 D6.S16
+#define dYi0 D7.S16
+#define dYr1 D8.S16
+#define dYi1 D9.S16
+#define qT0 Q5.S32
+#define qT1 Q6.S32
+
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+
MOV outPointStep,subFFTSize,LSL #2
- ;// Update grpCount and grpSize rightaway
-
- MOV subFFTNum,#1 ;//after the last stage
- LSL grpCount,subFFTSize,#1
-
- ;// update subFFTSize for the next stage
- MOV subFFTSize,grpCount
-
- SUB step,outPointStep,#4 ;// step = -4+outPointStep
- RSB dstStep,step,#0 ;// dstStep = -4-outPointStep+8 = -step
- ;//RSB dstStep,outPointStep,#16
-
-
- ;// Loop on 2 grps at a time for the last stage
+ @// Update grpCount and grpSize rightaway
-grpLoop$name
- VLD2 {dWr[0],dWi[0]},[pTwiddle]! ;// grp 0
- VLD2 {dWr[1],dWi[1]},[pTwiddle]! ;// grp 1
-
- ;//VLD2 {dWr,dWi},[pTwiddle],#16
-
- VLD4 {dXr0[0],dXi0[0],dXr1[0],dXi1[0]},[pSrc]! ;// grp 0
- VLD4 {dXr0[1],dXi0[1],dXr1[1],dXi1[1]},[pSrc]! ;// grp 1
-
-
- ;//VLD4 {dXr0,dXi0,dXr1,dXi1},[pSrc],#32
- SUBS grpCount,grpCount,#4 ;// grpCount is multiplied by 2
-
- IF $inverse
+ MOV subFFTNum,#1 @//after the last stage
+ LSL grpCount,subFFTSize,#1
+
+ @// update subFFTSize for the next stage
+ MOV subFFTSize,grpCount
+
+ SUB step,outPointStep,#4 @// step = -4+outPointStep
+ RSB dstStep,step,#0 @// dstStep = -4-outPointStep+8 = -step
+ @//RSB dstStep,outPointStep,#16
+
+
+ @// Loop on 2 grps at a time for the last stage
+
+grpLoop\name:
+ VLD2 {dWr[0],dWi[0]},[pTwiddle]! @// grp 0
+ VLD2 {dWr[1],dWi[1]},[pTwiddle]! @// grp 1
+
+ @//VLD2 {dWr,dWi},[pTwiddle],#16
+
+ VLD4 {dXr0[0],dXi0[0],dXr1[0],dXi1[0]},[pSrc]! @// grp 0
+ VLD4 {dXr0[1],dXi0[1],dXr1[1],dXi1[1]},[pSrc]! @// grp 1
+
+
+ @//VLD4 {dXr0,dXi0,dXr1,dXi1},[pSrc],#32
+ SUBS grpCount,grpCount,#4 @// grpCount is multiplied by 2
+
+ .ifeqs "\inverse", "TRUE"
VMULL qT0,dXr1,dWr
- VMLAL qT0,dXi1,dWi ;// real part
+ VMLAL qT0,dXi1,dWi @// real part
VMULL qT1,dXi1,dWr
- VMLSL qT1,dXr1,dWi ;// imag part
-
- ELSE
+ VMLSL qT1,dXr1,dWi @// imag part
+
+ .ELSE
VMULL qT0,dXr1,dWr
- VMLSL qT0,dXi1,dWi ;// real part
+ VMLSL qT0,dXi1,dWi @// real part
VMULL qT1,dXi1,dWr
- VMLAL qT1,dXr1,dWi ;// imag part
-
- ENDIF
-
+ VMLAL qT1,dXr1,dWi @// imag part
+
+ .ENDIF
+
VRSHRN dXr1,qT0,#15
VRSHRN dXi1,qT1,#15
-
-
- IF $scaled
-
+
+
+ .ifeqs "\scaled", "TRUE"
+
VHSUB dYr0,dXr0,dXr1
VHSUB dYi0,dXi0,dXi1
VHADD dYr1,dXr0,dXr1
VHADD dYi1,dXi0,dXi1
-
- ELSE
-
+
+ .ELSE
+
VSUB dYr0,dXr0,dXr1
VSUB dYi0,dXi0,dXi1
VADD dYr1,dXr0,dXr1
VADD dYi1,dXi0,dXi1
-
-
- ENDIF
-
+
+
+ .ENDIF
+
VST2 {dYr0[0],dYi0[0]},[pDst]!
- VST2 {dYr0[1],dYi0[1]},[pDst],step ;// step = -4+outPointStep
-
+ VST2 {dYr0[1],dYi0[1]},[pDst],step @// step = -4+outPointStep
+
VST2 {dYr1[0],dYi1[0]},[pDst]!
- VST2 {dYr1[1],dYi1[1]},[pDst],dstStep ;// dstStep = -4-outPointStep+8 = -step
-
- ;//VST2 {dYr0,dYi0},[pDst],outPointStep
- ;//VST2 {dYr1,dYi1},[pDst],dstStep ;// dstStep = step = -outPointStep + 16
-
- BGT grpLoop$name
-
-
- ;// Reset and Swap pSrc and pDst for the next stage
+ VST2 {dYr1[1],dYi1[1]},[pDst],dstStep @// dstStep = -4-outPointStep+8 = -step
+
+ @//VST2 {dYr0,dYi0},[pDst],outPointStep
+ @//VST2 {dYr1,dYi1},[pDst],dstStep @// dstStep = step = -outPointStep + 16
+
+ BGT grpLoop\name
+
+
+ @// Reset and Swap pSrc and pDst for the next stage
MOV pTmp,pDst
- SUB pDst,pSrc,outPointStep,LSL #1 ;// pDst -= 2*size; pSrc -= 4*size bytes
+ SUB pDst,pSrc,outPointStep,LSL #1 @// pDst -= 2*size; pSrc -= 4*size bytes
SUB pSrc,pTmp,outPointStep
-
- ;// Reset pTwiddle for the next stage
- SUB pTwiddle,pTwiddle,outPointStep ;// pTwiddle -= 2*size bytes
-
- MEND
-
-
-
+
+ @// Reset pTwiddle for the next stage
+ SUB pTwiddle,pTwiddle,outPointStep @// pTwiddle -= 2*size bytes
+
+ .endm
+
+
+
M_START armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe,r4
- FFTSTAGE {FALSE},{FALSE},FWD
+ FFTSTAGE "FALSE","FALSE",FWD
M_END
-
-
+
+
M_START armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe,r4
- FFTSTAGE {FALSE},{TRUE},INV
+ FFTSTAGE "FALSE","TRUE",INV
M_END
-
-
-
+
+
+
M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe,r4
- FFTSTAGE {TRUE},{FALSE},FWDSFS
+ FFTSTAGE "TRUE","FALSE",FWDSFS
M_END
-
-
+
+
M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe,r4
- FFTSTAGE {TRUE},{TRUE},INVSFS
+ FFTSTAGE "TRUE","TRUE",INVSFS
M_END
-
- ENDIF ;//CORTEXA8
-
-
- END
\ No newline at end of file
+
+
+
+ .END
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S
index 49bf607..666f4f3 100644
--- a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S
@@ -1,209 +1,216 @@
-;//
-;//
-;// File Name: armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision: 6740
-;// Last Modified Date: Wed, 18 Jul 2007
-;//
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;//
-;//
-;//
-;// Description:
-;// Compute a Radix 2 FFT stage for a N point complex signal
-;//
-;//
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This file was originally licensed as follows. It has been
+@// relicensed with permission from the copyright holders.
-
-;// Include standard headers
+@//
+@//
+@// File Name: armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision: 6740
+@// Last Modified Date: Wed, 18 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
- INCLUDE omxtypes_s.h
- INCLUDE armCOMM_s.h
-
- M_VARIANTS CortexA8
-
-;// Import symbols required from other files
-;// (For example tables)
-
-
-
-
-;// Set debugging level
-;//DEBUG_ON SETL {TRUE}
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
-
-;// Guarding implementation by the processor name
-
- IF CortexA8
-
-;//Input Registers
-pSrc RN 0
-pDst RN 2
-pTwiddle RN 1
-subFFTNum RN 6
-subFFTSize RN 7
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
-;//Output Registers
-;//Local Scratch Registers
-
-outPointStep RN 3
-grpCount RN 4
-dstStep RN 5
-twStep RN 8
-pTmp RN 4
-
-;// Neon Registers
-
-dW1S32 DN D0.S32
-dW2S32 DN D1.S32
-dW1 DN D0.S16
-dW2 DN D1.S16
-
-dX0 DN D2.S16
-dX1 DN D3.S16
-dX2 DN D4.S16
-dX3 DN D5.S16
-dY0 DN D6.S16
-dY1 DN D7.S16
-dY2 DN D8.S16
-dY3 DN D9.S16
-qT0 QN Q5.S32
-qT1 QN Q6.S32
+@// Guarding implementation by the processor name
- MACRO
- FFTSTAGE $scaled, $inverse, $name
-
- ;// Define stack arguments
-
-
- ;// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
-
-
+@//Input Registers
+
+#define pSrc r0
+#define pDst r2
+#define pTwiddle r1
+#define subFFTNum r6
+#define subFFTSize r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep r3
+#define grpCount r4
+#define dstStep r5
+#define twStep r8
+#define pTmp r4
+
+@// Neon Registers
+
+#define dW1S32 D0.S32
+#define dW2S32 D1.S32
+#define dW1 D0.S16
+#define dW2 D1.S16
+
+#define dX0 D2.S16
+#define dX1 D3.S16
+#define dX2 D4.S16
+#define dX3 D5.S16
+#define dY0 D6.S16
+#define dY1 D7.S16
+#define dY2 D8.S16
+#define dY3 D9.S16
+#define qT0 Q5.S32
+#define qT1 Q6.S32
+
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ @// Define stack arguments
+
+
+ @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+
+
LSL grpCount,subFFTSize,#1
-
-
- ;// update subFFTSize for the next stage
- MOV subFFTSize,grpCount
-
- ;// pOut0+1 increments pOut0 by 8 bytes
- ;// pOut0+outPointStep == increment of 4*outPointStep bytes = 2*size bytes
- SMULBB outPointStep,grpCount,subFFTNum
- MOV twStep,subFFTNum,LSL #1
- LSR subFFTNum,subFFTNum,#1 ;//grpSize
-
-
- RSB dstStep,outPointStep,#8
-
-
- ;// Note: pointStep is 8 in this case: so need of extra reg
- ;// Loop on the groups: 2 groups at a time
-grpLoop$name
-
- VLD1 dW1S32[],[pTwiddle],twStep ;//[wi | wr]
+
+ @// update subFFTSize for the next stage
+ MOV subFFTSize,grpCount
+
+ @// pOut0+1 increments pOut0 by 8 bytes
+ @// pOut0+outPointStep == increment of 4*outPointStep bytes = 2*size bytes
+ SMULBB outPointStep,grpCount,subFFTNum
+ MOV twStep,subFFTNum,LSL #1
+ LSR subFFTNum,subFFTNum,#1 @//grpSize
+
+
+ RSB dstStep,outPointStep,#8
+
+
+ @// Note: pointStep is 8 in this case: so need of extra reg
+ @// Loop on the groups: 2 groups at a time
+
+grpLoop\name:
+
+ VLD1 dW1S32[],[pTwiddle],twStep @//[wi | wr]
VLD1 dW2S32[],[pTwiddle],twStep
-
- ;// Process the sets for each grp: 2 sets at a time (no set looping required)
-
- VLD1 dX0,[pSrc]! ;// point0: of set0,set1 of grp0
- VLD1 dX1,[pSrc]! ;// point1: of set0,set1 of grp0
- VLD1 dX2,[pSrc]! ;// point0: of set0,set1 of grp1
- VLD1 dX3,[pSrc]! ;// point1: of set0,set1 of grp1
-
- SUBS grpCount,grpCount,#4 ;// decrement the loop counter
+
+ @// Process the sets for each grp: 2 sets at a time (no set looping required)
+
+ VLD1 dX0,[pSrc]! @// point0: of set0,set1 of grp0
+ VLD1 dX1,[pSrc]! @// point1: of set0,set1 of grp0
+ VLD1 dX2,[pSrc]! @// point0: of set0,set1 of grp1
+ VLD1 dX3,[pSrc]! @// point1: of set0,set1 of grp1
+
+ SUBS grpCount,grpCount,#4 @// decrement the loop counter
VUZP dW1,dW2
VUZP dX1,dX3
-
- IF $inverse
+
+ .ifeqs "\inverse", "TRUE"
VMULL qT0,dX1,dW1
- VMLAL qT0,dX3,dW2 ;// real part
+ VMLAL qT0,dX3,dW2 @// real part
VMULL qT1,dX3,dW1
- VMLSL qT1,dX1,dW2 ;// imag part
-
- ELSE
+ VMLSL qT1,dX1,dW2 @// imag part
+
+ .ELSE
VMULL qT0,dX1,dW1
- VMLSL qT0,dX3,dW2 ;// real part
+ VMLSL qT0,dX3,dW2 @// real part
VMULL qT1,dX3,dW1
- VMLAL qT1,dX1,dW2 ;// imag part
-
- ENDIF
-
+ VMLAL qT1,dX1,dW2 @// imag part
+
+ .ENDIF
+
VRSHRN dX1,qT0,#15
VRSHRN dX3,qT1,#15
-
+
VZIP dX1,dX3
-
-
- IF $scaled
-
+
+
+ .ifeqs "\scaled", "TRUE"
+
VHSUB dY0,dX0,dX1
VHADD dY1,dX0,dX1
VHSUB dY2,dX2,dX3
VHADD dY3,dX2,dX3
-
- ELSE
-
+
+ .ELSE
+
VSUB dY0,dX0,dX1
VADD dY1,dX0,dX1
VSUB dY2,dX2,dX3
VADD dY3,dX2,dX3
-
-
-
- ENDIF
-
- VST1 dY0,[pDst],outPointStep ;// point0: of set0,set1 of grp0
- VST1 dY1,[pDst],dstStep ;// dstStep = -outPointStep + 8
- VST1 dY2,[pDst],outPointStep ;// point0: of set0,set1 of grp1
- VST1 dY3,[pDst],dstStep ;// point1: of set0,set1 of grp1
-
-
- BGT grpLoop$name
-
-
- ;// Reset and Swap pSrc and pDst for the next stage
+
+
+
+ .ENDIF
+
+ VST1 dY0,[pDst],outPointStep @// point0: of set0,set1 of grp0
+ VST1 dY1,[pDst],dstStep @// dstStep = -outPointStep + 8
+ VST1 dY2,[pDst],outPointStep @// point0: of set0,set1 of grp1
+ VST1 dY3,[pDst],dstStep @// point1: of set0,set1 of grp1
+
+
+ BGT grpLoop\name
+
+
+ @// Reset and Swap pSrc and pDst for the next stage
MOV pTmp,pDst
- SUB pDst,pSrc,outPointStep,LSL #1 ;// pDst -= 2*size; pSrc -= 4*size bytes
+ SUB pDst,pSrc,outPointStep,LSL #1 @// pDst -= 2*size; pSrc -= 4*size bytes
SUB pSrc,pTmp,outPointStep
-
- ;// Reset pTwiddle for the next stage
- SUB pTwiddle,pTwiddle,outPointStep ;// pTwiddle -= 2*size bytes
-
- MEND
-
-
-
+
+ @// Reset pTwiddle for the next stage
+ SUB pTwiddle,pTwiddle,outPointStep @// pTwiddle -= 2*size bytes
+
+ .endm
+
+
+
M_START armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe,r4
- FFTSTAGE {FALSE},{FALSE},FWD
+ FFTSTAGE "FALSE","FALSE",FWD
M_END
-
-
+
+
M_START armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe,r4
- FFTSTAGE {FALSE},{TRUE},INV
+ FFTSTAGE "FALSE","TRUE",INV
M_END
-
-
-
+
+
+
M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe,r4
- FFTSTAGE {TRUE},{FALSE},FWDSFS
+ FFTSTAGE "TRUE","FALSE",FWDSFS
M_END
-
-
+
+
M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe,r4
- FFTSTAGE {TRUE},{TRUE},INVSFS
+ FFTSTAGE "TRUE","TRUE",INVSFS
M_END
-
- ENDIF ;//CORTEXA8
-
-
- END
\ No newline at end of file
+
+
+ .END
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S
index 133b137..f9bbebc 100644
--- a/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S
@@ -1,214 +1,219 @@
-;//
-;//
-;// File Name: armSP_FFT_CToC_SC16_Radix2_unsafe_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision: 5892
-;// Last Modified Date: Thu, 07 Jun 2007
-;//
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;//
-;//
-;//
-;// Description:
-;// Compute a Radix 2 FFT stage for a N point complex signal
-;//
-;//
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This file was originally licensed as follows. It has been
+@// relicensed with permission from the copyright holders.
-
-;// Include standard headers
+@//
+@//
+@// File Name: armSP_FFT_CToC_SC16_Radix2_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision: 5892
+@// Last Modified Date: Thu, 07 Jun 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
- INCLUDE omxtypes_s.h
- INCLUDE armCOMM_s.h
-
- M_VARIANTS CortexA8
-
-;// Import symbols required from other files
-;// (For example tables)
-
-
-;// Set debugging level
-;//DEBUG_ON SETL {TRUE}
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
-;// Guarding implementation by the processor name
-
-
-
-
- ;// Guarding implementation by the processor name
-
- IF CortexA8
-
-
-;//Input Registers
-
-pSrc RN 0
-pDst RN 2
-pTwiddle RN 1
-subFFTNum RN 6
-subFFTSize RN 7
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
-;//Output Registers
+
+@// Guarding implementation by the processor name
-;//Local Scratch Registers
-outPointStep RN 3
-pointStep RN 4
-grpCount RN 5
-setCount RN 8
-step RN 10
-dstStep RN 11
-pTmp RN 9
-;// Neon Registers
+ @// Guarding implementation by the processor name
-dW DN D0.S16
-dX0 DN D2.S16
-dX1 DN D3.S16
-dX2 DN D4.S16
-dX3 DN D5.S16
-dY0 DN D6.S16
-dY1 DN D7.S16
-dY2 DN D8.S16
-dY3 DN D9.S16
-qT0 QN Q3.S32
-qT1 QN Q4.S32
-
-
- MACRO
- FFTSTAGE $scaled, $inverse, $name
-
- ;// Define stack arguments
-
-
- ;// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
-
- LSR subFFTNum,subFFTNum,#1 ;//grpSize
+@//Input Registers
+
+#define pSrc r0
+#define pDst r2
+#define pTwiddle r1
+#define subFFTNum r6
+#define subFFTSize r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep r3
+#define pointStep r4
+#define grpCount r5
+#define setCount r8
+#define step r10
+#define dstStep r11
+#define pTmp r9
+
+@// Neon Registers
+
+#define dW D0.S16
+#define dX0 D2.S16
+#define dX1 D3.S16
+#define dX2 D4.S16
+#define dX3 D5.S16
+#define dY0 D6.S16
+#define dY1 D7.S16
+#define dY2 D8.S16
+#define dY3 D9.S16
+#define qT0 Q3.S32
+#define qT1 Q4.S32
+
+
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ @// Define stack arguments
+
+
+ @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+
+ LSR subFFTNum,subFFTNum,#1 @//grpSize
LSL grpCount,subFFTSize,#1
-
-
- ;// pT0+1 increments pT0 by 8 bytes
- ;// pT0+pointStep = increment of 4*pointStep bytes = 2*grpSize bytes
+
+
+ @// pT0+1 increments pT0 by 8 bytes
+ @// pT0+pointStep = increment of 4*pointStep bytes = 2*grpSize bytes
MOV pointStep,subFFTNum,LSL #1
-
- ;// update subFFTSize for the next stage
+
+ @// update subFFTSize for the next stage
MOV subFFTSize,grpCount
-
- ;// pOut0+1 increments pOut0 by 8 bytes
- ;// pOut0+outPointStep == increment of 4*outPointStep bytes = 2*size bytes
- SMULBB outPointStep,grpCount,pointStep
- LSL pointStep,pointStep,#1
-
-
+
+ @// pOut0+1 increments pOut0 by 8 bytes
+ @// pOut0+outPointStep == increment of 4*outPointStep bytes = 2*size bytes
+ SMULBB outPointStep,grpCount,pointStep
+ LSL pointStep,pointStep,#1
+
+
RSB step,pointStep,#16
RSB dstStep,outPointStep,#16
-
- ;// Loop on the groups
-grpLoop$name
-
- VLD1 dW,[pTwiddle],pointStep ;//[wi | wr]
+ @// Loop on the groups
+
+grpLoop\name:
+
+ VLD1 dW,[pTwiddle],pointStep @//[wi | wr]
MOV setCount,pointStep,LSR #2
-
-
- ;// Loop on the sets: 4 at a time
-
-
-setLoop$name
-
-
- VLD2 {dX0,dX1},[pSrc],pointStep ;// point0: dX0-real part dX1-img part
- VLD2 {dX2,dX3},[pSrc],step ;// point1: dX2-real part dX3-img part
-
- SUBS setCount,setCount,#4
-
- IF $inverse
+
+
+ @// Loop on the sets: 4 at a time
+
+
+setLoop\name:
+
+
+ VLD2 {dX0,dX1},[pSrc],pointStep @// point0: dX0-real part dX1-img part
+ VLD2 {dX2,dX3},[pSrc],step @// point1: dX2-real part dX3-img part
+
+ SUBS setCount,setCount,#4
+
+ .ifeqs "\inverse", "TRUE"
VMULL qT0,dX2,dW[0]
- VMLAL qT0,dX3,dW[1] ;// real part
+ VMLAL qT0,dX3,dW[1] @// real part
VMULL qT1,dX3,dW[0]
- VMLSL qT1,dX2,dW[1] ;// imag part
-
- ELSE
-
+ VMLSL qT1,dX2,dW[1] @// imag part
+
+ .ELSE
+
VMULL qT0,dX2,dW[0]
- VMLSL qT0,dX3,dW[1] ;// real part
+ VMLSL qT0,dX3,dW[1] @// real part
VMULL qT1,dX3,dW[0]
- VMLAL qT1,dX2,dW[1] ;// imag part
-
- ENDIF
-
+ VMLAL qT1,dX2,dW[1] @// imag part
+
+ .ENDIF
+
VRSHRN dX2,qT0,#15
VRSHRN dX3,qT1,#15
-
- IF $scaled
+
+ .ifeqs "\scaled", "TRUE"
VHSUB dY0,dX0,dX2
VHSUB dY1,dX1,dX3
VHADD dY2,dX0,dX2
VHADD dY3,dX1,dX3
-
- ELSE
+
+ .ELSE
VSUB dY0,dX0,dX2
VSUB dY1,dX1,dX3
VADD dY2,dX0,dX2
VADD dY3,dX1,dX3
-
- ENDIF
-
+
+ .ENDIF
+
VST2 {dY0,dY1},[pDst],outPointStep
- VST2 {dY2,dY3},[pDst],dstStep ;// dstStep = -outPointStep + 16
-
- BGT setLoop$name
-
- SUBS grpCount,grpCount,#2
+ VST2 {dY2,dY3},[pDst],dstStep @// dstStep = -outPointStep + 16
+
+ BGT setLoop\name
+
+ SUBS grpCount,grpCount,#2
ADD pSrc,pSrc,pointStep
- BGT grpLoop$name
-
-
- ;// Reset and Swap pSrc and pDst for the next stage
+ BGT grpLoop\name
+
+
+ @// Reset and Swap pSrc and pDst for the next stage
MOV pTmp,pDst
- SUB pDst,pSrc,outPointStep,LSL #1 ;// pDst -= 2*size; pSrc -= 4*size bytes
+ SUB pDst,pSrc,outPointStep,LSL #1 @// pDst -= 2*size; pSrc -= 4*size bytes
SUB pSrc,pTmp,outPointStep
-
- ;// Reset pTwiddle for the next stage
- SUB pTwiddle,pTwiddle,outPointStep ;// pTwiddle -= 2*size bytes
-
-
- MEND
-
-
-
+
+ @// Reset pTwiddle for the next stage
+ SUB pTwiddle,pTwiddle,outPointStep @// pTwiddle -= 2*size bytes
+
+
+ .endm
+
+
+
M_START armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe,r4
- FFTSTAGE {FALSE},{FALSE},FWD
+ FFTSTAGE "FALSE","FALSE",FWD
M_END
-
-
+
+
M_START armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe,r4
- FFTSTAGE {FALSE},{TRUE},INV
+ FFTSTAGE "FALSE","TRUE",INV
M_END
-
-
-
+
+
+
M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe,r4
- FFTSTAGE {TRUE},{FALSE},FWDSFS
+ FFTSTAGE "TRUE","FALSE",FWDSFS
M_END
-
-
+
+
M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe,r4
- FFTSTAGE {TRUE},{TRUE},INVSFS
+ FFTSTAGE "TRUE","TRUE",INVSFS
M_END
-
- ENDIF ;//CORTEXA8
-
-
-
- END
-
+
+
+
+ .END
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S
index 82662e6..cdb42a9 100644
--- a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S
@@ -1,306 +1,314 @@
-;//
-;//
-;// File Name: armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision: 7761
-;// Last Modified Date: Wed, 26 Sep 2007
-;//
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;//
-;//
-;//
-;// Description:
-;// Compute a first stage Radix 4 FFT stage for a N point complex signal
-;//
-;//
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This file was originally licensed as follows. It has been
+@// relicensed with permission from the copyright holders.
-
-;// Include standard headers
+@//
+@//
+@// File Name: armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision: 7761
+@// Last Modified Date: Wed, 26 Sep 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a first stage Radix 4 FFT stage for a N point complex signal
+@//
+@//
- INCLUDE omxtypes_s.h
- INCLUDE armCOMM_s.h
-
- M_VARIANTS CortexA8
-
-;// Import symbols required from other files
-;// (For example tables)
-
-
-
-
-;// Set debugging level
-;//DEBUG_ON SETL {TRUE}
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+@// Import symbols required from other files
+@// (For example tables)
-;// Guarding implementation by the processor name
-
-
-
-;// Guarding implementation by the processor name
-
- IF CortexA8
-
-;//Input Registers
-pSrc RN 0
-pDst RN 2
-pTwiddle RN 1
-pPingPongBuf RN 5
-subFFTNum RN 6
-subFFTSize RN 7
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
-;//Output Registers
+
+@// Guarding implementation by the processor name
-;//Local Scratch Registers
-grpSize RN 3
-setCount RN 3 ;// Reuse grpSize as setCount
-pointStep RN 4
-outPointStep RN 4
-setStep RN 8
-step1 RN 9
-step3 RN 10
+@// Guarding implementation by the processor name
-;// Neon Registers
-dXr0 DN D0.S16
-dXi0 DN D1.S16
-dXr1 DN D2.S16
-dXi1 DN D3.S16
-dXr2 DN D4.S16
-dXi2 DN D5.S16
-dXr3 DN D6.S16
-dXi3 DN D7.S16
-dYr0 DN D8.S16
-dYi0 DN D9.S16
-dYr1 DN D10.S16
-dYi1 DN D11.S16
-dYr2 DN D12.S16
-dYi2 DN D13.S16
-dYr3 DN D14.S16
-dYi3 DN D15.S16
-dZr0 DN D16.S16
-dZi0 DN D17.S16
-dZr1 DN D18.S16
-dZi1 DN D19.S16
-dZr2 DN D20.S16
-dZi2 DN D21.S16
-dZr3 DN D22.S16
-dZi3 DN D23.S16
-qY0 QN Q4.S16
-qY2 QN Q6.S16
-qX0 QN Q0.S16
-qX2 QN Q2.S16
+@//Input Registers
-qY1 QN Q5.S16
-qY3 QN Q7.S16
-qX1 QN Q1.S16
-qX3 QN Q3.S16
-qZ0 QN Q8.S16
-qZ1 QN Q9.S16
+#define pSrc r0
+#define pDst r2
+#define pTwiddle r1
+#define pPingPongBuf r5
+#define subFFTNum r6
+#define subFFTSize r7
-
- MACRO
- FFTSTAGE $scaled, $inverse, $name
-
- ;// Define stack arguments
-
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize r3
+@// Reuse grpSize as setCount
+#define setCount r3
+#define pointStep r4
+#define outPointStep r4
+#define setStep r8
+#define step1 r9
+#define step3 r10
+
+@// Neon Registers
+
+#define dXr0 D0.S16
+#define dXi0 D1.S16
+#define dXr1 D2.S16
+#define dXi1 D3.S16
+#define dXr2 D4.S16
+#define dXi2 D5.S16
+#define dXr3 D6.S16
+#define dXi3 D7.S16
+#define dYr0 D8.S16
+#define dYi0 D9.S16
+#define dYr1 D10.S16
+#define dYi1 D11.S16
+#define dYr2 D12.S16
+#define dYi2 D13.S16
+#define dYr3 D14.S16
+#define dYi3 D15.S16
+#define dZr0 D16.S16
+#define dZi0 D17.S16
+#define dZr1 D18.S16
+#define dZi1 D19.S16
+#define dZr2 D20.S16
+#define dZi2 D21.S16
+#define dZr3 D22.S16
+#define dZi3 D23.S16
+#define qY0 Q4.S16
+#define qY2 Q6.S16
+#define qX0 Q0.S16
+#define qX2 Q2.S16
+
+#define qY1 Q5.S16
+#define qY3 Q7.S16
+#define qX1 Q1.S16
+#define qX3 Q3.S16
+#define qZ0 Q8.S16
+#define qZ1 Q9.S16
+
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ @// Define stack arguments
+
MOV pointStep,subFFTNum
- ;// Update pSubFFTSize and pSubFFTNum regs
-
-
- VLD2 {dXr0,dXi0},[pSrc@128],pointStep ;// data[0]
- ;// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
- LSR grpSize,subFFTNum,#2
+ @// Update pSubFFTSize and pSubFFTNum regs
+
+
+ VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
+ @// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
+ LSR grpSize,subFFTNum,#2
MOV subFFTNum,grpSize
-
-
- ;// pT0+1 increments pT0 by 4 bytes
- ;// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
- ;// Note: outPointStep = pointStep for firststage
- VLD2 {dXr1,dXi1},[pSrc@128],pointStep ;// data[1]
-
-
- ;// Calculate the step of input data for the next set
- ;//MOV setStep,pointStep,LSL #1
+
+
+ @// pT0+1 increments pT0 by 4 bytes
+ @// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
+ @// Note: outPointStep = pointStep for firststage
+ VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
+
+
+ @// Calculate the step of input data for the next set
+ @//MOV setStep,pointStep,LSL #1
MOV setStep,grpSize,LSL #3
- VLD2 {dXr2,dXi2},[pSrc@128],pointStep ;// data[2]
+ VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
MOV step1,setStep
- ADD setStep,setStep,pointStep ;// setStep = 3*pointStep
- RSB setStep,setStep,#16 ;// setStep = - 3*pointStep+16
-
-
- VLD2 {dXr3,dXi3},[pSrc@128],setStep ;// data[3]
- MOV subFFTSize,#4 ;// subFFTSize = 1 for the first stage
-
-
- IF $scaled
- VHADD qY0,qX0,qX2 ;// u0
- ELSE
- VADD qY0,qX0,qX2 ;// u0
- ENDIF
+ ADD setStep,setStep,pointStep @// setStep = 3*pointStep
+ RSB setStep,setStep,#16 @// setStep = - 3*pointStep+16
+
+
+ VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3]
+ MOV subFFTSize,#4 @// subFFTSize = 1 for the first stage
+
+
+ .ifeqs "\scaled", "TRUE"
+ VHADD qY0,qX0,qX2 @// u0
+ .ELSE
+ VADD qY0,qX0,qX2 @// u0
+ .ENDIF
RSB step3,pointStep,#0
-
- ;// grp = 0 a special case since all the twiddle factors are 1
- ;// Loop on the sets: 4 sets at a time
-grpZeroSetLoop$name
-
-
- IF $scaled
-
- ;// finish first stage of 4 point FFT
-
- VHSUB qY2,qX0,qX2 ;// u1
- SUBS setCount,setCount,#4 ;// decrement the set loop counter
-
- VLD2 {dXr0,dXi0},[pSrc@128],step1 ;// data[0]
- VHADD qY1,qX1,qX3 ;// u2
- VLD2 {dXr2,dXi2},[pSrc@128],step3
- VHSUB qY3,qX1,qX3 ;// u3
-
-
-
- ;// finish second stage of 4 point FFT
-
- VLD2 {dXr1,dXi1},[pSrc@128],step1 ;// data[1]
- VHADD qZ0,qY0,qY1 ;// y0
-
- VLD2 {dXr3,dXi3},[pSrc@128],setStep
-
-
- IF $inverse
+ @// grp = 0 a special case since all the twiddle factors are 1
+ @// Loop on the sets: 4 sets at a time
- VHSUB dZr3,dYr2,dYi3 ;// y3
+grpZeroSetLoop\name:
+
+
+ .ifeqs "\scaled", "TRUE"
+
+ @// finish first stage of 4 point FFT
+
+ VHSUB qY2,qX0,qX2 @// u1
+ SUBS setCount,setCount,#4 @// decrement the set loop counter
+
+ VLD2 {dXr0,dXi0},[pSrc :128],step1 @// data[0]
+ VHADD qY1,qX1,qX3 @// u2
+ VLD2 {dXr2,dXi2},[pSrc :128],step3
+ VHSUB qY3,qX1,qX3 @// u3
+
+
+
+ @// finish second stage of 4 point FFT
+
+ VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
+ VHADD qZ0,qY0,qY1 @// y0
+
+ VLD2 {dXr3,dXi3},[pSrc :128],setStep
+
+
+ .ifeqs "\inverse", "TRUE"
+
+ VHSUB dZr3,dYr2,dYi3 @// y3
VHADD dZi3,dYi2,dYr3
- VST2 {dZr0,dZi0},[pDst@128],outPointStep
-
- VHSUB qZ1,qY0,qY1 ;// y2
- VST2 {dZr3,dZi3},[pDst@128],outPointStep
-
- VHADD dZr2,dYr2,dYi3 ;// y1
- VST2 {dZr1,dZi1},[pDst@128],outPointStep
- VHSUB dZi2,dYi2,dYr3
-
- VHADD qY0,qX0,qX2 ;// u0 (next loop)
- VST2 {dZr2,dZi2},[pDst@128],setStep
-
-
- ELSE
-
- VHADD dZr2,dYr2,dYi3 ;// y1
- VHSUB dZi2,dYi2,dYr3
-
- VST2 {dZr0,dZi0},[pDst@128],outPointStep
- VHSUB qZ1,qY0,qY1 ;// y2
-
- VST2 {dZr2,dZi2},[pDst@128],outPointStep
- VHSUB dZr3,dYr2,dYi3 ;// y3
- VHADD dZi3,dYi2,dYr3
- VST2 {dZr1,dZi1},[pDst@128],outPointStep
- VHADD qY0,qX0,qX2 ;// u0 (next loop)
- VST2 {dZr3,dZi3},[pDst@128],setStep
-
- ENDIF
-
-
- ELSE
-
- ;// finish first stage of 4 point FFT
-
- VSUB qY2,qX0,qX2 ;// u1
- SUBS setCount,setCount,#4 ;// decrement the set loop counter
-
- VLD2 {dXr0,dXi0},[pSrc@128],step1 ;// data[0]
- VADD qY1,qX1,qX3 ;// u2
- VLD2 {dXr2,dXi2},[pSrc@128],step3
- VSUB qY3,qX1,qX3 ;// u3
-
-
-
- ;// finish second stage of 4 point FFT
-
- VLD2 {dXr1,dXi1},[pSrc@128],step1 ;// data[1]
- VADD qZ0,qY0,qY1 ;// y0
-
- VLD2 {dXr3,dXi3},[pSrc@128],setStep
-
-
- IF $inverse
+ VST2 {dZr0,dZi0},[pDst :128],outPointStep
- VSUB dZr3,dYr2,dYi3 ;// y3
+ VHSUB qZ1,qY0,qY1 @// y2
+ VST2 {dZr3,dZi3},[pDst :128],outPointStep
+
+ VHADD dZr2,dYr2,dYi3 @// y1
+ VST2 {dZr1,dZi1},[pDst :128],outPointStep
+ VHSUB dZi2,dYi2,dYr3
+
+ VHADD qY0,qX0,qX2 @// u0 (next loop)
+ VST2 {dZr2,dZi2},[pDst :128],setStep
+
+
+ .ELSE
+
+ VHADD dZr2,dYr2,dYi3 @// y1
+ VHSUB dZi2,dYi2,dYr3
+
+ VST2 {dZr0,dZi0},[pDst :128],outPointStep
+ VHSUB qZ1,qY0,qY1 @// y2
+
+ VST2 {dZr2,dZi2},[pDst :128],outPointStep
+ VHSUB dZr3,dYr2,dYi3 @// y3
+ VHADD dZi3,dYi2,dYr3
+ VST2 {dZr1,dZi1},[pDst :128],outPointStep
+ VHADD qY0,qX0,qX2 @// u0 (next loop)
+ VST2 {dZr3,dZi3},[pDst :128],setStep
+
+ .ENDIF
+
+
+ .ELSE
+
+ @// finish first stage of 4 point FFT
+
+ VSUB qY2,qX0,qX2 @// u1
+ SUBS setCount,setCount,#4 @// decrement the set loop counter
+
+ VLD2 {dXr0,dXi0},[pSrc :128],step1 @// data[0]
+ VADD qY1,qX1,qX3 @// u2
+ VLD2 {dXr2,dXi2},[pSrc :128],step3
+ VSUB qY3,qX1,qX3 @// u3
+
+
+
+ @// finish second stage of 4 point FFT
+
+ VLD2 {dXr1,dXi1},[pSrc :128],step1 @// data[1]
+ VADD qZ0,qY0,qY1 @// y0
+
+ VLD2 {dXr3,dXi3},[pSrc :128],setStep
+
+
+ .ifeqs "\inverse", "TRUE"
+
+ VSUB dZr3,dYr2,dYi3 @// y3
VADD dZi3,dYi2,dYr3
- VST2 {dZr0,dZi0},[pDst@128],outPointStep
-
- VSUB qZ1,qY0,qY1 ;// y2
- VST2 {dZr3,dZi3},[pDst@128],outPointStep
-
- VADD dZr2,dYr2,dYi3 ;// y1
- VST2 {dZr1,dZi1},[pDst@128],outPointStep
+ VST2 {dZr0,dZi0},[pDst :128],outPointStep
+
+ VSUB qZ1,qY0,qY1 @// y2
+ VST2 {dZr3,dZi3},[pDst :128],outPointStep
+
+ VADD dZr2,dYr2,dYi3 @// y1
+ VST2 {dZr1,dZi1},[pDst :128],outPointStep
VSUB dZi2,dYi2,dYr3
-
- VADD qY0,qX0,qX2 ;// u0 (next loop)
- VST2 {dZr2,dZi2},[pDst@128],setStep
-
-
- ELSE
-
- VADD dZr2,dYr2,dYi3 ;// y1
+
+ VADD qY0,qX0,qX2 @// u0 (next loop)
+ VST2 {dZr2,dZi2},[pDst :128],setStep
+
+
+ .ELSE
+
+ VADD dZr2,dYr2,dYi3 @// y1
VSUB dZi2,dYi2,dYr3
-
- VST2 {dZr0,dZi0},[pDst@128],outPointStep
- VSUB qZ1,qY0,qY1 ;// y2
-
- VST2 {dZr2,dZi2},[pDst@128],outPointStep
- VSUB dZr3,dYr2,dYi3 ;// y3
+
+ VST2 {dZr0,dZi0},[pDst :128],outPointStep
+ VSUB qZ1,qY0,qY1 @// y2
+
+ VST2 {dZr2,dZi2},[pDst :128],outPointStep
+ VSUB dZr3,dYr2,dYi3 @// y3
VADD dZi3,dYi2,dYr3
- VST2 {dZr1,dZi1},[pDst@128],outPointStep
- VADD qY0,qX0,qX2 ;// u0 (next loop)
- VST2 {dZr3,dZi3},[pDst@128],setStep
-
- ENDIF
-
-
- ENDIF
-
- BGT grpZeroSetLoop$name
-
-
- ;// reset pSrc to pDst for the next stage
- SUB pSrc,pDst,pointStep ;// pDst -= grpSize
+ VST2 {dZr1,dZi1},[pDst :128],outPointStep
+ VADD qY0,qX0,qX2 @// u0 (next loop)
+ VST2 {dZr3,dZi3},[pDst :128],setStep
+
+ .ENDIF
+
+
+ .ENDIF
+
+ BGT grpZeroSetLoop\name
+
+
+ @// reset pSrc to pDst for the next stage
+ SUB pSrc,pDst,pointStep @// pDst -= grpSize
MOV pDst,pPingPongBuf
-
-
- MEND
-
-
+
+ .endm
+
+
+
M_START armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe,r4
- FFTSTAGE {FALSE},{FALSE},FWD
+ FFTSTAGE "FALSE","FALSE",FWD
M_END
-
-
+
+
M_START armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe,r4
- FFTSTAGE {FALSE},{TRUE},INV
- M_END
-
-
- M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
- FFTSTAGE {TRUE},{FALSE},FWDSFS
+ FFTSTAGE "FALSE","TRUE",INV
M_END
-
- M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
- FFTSTAGE {TRUE},{TRUE},INVSFS
+
+ M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
+ FFTSTAGE "TRUE","FALSE",FWDSFS
M_END
-
-
- ENDIF ;//CortexA8
-
-
-
- END
\ No newline at end of file
+
+
+ M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
+ FFTSTAGE "TRUE","TRUE",INVSFS
+ M_END
+
+
+
+
+
+ .END
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
index ce324f5..23e2c37 100644
--- a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
@@ -1,403 +1,410 @@
-;//
-;//
-;// File Name: armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision: 7765
-;// Last Modified Date: Thu, 27 Sep 2007
-;//
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;//
-;//
-;//
-;// Description:
-;// Compute a Radix 4 FFT stage for a N point complex signal
-;//
-;//
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This file was originally licensed as follows. It has been
+@// relicensed with permission from the copyright holders.
-
-;// Include standard headers
-
- INCLUDE omxtypes_s.h
- INCLUDE armCOMM_s.h
- INCLUDE armSP_FFT_s.h
-
- M_VARIANTS CortexA8
-
-;// Import symbols required from other files
-;// (For example tables)
-
-
-
-
-;// Set debugging level
-;//DEBUG_ON SETL {TRUE}
+@//
+@//
+@// File Name: armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision: 7765
+@// Last Modified Date: Thu, 27 Sep 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@//
+@//
-;// Guarding implementation by the processor name
-
-
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
-
-;// Guarding implementation by the processor name
-
- IF CortexA8
-
-;// Import symbols required from other files
-;// (For example tables)
- ;//IMPORT armAAC_constTable
-
-;//Input Registers
-pSrc RN 0
-pDst RN 2
-pTwiddle RN 1
-subFFTNum RN 6
-subFFTSize RN 7
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
+
+
+@// Guarding implementation by the processor name
-;//Output Registers
-;//Local Scratch Registers
-outPointStep RN 3
-grpCount RN 4
-dstStep RN 5
-pw1 RN 8
-pw2 RN 9
-pw3 RN 10
-pTmp RN 4
+@// Guarding implementation by the processor name
-;// Neon Registers
+@// Import symbols required from other files
+@// (For example tables)
+ @//IMPORT armAAC_constTable
-dButterfly1Real02 DN D0.S16
-dButterfly1Imag02 DN D1.S16
-dButterfly1Real13 DN D2.S16
-dButterfly1Imag13 DN D3.S16
-dButterfly2Real02 DN D4.S16
-dButterfly2Imag02 DN D5.S16
-dButterfly2Real13 DN D6.S16
-dButterfly2Imag13 DN D7.S16
-dXr0 DN D0.S16
-dXi0 DN D1.S16
-dXr1 DN D2.S16
-dXi1 DN D3.S16
-dXr2 DN D4.S16
-dXi2 DN D5.S16
-dXr3 DN D6.S16
-dXi3 DN D7.S16
+@//Input Registers
-dW1rS32 DN D8.S32
-dW1iS32 DN D9.S32
-dW2rS32 DN D10.S32
-dW2iS32 DN D11.S32
-dW3rS32 DN D12.S32
-dW3iS32 DN D13.S32
+#define pSrc r0
+#define pDst r2
+#define pTwiddle r1
+#define subFFTNum r6
+#define subFFTSize r7
-dW1r DN D8.S16
-dW1i DN D9.S16
-dW2r DN D10.S16
-dW2i DN D11.S16
-dW3r DN D12.S16
-dW3i DN D13.S16
-dTmp0 DN D12.S16
-dTmp1 DN D13.S16
-dTmp1S32 DN D13.S32
-dTmp2S32 DN D14.S32
-dTmp3S32 DN D15.S32
-dYr0 DN D18.S16
-dYi0 DN D19.S16
-dYr1 DN D16.S16
-dYi1 DN D17.S16
-dYr2 DN D20.S16
-dYi2 DN D21.S16
-dYr3 DN D14.S16
-dYi3 DN D15.S16
-qY0 QN Q9.S16
-qY1 QN Q8.S16
-qY2 QN Q10.S16
-qY3 QN Q7.S16
+@//Output Registers
-qX0 QN Q0.S16
-qX1 QN Q1.S16
-qX2 QN Q2.S16
-qX3 QN Q3.S16
-qT0 QN Q9.S32
-qT1 QN Q10.S32
-qT2 QN Q7.S32
-qT3 QN Q8.S32
+@//Local Scratch Registers
-dZr0 DN D22.S16
-dZi0 DN D23.S16
-dZr1 DN D24.S16
-dZi1 DN D25.S16
-dZr2 DN D26.S16
-dZi2 DN D27.S16
-dZr3 DN D28.S16
-dZi3 DN D29.S16
+#define outPointStep r3
+#define grpCount r4
+#define dstStep r5
+#define pw1 r8
+#define pw2 r9
+#define pw3 r10
+#define pTmp r4
-qZ0 QN Q11.S16
-qZ1 QN Q12.S16
-qZ2 QN Q13.S16
-qZ3 QN Q14.S16
-
- MACRO
- FFTSTAGE $scaled, $inverse , $name
-
- ;// Define stack arguments
-
- MOV pw2,pTwiddle
- VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2@256]!
-
+@// Neon Registers
+
+#define dButterfly1Real02 D0.S16
+#define dButterfly1Imag02 D1.S16
+#define dButterfly1Real13 D2.S16
+#define dButterfly1Imag13 D3.S16
+#define dButterfly2Real02 D4.S16
+#define dButterfly2Imag02 D5.S16
+#define dButterfly2Real13 D6.S16
+#define dButterfly2Imag13 D7.S16
+#define dXr0 D0.S16
+#define dXi0 D1.S16
+#define dXr1 D2.S16
+#define dXi1 D3.S16
+#define dXr2 D4.S16
+#define dXi2 D5.S16
+#define dXr3 D6.S16
+#define dXi3 D7.S16
+
+#define dW1rS32 D8.S32
+#define dW1iS32 D9.S32
+#define dW2rS32 D10.S32
+#define dW2iS32 D11.S32
+#define dW3rS32 D12.S32
+#define dW3iS32 D13.S32
+
+#define dW1r D8.S16
+#define dW1i D9.S16
+#define dW2r D10.S16
+#define dW2i D11.S16
+#define dW3r D12.S16
+#define dW3i D13.S16
+
+#define dTmp0 D12.S16
+#define dTmp1 D13.S16
+#define dTmp1S32 D13.S32
+#define dTmp2S32 D14.S32
+#define dTmp3S32 D15.S32
+
+#define dYr0 D18.S16
+#define dYi0 D19.S16
+#define dYr1 D16.S16
+#define dYi1 D17.S16
+#define dYr2 D20.S16
+#define dYi2 D21.S16
+#define dYr3 D14.S16
+#define dYi3 D15.S16
+#define qY0 Q9.S16
+#define qY1 Q8.S16
+#define qY2 Q10.S16
+#define qY3 Q7.S16
+
+#define qX0 Q0.S16
+#define qX1 Q1.S16
+#define qX2 Q2.S16
+#define qX3 Q3.S16
+
+#define qT0 Q9.S32
+#define qT1 Q10.S32
+#define qT2 Q7.S32
+#define qT3 Q8.S32
+
+#define dZr0 D22.S16
+#define dZi0 D23.S16
+#define dZr1 D24.S16
+#define dZi1 D25.S16
+#define dZr2 D26.S16
+#define dZi2 D27.S16
+#define dZr3 D28.S16
+#define dZi3 D29.S16
+
+#define qZ0 Q11.S16
+#define qZ1 Q12.S16
+#define qZ2 Q13.S16
+#define qZ3 Q14.S16
+
+
+ .MACRO FFTSTAGE scaled, inverse , name
+
+ @// Define stack arguments
+
+ MOV pw2,pTwiddle
+ VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]!
+
MOV pw3,pTwiddle
MOV pw1,pTwiddle
- ;// pOut0+1 increments pOut0 by 8 bytes
- ;// pOut0+outPointStep == increment of 4*outPointStep bytes
+ @// pOut0+1 increments pOut0 by 8 bytes
+ @// pOut0+outPointStep == increment of 4*outPointStep bytes
MOV outPointStep,subFFTSize,LSL #2
-
- VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3@64]!
- MOV subFFTNum,#1 ;//after the last stage
+
+ VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]!
+ MOV subFFTNum,#1 @//after the last stage
LSL grpCount,subFFTSize,#2
-
-
- ;// Update grpCount and grpSize rightaway
- VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3@64]!
-
- ;// update subFFTSize for the next stage
+
+
+ @// Update grpCount and grpSize rightaway
+ VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]!
+
+ @// update subFFTSize for the next stage
MOV subFFTSize,grpCount
MOV dstStep,outPointStep,LSL #1
-
- VLD2 {dW1r,dW1i}, [pw1@128]!
-
-
- ADD dstStep,dstStep,outPointStep ;// dstStep = 3*outPointStep
- RSB dstStep,dstStep,#16 ;// dstStep = - 3*outPointStep+16
-
- VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
- VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
-
- ;// Process 4 groups at a time
-
-grpLoop$name
-
-
- ;// Rearrange the third twiddle
+
+ VLD2 {dW1r,dW1i}, [pw1 :128]!
+
+
+ ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep
+ RSB dstStep,dstStep,#16 @// dstStep = - 3*outPointStep+16
+
+ VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+ VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
+ @// Process 4 groups at a time
+
+grpLoop\name:
+
+
+ @// Rearrange the third twiddle
VUZP dW3r,dW3i
- SUBS grpCount,grpCount,#16 ;// grpCount is multiplied by 4
-
-
- VUZP dButterfly1Real13, dButterfly2Real13 ;// B.r D.r
- VUZP dButterfly1Imag13, dButterfly2Imag13 ;// B.i D.i
- VUZP dButterfly1Real02, dButterfly2Real02 ;// A.r C.r
- VUZP dButterfly1Imag02, dButterfly2Imag02 ;// A.i C.i
-
-
- IF $inverse
+ SUBS grpCount,grpCount,#16 @// grpCount is multiplied by 4
+
+
+ VUZP dButterfly1Real13, dButterfly2Real13 @// B.r D.r
+ VUZP dButterfly1Imag13, dButterfly2Imag13 @// B.i D.i
+ VUZP dButterfly1Real02, dButterfly2Real02 @// A.r C.r
+ VUZP dButterfly1Imag02, dButterfly2Imag02 @// A.i C.i
+
+
+ .ifeqs "\inverse", "TRUE"
VMULL qT0,dXr1,dW1r
- VMLAL qT0,dXi1,dW1i ;// real part
+ VMLAL qT0,dXi1,dW1i @// real part
VMULL qT1,dXi1,dW1r
- VMLSL qT1,dXr1,dW1i ;// imag part
-
- ELSE
+ VMLSL qT1,dXr1,dW1i @// imag part
+
+ .ELSE
VMULL qT0,dXr1,dW1r
- VMLSL qT0,dXi1,dW1i ;// real part
+ VMLSL qT0,dXi1,dW1i @// real part
VMULL qT1,dXi1,dW1r
- VMLAL qT1,dXr1,dW1i ;// imag part
-
- ENDIF
-
- ;// Load the first twiddle for 4 groups : w^1
- ;// w^1 twiddle (i+0,i+1,i+2,i+3) for group 0,1,2,3
-
- VLD2 {dW1r,dW1i}, [pw1@128]!
-
- IF $inverse
+ VMLAL qT1,dXr1,dW1i @// imag part
+
+ .ENDIF
+
+ @// Load the first twiddle for 4 groups : w^1
+ @// w^1 twiddle (i+0,i+1,i+2,i+3) for group 0,1,2,3
+
+ VLD2 {dW1r,dW1i}, [pw1 :128]!
+
+ .ifeqs "\inverse", "TRUE"
VMULL qT2,dXr2,dW2r
- VMLAL qT2,dXi2,dW2i ;// real part
+ VMLAL qT2,dXi2,dW2i @// real part
VMULL qT3,dXi2,dW2r
- VMLSL qT3,dXr2,dW2i ;// imag part
-
- ELSE
+ VMLSL qT3,dXr2,dW2i @// imag part
+
+ .ELSE
VMULL qT2,dXr2,dW2r
- VMLSL qT2,dXi2,dW2i ;// real part
+ VMLSL qT2,dXi2,dW2i @// real part
VMULL qT3,dXi2,dW2r
- VMLAL qT3,dXr2,dW2i ;// imag part
-
- ENDIF
-
+ VMLAL qT3,dXr2,dW2i @// imag part
+
+ .ENDIF
+
VRSHRN dZr1,qT0,#15
VRSHRN dZi1,qT1,#15
-
-
-
- IF $inverse
+
+
+
+ .ifeqs "\inverse", "TRUE"
VMULL qT0,dXr3,dW3r
- VMLAL qT0,dXi3,dW3i ;// real part
+ VMLAL qT0,dXi3,dW3i @// real part
VMULL qT1,dXi3,dW3r
- VMLSL qT1,dXr3,dW3i ;// imag part
-
- ELSE
+ VMLSL qT1,dXr3,dW3i @// imag part
+
+ .ELSE
VMULL qT0,dXr3,dW3r
- VMLSL qT0,dXi3,dW3i ;// real part
+ VMLSL qT0,dXi3,dW3i @// real part
VMULL qT1,dXi3,dW3r
- VMLAL qT1,dXr3,dW3i ;// imag part
-
- ENDIF
-
- ;// Load the second twiddle for 4 groups : w^2
- ;// w^2 twiddle (2i+0,2i+2,2i+4,2i+6) for group 0,1,2,3
- VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2@256]!
-
-
+ VMLAL qT1,dXr3,dW3i @// imag part
+
+ .ENDIF
+
+ @// Load the second twiddle for 4 groups : w^2
+ @// w^2 twiddle (2i+0,2i+2,2i+4,2i+6) for group 0,1,2,3
+ VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]!
+
+
VRSHRN dZr2,qT2,#15
VRSHRN dZi2,qT3,#15
-
- ;// Load the third twiddle for 4 groups : w^3
- ;// w^3 twiddle (3i+0,3i+3,3i+6,3i+9) for group 0,1,2,3
-
- VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3@64]!
-
+
+ @// Load the third twiddle for 4 groups : w^3
+ @// w^3 twiddle (3i+0,3i+3,3i+6,3i+9) for group 0,1,2,3
+
+ VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]!
+
VRSHRN dZr3,qT0,#15
VRSHRN dZi3,qT1,#15
-
- VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3@64]!
-
- IF $scaled
-
- ;// finish first stage of 4 point FFT
-
+
+ VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]!
+
+ .ifeqs "\scaled", "TRUE"
+
+ @// finish first stage of 4 point FFT
+
VHADD qY0,qX0,qZ2
VHSUB qY2,qX0,qZ2
VHADD qY1,qZ1,qZ3
- VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
-
+ VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
VHSUB qY3,qZ1,qZ3
-
- ;// finish second stage of 4 point FFT
-
+
+ @// finish second stage of 4 point FFT
+
VHSUB qZ0,qY2,qY1
VHADD qZ2,qY2,qY1
- VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
-
-
- IF $inverse
-
- VHADD dZr3,dYr0,dYi3 ;// y3 = u0-ju3
- VST2 {dZr0,dZi0},[pDst@128],outPointStep
+ VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
+
+ .ifeqs "\inverse", "TRUE"
+
+ VHADD dZr3,dYr0,dYi3 @// y3 = u0-ju3
+ VST2 {dZr0,dZi0},[pDst :128],outPointStep
VHSUB dZi3,dYi0,dYr3
-
- VHSUB dZr1,dYr0,dYi3 ;// y1 = u0+ju3
+
+ VHSUB dZr1,dYr0,dYi3 @// y1 = u0+ju3
VHADD dZi1,dYi0,dYr3
- VST2 {dZr3,dZi3},[pDst@128],outPointStep
- VST2 {dZr2,dZi2},[pDst@128],outPointStep
- VST2 {dZr1,dZi1},[pDst@128],dstStep ;// dstStep = -3*outPointStep + 16
-
- ELSE
-
- VHSUB dZr1,dYr0,dYi3 ;// y1 = u0+ju3
+ VST2 {dZr3,dZi3},[pDst :128],outPointStep
+ VST2 {dZr2,dZi2},[pDst :128],outPointStep
+ VST2 {dZr1,dZi1},[pDst :128],dstStep @// dstStep = -3*outPointStep + 16
+
+ .ELSE
+
+ VHSUB dZr1,dYr0,dYi3 @// y1 = u0+ju3
VHADD dZi1,dYi0,dYr3
-
- VHADD dZr3,dYr0,dYi3 ;// y3 = u0-ju3
- VST2 {dZr0,dZi0},[pDst@128],outPointStep
+
+ VHADD dZr3,dYr0,dYi3 @// y3 = u0-ju3
+ VST2 {dZr0,dZi0},[pDst :128],outPointStep
VHSUB dZi3,dYi0,dYr3
- VST2 {dZr1,dZi1},[pDst@128],outPointStep
- VST2 {dZr2,dZi2},[pDst@128],outPointStep
- VST2 {dZr3,dZi3},[pDst@128],dstStep ;// dstStep = -3*outPointStep + 16
-
- ENDIF
-
- ELSE
-
- ;// finish first stage of 4 point FFT
-
+ VST2 {dZr1,dZi1},[pDst :128],outPointStep
+ VST2 {dZr2,dZi2},[pDst :128],outPointStep
+ VST2 {dZr3,dZi3},[pDst :128],dstStep @// dstStep = -3*outPointStep + 16
+
+ .ENDIF
+
+ .ELSE
+
+ @// finish first stage of 4 point FFT
+
VADD qY0,qX0,qZ2
VSUB qY2,qX0,qZ2
VADD qY1,qZ1,qZ3
- VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
-
+ VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
VSUB qY3,qZ1,qZ3
-
- ;// finish second stage of 4 point FFT
-
+
+ @// finish second stage of 4 point FFT
+
VSUB qZ0,qY2,qY1
VADD qZ2,qY2,qY1
- VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc@256]! ;// AC.r AC.i BD.r BD.i
-
-
- IF $inverse
-
- VADD dZr3,dYr0,dYi3 ;// y3 = u0-ju3
- VST2 {dZr0,dZi0},[pDst@128],outPointStep
- VSUB dZi3,dYi0,dYr3
-
- VSUB dZr1,dYr0,dYi3 ;// y1 = u0+ju3
- VADD dZi1,dYi0,dYr3
- VST2 {dZr3,dZi3},[pDst@128],outPointStep
- VST2 {dZr2,dZi2},[pDst@128],outPointStep
- VST2 {dZr1,dZi1},[pDst@128],dstStep ;// dstStep = -3*outPointStep + 16
-
- ELSE
-
- VSUB dZr1,dYr0,dYi3 ;// y1 = u0+ju3
- VADD dZi1,dYi0,dYr3
-
- VADD dZr3,dYr0,dYi3 ;// y3 = u0-ju3
- VST2 {dZr0,dZi0},[pDst@128],outPointStep
- VSUB dZi3,dYi0,dYr3
- VST2 {dZr1,dZi1},[pDst@128],outPointStep
- VST2 {dZr2,dZi2},[pDst@128],outPointStep
- VST2 {dZr3,dZi3},[pDst@128],dstStep ;// dstStep = -3*outPointStep + 16
-
- ENDIF
-
-
-
-
- ENDIF
-
- BGT grpLoop$name
-
+ VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
- ;// Reset and Swap pSrc and pDst for the next stage
+
+ .ifeqs "\inverse", "TRUE"
+
+ VADD dZr3,dYr0,dYi3 @// y3 = u0-ju3
+ VST2 {dZr0,dZi0},[pDst :128],outPointStep
+ VSUB dZi3,dYi0,dYr3
+
+ VSUB dZr1,dYr0,dYi3 @// y1 = u0+ju3
+ VADD dZi1,dYi0,dYr3
+ VST2 {dZr3,dZi3},[pDst :128],outPointStep
+ VST2 {dZr2,dZi2},[pDst :128],outPointStep
+ VST2 {dZr1,dZi1},[pDst :128],dstStep @// dstStep = -3*outPointStep + 16
+
+ .ELSE
+
+ VSUB dZr1,dYr0,dYi3 @// y1 = u0+ju3
+ VADD dZi1,dYi0,dYr3
+
+ VADD dZr3,dYr0,dYi3 @// y3 = u0-ju3
+ VST2 {dZr0,dZi0},[pDst :128],outPointStep
+ VSUB dZi3,dYi0,dYr3
+ VST2 {dZr1,dZi1},[pDst :128],outPointStep
+ VST2 {dZr2,dZi2},[pDst :128],outPointStep
+ VST2 {dZr3,dZi3},[pDst :128],dstStep @// dstStep = -3*outPointStep + 16
+
+ .ENDIF
+
+
+
+
+ .ENDIF
+
+ BGT grpLoop\name
+
+
+ @// Reset and Swap pSrc and pDst for the next stage
MOV pTmp,pDst
- SUB pSrc,pSrc,#64 ;// Extra increment currently done in the loop
- SUB pDst,pSrc,outPointStep,LSL #2 ;// pDst -= size; pSrc -= 4*size bytes
+ SUB pSrc,pSrc,#64 @// Extra increment currently done in the loop
+ SUB pDst,pSrc,outPointStep,LSL #2 @// pDst -= size; pSrc -= 4*size bytes
SUB pSrc,pTmp,outPointStep
-
- MEND
-
-
+
+ .endm
+
+
M_START armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
- FFTSTAGE {FALSE},{FALSE},FWD
+ FFTSTAGE "FALSE","FALSE",FWD
M_END
-
+
M_START armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
- FFTSTAGE {FALSE},{TRUE},INV
+ FFTSTAGE "FALSE","TRUE",INV
M_END
-
-
+
+
M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
- FFTSTAGE {TRUE},{FALSE},FWDSFS
+ FFTSTAGE "TRUE","FALSE",FWDSFS
M_END
-
+
M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
- FFTSTAGE {TRUE},{TRUE},INVSFS
+ FFTSTAGE "TRUE","TRUE",INVSFS
M_END
-
- ENDIF ;//CortexA8
-
-
-
- END
\ No newline at end of file
+
+
+
+
+ .END
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S
index c13df04..0eba385 100644
--- a/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S
@@ -1,392 +1,400 @@
-;//
-;//
-;// File Name: armSP_FFT_CToC_SC16_Radix4_unsafe_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision: 7761
-;// Last Modified Date: Wed, 26 Sep 2007
-;//
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;//
-;//
-;//
-;// Description:
-;// Compute a Radix 4 FFT stage for a N point complex signal
-;//
-;//
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This file was originally licensed as follows. It has been
+@// relicensed with permission from the copyright holders.
-
-;// Include standard headers
-
- INCLUDE omxtypes_s.h
- INCLUDE armCOMM_s.h
- INCLUDE armSP_FFT_s.h
-
- M_VARIANTS CortexA8
-
-;// Import symbols required from other files
-;// (For example tables)
-
-
-
-
-;// Set debugging level
-;//DEBUG_ON SETL {TRUE}
+@//
+@//
+@// File Name: armSP_FFT_CToC_SC16_Radix4_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision: 7761
+@// Last Modified Date: Wed, 26 Sep 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@//
+@//
-;// Guarding implementation by the processor name
-
+@// Include standard headers
-
- ;// Guarding implementation by the processor name
-
- IF CortexA8
-
-;// Import symbols required from other files
-;// (For example tables)
-
-
-;//Input Registers
-
-pSrc RN 0
-pDst RN 2
-pTwiddle RN 1
-subFFTNum RN 6
-subFFTSize RN 7
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
-;//Output Registers
+@// Import symbols required from other files
+@// (For example tables)
-;//Local Scratch Registers
-grpCount RN 3
-pointStep RN 4
-outPointStep RN 5
-stepTwiddle RN 12
-setCount RN 14
-srcStep RN 8
-setStep RN 9
-dstStep RN 10
-twStep RN 11
-t1 RN 3
-;// Neon Registers
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
-dW1 DN D0.S16
-dW2 DN D1.S16
-dW3 DN D2.S16
-dXr0 DN D4.S16
-dXi0 DN D5.S16
-dXr1 DN D6.S16
-dXi1 DN D7.S16
-dXr2 DN D8.S16
-dXi2 DN D9.S16
-dXr3 DN D10.S16
-dXi3 DN D11.S16
-dYr0 DN D12.S16
-dYi0 DN D13.S16
-dYr1 DN D14.S16
-dYi1 DN D15.S16
-dYr2 DN D16.S16
-dYi2 DN D17.S16
-dYr3 DN D18.S16
-dYi3 DN D19.S16
-qT0 QN Q8.S32
-qT1 QN Q9.S32
-qT2 QN Q6.S32
-qT3 QN Q7.S32
+@// Guarding implementation by the processor name
-dZr0 DN D20.S16
-dZi0 DN D21.S16
-dZr1 DN D22.S16
-dZi1 DN D23.S16
-dZr2 DN D24.S16
-dZi2 DN D25.S16
-dZr3 DN D26.S16
-dZi3 DN D27.S16
-qY0 QN Q6.S16
-qY1 QN Q7.S16
-qY2 QN Q8.S16
-qY3 QN Q9.S16
-qX0 QN Q2.S16
-qZ0 QN Q10.S16
-qZ1 QN Q11.S16
-qZ2 QN Q12.S16
-qZ3 QN Q13.S16
-
- MACRO
- FFTSTAGE $scaled, $inverse , $name
-
- ;// Define stack arguments
-
-
- ;// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
-
+
+ @// Guarding implementation by the processor name
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r2
+#define pTwiddle r1
+#define subFFTNum r6
+#define subFFTSize r7
+
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpCount r3
+#define pointStep r4
+#define outPointStep r5
+#define stepTwiddle r12
+#define setCount r14
+#define srcStep r8
+#define setStep r9
+#define dstStep r10
+#define twStep r11
+#define t1 r3
+
+@// Neon Registers
+
+#define dW1 D0.S16
+#define dW2 D1.S16
+#define dW3 D2.S16
+
+#define dXr0 D4.S16
+#define dXi0 D5.S16
+#define dXr1 D6.S16
+#define dXi1 D7.S16
+#define dXr2 D8.S16
+#define dXi2 D9.S16
+#define dXr3 D10.S16
+#define dXi3 D11.S16
+#define dYr0 D12.S16
+#define dYi0 D13.S16
+#define dYr1 D14.S16
+#define dYi1 D15.S16
+#define dYr2 D16.S16
+#define dYi2 D17.S16
+#define dYr3 D18.S16
+#define dYi3 D19.S16
+#define qT0 Q8.S32
+#define qT1 Q9.S32
+#define qT2 Q6.S32
+#define qT3 Q7.S32
+
+#define dZr0 D20.S16
+#define dZi0 D21.S16
+#define dZr1 D22.S16
+#define dZi1 D23.S16
+#define dZr2 D24.S16
+#define dZi2 D25.S16
+#define dZr3 D26.S16
+#define dZi3 D27.S16
+#define qY0 Q6.S16
+#define qY1 Q7.S16
+#define qY2 Q8.S16
+#define qY3 Q9.S16
+#define qX0 Q2.S16
+#define qZ0 Q10.S16
+#define qZ1 Q11.S16
+#define qZ2 Q12.S16
+#define qZ3 Q13.S16
+
+
+ .MACRO FFTSTAGE scaled, inverse , name
+
+ @// Define stack arguments
+
+
+ @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+
LSL grpCount,subFFTSize,#2
- LSR subFFTNum,subFFTNum,#2
+ LSR subFFTNum,subFFTNum,#2
MOV subFFTSize,grpCount
-
-
- ;// pOut0+1 increments pOut0 by 4 bytes
- ;// pOut0+outPointStep == increment of 4*outPointStep bytes = size bytes
-
+
+
+ @// pOut0+1 increments pOut0 by 4 bytes
+ @// pOut0+outPointStep == increment of 4*outPointStep bytes = size bytes
+
MOV stepTwiddle,#0
- SMULBB outPointStep,grpCount,subFFTNum
-
- ;// pT0+1 increments pT0 by 4 bytes
- ;// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
-
- LSL pointStep,subFFTNum,#2 ;// 2*grpSize
-
- VLD1 dW1,[pTwiddle@64] ;//[wi | wr]
- MOV srcStep,pointStep,LSL #1 ;// srcStep = 2*pointStep
- VLD1 dW2,[pTwiddle@64] ;//[wi | wr]
- ADD setStep,srcStep,pointStep ;// setStep = 3*pointStep
- SUB srcStep,srcStep,#16 ;// srcStep = 2*pointStep-16
- VLD1 dW3,[pTwiddle@64]
- ;//RSB setStep,setStep,#16 ;// setStep = - 3*pointStep+16
- RSB setStep,setStep,#0 ;// setStep = - 3*pointStep
-
+ SMULBB outPointStep,grpCount,subFFTNum
+
+ @// pT0+1 increments pT0 by 4 bytes
+ @// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
+
+ LSL pointStep,subFFTNum,#2 @// 2*grpSize
+
+ VLD1 dW1,[pTwiddle :64] @//[wi | wr]
+ MOV srcStep,pointStep,LSL #1 @// srcStep = 2*pointStep
+ VLD1 dW2,[pTwiddle :64] @//[wi | wr]
+ ADD setStep,srcStep,pointStep @// setStep = 3*pointStep
+ SUB srcStep,srcStep,#16 @// srcStep = 2*pointStep-16
+ VLD1 dW3,[pTwiddle :64]
+ @//RSB setStep,setStep,#16 @// setStep = - 3*pointStep+16
+ RSB setStep,setStep,#0 @// setStep = - 3*pointStep
+
MOV dstStep,outPointStep,LSL #1
- ADD dstStep,dstStep,outPointStep ;// dstStep = 3*outPointStep
- RSB dstStep,dstStep,#16 ;// dstStep = - 3*outPointStep+16
-
+ ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep
+ RSB dstStep,dstStep,#16 @// dstStep = - 3*outPointStep+16
-
-grpLoop$name
-
- VLD2 {dXr0,dXi0},[pSrc@128],pointStep ;// data[0]
+
+
+grpLoop\name:
+
+ VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
ADD stepTwiddle,stepTwiddle,pointStep
- VLD2 {dXr1,dXi1},[pSrc@128],pointStep ;// data[1]
- ADD pTwiddle,pTwiddle,stepTwiddle ;// set pTwiddle to the first point
- VLD2 {dXr2,dXi2},[pSrc@128],pointStep ;// data[2]
+ VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
+ ADD pTwiddle,pTwiddle,stepTwiddle @// set pTwiddle to the first point
+ VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
MOV twStep,stepTwiddle,LSL #2
- VLD2 {dXr3,dXi3},[pSrc@128],setStep ;// data[3] & reset pSrc
-
- SUB twStep,stepTwiddle,twStep ;// twStep = -3*stepTwiddle
-
-
- MOV setCount,pointStep,LSR #2
- ADD pSrc,pSrc,#16 ;// set pSrc to data[0] of the next set
- ADD pSrc,pSrc,pointStep ;// increment to data[1] of the next set
-
- ;// Loop on the sets : 4 at a time
+ VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & reset pSrc
-setLoop$name
-
- SUBS setCount,setCount,#4 ;// decrement the loop counter
-
- IF $inverse
+ SUB twStep,stepTwiddle,twStep @// twStep = -3*stepTwiddle
+
+
+ MOV setCount,pointStep,LSR #2
+ ADD pSrc,pSrc,#16 @// set pSrc to data[0] of the next set
+ ADD pSrc,pSrc,pointStep @// increment to data[1] of the next set
+
+ @// Loop on the sets : 4 at a time
+
+setLoop\name:
+
+ SUBS setCount,setCount,#4 @// decrement the loop counter
+
+ .ifeqs "\inverse", "TRUE"
VMULL qT0,dXr1,dW1[0]
- VMLAL qT0,dXi1,dW1[1] ;// real part
+ VMLAL qT0,dXi1,dW1[1] @// real part
VMULL qT1,dXi1,dW1[0]
- VMLSL qT1,dXr1,dW1[1] ;// imag part
-
- ELSE
+ VMLSL qT1,dXr1,dW1[1] @// imag part
+
+ .ELSE
VMULL qT0,dXr1,dW1[0]
- VMLSL qT0,dXi1,dW1[1] ;// real part
+ VMLSL qT0,dXi1,dW1[1] @// real part
VMULL qT1,dXi1,dW1[0]
- VMLAL qT1,dXr1,dW1[1] ;// imag part
-
- ENDIF
-
- VLD2 {dXr1,dXi1},[pSrc@128],pointStep ;// data[1]
-
- IF $inverse
+ VMLAL qT1,dXr1,dW1[1] @// imag part
+
+ .ENDIF
+
+ VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
+
+ .ifeqs "\inverse", "TRUE"
VMULL qT2,dXr2,dW2[0]
- VMLAL qT2,dXi2,dW2[1] ;// real part
+ VMLAL qT2,dXi2,dW2[1] @// real part
VMULL qT3,dXi2,dW2[0]
- VMLSL qT3,dXr2,dW2[1] ;// imag part
-
- ELSE
+ VMLSL qT3,dXr2,dW2[1] @// imag part
+
+ .ELSE
VMULL qT2,dXr2,dW2[0]
- VMLSL qT2,dXi2,dW2[1] ;// real part
+ VMLSL qT2,dXi2,dW2[1] @// real part
VMULL qT3,dXi2,dW2[0]
- VMLAL qT3,dXr2,dW2[1] ;// imag part
-
- ENDIF
-
+ VMLAL qT3,dXr2,dW2[1] @// imag part
+
+ .ENDIF
+
VRSHRN dZr1,qT0,#15
VRSHRN dZi1,qT1,#15
-
-
- VLD2 {dXr2,dXi2},[pSrc@128],pointStep ;// data[2]
-
- IF $inverse
+
+
+ VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
+
+ .ifeqs "\inverse", "TRUE"
VMULL qT0,dXr3,dW3[0]
- VMLAL qT0,dXi3,dW3[1] ;// real part
+ VMLAL qT0,dXi3,dW3[1] @// real part
VMULL qT1,dXi3,dW3[0]
- VMLSL qT1,dXr3,dW3[1] ;// imag part
-
- ELSE
+ VMLSL qT1,dXr3,dW3[1] @// imag part
+
+ .ELSE
VMULL qT0,dXr3,dW3[0]
- VMLSL qT0,dXi3,dW3[1] ;// real part
+ VMLSL qT0,dXi3,dW3[1] @// real part
VMULL qT1,dXi3,dW3[0]
- VMLAL qT1,dXr3,dW3[1] ;// imag part
-
- ENDIF
-
+ VMLAL qT1,dXr3,dW3[1] @// imag part
+
+ .ENDIF
+
VRSHRN dZr2,qT2,#15
VRSHRN dZi2,qT3,#15
-
-
+
+
VRSHRN dZr3,qT0,#15
VRSHRN dZi3,qT1,#15
- VLD2 {dXr3,dXi3},[pSrc@128],setStep ;// data[3] & update pSrc for the next set
-
-
- IF $scaled
-
- ;// finish first stage of 4 point FFT
+ VLD2 {dXr3,dXi3},[pSrc :128],setStep @// data[3] & update pSrc for the next set
+
+
+ .ifeqs "\scaled", "TRUE"
+
+ @// finish first stage of 4 point FFT
VHADD qY0,qX0,qZ2
VHSUB qY2,qX0,qZ2
-
- VLD2 {dXr0,dXi0},[pSrc@128]! ;// data[0]
+
+ VLD2 {dXr0,dXi0},[pSrc :128]! @// data[0]
VHADD qY1,qZ1,qZ3
VHSUB qY3,qZ1,qZ3
-
-
- ;// finish second stage of 4 point FFT
-
- IF $inverse
-
+
+
+ @// finish second stage of 4 point FFT
+
+ .ifeqs "\inverse", "TRUE"
+
VHSUB qZ0,qY2,qY1
-
+
VHADD dZr2,dYr0,dYi3
- VST2 {dZr0,dZi0},[pDst@128],outPointStep
+ VST2 {dZr0,dZi0},[pDst :128],outPointStep
VHSUB dZi2,dYi0,dYr3
-
+
VHADD qZ1,qY2,qY1
- VST2 {dZr2,dZi2},[pDst@128],outPointStep
-
+ VST2 {dZr2,dZi2},[pDst :128],outPointStep
+
VHSUB dZr3,dYr0,dYi3
- VST2 {dZr1,dZi1},[pDst@128],outPointStep
+ VST2 {dZr1,dZi1},[pDst :128],outPointStep
VHADD dZi3,dYi0,dYr3
- VST2 {dZr3,dZi3},[pDst@128],dstStep
-
-
- ELSE
-
+ VST2 {dZr3,dZi3},[pDst :128],dstStep
+
+
+ .ELSE
+
VHSUB qZ0,qY2,qY1
-
+
VHSUB dZr3,dYr0,dYi3
- VST2 {dZr0,dZi0},[pDst@128],outPointStep
+ VST2 {dZr0,dZi0},[pDst :128],outPointStep
VHADD dZi3,dYi0,dYr3
-
+
VHADD qZ1,qY2,qY1
- VST2 {dZr3,dZi3},[pDst@128],outPointStep
-
+ VST2 {dZr3,dZi3},[pDst :128],outPointStep
+
VHADD dZr2,dYr0,dYi3
VHSUB dZi2,dYi0,dYr3
- VST2 {dZr1,dZi1},[pDst@128],outPointStep
- VST2 {dZr2,dZi2},[pDst@128],dstStep
-
-
- ENDIF
-
-
- ELSE
-
- ;// finish first stage of 4 point FFT
+ VST2 {dZr1,dZi1},[pDst :128],outPointStep
+ VST2 {dZr2,dZi2},[pDst :128],dstStep
+
+
+ .ENDIF
+
+
+ .ELSE
+
+ @// finish first stage of 4 point FFT
VADD qY0,qX0,qZ2
VSUB qY2,qX0,qZ2
-
- VLD2 {dXr0,dXi0},[pSrc]! ;// data[0]
+
+ VLD2 {dXr0,dXi0},[pSrc]! @// data[0]
VADD qY1,qZ1,qZ3
VSUB qY3,qZ1,qZ3
-
-
- ;// finish second stage of 4 point FFT
-
-
- IF $inverse
-
- VSUB qZ0,qY2,qY1
-
- VADD dZr2,dYr0,dYi3
- VST2 {dZr0,dZi0},[pDst@128],outPointStep
- VSUB dZi2,dYi0,dYr3
-
- VADD qZ1,qY2,qY1
- VST2 {dZr2,dZi2},[pDst@128],outPointStep
-
- VSUB dZr3,dYr0,dYi3
- VST2 {dZr1,dZi1},[pDst@128],outPointStep
- VADD dZi3,dYi0,dYr3
- VST2 {dZr3,dZi3},[pDst@128],dstStep
-
-
- ELSE
-
- VSUB qZ0,qY2,qY1
-
- VSUB dZr3,dYr0,dYi3
- VST2 {dZr0,dZi0},[pDst@128],outPointStep
- VADD dZi3,dYi0,dYr3
-
- VADD qZ1,qY2,qY1
- VST2 {dZr3,dZi3},[pDst@128],outPointStep
-
- VADD dZr2,dYr0,dYi3
- VSUB dZi2,dYi0,dYr3
- VST2 {dZr1,dZi1},[pDst@128],outPointStep
- VST2 {dZr2,dZi2},[pDst@128],dstStep
-
-
- ENDIF
-
-
-
- ENDIF
-
- ADD pSrc,pSrc,pointStep ;// increment to data[1] of the next set
- BGT setLoop$name
-
- VLD1 dW1,[pTwiddle@64],stepTwiddle ;//[wi | wr]
- SUBS grpCount,grpCount,#4 ;// subtract 4 since grpCount multiplied by 4
- VLD1 dW2,[pTwiddle@64],stepTwiddle ;//[wi | wr]
- ADD pSrc,pSrc,srcStep ;// increment pSrc for the next grp
- VLD1 dW3,[pTwiddle@64],twStep ;//[wi | wr]
-
-
-
- BGT grpLoop$name
-
- ;// Reset and Swap pSrc and pDst for the next stage
+
+ @// finish second stage of 4 point FFT
+
+
+ .ifeqs "\inverse", "TRUE"
+
+ VSUB qZ0,qY2,qY1
+
+ VADD dZr2,dYr0,dYi3
+ VST2 {dZr0,dZi0},[pDst :128],outPointStep
+ VSUB dZi2,dYi0,dYr3
+
+ VADD qZ1,qY2,qY1
+ VST2 {dZr2,dZi2},[pDst :128],outPointStep
+
+ VSUB dZr3,dYr0,dYi3
+ VST2 {dZr1,dZi1},[pDst :128],outPointStep
+ VADD dZi3,dYi0,dYr3
+ VST2 {dZr3,dZi3},[pDst :128],dstStep
+
+
+ .ELSE
+
+ VSUB qZ0,qY2,qY1
+
+ VSUB dZr3,dYr0,dYi3
+ VST2 {dZr0,dZi0},[pDst :128],outPointStep
+ VADD dZi3,dYi0,dYr3
+
+ VADD qZ1,qY2,qY1
+ VST2 {dZr3,dZi3},[pDst :128],outPointStep
+
+ VADD dZr2,dYr0,dYi3
+ VSUB dZi2,dYi0,dYr3
+ VST2 {dZr1,dZi1},[pDst :128],outPointStep
+ VST2 {dZr2,dZi2},[pDst :128],dstStep
+
+
+ .ENDIF
+
+
+
+ .ENDIF
+
+ ADD pSrc,pSrc,pointStep @// increment to data[1] of the next set
+ BGT setLoop\name
+
+ VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi | wr]
+ SUBS grpCount,grpCount,#4 @// subtract 4 since grpCount multiplied by 4
+ VLD1 dW2,[pTwiddle :64],stepTwiddle @//[wi | wr]
+ ADD pSrc,pSrc,srcStep @// increment pSrc for the next grp
+ VLD1 dW3,[pTwiddle :64],twStep @//[wi | wr]
+
+
+
+ BGT grpLoop\name
+
+
+ @// Reset and Swap pSrc and pDst for the next stage
MOV t1,pDst
- SUB pDst,pSrc,outPointStep,LSL #2 ;// pDst -= size; pSrc -= 4*size bytes
- SUB pSrc,t1,outPointStep
-
-
- MEND
-
-
+ SUB pDst,pSrc,outPointStep,LSL #2 @// pDst -= size; pSrc -= 4*size bytes
+ SUB pSrc,t1,outPointStep
+
+
+ .endm
+
+
M_START armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe,r4
- FFTSTAGE {FALSE},{FALSE},FWD
+ FFTSTAGE "FALSE","FALSE",FWD
M_END
-
+
M_START armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe,r4
- FFTSTAGE {FALSE},{TRUE},INV
+ FFTSTAGE "FALSE","TRUE",INV
M_END
-
-
+
+
M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4
- FFTSTAGE {TRUE},{FALSE},FWDSFS
+ FFTSTAGE "TRUE","FALSE",FWDSFS
M_END
-
+
M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4
- FFTSTAGE {TRUE},{TRUE},INVSFS
+ FFTSTAGE "TRUE","TRUE",INVSFS
M_END
-
- ENDIF ;//CortexA8
-
-
-
- END
\ No newline at end of file
+
+
+
+
+ .END
diff --git a/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S b/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
index 741681f..588c319 100644
--- a/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
+++ b/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
@@ -1,591 +1,619 @@
-;//
-;//
-;// File Name: armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision: 7766
-;// Last Modified Date: Thu, 27 Sep 2007
-;//
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;//
-;//
-;//
-;// Description:
-;// Compute a first stage Radix 8 FFT stage for a N point complex signal
-;//
-;//
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This file was originally licensed as follows. It has been
+@// relicensed with permission from the copyright holders.
-
-;// Include standard headers
+@//
+@//
+@// File Name: armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision: 7766
+@// Last Modified Date: Thu, 27 Sep 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a first stage Radix 8 FFT stage for a N point complex signal
+@//
+@//
- INCLUDE omxtypes_s.h
- INCLUDE armCOMM_s.h
-
- M_VARIANTS CortexA8
-
-;// Import symbols required from other files
-;// (For example tables)
-
-
-;// Set debugging level
-;//DEBUG_ON SETL {TRUE}
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
-;// Guarding implementation by the processor name
-
-
-
-
-;// Guarding implementation by the processor name
-
- IF CortexA8
-
-;//Input Registers
-
-pSrc RN 0
-pDst RN 2
-pTwiddle RN 1
-subFFTNum RN 6
-subFFTSize RN 7
-pPingPongBuf RN 5 ;// dest buffer for the next stage (not pSrc for first stage)
-
-
-;//Output Registers
-
-
-;//Local Scratch Registers
-
-grpSize RN 3
-setCount RN 3 ;// Reuse grpSize as setCount
-pointStep RN 4
-outPointStep RN 4
-setStep RN 8
-step1 RN 9
-step2 RN 10
-t0 RN 11
-
-
-;// Neon Registers
-
-dXr0 DN D14.S16
-dXi0 DN D15.S16
-dXr1 DN D2.S16
-dXi1 DN D3.S16
-dXr2 DN D4.S16
-dXi2 DN D5.S16
-dXr3 DN D6.S16
-dXi3 DN D7.S16
-dXr4 DN D8.S16
-dXi4 DN D9.S16
-dXr5 DN D10.S16
-dXi5 DN D11.S16
-dXr6 DN D12.S16
-dXi6 DN D13.S16
-dXr7 DN D0.S16
-dXi7 DN D1.S16
-qX0 QN Q7.S16
-qX1 QN Q1.S16
-qX2 QN Q2.S16
-qX3 QN Q3.S16
-qX4 QN Q4.S16
-qX5 QN Q5.S16
-qX6 QN Q6.S16
-qX7 QN Q0.S16
-
-dUr0 DN D16.S16
-dUi0 DN D17.S16
-dUr2 DN D18.S16
-dUi2 DN D19.S16
-dUr4 DN D20.S16
-dUi4 DN D21.S16
-dUr6 DN D22.S16
-dUi6 DN D23.S16
-dUr1 DN D24.S16
-dUi1 DN D25.S16
-dUr3 DN D26.S16
-dUi3 DN D27.S16
-dUr5 DN D28.S16
-dUi5 DN D29.S16
-dUr7 DN D30.S16 ;// reuse dXr7 and dXi7
-dUi7 DN D31.S16
-qU0 QN Q8.S16
-qU1 QN Q12.S16
-qU2 QN Q9.S16
-qU3 QN Q13.S16
-qU4 QN Q10.S16
-qU5 QN Q14.S16
-qU6 QN Q11.S16
-qU7 QN Q15.S16
+@// Guarding implementation by the processor name
-dVr0 DN D24.S16
-dVi0 DN D25.S16
-dVr2 DN D26.S16
-dVi2 DN D27.S16
-dVr4 DN D28.S16
-dVi4 DN D29.S16
-dVr6 DN D30.S16
-dVi6 DN D31.S16
-dVr1 DN D16.S16
-dVi1 DN D17.S16
-dVr3 DN D18.S16
-dVi3 DN D19.S16
-dVr5 DN D20.S16
-dVi5 DN D21.S16
-dVr7 DN D22.S16 ;// reuse dUi7
-dVi7 DN D23.S16 ;// reuse dUr7
-qV0 QN Q12.S16
-qV1 QN Q8.S16
-qV2 QN Q13.S16
-qV3 QN Q9.S16
-qV4 QN Q14.S16
-qV5 QN Q10.S16
-qV6 QN Q15.S16
-qV7 QN Q11.S16
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r2
+#define pTwiddle r1
+#define subFFTNum r6
+#define subFFTSize r7
+@// dest buffer for the next stage (not pSrc for first stage)
+#define pPingPongBuf r5
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize r3
+@// Reuse grpSize as setCount
+#define setCount r3
+#define pointStep r4
+#define outPointStep r4
+#define setStep r8
+#define step1 r9
+#define step2 r10
+#define t0 r11
+
+
+@// Neon Registers
+
+#define dXr0 D14.S16
+#define dXi0 D15.S16
+#define dXr1 D2.S16
+#define dXi1 D3.S16
+#define dXr2 D4.S16
+#define dXi2 D5.S16
+#define dXr3 D6.S16
+#define dXi3 D7.S16
+#define dXr4 D8.S16
+#define dXi4 D9.S16
+#define dXr5 D10.S16
+#define dXi5 D11.S16
+#define dXr6 D12.S16
+#define dXi6 D13.S16
+#define dXr7 D0.S16
+#define dXi7 D1.S16
+#define qX0 Q7.S16
+#define qX1 Q1.S16
+#define qX2 Q2.S16
+#define qX3 Q3.S16
+#define qX4 Q4.S16
+#define qX5 Q5.S16
+#define qX6 Q6.S16
+#define qX7 Q0.S16
+
+#define dUr0 D16.S16
+#define dUi0 D17.S16
+#define dUr2 D18.S16
+#define dUi2 D19.S16
+#define dUr4 D20.S16
+#define dUi4 D21.S16
+#define dUr6 D22.S16
+#define dUi6 D23.S16
+#define dUr1 D24.S16
+#define dUi1 D25.S16
+#define dUr3 D26.S16
+#define dUi3 D27.S16
+#define dUr5 D28.S16
+#define dUi5 D29.S16
+@// reuse dXr7 and dXi7
+#define dUr7 D30.S16
+#define dUi7 D31.S16
+#define qU0 Q8.S16
+#define qU1 Q12.S16
+#define qU2 Q9.S16
+#define qU3 Q13.S16
+#define qU4 Q10.S16
+#define qU5 Q14.S16
+#define qU6 Q11.S16
+#define qU7 Q15.S16
-dYr0 DN D16.S16
-dYi0 DN D17.S16
-dYr2 DN D18.S16
-dYi2 DN D19.S16
-dYr4 DN D20.S16
-dYi4 DN D21.S16
-dYr6 DN D22.S16
-dYi6 DN D23.S16
-dYr1 DN D24.S16
-dYi1 DN D25.S16
-dYr3 DN D26.S16
-dYi3 DN D27.S16
-dYr5 DN D28.S16
-dYi5 DN D29.S16
-dYr7 DN D30.S16 ;// reuse dYr4 and dYi4
-dYi7 DN D31.S16
-qY0 QN Q8.S16
-qY1 QN Q12.S16
-qY2 QN Q9.S16
-qY3 QN Q13.S16
-qY4 QN Q10.S16
-qY5 QN Q14.S16
-qY6 QN Q11.S16
-qY7 QN Q15.S16
+#define dVr0 D24.S16
+#define dVi0 D25.S16
+#define dVr2 D26.S16
+#define dVi2 D27.S16
+#define dVr4 D28.S16
+#define dVi4 D29.S16
+#define dVr6 D30.S16
+#define dVi6 D31.S16
+#define dVr1 D16.S16
+#define dVi1 D17.S16
+#define dVr3 D18.S16
+#define dVi3 D19.S16
+#define dVr5 D20.S16
+#define dVi5 D21.S16
+@// reuse dUi7
+#define dVr7 D22.S16
+@// reuse dUr7
+#define dVi7 D23.S16
+#define qV0 Q12.S16
+#define qV1 Q8.S16
+#define qV2 Q13.S16
+#define qV3 Q9.S16
+#define qV4 Q14.S16
+#define qV5 Q10.S16
+#define qV6 Q15.S16
+#define qV7 Q11.S16
-dT0 DN D0.S16
-dT1 DN D1.S16
+
+#define dYr0 D16.S16
+#define dYi0 D17.S16
+#define dYr2 D18.S16
+#define dYi2 D19.S16
+#define dYr4 D20.S16
+#define dYi4 D21.S16
+#define dYr6 D22.S16
+#define dYi6 D23.S16
+#define dYr1 D24.S16
+#define dYi1 D25.S16
+#define dYr3 D26.S16
+#define dYi3 D27.S16
+#define dYr5 D28.S16
+#define dYi5 D29.S16
+@// reuse dYr4 and dYi4
+#define dYr7 D30.S16
+#define dYi7 D31.S16
+#define qY0 Q8.S16
+#define qY1 Q12.S16
+#define qY2 Q9.S16
+#define qY3 Q13.S16
+#define qY4 Q10.S16
+#define qY5 Q14.S16
+#define qY6 Q11.S16
+#define qY7 Q15.S16
-;// Define constants
-ONEBYSQRT2 EQU 0x00005A82 ;// Q15 format
-
+#define dT0 D0.S16
+#define dT1 D1.S16
- MACRO
- FFTSTAGE $scaled, $inverse , $name
-
- ;// Define stack arguments
-
- ;// Update pSubFFTSize and pSubFFTNum regs
- MOV subFFTSize,#8 ;// subFFTSize = 1 for the first stage
- LDR t0,=ONEBYSQRT2 ;// t0=(1/sqrt(2)) as Q15 format
-
- ;// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
- LSR grpSize,subFFTNum,#3
+
+@// Define constants
+ .set ONEBYSQRT2, 0x00005A82 @// Q15 format
+
+
+ .MACRO FFTSTAGE scaled, inverse , name
+
+ @// Define stack arguments
+
+ @// Update pSubFFTSize and pSubFFTNum regs
+ MOV subFFTSize,#8 @// subFFTSize = 1 for the first stage
+ LDR t0,=ONEBYSQRT2 @// t0=(1/sqrt(2)) as Q15 format
+
+ @// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
+ LSR grpSize,subFFTNum,#3
MOV subFFTNum,grpSize
-
-
- ;// pT0+1 increments pT0 by 4 bytes
- ;// pT0+pointStep = increment of 4*pointStep bytes = grpSize/2 bytes
- ;// Note: outPointStep = pointStep for firststage
-
- MOV pointStep,grpSize,LSL #2
-
-
- ;// Calculate the step of input data for the next set
- ;//MOV step1,pointStep,LSL #1 ;// step1 = 2*pointStep
- VLD2 {dXr0,dXi0},[pSrc@128],pointStep ;// data[0]
- MOV step1,grpSize,LSL #3
-
- MOV step2,pointStep,LSL #3
- VLD2 {dXr1,dXi1},[pSrc@128],pointStep ;// data[1]
- SUB step2,step2,pointStep ;// step2 = 7*pointStep
- RSB setStep,step2,#16 ;// setStep = - 7*pointStep+16
-
-
-
- VLD2 {dXr2,dXi2},[pSrc@128],pointStep ;// data[2]
- VLD2 {dXr3,dXi3},[pSrc@128],pointStep ;// data[3]
- VLD2 {dXr4,dXi4},[pSrc@128],pointStep ;// data[4]
- VLD2 {dXr5,dXi5},[pSrc@128],pointStep ;// data[5]
- VLD2 {dXr6,dXi6},[pSrc@128],pointStep ;// data[6]
- VLD2 {dXr7,dXi7},[pSrc@128],setStep ;// data[7] & update pSrc for the next set
- ;// setStep = -7*pointStep + 16
- ;// grp = 0 a special case since all the twiddle factors are 1
- ;// Loop on the sets : 4 sets at a time
-grpZeroSetLoop$name
-
- ;// Decrement setcount
- SUBS setCount,setCount,#4 ;// decrement the set loop counter
-
-
- IF $scaled
- ;// finish first stage of 8 point FFT
-
+
+ @// pT0+1 increments pT0 by 4 bytes
+ @// pT0+pointStep = increment of 4*pointStep bytes = grpSize/2 bytes
+ @// Note: outPointStep = pointStep for firststage
+
+ MOV pointStep,grpSize,LSL #2
+
+
+ @// Calculate the step of input data for the next set
+ @//MOV step1,pointStep,LSL #1 @// step1 = 2*pointStep
+ VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0]
+ MOV step1,grpSize,LSL #3
+
+ MOV step2,pointStep,LSL #3
+ VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
+ SUB step2,step2,pointStep @// step2 = 7*pointStep
+ RSB setStep,step2,#16 @// setStep = - 7*pointStep+16
+
+
+
+ VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
+ VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
+ VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
+ VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
+ VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
+ VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7] & update pSrc for the next set
+ @// setStep = -7*pointStep + 16
+ @// grp = 0 a special case since all the twiddle factors are 1
+ @// Loop on the sets : 4 sets at a time
+
+grpZeroSetLoop\name:
+
+ @// Decrement setcount
+ SUBS setCount,setCount,#4 @// decrement the set loop counter
+
+
+ .ifeqs "\scaled", "TRUE"
+ @// finish first stage of 8 point FFT
+
VHADD qU0,qX0,qX4
VHADD qU2,qX1,qX5
VHADD qU4,qX2,qX6
VHADD qU6,qX3,qX7
-
- ;// finish second stage of 8 point FFT
-
+
+ @// finish second stage of 8 point FFT
+
VHADD qV0,qU0,qU4
VHSUB qV2,qU0,qU4
VHADD qV4,qU2,qU6
VHSUB qV6,qU2,qU6
-
- ;// finish third stage of 8 point FFT
-
+
+ @// finish third stage of 8 point FFT
+
VHADD qY0,qV0,qV4
VHSUB qY4,qV0,qV4
- VST2 {dYr0,dYi0},[pDst@128],step1 ;// store y0
-
- IF $inverse
-
- VHSUB dYr2,dVr2,dVi6
- VHADD dYi2,dVi2,dVr6
-
- VHADD dYr6,dVr2,dVi6
- VST2 {dYr2,dYi2},[pDst@128],step1 ;// store y2
- VHSUB dYi6,dVi2,dVr6
-
- VHSUB qU1,qX0,qX4
- VST2 {dYr4,dYi4},[pDst@128],step1 ;// store y4
-
- VHSUB qU3,qX1,qX5
- VHSUB qU5,qX2,qX6
- VST2 {dYr6,dYi6},[pDst@128],step1 ;// store y6
-
- ELSE
-
- VHADD dYr6,dVr2,dVi6
- VHSUB dYi6,dVi2,dVr6
-
- VHSUB dYr2,dVr2,dVi6
- VST2 {dYr6,dYi6},[pDst@128],step1 ;// store y2
- VHADD dYi2,dVi2,dVr6
-
-
- VHSUB qU1,qX0,qX4
- VST2 {dYr4,dYi4},[pDst@128],step1 ;// store y4
- VHSUB qU3,qX1,qX5
- VHSUB qU5,qX2,qX6
- VST2 {dYr2,dYi2},[pDst@128],step1 ;// store y6
+ VST2 {dYr0,dYi0},[pDst :128],step1 @// store y0
-
- ENDIF
-
- ;// finish first stage of 8 point FFT
-
+ .ifeqs "\inverse", "TRUE"
+
+ VHSUB dYr2,dVr2,dVi6
+ VHADD dYi2,dVi2,dVr6
+
+ VHADD dYr6,dVr2,dVi6
+ VST2 {dYr2,dYi2},[pDst :128],step1 @// store y2
+ VHSUB dYi6,dVi2,dVr6
+
+ VHSUB qU1,qX0,qX4
+ VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
+
+ VHSUB qU3,qX1,qX5
+ VHSUB qU5,qX2,qX6
+ VST2 {dYr6,dYi6},[pDst :128],step1 @// store y6
+
+ .ELSE
+
+ VHADD dYr6,dVr2,dVi6
+ VHSUB dYi6,dVi2,dVr6
+
+ VHSUB dYr2,dVr2,dVi6
+ VST2 {dYr6,dYi6},[pDst :128],step1 @// store y2
+ VHADD dYi2,dVi2,dVr6
+
+
+ VHSUB qU1,qX0,qX4
+ VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
+ VHSUB qU3,qX1,qX5
+ VHSUB qU5,qX2,qX6
+ VST2 {dYr2,dYi2},[pDst :128],step1 @// store y6
+
+
+ .ENDIF
+
+ @// finish first stage of 8 point FFT
+
VHSUB qU7,qX3,qX7
- VMOV dT0[0],t0
-
- ;// finish second stage of 8 point FFT
-
+ VMOV dT0[0],t0
+
+ @// finish second stage of 8 point FFT
+
VHSUB dVr1,dUr1,dUi5
- VLD2 {dXr0,dXi0},[pSrc@128],pointStep ;// data[0] for next iteration
+ VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] for next iteration
VHADD dVi1,dUi1,dUr5
VHADD dVr3,dUr1,dUi5
- VLD2 {dXr1,dXi1},[pSrc@128],pointStep ;// data[1]
+ VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
VHSUB dVi3,dUi1,dUr5
-
+
VHSUB dVr5,dUr3,dUi7
- VLD2 {dXr2,dXi2},[pSrc@128],pointStep ;// data[2]
+ VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
VHADD dVi5,dUi3,dUr7
VHADD dVr7,dUr3,dUi7
- VLD2 {dXr3,dXi3},[pSrc@128],pointStep ;// data[3]
+ VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
VHSUB dVi7,dUi3,dUr7
-
- ;// finish third stage of 8 point FFT
-
- IF $inverse
-
- ;// calculate a*v5
- VQRDMULH dT1,dVr5,dT0[0] ;// use dVi0 for dT1
- VLD2 {dXr4,dXi4},[pSrc@128],pointStep ;// data[4]
+
+ @// finish third stage of 8 point FFT
+
+ .ifeqs "\inverse", "TRUE"
+
+ @// calculate a*v5
+ VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
+ VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VQRDMULH dVi5,dVi5,dT0[0]
-
- VLD2 {dXr5,dXi5},[pSrc@128],pointStep ;// data[5]
- VSUB dVr5,dT1,dVi5 ;// a * V5
+
+ VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
+ VSUB dVr5,dT1,dVi5 @// a * V5
VADD dVi5,dT1,dVi5
-
- VLD2 {dXr6,dXi6},[pSrc@128],pointStep ;// data[6]
-
- ;// calculate b*v7
+
+ VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
+
+ @// calculate b*v7
VQRDMULH dT1,dVr7,dT0[0]
VQRDMULH dVi7,dVi7,dT0[0]
-
+
VHADD qY1,qV1,qV5
VHSUB qY5,qV1,qV5
-
-
- VADD dVr7,dT1,dVi7 ;// b * V7
+
+
+ VADD dVr7,dT1,dVi7 @// b * V7
VSUB dVi7,dVi7,dT1
- SUB pDst, pDst, step2 ;// set pDst to y1
-
- VLD2 {dXr7,dXi7},[pSrc@128],setStep ;// data[7]
-
-
+ SUB pDst, pDst, step2 @// set pDst to y1
+
+ VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
+
+
VHSUB dYr3,dVr3,dVr7
VHSUB dYi3,dVi3,dVi7
- VST2 {dYr1,dYi1},[pDst@128],step1 ;// store y1
+ VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1
VHADD dYr7,dVr3,dVr7
VHADD dYi7,dVi3,dVi7
-
- VST2 {dYr3,dYi3},[pDst@128],step1 ;// store y3
- VST2 {dYr5,dYi5},[pDst@128],step1 ;// store y5
- VST2 {dYr7,dYi7},[pDst@128],#16 ;// store y7
- ELSE
-
- ;// calculate b*v7
+
+ VST2 {dYr3,dYi3},[pDst :128],step1 @// store y3
+ VST2 {dYr5,dYi5},[pDst :128],step1 @// store y5
+#if 0
+ VST2 {dYr7,dYi7},[pDst :128],#16 @// store y7
+#else
+ VST2 {dYr7,dYi7},[pDst :128]! @// store y7
+#endif
+ .ELSE
+
+ @// calculate b*v7
VQRDMULH dT1,dVr7,dT0[0]
- VLD2 {dXr4,dXi4},[pSrc@128],pointStep ;// data[4]
+ VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VQRDMULH dVi7,dVi7,dT0[0]
-
- VLD2 {dXr5,dXi5},[pSrc@128],pointStep ;// data[5]
- VADD dVr7,dT1,dVi7 ;// b * V7
+
+ VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
+ VADD dVr7,dT1,dVi7 @// b * V7
VSUB dVi7,dVi7,dT1
-
- VLD2 {dXr6,dXi6},[pSrc@128],pointStep ;// data[6]
-
- ;// calculate a*v5
- VQRDMULH dT1,dVr5,dT0[0] ;// use dVi0 for dT1
+
+ VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
+
+ @// calculate a*v5
+ VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
VQRDMULH dVi5,dVi5,dT0[0]
VHADD dYr7,dVr3,dVr7
VHADD dYi7,dVi3,dVi7
- SUB pDst, pDst, step2 ;// set pDst to y1
-
- VSUB dVr5,dT1,dVi5 ;// a * V5
+ SUB pDst, pDst, step2 @// set pDst to y1
+
+ VSUB dVr5,dT1,dVi5 @// a * V5
VADD dVi5,dT1,dVi5
- VLD2 {dXr7,dXi7},[pSrc@128],setStep ;// data[7]
-
+ VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
+
VHSUB qY5,qV1,qV5
-
+
VHSUB dYr3,dVr3,dVr7
- VST2 {dYr7,dYi7},[pDst@128],step1 ;// store y1
+ VST2 {dYr7,dYi7},[pDst :128],step1 @// store y1
VHSUB dYi3,dVi3,dVi7
VHADD qY1,qV1,qV5
-
-
- VST2 {dYr5,dYi5},[pDst@128],step1 ;// store y3
- VST2 {dYr3,dYi3},[pDst@128],step1 ;// store y5
- VST2 {dYr1,dYi1},[pDst@128],#16 ;// store y7
-
- ENDIF
-
-
-
- ELSE
- ;// finish first stage of 8 point FFT
-
+
+ VST2 {dYr5,dYi5},[pDst :128],step1 @// store y3
+ VST2 {dYr3,dYi3},[pDst :128],step1 @// store y5
+#if 0
+ VST2 {dYr1,dYi1},[pDst :128],#16 @// store y7
+#else
+ VST2 {dYr1,dYi1},[pDst :128]! @// store y7
+#endif
+
+ .ENDIF
+
+
+
+ .ELSE
+ @// finish first stage of 8 point FFT
+
VADD qU0,qX0,qX4
VADD qU2,qX1,qX5
VADD qU4,qX2,qX6
VADD qU6,qX3,qX7
-
- ;// finish second stage of 8 point FFT
-
+
+ @// finish second stage of 8 point FFT
+
VADD qV0,qU0,qU4
VSUB qV2,qU0,qU4
VADD qV4,qU2,qU6
VSUB qV6,qU2,qU6
-
- ;// finish third stage of 8 point FFT
-
+
+ @// finish third stage of 8 point FFT
+
VADD qY0,qV0,qV4
VSUB qY4,qV0,qV4
- VST2 {dYr0,dYi0},[pDst@128],step1 ;// store y0
-
- IF $inverse
-
- VSUB dYr2,dVr2,dVi6
- VADD dYi2,dVi2,dVr6
-
- VADD dYr6,dVr2,dVi6
- VST2 {dYr2,dYi2},[pDst@128],step1 ;// store y2
- VSUB dYi6,dVi2,dVr6
-
- VSUB qU1,qX0,qX4
- VST2 {dYr4,dYi4},[pDst@128],step1 ;// store y4
-
- VSUB qU3,qX1,qX5
- VSUB qU5,qX2,qX6
- VST2 {dYr6,dYi6},[pDst@128],step1 ;// store y6
-
- ELSE
-
- VADD dYr6,dVr2,dVi6
- VSUB dYi6,dVi2,dVr6
-
- VSUB dYr2,dVr2,dVi6
- VST2 {dYr6,dYi6},[pDst@128],step1 ;// store y2
- VADD dYi2,dVi2,dVr6
-
-
- VSUB qU1,qX0,qX4
- VST2 {dYr4,dYi4},[pDst@128],step1 ;// store y4
- VSUB qU3,qX1,qX5
- VSUB qU5,qX2,qX6
- VST2 {dYr2,dYi2},[pDst@128],step1 ;// store y6
+ VST2 {dYr0,dYi0},[pDst :128],step1 @// store y0
-
- ENDIF
-
- ;// finish first stage of 8 point FFT
-
+ .ifeqs "\inverse", "TRUE"
+
+ VSUB dYr2,dVr2,dVi6
+ VADD dYi2,dVi2,dVr6
+
+ VADD dYr6,dVr2,dVi6
+ VST2 {dYr2,dYi2},[pDst :128],step1 @// store y2
+ VSUB dYi6,dVi2,dVr6
+
+ VSUB qU1,qX0,qX4
+ VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
+
+ VSUB qU3,qX1,qX5
+ VSUB qU5,qX2,qX6
+ VST2 {dYr6,dYi6},[pDst :128],step1 @// store y6
+
+ .ELSE
+
+ VADD dYr6,dVr2,dVi6
+ VSUB dYi6,dVi2,dVr6
+
+ VSUB dYr2,dVr2,dVi6
+ VST2 {dYr6,dYi6},[pDst :128],step1 @// store y2
+ VADD dYi2,dVi2,dVr6
+
+
+ VSUB qU1,qX0,qX4
+ VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4
+ VSUB qU3,qX1,qX5
+ VSUB qU5,qX2,qX6
+ VST2 {dYr2,dYi2},[pDst :128],step1 @// store y6
+
+
+ .ENDIF
+
+ @// finish first stage of 8 point FFT
+
VSUB qU7,qX3,qX7
- VMOV dT0[0],t0
-
- ;// finish second stage of 8 point FFT
-
+ VMOV dT0[0],t0
+
+ @// finish second stage of 8 point FFT
+
VSUB dVr1,dUr1,dUi5
- VLD2 {dXr0,dXi0},[pSrc@128],pointStep ;// data[0] for next iteration
+ VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] for next iteration
VADD dVi1,dUi1,dUr5
VADD dVr3,dUr1,dUi5
- VLD2 {dXr1,dXi1},[pSrc@128],pointStep ;// data[1]
+ VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1]
VSUB dVi3,dUi1,dUr5
-
+
VSUB dVr5,dUr3,dUi7
- VLD2 {dXr2,dXi2},[pSrc@128],pointStep ;// data[2]
+ VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2]
VADD dVi5,dUi3,dUr7
VADD dVr7,dUr3,dUi7
- VLD2 {dXr3,dXi3},[pSrc@128],pointStep ;// data[3]
+ VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3]
VSUB dVi7,dUi3,dUr7
-
- ;// finish third stage of 8 point FFT
-
- IF $inverse
-
- ;// calculate a*v5
- VQRDMULH dT1,dVr5,dT0[0] ;// use dVi0 for dT1
- VLD2 {dXr4,dXi4},[pSrc@128],pointStep ;// data[4]
+
+ @// finish third stage of 8 point FFT
+
+ .ifeqs "\inverse", "TRUE"
+
+ @// calculate a*v5
+ VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
+ VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VQRDMULH dVi5,dVi5,dT0[0]
-
- VLD2 {dXr5,dXi5},[pSrc@128],pointStep ;// data[5]
- VSUB dVr5,dT1,dVi5 ;// a * V5
+
+ VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
+ VSUB dVr5,dT1,dVi5 @// a * V5
VADD dVi5,dT1,dVi5
-
- VLD2 {dXr6,dXi6},[pSrc@128],pointStep ;// data[6]
-
- ;// calculate b*v7
+
+ VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
+
+ @// calculate b*v7
VQRDMULH dT1,dVr7,dT0[0]
VQRDMULH dVi7,dVi7,dT0[0]
-
+
VADD qY1,qV1,qV5
VSUB qY5,qV1,qV5
-
-
- VADD dVr7,dT1,dVi7 ;// b * V7
+
+
+ VADD dVr7,dT1,dVi7 @// b * V7
VSUB dVi7,dVi7,dT1
- SUB pDst, pDst, step2 ;// set pDst to y1
-
- VLD2 {dXr7,dXi7},[pSrc@128],setStep ;// data[7]
-
-
+ SUB pDst, pDst, step2 @// set pDst to y1
+
+ VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
+
+
VSUB dYr3,dVr3,dVr7
VSUB dYi3,dVi3,dVi7
- VST2 {dYr1,dYi1},[pDst@128],step1 ;// store y1
+ VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1
VADD dYr7,dVr3,dVr7
VADD dYi7,dVi3,dVi7
-
- VST2 {dYr3,dYi3},[pDst@128],step1 ;// store y3
- VST2 {dYr5,dYi5},[pDst@128],step1 ;// store y5
- VST2 {dYr7,dYi7},[pDst@128],#16 ;// store y7
- ELSE
-
- ;// calculate b*v7
+
+ VST2 {dYr3,dYi3},[pDst :128],step1 @// store y3
+ VST2 {dYr5,dYi5},[pDst :128],step1 @// store y5
+#if 0
+ VST2 {dYr7,dYi7},[pDst :128],#16 @// store y7
+#else
+ VST2 {dYr7,dYi7},[pDst :128]! @// store y7
+#endif
+ .ELSE
+
+ @// calculate b*v7
VQRDMULH dT1,dVr7,dT0[0]
- VLD2 {dXr4,dXi4},[pSrc@128],pointStep ;// data[4]
+ VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4]
VQRDMULH dVi7,dVi7,dT0[0]
-
- VLD2 {dXr5,dXi5},[pSrc@128],pointStep ;// data[5]
- VADD dVr7,dT1,dVi7 ;// b * V7
+
+ VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5]
+ VADD dVr7,dT1,dVi7 @// b * V7
VSUB dVi7,dVi7,dT1
-
- VLD2 {dXr6,dXi6},[pSrc@128],pointStep ;// data[6]
-
- ;// calculate a*v5
- VQRDMULH dT1,dVr5,dT0[0] ;// use dVi0 for dT1
+
+ VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6]
+
+ @// calculate a*v5
+ VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1
VQRDMULH dVi5,dVi5,dT0[0]
VADD dYr7,dVr3,dVr7
VADD dYi7,dVi3,dVi7
- SUB pDst, pDst, step2 ;// set pDst to y1
-
- VSUB dVr5,dT1,dVi5 ;// a * V5
+ SUB pDst, pDst, step2 @// set pDst to y1
+
+ VSUB dVr5,dT1,dVi5 @// a * V5
VADD dVi5,dT1,dVi5
- VLD2 {dXr7,dXi7},[pSrc@128],setStep ;// data[7]
-
+ VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7]
+
VSUB qY5,qV1,qV5
-
+
VSUB dYr3,dVr3,dVr7
- VST2 {dYr7,dYi7},[pDst@128],step1 ;// store y1
+ VST2 {dYr7,dYi7},[pDst :128],step1 @// store y1
VSUB dYi3,dVi3,dVi7
VADD qY1,qV1,qV5
-
-
- VST2 {dYr5,dYi5},[pDst@128],step1 ;// store y3
- VST2 {dYr3,dYi3},[pDst@128],step1 ;// store y5
- VST2 {dYr1,dYi1},[pDst@128],#16 ;// store y7
-
- ENDIF
-
-
- ENDIF
-
- SUB pDst, pDst, step2 ;// update pDst for the next set
- BGT grpZeroSetLoop$name
-
-
- ;// reset pSrc to pDst for the next stage
- SUB pSrc,pDst,pointStep ;// pDst -= 2*grpSize
- MOV pDst,pPingPongBuf
-
-
-
- MEND
-
- ;// Allocate stack memory required by the function
-
-
+ VST2 {dYr5,dYi5},[pDst :128],step1 @// store y3
+ VST2 {dYr3,dYi3},[pDst :128],step1 @// store y5
+#if 0
+ VST2 {dYr1,dYi1},[pDst :128],#16 @// store y7
+#else
+ VST2 {dYr1,dYi1},[pDst :128]! @// store y7
+#endif
+
+ .ENDIF
+
+
+ .ENDIF
+
+ SUB pDst, pDst, step2 @// update pDst for the next set
+ BGT grpZeroSetLoop\name
+
+
+ @// reset pSrc to pDst for the next stage
+ SUB pSrc,pDst,pointStep @// pDst -= 2*grpSize
+ MOV pDst,pPingPongBuf
+
+
+
+ .endm
+
+
+ @// Allocate stack memory required by the function
+
+
M_START armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4
- FFTSTAGE {FALSE},{FALSE},FWD
+ FFTSTAGE "FALSE","FALSE",FWD
M_END
-
+
M_START armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4
- FFTSTAGE {FALSE},{TRUE},INV
+ FFTSTAGE "FALSE","TRUE",INV
M_END
-
-
+
+
M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
- FFTSTAGE {TRUE},{FALSE},FWDSFS
+ FFTSTAGE "TRUE","FALSE",FWDSFS
M_END
-
+
M_START armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
- FFTSTAGE {TRUE},{TRUE},INVSFS
+ FFTSTAGE "TRUE","TRUE",INVSFS
M_END
-
- ENDIF ;//CortexA8
-
-
-
- END
\ No newline at end of file
+
+
+
+
+ .END
diff --git a/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S b/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S
index 399037c..ca15c6b 100644
--- a/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S
+++ b/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S
@@ -1,353 +1,356 @@
-;//
-;//
-;// File Name: omxSP_FFTFwd_CToC_SC16_Sfs_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision: 6729
-;// Last Modified Date: Tue, 17 Jul 2007
-;//
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;//
-;//
-;//
-;// Description:
-;// Compute an inverse FFT for a complex signal
-;//
-;//
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This file was originally licensed as follows. It has been
+@// relicensed with permission from the copyright holders.
-
-;// Include standard headers
+@//
+@//
+@// File Name: omxSP_FFTFwd_CToC_SC16_Sfs_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision: 6729
+@// Last Modified Date: Tue, 17 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
- INCLUDE omxtypes_s.h
- INCLUDE armCOMM_s.h
-
- M_VARIANTS CortexA8
-
-;// Import symbols required from other files
-;// (For example tables)
-
- IMPORT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
- IMPORT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
- IMPORT armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
- IMPORT armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
- IMPORT armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
- IMPORT armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
- IMPORT armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
- IMPORT armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
- IMPORT armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
- IMPORT armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
- IMPORT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
- IMPORT armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
- IMPORT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
- IMPORT armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
-
-;// Set debugging level
-;//DEBUG_ON SETL {TRUE}
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+ .extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ .extern armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+ .extern armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+ .extern armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ .extern armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+ .extern armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+ .extern armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+ .extern armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+ .extern armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+ .extern armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+ .extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+ .extern armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
+ .extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+ .extern armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
-;// Guarding implementation by the processor name
-
-
-
-;// Guarding implementation by the processor name
-
- IF CortexA8
-
- IMPORT armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
- IMPORT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
-
-;//Input Registers
-
-pSrc RN 0
-pDst RN 1
-pFFTSpec RN 2
-scale RN 3
-
-
-;// Output registers
-result RN 0
-
-;//Local Scratch Registers
-
-argTwiddle RN 1
-argDst RN 2
-argScale RN 4
-pTwiddle RN 4
-tmpOrder RN 4
-pOut RN 5
-subFFTSize RN 7
-subFFTNum RN 6
-N RN 6
-order RN 14
-diff RN 9
-count RN 8 ;// Total num of radix stages required to comple the FFT
-x0r RN 4
-x0i RN 5
-diffMinusOne RN 2
-round RN 3
-
-;// Neon registers
-
-dX0 DN D0.S16
-dShift DN D1.S16
-dX0S32 DN D0.S32
+@// Guarding implementation by the processor name
- ;// Allocate stack memory required by the function
+@// Guarding implementation by the processor name
+
+
+ .extern armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+ .extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+#define scale r3
+
+
+@// Output registers
+#define result r0
+
+@//Local Scratch Registers
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define pTwiddle r4
+#define tmpOrder r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+@// Total num of radix stages required to comple the FFT
+#define count r8
+#define x0r r4
+#define x0i r5
+#define diffMinusOne r2
+#define round r3
+
+@// Neon registers
+
+#define dX0 D0.S16
+#define dShift D1.S16
+#define dX0S32 D0.S32
+
+
+
+ @// Allocate stack memory required by the function
M_ALLOC4 diffOnStack, 4
- ;// Write function header
+ @// Write function header
M_START omxSP_FFTFwd_CToC_SC16_Sfs,r11,d15
-
- M_STRUCT ARMsFFTSpec
- M_FIELD N, 4
- M_FIELD pBitRev, 4
- M_FIELD pTwiddle, 4
- M_FIELD pBuf, 4
- M_ENDSTRUCT
-
- ;// Define stack arguments
-
- ;// Read the size from structure and take log
+
+@ Structure offsets for the FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+ @// Define stack arguments
+
+ @// Read the size from structure and take log
LDR N, [pFFTSpec, #ARMsFFTSpec_N]
-
- ;// Read other structure parameters
+
+ @// Read other structure parameters
LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
-
- CLZ order,N ;// N = 2^order
- RSB order,order,#31
+
+ CLZ order,N @// N = 2^order
+ RSB order,order,#31
MOV subFFTSize,#1
- ;//MOV subFFTNum,N
-
+ @//MOV subFFTNum,N
+
CMP order,#3
- BGT orderGreaterthan3 ;// order > 3
-
+ BGT orderGreaterthan3 @// order > 3
+
CMP order,#1
- BGE orderGreaterthan0 ;// order > 0
- M_STR scale, diffOnStack,LT ;// order = 0
+ BGE orderGreaterthan0 @// order > 0
+ M_STR scale, diffOnStack,LT @// order = 0
LDRLT x0r,[pSrc]
STRLT x0r,[pDst]
MOVLT pSrc,pDst
BLT FFTEnd
-
-orderGreaterthan0
- ;// set the buffers appropriately for various orders
+
+orderGreaterthan0:
+ @// set the buffers appropriately for various orders
CMP order,#2
- MOVNE argDst,pDst
+ MOVNE argDst,pDst
MOVEQ argDst,pOut
- MOVEQ pOut,pDst ;// Pass the first stage destination in RN5
+ MOVEQ pOut,pDst @// Pass the first stage destination in RN5
MOV argTwiddle,pTwiddle
-
+
SUBS diff,scale,order
M_STR diff,diffOnStack
MOVGT scale,order
- ;// Now scale <= order
-
+ @// Now scale <= order
+
CMP order,#1
BGT orderGreaterthan1
SUBS scale,scale,#1
- BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe ;// order = 1
- BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe ;// order = 1
+ BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe @// order = 1
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe @// order = 1
B FFTEnd
-orderGreaterthan1
+orderGreaterthan1:
CMP order,#2
MOV argScale,scale
BGT orderGreaterthan2
SUBS argScale,argScale,#1
- BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe ;// order =2
- BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+ BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe @// order =2
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
SUBS argScale,argScale,#1
- BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
- BLLT armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+ BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
B FFTEnd
-
-orderGreaterthan2 ;// order =3
+
+orderGreaterthan2: @// order =3
SUBS argScale,argScale,#1
- BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
- BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+ BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
SUBS argScale,argScale,#1
- BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+ BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
BLLT armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
SUBS argScale,argScale,#1
- BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
- BLLT armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+ BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
B FFTEnd
-
-orderGreaterthan3
- ;// check scale = 0 or scale = order
- SUBS diff, scale, order ;// scale > order
- MOVGT scale,order
- BGE specialScaleCase ;// scale = 0 or scale = order
+
+orderGreaterthan3:
+ @// check scale = 0 or scale = order
+ SUBS diff, scale, order @// scale > order
+ MOVGT scale,order
+ BGE specialScaleCase @// scale = 0 or scale = order
CMP scale,#0
BEQ specialScaleCase
B generalScaleCase
-
-specialScaleCase ;// scale = 0 or scale = order and order > 3
-
- TST order, #2 ;// Set input args to fft stages
- MOVNE argDst,pDst
+
+specialScaleCase: @// scale = 0 or scale = order and order > 3
+
+ TST order, #2 @// Set input args to fft stages
+ MOVNE argDst,pDst
MOVEQ argDst,pOut
- MOVEQ pOut,pDst ;// Pass the first stage destination in RN5
- MOV argTwiddle,pTwiddle
+ MOVEQ pOut,pDst @// Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
CMP diff,#0
M_STR diff, diffOnStack
- BGE scaleEqualsOrder
-
- ;//check for even or odd order
- ;// NOTE: The following combination of BL's would work fine eventhough the first
- ;// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
- ;// armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
-
+ BGE scaleEqualsOrder
+
+ @//check for even or odd order
+ @// NOTE: The following combination of BL's would work fine eventhough the first
+ @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+ @// armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
TST order,#0x00000001
- BLEQ armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
- BLNE armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
-
+ BLEQ armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+ BLNE armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+
CMP subFFTNum,#4
BLT FFTEnd
-unscaledRadix4Loop
+unscaledRadix4Loop:
BEQ lastStageUnscaledRadix4
BL armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
CMP subFFTNum,#4
B unscaledRadix4Loop
-lastStageUnscaledRadix4
- BL armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
- B FFTEnd
+lastStageUnscaledRadix4:
+ BL armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ B FFTEnd
-scaleEqualsOrder
- ;//check for even or odd order
- ;// NOTE: The following combination of BL's would work fine eventhough the first
- ;// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
- ;// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
-
+scaleEqualsOrder:
+ @//check for even or odd order
+ @// NOTE: The following combination of BL's would work fine eventhough the first
+ @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+ @// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
TST order,#0x00000001
- BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
- BLNE armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
-
+ BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+ BLNE armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+
CMP subFFTNum,#4
BLT FFTEnd
-scaledRadix4Loop
+scaledRadix4Loop:
BEQ lastStageScaledRadix4
BL armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
CMP subFFTNum,#4
B scaledRadix4Loop
-
-lastStageScaledRadix4
- BL armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
- B FFTEnd
-
-
-
-generalScaleCase ;// 0 < scale < order and order > 3
- ;// Determine the correct destination buffer
+
+lastStageScaledRadix4:
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+
+
+generalScaleCase: @// 0 < scale < order and order > 3
+ @// Determine the correct destination buffer
SUB diff,order,scale
TST diff,#0x01
- ADDEQ count,scale,diff,LSR #1 ;// count = scale + (order - scale)/2
+ ADDEQ count,scale,diff,LSR #1 @// count = scale + (order - scale)/2
MOVNE count,order
- TST count,#0x01 ;// Is count even or odd ?
-
- MOVNE argDst,pDst ;// Set input args to fft stages
+ TST count,#0x01 @// Is count even or odd ?
+
+ MOVNE argDst,pDst @// Set input args to fft stages
MOVEQ argDst,pOut
- MOVEQ pOut,pDst ;// Pass the first stage destination in RN5
- MOV argTwiddle,pTwiddle
+ MOVEQ pOut,pDst @// Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
CMP diff,#1
- M_STR diff, diffOnStack
- BEQ scaleps ;// scaling including a radix2_ps stage
-
- MOV argScale,scale ;// Put scale in RN4 so as to save and restore
- BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe ;// scaled first stage
+ M_STR diff, diffOnStack
+ BEQ scaleps @// scaling including a radix2_ps stage
+
+ MOV argScale,scale @// Put scale in RN4 so as to save and restore
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe @// scaled first stage
SUBS argScale,argScale,#1
-
-scaledRadix2Loop
+
+scaledRadix2Loop:
BLGT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
- SUBS argScale,argScale,#1 ;// save and restore scale (RN4) in the scaled stages
+ SUBS argScale,argScale,#1 @// save and restore scale (RN4) in the scaled stages
BGT scaledRadix2Loop
B outScale
-scaleps
- SUB argScale,scale,#1 ;// order>3 and diff=1 => scale >= 3
- BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe ;// scaled first stage
+scaleps:
+ SUB argScale,scale,#1 @// order>3 and diff=1 => scale >= 3
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe @// scaled first stage
SUBS argScale,argScale,#1
-
-scaledRadix2psLoop
- BEQ scaledRadix2psStage
+
+scaledRadix2psLoop:
+ BEQ scaledRadix2psStage
BLGT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
- SUBS argScale,argScale,#1 ;// save and restore scale (RN4) in the scaled stages
+ SUBS argScale,argScale,#1 @// save and restore scale (RN4) in the scaled stages
BGE scaledRadix2psLoop
-scaledRadix2psStage
+scaledRadix2psStage:
BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
- B generalLastStageUnscaledRadix2
-
-
-outScale
- M_LDR diff, diffOnStack
- ;//check for even or odd order
+ B generalLastStageUnscaledRadix2
+
+
+outScale:
+ M_LDR diff, diffOnStack
+ @//check for even or odd order
TST diff,#0x00000001
BEQ generalUnscaledRadix4Loop
B unscaledRadix2Loop
-generalUnscaledRadix4Loop
+generalUnscaledRadix4Loop:
CMP subFFTNum,#4
BEQ generalLastStageUnscaledRadix4
BL armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
B generalUnscaledRadix4Loop
-
-generalLastStageUnscaledRadix4
- BL armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
- B End
-unscaledRadix2Loop
+generalLastStageUnscaledRadix4:
+ BL armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ B End
+
+unscaledRadix2Loop:
CMP subFFTNum,#4
BEQ generalLastTwoStagesUnscaledRadix2
BL armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
- B unscaledRadix2Loop
+ B unscaledRadix2Loop
-generalLastTwoStagesUnscaledRadix2
+generalLastTwoStagesUnscaledRadix2:
BL armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
-generalLastStageUnscaledRadix2
- BL armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+generalLastStageUnscaledRadix2:
+ BL armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
B End
-FFTEnd ;// Does only the scaling
-
- M_LDR diff, diffOnStack
+FFTEnd: @// Does only the scaling
+
+ M_LDR diff, diffOnStack
CMP diff,#0
BLE End
-
- RSB diff,diff,#0 ;// to use VRSHL for right shift by a variable
- VDUP dShift,diff
-
-scaleFFTData ;// N = subFFTSize ; dataptr = pDst ; scale = diff
- VLD1 {dX0S32[0]},[pSrc] ;// pSrc contains pDst pointer
+
+ RSB diff,diff,#0 @// to use VRSHL for right shift by a variable
+ VDUP dShift,diff
+
+scaleFFTData: @// N = subFFTSize ; dataptr = pDst ; scale = diff
+ VLD1 {dX0S32[0]},[pSrc] @// pSrc contains pDst pointer
SUBS subFFTSize,subFFTSize,#1
VRSHL dX0,dShift
VST1 {dX0S32[0]},[pSrc]!
-
+
BGT scaleFFTData
-
-
-
-End
- ;// Set return value
- MOV result, #OMX_Sts_NoErr
- ;// Write function tail
+
+
+End:
+ @// Set return value
+ MOV result, #OMX_Sts_NoErr
+
+ @// Write function tail
M_END
-
- ENDIF ;//CortexA8
-
-
-
-
- END
\ No newline at end of file
+ .END
diff --git a/dl/sp/src/omxSP_FFTInit_C_SC16.c b/dl/sp/src/omxSP_FFTInit_C_SC16.c
index 342fc0c..fdab9b0 100644
--- a/dl/sp/src/omxSP_FFTInit_C_SC16.c
+++ b/dl/sp/src/omxSP_FFTInit_C_SC16.c
@@ -12,15 +12,15 @@
*/
/**
- *
+ *
* File Name: omxSP_FFTInit_C_SC16.c
* OpenMAX DL: v1.0.2
* Last Modified Revision: 15322
* Last Modified Date: Wed, 15 Oct 2008
- *
+ *
* (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
- *
- *
+ *
+ *
* Description:
* Initializes the specification structures required
*/
@@ -47,9 +47,9 @@
* *pFFTSpec, in bytes, can be determined using <FFTGetBufSize_C_SC16>.
*
* Parameters:
- * [in] order base-2 logarithm of the desired block length;
- * valid in the range [0,12].
- * [out] pFFTSpec pointer to initialized specification structure.
+ * [in] order base-2 logarithm of the desired block length;
+ * valid in the range [0,12].
+ * [out] pFFTSpec pointer to initialized specification structure.
*
* Return Value:
* Standard omxError result. See enumeration for possible result codes.
@@ -69,8 +69,8 @@
ARMsFFTSpec_SC16 *pFFTStruct = 0;
OMX_S16 x,y,xNeg;
OMX_S32 xS32,yS32;
-
-
+
+
pFFTStruct = (ARMsFFTSpec_SC16 *) pFFTSpec;
/* if order zero no init is needed */
@@ -84,51 +84,51 @@
Nby2 = 1 << (order - 1);
N = Nby2 << 1;
M = N>>3;
-
- pBitRev = NULL ;
-
- pTwiddle = (OMX_SC16 *)
+
+ pBitRev = NULL ;
+
+ pTwiddle = (OMX_SC16 *)
(sizeof(ARMsFFTSpec_SC16) + (OMX_S8*) pFFTSpec);
-
+
/* Align to 32 byte boundary */
pTmp = ((OMX_U32)pTwiddle)&31; /* (OMX_U32)pTwiddle % 32 */
if(pTmp != 0)
- pTwiddle = (OMX_SC16*) ((OMX_S8*)pTwiddle + (32-pTmp));
-
- pBuf = (OMX_SC16 *)
+ pTwiddle = (OMX_SC16*) ((OMX_S8*)pTwiddle + (32-pTmp));
+
+ pBuf = (OMX_SC16 *)
(sizeof(OMX_SC16) * (3*N/4) + (OMX_S8*) pTwiddle);
-
+
/* Align to 32 byte boundary */
pTmp = ((OMX_U32)pBuf)&31; /* (OMX_U32)pBuf % 32 */
if(pTmp != 0)
- pBuf = (OMX_SC16*) ((OMX_S8*)pBuf + (32-pTmp));
+ pBuf = (OMX_SC16*) ((OMX_S8*)pBuf + (32-pTmp));
-
- /*
- * Filling Twiddle factors :
+
+ /*
+ * Filling Twiddle factors :
* The original twiddle table "armSP_FFT_S16TwiddleTable" is of size (MaxSize/8 + 1)
* Rest of the values i.e., upto MaxSize are calculated using the symmetries of sin and cos
* The max size of the twiddle table needed is 3N/4 for a radix-4 stage
*
- * W = (-2 * PI) / N
+ * W = (-2 * PI) / N
* N = 1 << order
* W = -PI >> (order - 1)
*/
-
-
-
+
+
+
diff = 12 - order;
step = 1<<diff; /* step into the twiddle table for the current order */
-
+
xS32 = armSP_FFT_S32TwiddleTable[0];
yS32 = armSP_FFT_S32TwiddleTable[1];
x = (xS32+0x8000)>>16;
y = (yS32+0x8000)>>16;
xNeg = 0x7FFF;
-
- if(order >=3)
+
+ if(order >=3)
{
/* i = 0 case */
pTwiddle[0].Re = x;
@@ -137,17 +137,17 @@
pTwiddle[2*M].Im = xNeg;
pTwiddle[4*M].Re = xNeg;
pTwiddle[4*M].Im = y;
-
-
+
+
for (i=1; i<=M; i++)
{
j = i*step;
-
+
xS32 = armSP_FFT_S32TwiddleTable[2*j];
yS32 = armSP_FFT_S32TwiddleTable[2*j+1];
x = (xS32+0x8000)>>16;
y = (yS32+0x8000)>>16;
-
+
pTwiddle[i].Re = x;
pTwiddle[i].Im = y;
pTwiddle[2*M-i].Re = -y;
@@ -161,8 +161,8 @@
pTwiddle[6*M-i].Re = y;
pTwiddle[6*M-i].Im = x;
}
-
-
+
+
}
else
{
@@ -174,19 +174,19 @@
pTwiddle[1].Im = xNeg;
pTwiddle[2].Re = xNeg;
pTwiddle[2].Im = y;
-
+
}
if (order == 1)
{
pTwiddle[0].Re = x;
pTwiddle[0].Im = y;
-
- }
-
-
+
+ }
+
+
}
-
-
+
+
/* Update the structure */
pFFTStruct->N = N;
pFFTStruct->pTwiddle = pTwiddle;
diff --git a/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S b/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S
index f1a8d03..ff85e2b 100644
--- a/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S
+++ b/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S
@@ -1,334 +1,342 @@
-;//
-;//
-;// File Name: omxSP_FFTInv_CToC_SC16_Sfs_s.s
-;// OpenMAX DL: v1.0.2
-;// Last Modified Revision: 6729
-;// Last Modified Date: Tue, 17 Jul 2007
-;//
-;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-;//
-;//
-;//
-;// Description:
-;// Compute an inverse FFT for a complex signal
-;//
-;//
+@//
+@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@// Use of this source code is governed by a BSD-style license
+@// that can be found in the LICENSE file in the root of the source
+@// tree. An additional intellectual property rights grant can be found
+@// in the file PATENTS. All contributing project authors may
+@// be found in the AUTHORS file in the root of the source tree.
+@//
+@// This file was originally licensed as follows. It has been
+@// relicensed with permission from the copyright holders.
-
-;// Include standard headers
+@//
+@//
+@// File Name: omxSP_FFTInv_CToC_SC16_Sfs_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision: 6729
+@// Last Modified Date: Tue, 17 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
- INCLUDE omxtypes_s.h
- INCLUDE armCOMM_s.h
-
- M_VARIANTS CortexA8
-
-;// Import symbols required from other files
-;// (For example tables)
-
- IMPORT armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
- IMPORT armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
- IMPORT armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
- IMPORT armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
- IMPORT armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
- IMPORT armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
- IMPORT armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
- IMPORT armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
- IMPORT armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
- IMPORT armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
- IMPORT armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
- IMPORT armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe
- IMPORT armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
- IMPORT armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
-
-;// Set debugging level
-;//DEBUG_ON SETL {TRUE}
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+@// Import symbols required from other files
+@// (For example tables)
+
+ .extern armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ .extern armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+ .extern armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+ .extern armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ .extern armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+ .extern armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
+ .extern armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+ .extern armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+ .extern armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+ .extern armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+ .extern armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+ .extern armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe
+ .extern armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+ .extern armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+
+@// Set debugging level
+@//DEBUG_ON SETL {TRUE}
-;// Guarding implementation by the processor name
-
-
-
-;// Guarding implementation by the processor name
-
- IF CortexA8
-
- IMPORT armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
- IMPORT armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
-
-;//Input Registers
-
-pSrc RN 0
-pDst RN 1
-pFFTSpec RN 2
-scale RN 3
+@// Guarding implementation by the processor name
-;// Output registers
-result RN 0
-;//Local Scratch Registers
-
-argTwiddle RN 1
-argDst RN 2
-argScale RN 4
-pTwiddle RN 4
-tmpOrder RN 4
-pOut RN 5
-subFFTSize RN 7
-subFFTNum RN 6
-N RN 6
-order RN 14
-diff RN 9
-count RN 8 ;// Total num of radix stages required to comple the FFT
-x0r RN 4
-x0i RN 5
-diffMinusOne RN 2
-round RN 3
-
-;// Neon registers
-
-dX0 DN D0.S16
-dShift DN D1.S16
-dX0S32 DN D0.S32
+@// Guarding implementation by the processor name
- ;// Allocate stack memory required by the function
+ .extern armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+ .extern armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+
+@//Input Registers
+
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+#define scale r3
+
+
+@// Output registers
+#define result r0
+
+@//Local Scratch Registers
+
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define pTwiddle r4
+#define tmpOrder r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+@// Total num of radix stages required to comple the FFT
+#define count r8
+#define x0r r4
+#define x0i r5
+#define diffMinusOne r2
+#define round r3
+
+@// Neon registers
+
+#define dX0 D0.S16
+#define dShift D1.S16
+#define dX0S32 D0.S32
+
+
+ @// Allocate stack memory required by the function
M_ALLOC4 diffOnStack, 4
- ;// Write function header
+ @// Write function header
M_START omxSP_FFTInv_CToC_SC16_Sfs,r11,d15
-
- M_STRUCT ARMsFFTSpec
- M_FIELD N, 4
- M_FIELD pBitRev, 4
- M_FIELD pTwiddle, 4
- M_FIELD pBuf, 4
- M_ENDSTRUCT
-
- ;// Define stack arguments
-
- ;// Read the size from structure and take log
+
+@ Structure offsets for the FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+ @// Define stack arguments
+
+ @// Read the size from structure and take log
LDR N, [pFFTSpec, #ARMsFFTSpec_N]
-
- ;// Read other structure parameters
+
+ @// Read other structure parameters
LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
-
- CLZ order,N ;// N = 2^order
- RSB order,order,#31
+
+ CLZ order,N @// N = 2^order
+ RSB order,order,#31
MOV subFFTSize,#1
- ;//MOV subFFTNum,N
-
- ADD scale,scale,order ;// FFTInverse has a final scaling factor by N
-
+ @//MOV subFFTNum,N
+
+ ADD scale,scale,order @// FFTInverse has a final scaling factor by N
+
CMP order,#3
- BGT orderGreaterthan3 ;// order > 3
-
+ BGT orderGreaterthan3 @// order > 3
+
CMP order,#1
- BGE orderGreaterthan0 ;// order > 0
- M_STR scale, diffOnStack,LT ;// order = 0
+ BGE orderGreaterthan0 @// order > 0
+ M_STR scale, diffOnStack,LT @// order = 0
LDRLT x0r,[pSrc]
STRLT x0r,[pDst]
MOVLT pSrc,pDst
BLT FFTEnd
-
-orderGreaterthan0
- ;// set the buffers appropriately for various orders
+
+orderGreaterthan0:
+ @// set the buffers appropriately for various orders
CMP order,#2
- MOVNE argDst,pDst
+ MOVNE argDst,pDst
MOVEQ argDst,pOut
- MOVEQ pOut,pDst ;// Pass the first stage destination in RN5
+ MOVEQ pOut,pDst @// Pass the first stage destination in RN5
MOV argTwiddle,pTwiddle
- ;// Store the scale factor and scale at the end
+ @// Store the scale factor and scale at the end
SUB diff,scale,order
M_STR diff, diffOnStack
BGE orderGreaterthan1
- BLLT armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe ;// order = 1
+ BLLT armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe @// order = 1
B FFTEnd
-
-
-orderGreaterthan1
- MOV tmpOrder,order ;// tmpOrder = RN 4
- BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+
+
+orderGreaterthan1:
+ MOV tmpOrder,order @// tmpOrder = RN 4
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
CMP tmpOrder,#2
BLGT armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
B FFTEnd
-
-
-orderGreaterthan3
- ;// check scale = 0 or scale = order
- SUBS diff, scale, order ;// scale > order
- MOVGT scale,order
- BGE specialScaleCase ;// scale = 0 or scale = order
+
+
+orderGreaterthan3:
+ @// check scale = 0 or scale = order
+ SUBS diff, scale, order @// scale > order
+ MOVGT scale,order
+ BGE specialScaleCase @// scale = 0 or scale = order
CMP scale,#0
BEQ specialScaleCase
B generalScaleCase
-
-specialScaleCase ;// scale = 0 or scale = order and order > 3
-
- TST order, #2 ;// Set input args to fft stages
- MOVNE argDst,pDst
+
+specialScaleCase: @// scale = 0 or scale = order and order > 3
+
+ TST order, #2 @// Set input args to fft stages
+ MOVNE argDst,pDst
MOVEQ argDst,pOut
- MOVEQ pOut,pDst ;// Pass the first stage destination in RN5
- MOV argTwiddle,pTwiddle
+ MOVEQ pOut,pDst @// Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
CMP diff,#0
M_STR diff, diffOnStack
- BGE scaleEqualsOrder
-
- ;//check for even or odd order
- ;// NOTE: The following combination of BL's would work fine eventhough the first
- ;// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
- ;// armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
-
+ BGE scaleEqualsOrder
+
+ @//check for even or odd order
+ @// NOTE: The following combination of BL's would work fine eventhough the first
+ @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+ @// armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
TST order,#0x00000001
- BLEQ armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
- BLNE armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
-
+ BLEQ armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+ BLNE armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+
CMP subFFTNum,#4
BLT FFTEnd
-unscaledRadix4Loop
+unscaledRadix4Loop:
BEQ lastStageUnscaledRadix4
BL armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
CMP subFFTNum,#4
B unscaledRadix4Loop
-lastStageUnscaledRadix4
- BL armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
- B FFTEnd
+lastStageUnscaledRadix4:
+ BL armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ B FFTEnd
-scaleEqualsOrder
- ;//check for even or odd order
- ;// NOTE: The following combination of BL's would work fine eventhough the first
- ;// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
- ;// armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
-
+scaleEqualsOrder:
+ @//check for even or odd order
+ @// NOTE: The following combination of BL's would work fine eventhough the first
+ @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+ @// armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
TST order,#0x00000001
- BLEQ armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
- BLNE armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
-
+ BLEQ armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+ BLNE armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+
CMP subFFTNum,#4
BLT FFTEnd
-scaledRadix4Loop
+scaledRadix4Loop:
BEQ lastStageScaledRadix4
BL armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
CMP subFFTNum,#4
B scaledRadix4Loop
-
-lastStageScaledRadix4
- BL armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
- B FFTEnd
-
-
-
-generalScaleCase ;// 0 < scale < order and order > 3
- ;// Determine the correct destination buffer
+
+lastStageScaledRadix4:
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+
+
+generalScaleCase: @// 0 < scale < order and order > 3
+ @// Determine the correct destination buffer
SUB diff,order,scale
TST diff,#0x01
- ADDEQ count,scale,diff,LSR #1 ;// count = scale + (order - scale)/2
+ ADDEQ count,scale,diff,LSR #1 @// count = scale + (order - scale)/2
MOVNE count,order
- TST count,#0x01 ;// Is count even or odd ?
-
- MOVNE argDst,pDst ;// Set input args to fft stages
+ TST count,#0x01 @// Is count even or odd ?
+
+ MOVNE argDst,pDst @// Set input args to fft stages
MOVEQ argDst,pOut
- MOVEQ pOut,pDst ;// Pass the first stage destination in RN5
- MOV argTwiddle,pTwiddle
+ MOVEQ pOut,pDst @// Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
CMP diff,#1
- M_STR diff, diffOnStack
- BEQ scaleps ;// scaling including a radix2_ps stage
-
- MOV argScale,scale ;// Put scale in RN4 so as to save and restore
- BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe ;// scaled first stage
+ M_STR diff, diffOnStack
+ BEQ scaleps @// scaling including a radix2_ps stage
+
+ MOV argScale,scale @// Put scale in RN4 so as to save and restore
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe @// scaled first stage
SUBS argScale,argScale,#1
-
-scaledRadix2Loop
+
+scaledRadix2Loop:
BLGT armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
- SUBS argScale,argScale,#1 ;// save and restore scale (RN4) in the scaled stages
+ SUBS argScale,argScale,#1 @// save and restore scale (RN4) in the scaled stages
BGT scaledRadix2Loop
B outScale
-scaleps
- SUB argScale,scale,#1 ;// order>3 and diff=1 => scale >= 3
- BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe ;// scaled first stage
+scaleps:
+ SUB argScale,scale,#1 @// order>3 and diff=1 => scale >= 3
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe @// scaled first stage
SUBS argScale,argScale,#1
-
-scaledRadix2psLoop
- BEQ scaledRadix2psStage
+
+scaledRadix2psLoop:
+ BEQ scaledRadix2psStage
BLGT armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
- SUBS argScale,argScale,#1 ;// save and restore scale (RN4) in the scaled stages
+ SUBS argScale,argScale,#1 @// save and restore scale (RN4) in the scaled stages
BGE scaledRadix2psLoop
-scaledRadix2psStage
+scaledRadix2psStage:
BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
- B generalLastStageUnscaledRadix2
-
-
-outScale
- M_LDR diff, diffOnStack
- ;//check for even or odd order
+ B generalLastStageUnscaledRadix2
+
+
+outScale:
+ M_LDR diff, diffOnStack
+ @//check for even or odd order
TST diff,#0x00000001
BEQ generalUnscaledRadix4Loop
B unscaledRadix2Loop
-generalUnscaledRadix4Loop
+generalUnscaledRadix4Loop:
CMP subFFTNum,#4
BEQ generalLastStageUnscaledRadix4
BL armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
B generalUnscaledRadix4Loop
-
-generalLastStageUnscaledRadix4
- BL armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
- B End
-unscaledRadix2Loop
+generalLastStageUnscaledRadix4:
+ BL armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ B End
+
+unscaledRadix2Loop:
CMP subFFTNum,#4
BEQ generalLastTwoStagesUnscaledRadix2
BL armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe
- B unscaledRadix2Loop
+ B unscaledRadix2Loop
-generalLastTwoStagesUnscaledRadix2
+generalLastTwoStagesUnscaledRadix2:
BL armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
-generalLastStageUnscaledRadix2
- BL armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+generalLastStageUnscaledRadix2:
+ BL armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
B End
-FFTEnd ;// Does only the scaling
-
- M_LDR diff, diffOnStack
+FFTEnd: @// Does only the scaling
+
+ M_LDR diff, diffOnStack
CMP diff,#0
BLE End
-
- RSB diff,diff,#0 ;// to use VRSHL for right shift by a variable
- VDUP dShift,diff
-
-scaleFFTData ;// N = subFFTSize ; dataptr = pDst ; scale = diff
- VLD1 {dX0S32[0]},[pSrc] ;// pSrc contains pDst pointer
+
+ RSB diff,diff,#0 @// to use VRSHL for right shift by a variable
+ VDUP dShift,diff
+
+scaleFFTData: @// N = subFFTSize ; dataptr = pDst ; scale = diff
+ VLD1 {dX0S32[0]},[pSrc] @// pSrc contains pDst pointer
SUBS subFFTSize,subFFTSize,#1
VRSHL dX0,dShift
VST1 {dX0S32[0]},[pSrc]!
-
+
BGT scaleFFTData
-
-
-End
- ;// Set return value
- MOV result, #OMX_Sts_NoErr
- ;// Write function tail
+
+End:
+ @// Set return value
+ MOV result, #OMX_Sts_NoErr
+
+ @// Write function tail
M_END
-
- ENDIF ;//CortexA8
-
-
-
-
- END
\ No newline at end of file
+
+
+
+
+
+ .END
diff --git a/dl/sp/src/test/compare.c b/dl/sp/src/test/compare.c
index 6cf76b4..c2ed0c0 100644
--- a/dl/sp/src/test/compare.c
+++ b/dl/sp/src/test/compare.c
@@ -69,6 +69,45 @@
snr->complex_snr_ = CalculateSNR(complex_signal_power, complex_noise_power);
}
+void CompareComplex16(struct SnrResult* snr, OMX_SC16* actual,
+ OMX_SC16* expected, int size) {
+ double realSignalPower = 0;
+ double imagSignalPower = 0;
+ double complexSignalPower = 0;
+ double realNoisePower = 0;
+ double imagNoisePower = 0;
+ double complexNoisePower = 0;
+ int k;
+ for (k = 0; k < size; ++k) {
+ double x2;
+ double y2;
+ double z2;
+
+ if (verbose > 255) {
+ printf("%4d: (%10d, %10d) (%10d, %10d)\n", k,
+ actual[k].Re, actual[k].Im,
+ expected[k].Re, expected[k].Im);
+ }
+
+ x2 = pow((double) expected[k].Re, 2);
+ y2 = pow((double) expected[k].Im, 2);
+ realSignalPower += x2;
+ imagSignalPower += y2;
+ complexSignalPower += x2 + y2;
+
+ x2 = pow((double) actual[k].Re - expected[k].Re, 2);
+ y2 = pow((double) actual[k].Im - expected[k].Im, 2);
+
+ realNoisePower += x2;
+ imagNoisePower += y2;
+ complexNoisePower += x2 + y2;
+ }
+
+ snr->real_snr_ = CalculateSNR(realSignalPower, realNoisePower);
+ snr->imag_snr_ = CalculateSNR(imagSignalPower, imagNoisePower);
+ snr->complex_snr_ = CalculateSNR(complexSignalPower, complexNoisePower);
+}
+
/*
* Compute the SNR of the actual real signal, returning the SNR.
*/
diff --git a/dl/sp/src/test/compare.h b/dl/sp/src/test/compare.h
index 258c627..348c407 100644
--- a/dl/sp/src/test/compare.h
+++ b/dl/sp/src/test/compare.h
@@ -32,6 +32,8 @@
*/
void CompareComplex32(struct SnrResult* snr, OMX_SC32* actual,
OMX_SC32* expected, int size);
+void CompareComplex16(struct SnrResult* snr, OMX_SC16* actual,
+ OMX_SC16* expected, int size);
void CompareReal32(struct SnrResult* snr, OMX_S32* actual,
OMX_S32* expected, int size);
void CompareReal16(struct SnrResult* snr, OMX_S16* actual,
diff --git a/dl/sp/src/test/test_fft.gyp b/dl/sp/src/test/test_fft.gyp
index ee63818..f47bf83 100644
--- a/dl/sp/src/test/test_fft.gyp
+++ b/dl/sp/src/test/test_fft.gyp
@@ -43,6 +43,14 @@
],
},
{
+ # Test complex fixed-point 16-bit FFT
+ 'target_name': 'test_fft16',
+ 'type': 'executable',
+ 'sources': [
+ 'test_fft16.c',
+ ],
+ },
+ {
# Test complex fixed-point 32-bit FFT
'target_name': 'test_fft32',
'type': 'executable',
@@ -95,6 +103,7 @@
'target_name': 'All',
'type': 'none',
'dependencies': [
+ 'test_fft16',
'test_fft32',
'test_float_fft',
'test_float_rfft',
diff --git a/dl/sp/src/test/test_fft16.c b/dl/sp/src/test/test_fft16.c
new file mode 100644
index 0000000..081bf23
--- /dev/null
+++ b/dl/sp/src/test/test_fft16.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+#include "dl/sp/src/test/aligned_ptr.h"
+#include "dl/sp/src/test/compare.h"
+#include "dl/sp/src/test/gensig.h"
+#include "dl/sp/src/test/test_util.h"
+
+#define MAX_FFT_ORDER 12
+
+int verbose = 0;
+int signal_value = 1024;
+int scale_factor = 0;
+
+struct KnownTestFailures known_failures[] = {
+ {11, 0, 1},
+ {11, 0, 2},
+ {11, 0, 3},
+ {12, 0, 1},
+ {12, 0, 2},
+ {12, 0, 3},
+ { 6, 1, 3},
+ { 7, 1, 3},
+ { 8, 1, 3},
+ { 9, 1, 3},
+ {10, 1, 3},
+ {11, 1, 1},
+ {11, 1, 2},
+ {11, 1, 3},
+ {12, 1, 1},
+ {12, 1, 2},
+ {12, 1, 3},
+ /* Marker to terminate array */
+ {-1, 0, 0}
+};
+
+void TestFFT(int fftLogSize, int scale_factor, int signalType);
+
+void main(int argc, char* argv[]) {
+ struct Options options;
+
+ SetDefaultOptions(&options, 0, MAX_FFT_ORDER);
+
+ options.signal_value_ = signal_value;
+ options.scale_factor_ = scale_factor;
+
+ ProcessCommandLine(&options, argc, argv,
+ "Test forward and inverse 16-bit fixed-point FFT\n");
+
+ verbose = options.verbose_;
+ signal_value = options.signal_value_;
+ scale_factor = options.scale_factor_;
+
+ if (verbose > 255)
+ DumpOptions(stderr, &options);
+
+ if (options.test_mode_) {
+ struct TestInfo info;
+
+ info.real_only_ = options.real_only_;
+ info.max_fft_order_ = options.max_fft_order_;
+ info.min_fft_order_ = options.min_fft_order_;
+ info.do_forward_tests_ = options.do_forward_tests_;
+ info.do_inverse_tests_ = options.do_inverse_tests_;
+ info.known_failures_ = known_failures;
+ /*
+ * These SNR threshold values critically depend on the
+ * signal_value that is set for the tests!
+ */
+ info.forward_threshold_ = 33.01;
+ info.inverse_threshold_ = 35.59;
+
+ RunAllTests(&info);
+ } else {
+ TestFFT(options.fft_log_size_,
+ options.signal_type_,
+ options.scale_factor_);
+ }
+}
+
+void GenerateSignal(OMX_SC16* x, struct ComplexFloat* fft,
+ struct ComplexFloat* x_true, int size, int sigtype,
+ int scale_factor) {
+ int k;
+
+ GenerateTestSignalAndFFT(x_true, fft, size, sigtype, signal_value, 0);
+
+ /*
+ * Convert the complex result to what we want
+ */
+
+ for (k = 0; k < size; ++k) {
+ x[k].Re = 0.5 + x_true[k].Re;
+ x[k].Im = 0.5 + x_true[k].Im;
+ }
+}
+
+void DumpFFTSpec(OMXFFTSpec_C_SC16* pSpec) {
+ ARMsFFTSpec_SC16* p = (ARMsFFTSpec_SC16*) pSpec;
+ printf(" N = %d\n", p->N);
+ printf(" pBitRev = %p\n", p->pBitRev);
+ printf(" pTwiddle = %p\n", p->pTwiddle);
+ printf(" pBuf = %p\n", p->pBuf);
+}
+
+void TestFFT(int fft_log_size, int signal_type, int scale_factor) {
+ struct SnrResult snr;
+
+ RunOneForwardTest(fft_log_size, signal_type, signal_value, &snr);
+ printf("Forward float FFT\n");
+ printf("SNR: real part %f dB\n", snr.real_snr_);
+ printf(" imag part %f dB\n", snr.imag_snr_);
+ printf(" complex part %f dB\n", snr.complex_snr_);
+
+ RunOneInverseTest(fft_log_size, signal_type, signal_value, &snr);
+ printf("Inverse float FFT\n");
+ printf("SNR: real part %f dB\n", snr.real_snr_);
+ printf(" imag part %f dB\n", snr.imag_snr_);
+ printf(" complex part %f dB\n", snr.complex_snr_);
+}
+
+
+float RunOneForwardTest(int fft_log_size, int signal_type,
+ float unused_signal_value,
+ struct SnrResult* snr) {
+ OMX_SC16* x;
+ OMX_SC16* y;
+
+ struct AlignedPtr* x_aligned;
+ struct AlignedPtr* y_aligned;
+
+ struct ComplexFloat* x_true;
+ struct ComplexFloat* y_true;
+ OMX_SC16* y_scaled;
+
+ OMX_INT n, fft_spec_buffer_size;
+ OMXResult status;
+ OMXFFTSpec_C_SC16 * fft_fwd_spec = NULL;
+ int fft_size;
+
+ /*
+ * With 16-bit numbers, we need to be careful to use all of the
+ * available bits to get good accuracy. Hence, set signal_value to
+ * the max 16-bit value (or close to it).
+ *
+ * To get good FFT results, also set the forward FFT scale factor
+ * to be the same as the order. This was determined by
+ * experimentation, so be careful!
+ */
+ signal_value = 32767;
+ scale_factor = fft_log_size;
+
+ fft_size = 1 << fft_log_size;
+
+ status = omxSP_FFTGetBufSize_C_SC16(fft_log_size, &fft_spec_buffer_size);
+ if (verbose > 63) {
+ printf("bufSize = %d\n", fft_spec_buffer_size);
+ }
+
+ fft_fwd_spec = (OMXFFTSpec_C_SC16*) malloc(fft_spec_buffer_size);
+ status = omxSP_FFTInit_C_SC16(fft_fwd_spec, fft_log_size);
+ if (status) {
+ fprintf(stderr, "Failed to init forward FFT: status = %d\n", status);
+ exit(1);
+ }
+
+ x_aligned = AllocAlignedPointer(32, sizeof(*x) * fft_size);
+ y_aligned = AllocAlignedPointer(32, sizeof(*y) * (fft_size + 2));
+
+ x = x_aligned->aligned_pointer_;
+ y = y_aligned->aligned_pointer_;
+
+ x_true = (struct ComplexFloat*) malloc(sizeof(*x_true) * fft_size);
+ y_true = (struct ComplexFloat*) malloc(sizeof(*y_true) * fft_size);
+ y_scaled = (OMX_SC16*) malloc(sizeof(*y_true) * fft_size);
+
+ GenerateSignal(x, y_true, x_true, fft_size, signal_type, scale_factor);
+
+ {
+ float scale = pow(2.0, fft_log_size);
+
+ for (n = 0; n < fft_size; ++n) {
+ y_scaled[n].Re = 0.5 + y_true[n].Re / scale;
+ y_scaled[n].Im = 0.5 + y_true[n].Im / scale;
+ }
+ }
+
+ if (verbose > 63) {
+ printf("Signal\n");
+ DumpArrayComplex16("x", fft_size, x);
+ printf("Expected FFT output\n");
+ DumpArrayComplex16("y", fft_size, y_scaled);
+ }
+
+ status = omxSP_FFTFwd_CToC_SC16_Sfs(x, y, fft_fwd_spec, scale_factor);
+ if (status) {
+ fprintf(stderr, "Forward FFT failed: status = %d\n", status);
+ exit(1);
+ }
+
+ if (verbose > 63) {
+ printf("FFT Output\n");
+ DumpArrayComplex16("y", fft_size, y);
+ }
+
+ CompareComplex16(snr, y, y_scaled, fft_size);
+
+ return snr->complex_snr_;
+}
+
+float RunOneInverseTest(int fft_log_size, int signal_type,
+ float unused_signal_value,
+ struct SnrResult* snr) {
+ OMX_SC16* x;
+ OMX_SC16* y;
+ OMX_SC16* z;
+ OMX_SC16* y_scaled;
+
+ struct AlignedPtr* x_aligned;
+ struct AlignedPtr* y_aligned;
+ struct AlignedPtr* z_aligned;
+ struct AlignedPtr* y_scaled_aligned;
+
+ struct ComplexFloat* x_true;
+ struct ComplexFloat* y_true;
+
+ OMX_INT n, fft_spec_buffer_size;
+ OMXResult status;
+ OMXFFTSpec_C_SC16 * fft_fwd_spec = NULL;
+ OMXFFTSpec_C_SC16 * fft_inv_spec = NULL;
+ int fft_size;
+
+ /*
+ * With 16-bit numbers, we need to be careful to use all of the
+ * available bits to get good accuracy. Hence, set signal_value to
+ * the max 16-bit value (or close to it).
+ *
+ * To get good FFT results, also set the forward FFT scale factor
+ * to be the same as the order. This was determined by
+ * experimentation, so be careful!
+ */
+ signal_value = 32767;
+
+ fft_size = 1 << fft_log_size;
+
+ status = omxSP_FFTGetBufSize_C_SC16(fft_log_size, &fft_spec_buffer_size);
+ if (verbose > 3) {
+ printf("bufSize = %d\n", fft_spec_buffer_size);
+ }
+
+ fft_inv_spec = (OMXFFTSpec_C_SC16*)malloc(fft_spec_buffer_size);
+ status = omxSP_FFTInit_C_SC16(fft_inv_spec, fft_log_size);
+ if (status) {
+ fprintf(stderr, "Failed to init backward FFT: status = %d\n", status);
+ exit(1);
+ }
+
+ x_aligned = AllocAlignedPointer(32, sizeof(*x) * fft_size);
+ y_aligned = AllocAlignedPointer(32, sizeof(*y) * (fft_size + 2));
+ z_aligned = AllocAlignedPointer(32, sizeof(*z) * fft_size);
+ y_scaled_aligned = AllocAlignedPointer(32, sizeof(*y_true) * fft_size);
+
+ x = x_aligned->aligned_pointer_;
+ y = y_aligned->aligned_pointer_;
+ z = z_aligned->aligned_pointer_;
+ y_scaled = y_scaled_aligned->aligned_pointer_;
+
+ y_true = (struct ComplexFloat*) malloc(sizeof(*y_true) * fft_size);
+ x_true = (struct ComplexFloat*) malloc(sizeof(*x_true) * fft_size);
+
+
+ GenerateSignal(x, y_true, x_true, fft_size, signal_type, fft_log_size);
+
+ {
+ /*
+ * To get max accuracy, scale the input to the inverse FFT up
+ * to use as many bits as we can.
+ */
+ float scale = 1;
+ float max = 0;
+
+ for (n = 0; n < fft_size; ++n) {
+ float val;
+ val = fabs(y_true[n].Re);
+ if (val > max) {
+ max = val;
+ }
+ val = fabs(y_true[n].Im);
+ if (val > max) {
+ max = val;
+ }
+ }
+
+ scale = 16384 / max;
+ if (verbose > 63)
+ printf("Inverse FFT input scaled factor %g\n", scale);
+
+ /*
+ * Scale both the true FFT signal and the input so we can
+ * compare them correctly later
+ */
+ for (n = 0; n < fft_size; ++n) {
+ y_scaled[n].Re = 0.5 + y_true[n].Re * scale;
+ y_scaled[n].Im = 0.5 + y_true[n].Im * scale;
+ x_true[n].Re *= scale;
+ x_true[n].Im *= scale;
+ }
+ }
+
+
+ if (verbose > 63) {
+ printf("Inverse FFT Input Signal\n");
+ DumpArrayComplex16("yScaled", fft_size, y_scaled);
+ printf("Expected Inverse FFT Output\n");
+ DumpArrayComplexFloat("x_true", fft_size, (OMX_FC32*) x_true);
+ }
+
+ status = omxSP_FFTInv_CToC_SC16_Sfs(y_scaled, z, fft_inv_spec, 0);
+
+ if (verbose > 7)
+ printf("Inverse FFT scaling = %d\n", status);
+
+ if (verbose > 127) {
+ printf("Raw Inverse FFT Output\n");
+ DumpArrayComplex16("z", fft_size, z);
+ }
+
+ /*
+ * The inverse FFT routine returns how much scaling was done. To
+ * compare the output with the expected output, we need to scale
+ * the expected output according to the scale factor returned.
+ */
+ for (n = 0; n < fft_size; ++n) {
+ x[n].Re = 0.5 + x_true[n].Re;
+ x[n].Im = 0.5 + x_true[n].Im;
+ }
+
+ if (verbose > 63) {
+ printf("Inverse FFT Output\n");
+ printf(" Actual\n");
+ DumpArrayComplex16("z", fft_size, z);
+ printf(" Expected (scaled)\n");
+ DumpArrayComplex16("x", fft_size, x);
+ }
+
+ CompareComplex16(snr, z, x, fft_size);
+
+ return snr->complex_snr_;
+}
diff --git a/dl/sp/src/test/test_util.c b/dl/sp/src/test/test_util.c
index ab989c4..88d697b 100644
--- a/dl/sp/src/test/test_util.c
+++ b/dl/sp/src/test/test_util.c
@@ -378,6 +378,16 @@
}
}
+void DumpArrayComplex16(const char* array_name, int count,
+ const OMX_SC16* array) {
+ int n;
+
+ printf("%4s\t%10s.re[n]\t%10s.im[n]\n", "n", array_name);
+ for (n = 0; n < count; ++n) {
+ printf("%4d\t%16d\t%16d\n", n, array[n].Re, array[n].Im);
+ }
+}
+
void DumpArrayFloat(const char* array_name, int count, const OMX_F32* array) {
int n;
diff --git a/dl/sp/src/test/test_util.h b/dl/sp/src/test/test_util.h
index c75ccc7..5851e12 100644
--- a/dl/sp/src/test/test_util.h
+++ b/dl/sp/src/test/test_util.h
@@ -152,6 +152,8 @@
*/
void DumpArrayReal16(const char* array_name, int count, const OMX_S16* array);
void DumpArrayReal32(const char* array_name, int count, const OMX_S32* array);
+void DumpArrayComplex16(const char* array_name, int count,
+ const OMX_SC16* array);
void DumpArrayComplex32(const char* array_name, int count,
const OMX_SC32* array);
void DumpArrayFloat(const char* array_name, int count, const OMX_F32* array);