dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S - deps/third_party/openmax - Git at Google

 @//
 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
 @//
 @//  Use of this source code is governed by a BSD-style license
 @//  that can be found in the LICENSE file in the root of the source
 @//  tree. An additional intellectual property rights grant can be found
 @//  in the file PATENTS.  All contributing project authors may
 @//  be found in the AUTHORS file in the root of the source tree.
 @//
 @//  This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
 @//  to support float instead of SC32.
 @//

 @//
 @// Description:
 @// Compute FFT for a real signal
 @//
 @//


 @// Include standard headers

 #include "dl/api/armCOMM_s.h"
 #include "dl/api/omxtypes_s.h"


 @// Import symbols required from other files
 @// (For example tables)

         .extern  armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
         .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
         .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
         .extern  armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
         .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
         .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
         .extern  armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe

 @// Set debugging level
 @//DEBUG_ON    SETL {TRUE}


 @// Guarding implementation by the processor name


     @// Guarding implementation by the processor name

 @// Import symbols required from other files
 @// (For example tables)
         .extern  armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
         .extern  armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe


 @//Input Registers

 #define pSrc            r0
 #define pDst            r1
 #define pFFTSpec        r2
 #define scale           r3


 @// Output registers
 #define result          r0

 @//Local Scratch Registers

 #define argTwiddle      r1
 #define argDst          r2
 #define argScale        r4
 #define tmpOrder        r4
 #define pTwiddle        r4
 #define pOut            r5
 #define subFFTSize      r7
 #define subFFTNum       r6
 #define N               r6
 #define order           r14
 #define diff            r9
 @// Total num of radix stages required to comple the FFT
 #define count           r8
 #define x0r             r4
 #define x0i             r5
 #define diffMinusOne    r2
 #define subFFTSizeTmp   r6
 #define step            r3
 #define step1           r4
 #define twStep          r8
 #define zero            r9
 #define pTwiddleTmp     r5
 #define t0              r10

 @// Neon registers

 #define dX0       d0.f32
 #define dzero     d1.f32
 #define dZero     d2.f32
 #define dShift    d3.f32
 #define dX0r      d2.f32
 #define dX0i      d3.f32
 #define dX1r      d4.f32
 #define dX1i      d5.f32
 #define dT0       d6.f32
 #define dT1       d7.f32
 #define dT2       d8.f32
 #define dT3       d9.f32
 #define qT0       d10.f32
 #define qT1       d12.f32
 #define dW0r      d14.f32
 #define dW0i      d15.f32
 #define dW1r      d16.f32
 #define dW1i      d17.f32
 #define dY0r      d14.f32
 #define dY0i      d15.f32
 #define dY1r      d16.f32
 #define dY1i      d17.f32
 #define dY0rS64   d14.s64
 #define dY0iS64   d15.s64
 #define qT2       d18.f32
 #define qT3       d20.f32
 @// lastThreeelements
 #define dX1       d3.f32
 #define dW0       d4.f32
 #define dW1       d5.f32
 #define dY0       d10.f32
 #define dY1       d11.f32
 #define dY2       d12.f32
 #define dY3       d13.f32

 #define half      d0.f32

 HALF:   .float  0.5

     @// Allocate stack memory required by the function

     @// Write function header
         M_START     omxSP_FFTFwd_RToCCS_F32_Sfs,r11,d15

 @ Structure offsets for the FFTSpec
         .set    ARMsFFTSpec_N, 0
         .set    ARMsFFTSpec_pBitRev, 4
         .set    ARMsFFTSpec_pTwiddle, 8
         .set    ARMsFFTSpec_pBuf, 12

         @// Define stack arguments

         @// Read the size from structure and take log
         LDR     N, [pFFTSpec, #ARMsFFTSpec_N]

         @// Read other structure parameters
         LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
         LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]

         @//  N=1 Treat seperately
         CMP     N,#1
         BGT     sizeGreaterThanOne
         VLD1    dX0[0],[pSrc]
         MOV     zero,#0
         VMOV    dzero[0],zero
         VMOV    dZero[0],zero
         VST3    {dX0[0],dzero[0],dZero[0]},[pDst]

         B       End


 sizeGreaterThanOne:
         @// Do a N/2 point complex FFT including the scaling

         MOV     N,N,ASR #1                          @// N/2 point complex FFT

         CLZ     order,N                             @// N = 2^order
         RSB     order,order,#31
         MOV     subFFTSize,#1
         @//MOV     subFFTNum,N

         CMP     order,#3
         BGT     orderGreaterthan3                   @// order > 3

         CMP     order,#1
         BGE     orderGreaterthan0                   @// order > 0
         VLD1    dX0,[pSrc]
         VST1    dX0,[pOut]
         MOV     pSrc,pOut
         MOV     argDst,pDst
         BLT     FFTEnd

 orderGreaterthan0:
         @// set the buffers appropriately for various orders
         CMP     order,#2
         MOVEQ   argDst,pDst
         MOVNE   argDst,pOut
         @// Pass the first stage destination in RN5
         MOVNE   pOut,pDst
         MOV     argTwiddle,pTwiddle

         CMP     order,#1
         BGT     orderGreaterthan1
         @// order = 1
         BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
         B       FFTEnd

 orderGreaterthan1:
         CMP     order,#2
         BGT     orderGreaterthan2
         @// order =2
         BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
         BL      armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
         B       FFTEnd

 orderGreaterthan2:@// order =3
         BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
         BL      armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe
         BL      armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe

         B       FFTEnd


 orderGreaterthan3:
 specialScaleCase:

         @// Set input args to fft stages
         TST     order, #2
         MOVEQ   argDst,pDst
         MOVNE   argDst,pOut
         @// Pass the first stage destination in RN5
         MOVNE   pOut,pDst
         MOV     argTwiddle,pTwiddle

         @//check for even or odd order
         @// NOTE: The following combination of BL's would work fine even though
         @// the first BL would corrupt the flags. This is because the end of
         @// the "grpZeroSetLoop" loop inside
         @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag
         @// to EQ

         TST     order,#0x00000001
         BLEQ    armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
         BLNE    armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe

         CMP        subFFTNum,#4
         BLT     FFTEnd


 unscaledRadix4Loop:
         BEQ        lastStageUnscaledRadix4
          BL        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
          CMP        subFFTNum,#4
          B        unscaledRadix4Loop

 lastStageUnscaledRadix4:
         BL      armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
         B        FFTEnd


 FFTEnd:
 finalComplexToRealFixup:


         @// F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
         @// 1/2[(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
         @// 1/2[2a+j0] - j [0+j2b]
         @// (a+b, 0)

         @// F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
         @// 1/2[(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
         @// 1/2[2a+j0] + j [0+j2b]
         @// (a-b, 0)

         @// F(0) and F(N/2)
         VLD2    {dX0r[0],dX0i[0]},[pSrc]!
         MOV     zero,#0
         VMOV    dX0r[1],zero
         MOV     step,subFFTSize,LSL #3            @// step = N/2 * 8 bytes
         VMOV    dX0i[1],zero
         @// twStep = 3N/8 * 8 bytes pointing to W^1
         SUB     twStep,step,subFFTSize,LSL #1

         VADD    dY0r,dX0r,dX0i                    @// F(0) = ((Z0.r+Z0.i) , 0)
         MOV     step1,subFFTSize,LSL #2           @// step1 = N/2 * 4 bytes
         VSUB    dY0i,dX0r,dX0i                    @// F(N/2) = ((Z0.r-Z0.i) , 0)
         SUBS    subFFTSize,subFFTSize,#2

         VST1    dY0r,[argDst],step
         ADD     pTwiddleTmp,argTwiddle,#8         @// W^2
         VST1    dY0i,[argDst]!
         ADD     argTwiddle,argTwiddle,twStep      @// W^1

         VDUP    dzero,zero
         SUB     argDst,argDst,step

         BLT     End
         BEQ     lastElement
         SUB     step,step,#24
         SUB     step1,step1,#8                    @// (N/4-1)*8 bytes

         @// F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
         @// Note: W^k is stored as negative values in the table
         @// Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
         @// since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)


         LDR     t0, =HALF
         VLD1    half[0], [t0]

 evenOddButterflyLoop:


         VLD1    dW0r,[argTwiddle],step1
         VLD1    dW1r,[argTwiddle]!

         VLD2    {dX0r,dX0i},[pSrc],step
         SUB     argTwiddle,argTwiddle,step1
         VLD2    {dX1r,dX1i},[pSrc]!


         SUB     step1,step1,#8                    @// (N/4-2)*8 bytes
         VLD1    dW0i,[pTwiddleTmp],step1
         VLD1    dW1i,[pTwiddleTmp]!
         SUB     pSrc,pSrc,step

         SUB     pTwiddleTmp,pTwiddleTmp,step1
         VREV64  dX1r,dX1r
         VREV64  dX1i,dX1i
         SUBS    subFFTSize,subFFTSize,#4


         VSUB    dT2,dX0r,dX1r                     @// a-c
         SUB     step1,step1,#8
         VADD    dT0,dX0r,dX1r                     @// a+c
         VSUB    dT1,dX0i,dX1i                     @// b-d
         VADD    dT3,dX0i,dX1i                     @// b+d
         VMUL   dT0,dT0,half[0]
         VMUL   dT1,dT1,half[0]
         VZIP    dW1r,dW1i
         VZIP    dW0r,dW0i


         VMUL   qT0,dW1r,dT2
         VMUL   qT1,dW1r,dT3
         VMUL   qT2,dW0r,dT2
         VMUL   qT3,dW0r,dT3

         VMLA   qT0,dW1i,dT3
         VMLS   qT1,dW1i,dT2

         VMLS   qT2,dW0i,dT3
         VMLA   qT3,dW0i,dT2


         VMUL  dX1r,qT0,half[0]
         VMUL  dX1i,qT1,half[0]

         VSUB    dY1r,dT0,dX1i                     @// F(N/2 -1)
         VADD    dY1i,dT1,dX1r
         VNEG    dY1i,dY1i

         VREV64  dY1r,dY1r
         VREV64  dY1i,dY1i


         VMUL  dX0r,qT2,half[0]
         VMUL  dX0i,qT3,half[0]

         VSUB    dY0r,dT0,dX0i                     @// F(1)
         VADD    dY0i,dT1,dX0r


         VST2    {dY0r,dY0i},[argDst],step
         VST2    {dY1r,dY1i},[argDst]!
         SUB     argDst,argDst,step
         SUB     step,step,#32                     @// (N/2-4)*8 bytes


         BGT     evenOddButterflyLoop

         @// set both the ptrs to the last element
         SUB     pSrc,pSrc,#8
         SUB     argDst,argDst,#8


         @// Last element can be expanded as follows
         @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
         @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
         @// 1/2[2a+j0] + j (c+jd) [0+j2b]
         @// (a-bc, -bd)
         @// Since (c,d) = (0,1) for the last element, result is just (a,-b)

 lastElement:
         VLD1    dX0r,[pSrc]

         VST1    dX0r[0],[argDst]!
         VNEG    dX0r,dX0r
         VST1    dX0r[1],[argDst]!

 End:
         @// Set return value
         MOV     result, #OMX_Sts_NoErr

         @// Write function tail
         M_END

         .end
	@//
	@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
	@//
	@// Use of this source code is governed by a BSD-style license
	@// that can be found in the LICENSE file in the root of the source
	@// tree. An additional intellectual property rights grant can be found
	@// in the file PATENTS. All contributing project authors may
	@// be found in the AUTHORS file in the root of the source tree.
	@//
	@// This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
	@// to support float instead of SC32.
	@//

	@//
	@// Description:
	@// Compute FFT for a real signal
	@//
	@//


	@// Include standard headers

	#include "dl/api/armCOMM_s.h"
	#include "dl/api/omxtypes_s.h"


	@// Import symbols required from other files
	@// (For example tables)

	.extern armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
	.extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
	.extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
	.extern armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
	.extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
	.extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
	.extern armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe

	@// Set debugging level
	@//DEBUG_ON SETL {TRUE}



	@// Guarding implementation by the processor name



	@// Guarding implementation by the processor name

	@// Import symbols required from other files
	@// (For example tables)
	.extern armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
	.extern armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe


	@//Input Registers

	#define pSrc r0
	#define pDst r1
	#define pFFTSpec r2
	#define scale r3


	@// Output registers
	#define result r0

	@//Local Scratch Registers

	#define argTwiddle r1
	#define argDst r2
	#define argScale r4
	#define tmpOrder r4
	#define pTwiddle r4
	#define pOut r5
	#define subFFTSize r7
	#define subFFTNum r6
	#define N r6
	#define order r14
	#define diff r9
	@// Total num of radix stages required to comple the FFT
	#define count r8
	#define x0r r4
	#define x0i r5
	#define diffMinusOne r2
	#define subFFTSizeTmp r6
	#define step r3
	#define step1 r4
	#define twStep r8
	#define zero r9
	#define pTwiddleTmp r5
	#define t0 r10

	@// Neon registers

	#define dX0 d0.f32
	#define dzero d1.f32
	#define dZero d2.f32
	#define dShift d3.f32
	#define dX0r d2.f32
	#define dX0i d3.f32
	#define dX1r d4.f32
	#define dX1i d5.f32
	#define dT0 d6.f32
	#define dT1 d7.f32
	#define dT2 d8.f32
	#define dT3 d9.f32
	#define qT0 d10.f32
	#define qT1 d12.f32
	#define dW0r d14.f32
	#define dW0i d15.f32
	#define dW1r d16.f32
	#define dW1i d17.f32
	#define dY0r d14.f32
	#define dY0i d15.f32
	#define dY1r d16.f32
	#define dY1i d17.f32
	#define dY0rS64 d14.s64
	#define dY0iS64 d15.s64
	#define qT2 d18.f32
	#define qT3 d20.f32
	@// lastThreeelements
	#define dX1 d3.f32
	#define dW0 d4.f32
	#define dW1 d5.f32
	#define dY0 d10.f32
	#define dY1 d11.f32
	#define dY2 d12.f32
	#define dY3 d13.f32

	#define half d0.f32

	HALF: .float 0.5

	@// Allocate stack memory required by the function

	@// Write function header
	M_START omxSP_FFTFwd_RToCCS_F32_Sfs,r11,d15

	@ Structure offsets for the FFTSpec
	.set ARMsFFTSpec_N, 0
	.set ARMsFFTSpec_pBitRev, 4
	.set ARMsFFTSpec_pTwiddle, 8
	.set ARMsFFTSpec_pBuf, 12

	@// Define stack arguments

	@// Read the size from structure and take log
	LDR N, [pFFTSpec, #ARMsFFTSpec_N]

	@// Read other structure parameters
	LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
	LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]

	@// N=1 Treat seperately
	CMP N,#1
	BGT sizeGreaterThanOne
	VLD1 dX0[0],[pSrc]
	MOV zero,#0
	VMOV dzero[0],zero
	VMOV dZero[0],zero
	VST3 {dX0[0],dzero[0],dZero[0]},[pDst]

	B End



	sizeGreaterThanOne:
	@// Do a N/2 point complex FFT including the scaling

	MOV N,N,ASR #1 @// N/2 point complex FFT

	CLZ order,N @// N = 2^order
	RSB order,order,#31
	MOV subFFTSize,#1
	@//MOV subFFTNum,N

	CMP order,#3
	BGT orderGreaterthan3 @// order > 3

	CMP order,#1
	BGE orderGreaterthan0 @// order > 0
	VLD1 dX0,[pSrc]
	VST1 dX0,[pOut]
	MOV pSrc,pOut
	MOV argDst,pDst
	BLT FFTEnd

	orderGreaterthan0:
	@// set the buffers appropriately for various orders
	CMP order,#2
	MOVEQ argDst,pDst
	MOVNE argDst,pOut
	@// Pass the first stage destination in RN5
	MOVNE pOut,pDst
	MOV argTwiddle,pTwiddle

	CMP order,#1
	BGT orderGreaterthan1
	@// order = 1
	BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
	B FFTEnd

	orderGreaterthan1:
	CMP order,#2
	BGT orderGreaterthan2
	@// order =2
	BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
	BL armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
	B FFTEnd

	orderGreaterthan2:@// order =3
	BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
	BL armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe
	BL armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe

	B FFTEnd



	orderGreaterthan3:
	specialScaleCase:

	@// Set input args to fft stages
	TST order, #2
	MOVEQ argDst,pDst
	MOVNE argDst,pOut
	@// Pass the first stage destination in RN5
	MOVNE pOut,pDst
	MOV argTwiddle,pTwiddle

	@//check for even or odd order
	@// NOTE: The following combination of BL's would work fine even though
	@// the first BL would corrupt the flags. This is because the end of
	@// the "grpZeroSetLoop" loop inside
	@// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag
	@// to EQ

	TST order,#0x00000001
	BLEQ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
	BLNE armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe

	CMP subFFTNum,#4
	BLT FFTEnd


	unscaledRadix4Loop:
	BEQ lastStageUnscaledRadix4
	BL armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
	CMP subFFTNum,#4
	B unscaledRadix4Loop

	lastStageUnscaledRadix4:
	BL armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
	B FFTEnd


	FFTEnd:
	finalComplexToRealFixup:


	@// F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
	@// 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)]
	@// 1/2[2a+j0] - j [0+j2b]
	@// (a+b, 0)

	@// F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
	@// 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)]
	@// 1/2[2a+j0] + j [0+j2b]
	@// (a-b, 0)

	@// F(0) and F(N/2)
	VLD2 {dX0r[0],dX0i[0]},[pSrc]!
	MOV zero,#0
	VMOV dX0r[1],zero
	MOV step,subFFTSize,LSL #3 @// step = N/2 * 8 bytes
	VMOV dX0i[1],zero
	@// twStep = 3N/8 * 8 bytes pointing to W^1
	SUB twStep,step,subFFTSize,LSL #1

	VADD dY0r,dX0r,dX0i @// F(0) = ((Z0.r+Z0.i) , 0)
	MOV step1,subFFTSize,LSL #2 @// step1 = N/2 * 4 bytes
	VSUB dY0i,dX0r,dX0i @// F(N/2) = ((Z0.r-Z0.i) , 0)
	SUBS subFFTSize,subFFTSize,#2

	VST1 dY0r,[argDst],step
	ADD pTwiddleTmp,argTwiddle,#8 @// W^2
	VST1 dY0i,[argDst]!
	ADD argTwiddle,argTwiddle,twStep @// W^1

	VDUP dzero,zero
	SUB argDst,argDst,step

	BLT End
	BEQ lastElement
	SUB step,step,#24
	SUB step1,step1,#8 @// (N/4-1)*8 bytes

	@// F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)]
	@// Note: W^k is stored as negative values in the table
	@// Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
	@// since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)


	LDR t0, =HALF
	VLD1 half[0], [t0]

	evenOddButterflyLoop:


	VLD1 dW0r,[argTwiddle],step1
	VLD1 dW1r,[argTwiddle]!

	VLD2 {dX0r,dX0i},[pSrc],step
	SUB argTwiddle,argTwiddle,step1
	VLD2 {dX1r,dX1i},[pSrc]!



	SUB step1,step1,#8 @// (N/4-2)*8 bytes
	VLD1 dW0i,[pTwiddleTmp],step1
	VLD1 dW1i,[pTwiddleTmp]!
	SUB pSrc,pSrc,step

	SUB pTwiddleTmp,pTwiddleTmp,step1
	VREV64 dX1r,dX1r
	VREV64 dX1i,dX1i
	SUBS subFFTSize,subFFTSize,#4



	VSUB dT2,dX0r,dX1r @// a-c
	SUB step1,step1,#8
	VADD dT0,dX0r,dX1r @// a+c
	VSUB dT1,dX0i,dX1i @// b-d
	VADD dT3,dX0i,dX1i @// b+d
	VMUL dT0,dT0,half[0]
	VMUL dT1,dT1,half[0]
	VZIP dW1r,dW1i
	VZIP dW0r,dW0i


	VMUL qT0,dW1r,dT2
	VMUL qT1,dW1r,dT3
	VMUL qT2,dW0r,dT2
	VMUL qT3,dW0r,dT3

	VMLA qT0,dW1i,dT3
	VMLS qT1,dW1i,dT2

	VMLS qT2,dW0i,dT3
	VMLA qT3,dW0i,dT2


	VMUL dX1r,qT0,half[0]
	VMUL dX1i,qT1,half[0]

	VSUB dY1r,dT0,dX1i @// F(N/2 -1)
	VADD dY1i,dT1,dX1r
	VNEG dY1i,dY1i

	VREV64 dY1r,dY1r
	VREV64 dY1i,dY1i


	VMUL dX0r,qT2,half[0]
	VMUL dX0i,qT3,half[0]

	VSUB dY0r,dT0,dX0i @// F(1)
	VADD dY0i,dT1,dX0r


	VST2 {dY0r,dY0i},[argDst],step
	VST2 {dY1r,dY1i},[argDst]!
	SUB argDst,argDst,step
	SUB step,step,#32 @// (N/2-4)*8 bytes


	BGT evenOddButterflyLoop

	@// set both the ptrs to the last element
	SUB pSrc,pSrc,#8
	SUB argDst,argDst,#8



	@// Last element can be expanded as follows
	@// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
	@// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
	@// 1/2[2a+j0] + j (c+jd) [0+j2b]
	@// (a-bc, -bd)
	@// Since (c,d) = (0,1) for the last element, result is just (a,-b)

	lastElement:
	VLD1 dX0r,[pSrc]

	VST1 dX0r[0],[argDst]!
	VNEG dX0r,dX0r
	VST1 dX0r[1],[argDst]!

	End:
	@// Set return value
	MOV result, #OMX_Sts_NoErr

	@// Write function tail
	M_END

	.end