dl/sp/src/arm/armv7/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S - deps/third_party/openmax - Git at Google

 @//
 @//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
 @//
 @//  Use of this source code is governed by a BSD-style license
 @//  that can be found in the LICENSE file in the root of the source
 @//  tree. An additional intellectual property rights grant can be found
 @//  in the file PATENTS.  All contributing project authors may
 @//  be found in the AUTHORS file in the root of the source tree.
 @//
 @//  This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
 @//  to support float instead of SC32.
 @//

 @//
 @// Description:
 @// Compute FFT for a real signal
 @//
 @//


 @// Include standard headers

 #include "dl/api/arm/armCOMM_s.h"
 #include "dl/api/arm/omxtypes_s.h"

 @//        M_VARIANTS ARM1136JS

 @// Import symbols required from other files
 @// (For example tables)

         .extern  armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
         .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
         .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
         .extern  armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp

 @// Set debugging level
 @//DEBUG_ON    SETL {TRUE}


 @// Guarding implementation by the processor name

 @//    IF  ARM1136JS

 @//Input Registers

 #define pSrc            r0
 #define pDst            r1
 #define pFFTSpec        r2


 @// Output registers
 #define result          r0

 @//Local Scratch Registers

 @// N=1 case
 #define scaleMinusOne   r2
 #define rnd             r2
 #define zero            r8
 #define Zero            r9


 #define argTwiddle      r1
 #define argDst          r2
 #define argScale        r4
 #define pTwiddle        r4
 #define pOut            r5
 #define subFFTSize      r7
 #define subFFTNum       r6
 #define N               r6
 #define order           r14
 #define diff            r9
 #define count           r8
 #define diffMinusOne    r10
 #define round           r3

 #define step            r3
 #define step1           r6
 #define twStep          r12
 #define pTwiddleTmp     r14
 #define t0              r12
 #define t1              r14              /*@// pTwiddleTmp*/
 #define t2              r0
 #define t3              r1               /*@// pSrc,argTwiddle*/
 #define t4              r6
 #define t5              r7               /*@// step1,subFFTSize*/

 #define x0r     s0
 #define x0i     s1
 #define y0r     s2
 #define y0i     s3
 #define x1r     s4
 #define x1i     s5
 #define w1r     s2
 #define w1i     s3
 #define w0r     s6
 #define w0i     s7
 #define y1r     s2              /*@// w1r,w1i*/
 #define y1i     s3
 #define st0     s8
 #define st1     s9
 #define st2     s10
 #define st3     s11
 #define st4     s12
 #define st5     s13
 #define half    s15


     @// Allocate stack memory required by the function


     @// Write function header
         M_START     omxSP_FFTFwd_RToCCS_F32_Sfs_vfp,r11

 @ Structure offsets for FFTSpec
         .set    ARMsFFTSpec_N, 0
         .set    ARMsFFTSpec_pBitRev, 4
         .set    ARMsFFTSpec_pTwiddle, 8
         .set    ARMsFFTSpec_pBuf, 12

         @// Define stack arguments

         @// Setup half value
         movw    N, #0                   @// Use N as a temp.
         movt    N, #0x3f00
         vmov.f32 half, N

         @// Read the size from structure and take log
         LDR     N, [pFFTSpec, #ARMsFFTSpec_N]

         @// Read other structure parameters
         LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
         LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]

         @//  N=1 Treat seperately
         CMP     N,#1
         BGT     sizeGreaterThanOne
         // N<=1 is not supported
         @// Set return value
         MOV     result, #OMX_Sts_NoErr
         B       FunctionEnd

 sizeGreaterThanOne:
         @// Do a N/2 point complex FFT including the scaling

         MOV     N,N,ASR #1              @// N/2 point complex FFT
         CLZ     order,N                 @// N = 2^order
         RSB     order,order,#31
         MOV     subFFTSize,#1
         @//MOV     subFFTNum,N


         CMP     order,#1
         BGT     orderGreaterthan1       @// order > 1
         vldmlt.f32 pSrc, {x0r, x0i}
         vstmlt.f32 pOut, {x0r, x0i}
         MOVLT   pSrc,pOut
         MOVLT   argDst,pDst
         BLT     FFTEnd

         MOV     argDst,pOut             @// Set input args to fft stages
         MOV     pOut,pDst               @// Set input args to fft stages
         MOV     argTwiddle,pTwiddle

         BL    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
         B     finalComplexToRealFixup

 orderGreaterthan1:

         TST     order, #2               @// Set input args to fft stages
         MOVEQ   argDst,pDst
         MOVNE   argDst,pOut
         MOVNE   pOut,pDst               @// Pass the first stage dest in RN5
         MOV     argTwiddle,pTwiddle

         @//check for even or odd order

         @// NOTE: The following combination of BL's would work fine
         @// eventhough the first BL would corrupt the flags. This is
         @// because the end of the "grpZeroSetLoop" loop inside
         @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets
         @// the Z flag to EQ

         TST     order,#0x00000001
         BLEQ    armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
         BLNE    armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp

 unscaledRadix4Loop:
         CMP        subFFTNum,#1
          BEQ        FFTEnd
          BL        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
          B        unscaledRadix4Loop

 FFTEnd:
 finalComplexToRealFixup:

         @// step = N/2 * 8 bytes
         MOV     step,subFFTSize,LSL #3
         @// twStep = 3N/8 * 8 bytes pointing to W^1
         SUB     twStep,step,subFFTSize,LSL #1
         @// step1 = N/4 * 8 = N/2*4 bytes
         MOV     step1,subFFTSize,LSL #2
         @// (N/4-1)*8 bytes
         SUB     step1,step1,#8

         @// F(0) = 1/2 [Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
         @// 1/2 [(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
         @// 1/2 [2a+j0] - j [0+j2b]
         @// (a+b, 0)

         @// F(N/2) =1/2 [Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
         @// 1/2 [(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
         @// 1/2 [2a+j0] + j [0+j2b]
         @// (a-b, 0)

         @// F(0) and F(N/2)
         vldm.f32 pSrc!, {x0r, x0i}
         vadd.f32 y0r,x0r,x0i            @// F(0) = (2(Z0.r+Z0.i) , 0)
         vsub.f32 x0r,x0r,x0i            @// F(N/2) = (2(Z0.r-Z0.i) , 0)
         vsub.f32 y0i, y0i               @ y0i and x0i set to 0.0
         vsub.f32 x0i, x0i

         add      argDst, step
         vstm.f32 argDst, {x0r, x0i}     @// {x0r,x0i}->[argDst, step]
         sub      argDst, step
         vstm.f32 argDst!, {y0r, y0i}

         SUBS    subFFTSize,subFFTSize,#2

         ADD     pTwiddleTmp,argTwiddle,#8       @// W^2
         ADD     argTwiddle,argTwiddle,twStep    @// W^1
         BLT     End
         BEQ     lastElement


         @// F(k) = 1/2 [Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
         @// Process 2 elements at a time. E.g: F(1) and F(N/2-1) since
         @// both of them require Z(1) and Z(N/2-1)

         ASR     subFFTSize,subFFTSize,#1
 evenOddButterflyLoop:

         SUB     step,step,#16           @// (N/2-2)*8 bytes

         add      pSrc, step
         vldm.f32 pSrc, {x1r, x1i}       @// {x1r, x1i} = [pSrc, step]
         sub      pSrc, step
         vldm.f32 pSrc!, {x0r, x0i}
         add      argTwiddle, step1
         vldm.f32 argTwiddle, {w1r, w1i}  @// {w1r, w1i} = [argTwiddle, step1]
         sub      argTwiddle, step1
         vldm.f32 argTwiddle!, {w0r, w0i} @// {w0r, w0i} = [argTwiddle], #8

         SUB     step1,step1,#8
         SUBS    subFFTSize,subFFTSize,#1

         vsub.f32 st2,x0r,x1r            @// a-c
         vadd.f32 st3,x0i,x1i            @// b+d
         vadd.f32 st0,x0r,x1r            @// a+c
         vsub.f32 st1,x0i,x1i            @// b-d

         vmul.f32 x1r,w1r,st2
         vmul.f32 x1i,w1r,st3
         vmla.f32 x1r,w1i,st3            @// x1r = w1r*st2 + w1i*st3
         @//RSB     x1r,x1r,#0
         vmls.f32 x1i,w1i,st2            @// x1i = w1r*st3 - wli*st2

         vsub.f32 y1r, st0, x1i
         vadd.f32 y1i, x1r, st1
         vneg.f32 y1i, y1i

         vmul.f32  x0r,w0r,st2
         vmul.f32  x0i,w0r,st3
         vmls.f32  x0r,w0i,st3           @// x0r = w0r*st2 - w0i*st3
         vmla.f32  x0i,w0i,st2           @// x0i = w0r*st3 + x0i*st1

         vsub.f32   st4,st0,x0i          @// F(1)
         vadd.f32   st5,x0r,st1


         vmul.f32 y1r, half
         vmul.f32 y1i, half
         vmul.f32 st4, half
         vmul.f32 st5, half

         add      argDst, step
         vstm.f32 argDst, {y1r, y1i}     @// {y1r,y1i} -> [argDst,step]
         sub      argDst, step
         vstm.f32 argDst!, {st4, st5}


         MOV     t0,argTwiddle           @// swap ptr for even and odd twiddles
         MOV     argTwiddle,pTwiddleTmp
         MOV     pTwiddleTmp,t0

         BGT     evenOddButterflyLoop

         @// Last element can be expanded as follows
         @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
         @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
         @// 1/2[2a+j0] + j (c+jd) [0+j2b]
         @// (a-bc, -bd)

 lastElement:
         vldm.f32 pSrc, {x0r, x0i}
         vneg.f32 x0i, x0i
         vstm.f32 argDst, {x0r, x0i}

 End:
         @// Set return value
         MOV     result, #OMX_Sts_NoErr

 FunctionEnd:
         @// Write function tail
         M_END

 @//    ENDIF                                           @//ARM1136JS


     @// Guarding implementation by the processor name


     .end
	@//
	@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
	@//
	@// Use of this source code is governed by a BSD-style license
	@// that can be found in the LICENSE file in the root of the source
	@// tree. An additional intellectual property rights grant can be found
	@// in the file PATENTS. All contributing project authors may
	@// be found in the AUTHORS file in the root of the source tree.
	@//
	@// This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
	@// to support float instead of SC32.
	@//

	@//
	@// Description:
	@// Compute FFT for a real signal
	@//
	@//


	@// Include standard headers

	#include "dl/api/arm/armCOMM_s.h"
	#include "dl/api/arm/omxtypes_s.h"

	@// M_VARIANTS ARM1136JS

	@// Import symbols required from other files
	@// (For example tables)

	.extern armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
	.extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
	.extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
	.extern armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp

	@// Set debugging level
	@//DEBUG_ON SETL {TRUE}



	@// Guarding implementation by the processor name

	@// IF ARM1136JS

	@//Input Registers

	#define pSrc r0
	#define pDst r1
	#define pFFTSpec r2


	@// Output registers
	#define result r0

	@//Local Scratch Registers

	@// N=1 case
	#define scaleMinusOne r2
	#define rnd r2
	#define zero r8
	#define Zero r9


	#define argTwiddle r1
	#define argDst r2
	#define argScale r4
	#define pTwiddle r4
	#define pOut r5
	#define subFFTSize r7
	#define subFFTNum r6
	#define N r6
	#define order r14
	#define diff r9
	#define count r8
	#define diffMinusOne r10
	#define round r3

	#define step r3
	#define step1 r6
	#define twStep r12
	#define pTwiddleTmp r14
	#define t0 r12
	#define t1 r14 /@// pTwiddleTmp/
	#define t2 r0
	#define t3 r1 /@// pSrc,argTwiddle/
	#define t4 r6
	#define t5 r7 /@// step1,subFFTSize/

	#define x0r s0
	#define x0i s1
	#define y0r s2
	#define y0i s3
	#define x1r s4
	#define x1i s5
	#define w1r s2
	#define w1i s3
	#define w0r s6
	#define w0i s7
	#define y1r s2 /@// w1r,w1i/
	#define y1i s3
	#define st0 s8
	#define st1 s9
	#define st2 s10
	#define st3 s11
	#define st4 s12
	#define st5 s13
	#define half s15




	@// Allocate stack memory required by the function



	@// Write function header
	M_START omxSP_FFTFwd_RToCCS_F32_Sfs_vfp,r11

	@ Structure offsets for FFTSpec
	.set ARMsFFTSpec_N, 0
	.set ARMsFFTSpec_pBitRev, 4
	.set ARMsFFTSpec_pTwiddle, 8
	.set ARMsFFTSpec_pBuf, 12

	@// Define stack arguments

	@// Setup half value
	movw N, #0 @// Use N as a temp.
	movt N, #0x3f00
	vmov.f32 half, N

	@// Read the size from structure and take log
	LDR N, [pFFTSpec, #ARMsFFTSpec_N]

	@// Read other structure parameters
	LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
	LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]

	@// N=1 Treat seperately
	CMP N,#1
	BGT sizeGreaterThanOne
	// N<=1 is not supported
	@// Set return value
	MOV result, #OMX_Sts_NoErr
	B FunctionEnd

	sizeGreaterThanOne:
	@// Do a N/2 point complex FFT including the scaling

	MOV N,N,ASR #1 @// N/2 point complex FFT
	CLZ order,N @// N = 2^order
	RSB order,order,#31
	MOV subFFTSize,#1
	@//MOV subFFTNum,N


	CMP order,#1
	BGT orderGreaterthan1 @// order > 1
	vldmlt.f32 pSrc, {x0r, x0i}
	vstmlt.f32 pOut, {x0r, x0i}
	MOVLT pSrc,pOut
	MOVLT argDst,pDst
	BLT FFTEnd

	MOV argDst,pOut @// Set input args to fft stages
	MOV pOut,pDst @// Set input args to fft stages
	MOV argTwiddle,pTwiddle

	BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
	B finalComplexToRealFixup

	orderGreaterthan1:

	TST order, #2 @// Set input args to fft stages
	MOVEQ argDst,pDst
	MOVNE argDst,pOut
	MOVNE pOut,pDst @// Pass the first stage dest in RN5
	MOV argTwiddle,pTwiddle

	@//check for even or odd order

	@// NOTE: The following combination of BL's would work fine
	@// eventhough the first BL would corrupt the flags. This is
	@// because the end of the "grpZeroSetLoop" loop inside
	@// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets
	@// the Z flag to EQ

	TST order,#0x00000001
	BLEQ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
	BLNE armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp

	unscaledRadix4Loop:
	CMP subFFTNum,#1
	BEQ FFTEnd
	BL armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
	B unscaledRadix4Loop

	FFTEnd:
	finalComplexToRealFixup:

	@// step = N/2 * 8 bytes
	MOV step,subFFTSize,LSL #3
	@// twStep = 3N/8 * 8 bytes pointing to W^1
	SUB twStep,step,subFFTSize,LSL #1
	@// step1 = N/4 * 8 = N/2*4 bytes
	MOV step1,subFFTSize,LSL #2
	@// (N/4-1)*8 bytes
	SUB step1,step1,#8

	@// F(0) = 1/2 [Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
	@// 1/2 [(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)]
	@// 1/2 [2a+j0] - j [0+j2b]
	@// (a+b, 0)

	@// F(N/2) =1/2 [Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
	@// 1/2 [(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)]
	@// 1/2 [2a+j0] + j [0+j2b]
	@// (a-b, 0)

	@// F(0) and F(N/2)
	vldm.f32 pSrc!, {x0r, x0i}
	vadd.f32 y0r,x0r,x0i @// F(0) = (2(Z0.r+Z0.i) , 0)
	vsub.f32 x0r,x0r,x0i @// F(N/2) = (2(Z0.r-Z0.i) , 0)
	vsub.f32 y0i, y0i @ y0i and x0i set to 0.0
	vsub.f32 x0i, x0i

	add argDst, step
	vstm.f32 argDst, {x0r, x0i} @// {x0r,x0i}->[argDst, step]
	sub argDst, step
	vstm.f32 argDst!, {y0r, y0i}

	SUBS subFFTSize,subFFTSize,#2

	ADD pTwiddleTmp,argTwiddle,#8 @// W^2
	ADD argTwiddle,argTwiddle,twStep @// W^1
	BLT End
	BEQ lastElement


	@// F(k) = 1/2 [Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)]
	@// Process 2 elements at a time. E.g: F(1) and F(N/2-1) since
	@// both of them require Z(1) and Z(N/2-1)

	ASR subFFTSize,subFFTSize,#1
	evenOddButterflyLoop:

	SUB step,step,#16 @// (N/2-2)*8 bytes

	add pSrc, step
	vldm.f32 pSrc, {x1r, x1i} @// {x1r, x1i} = [pSrc, step]
	sub pSrc, step
	vldm.f32 pSrc!, {x0r, x0i}
	add argTwiddle, step1
	vldm.f32 argTwiddle, {w1r, w1i} @// {w1r, w1i} = [argTwiddle, step1]
	sub argTwiddle, step1
	vldm.f32 argTwiddle!, {w0r, w0i} @// {w0r, w0i} = [argTwiddle], #8

	SUB step1,step1,#8
	SUBS subFFTSize,subFFTSize,#1

	vsub.f32 st2,x0r,x1r @// a-c
	vadd.f32 st3,x0i,x1i @// b+d
	vadd.f32 st0,x0r,x1r @// a+c
	vsub.f32 st1,x0i,x1i @// b-d

	vmul.f32 x1r,w1r,st2
	vmul.f32 x1i,w1r,st3
	vmla.f32 x1r,w1i,st3 @// x1r = w1rst2 + w1ist3
	@//RSB x1r,x1r,#0
	vmls.f32 x1i,w1i,st2 @// x1i = w1rst3 - wlist2

	vsub.f32 y1r, st0, x1i
	vadd.f32 y1i, x1r, st1
	vneg.f32 y1i, y1i

	vmul.f32 x0r,w0r,st2
	vmul.f32 x0i,w0r,st3
	vmls.f32 x0r,w0i,st3 @// x0r = w0rst2 - w0ist3
	vmla.f32 x0i,w0i,st2 @// x0i = w0rst3 + x0ist1

	vsub.f32 st4,st0,x0i @// F(1)
	vadd.f32 st5,x0r,st1


	vmul.f32 y1r, half
	vmul.f32 y1i, half
	vmul.f32 st4, half
	vmul.f32 st5, half

	add argDst, step
	vstm.f32 argDst, {y1r, y1i} @// {y1r,y1i} -> [argDst,step]
	sub argDst, step
	vstm.f32 argDst!, {st4, st5}


	MOV t0,argTwiddle @// swap ptr for even and odd twiddles
	MOV argTwiddle,pTwiddleTmp
	MOV pTwiddleTmp,t0

	BGT evenOddButterflyLoop

	@// Last element can be expanded as follows
	@// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
	@// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
	@// 1/2[2a+j0] + j (c+jd) [0+j2b]
	@// (a-bc, -bd)

	lastElement:
	vldm.f32 pSrc, {x0r, x0i}
	vneg.f32 x0i, x0i
	vstm.f32 argDst, {x0r, x0i}

	End:
	@// Set return value
	MOV result, #OMX_Sts_NoErr

	FunctionEnd:
	@// Write function tail
	M_END

	@// ENDIF @//ARM1136JS


	@// Guarding implementation by the processor name



	.end