dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S - deps/third_party/openmax - Git at Google

 //
 //  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
 //
 //  Use of this source code is governed by a BSD-style license
 //  that can be found in the LICENSE file in the root of the source
 //  tree. An additional intellectual property rights grant can be found
 //  in the file PATENTS.  All contributing project authors may
 //  be found in the AUTHORS file in the root of the source tree.
 //
 //  This is a modification of
 //  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float
 //  instead of SC32.
 //

 //
 // Description:
 // Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
 // It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
 //
 //


 // Include standard headers

 #include "dl/api/arm/arm64COMM_s.h"
 #include "dl/api/arm/omxtypes_s.h"


 // Import symbols required from other files
 // (For example tables)


 // Set debugging level
 //DEBUG_ON    SETL {TRUE}


 // Guarding implementation by the processor name


       // Guarding implementation by the processor name


 //Input Registers

 #define pSrc            x0
 #define pTwiddle        x1
 #define	pOut		x2
 #define	subFFTNum	x3

 // Output registers

 //Local Scratch Registers

 #define argTwiddle      x5
 #define argDst          x6
 #define subFFTSize      x7
 #define N               subFFTNum

 #define pOut1           x13

 #define size            x7
 #define step            x8
 #define step1           x9
 #define twStep          x10
 #define pTwiddleTmp     x11
 #define argTwiddle1     x12

 // Neon registers

 #define dX0     v0.2s
 #define dX0s    v0.s
 #define dShift  v1.2s
 #define dX1     v1.2s
 #define dX1s    v1.s
 #define dY0     v2.2s
 #define dY08b   v2.8b
 #define dY1     v3.2s
 #define dX0r    v0.2s
 #define dX0rs   v0.s
 #define dX0i    v1.2s
 #define dX1r    v2.2s
 #define dX1i    v3.2s
 #define dW0r    v4.2s
 #define dW0r8b  v4.8b
 #define dW0i    v5.2s
 #define dW1r    v6.2s
 #define dW1r8b  v6.8b
 #define dW1i    v7.2s
 #define dT0     v8.2s
 #define dT1     v9.2s
 #define dT2     v10.2s
 #define dT3     v11.2s
 #define qT0     v12.2s
 #define qT1     v14.2s
 #define qT2     v16.2s
 #define qT3     v18.2s
 #define dY0r    v4.2s
 #define dY0i    v5.2s
 #define dY1r    v6.2s
 #define dY1i    v7.2s

 #define dY2     v4.2s
 #define dY3     v5.2s
 #define dW0     v6.2s
 #define dW1     v7.2s
 #define dW0Tmp  v10.2s
 #define dW1Neg  v11.2s

 #define dZip    v19.2s
 #define dZip8b  v19.8b
 #define half    v13.2s

         .MACRO FFTSTAGE scaled, inverse, name

         fmov    half, 0.5

         asr     size, subFFTNum, #1           // preserve the contents of N = subFFTNum
         lsl     step, subFFTNum, #2           // step = N/2 * 8 bytes


         // Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
         // Note: W^(k) is stored as negated value and also need to
         // conjugate the values from the table

         // Z(0) : no need of twiddle multiply
         // Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }

         ld1     {dX0},[pSrc],step
         ADD     pOut1,pOut,step               // pOut1 = pOut+ N/2*8 bytes

         ld1     {dX1},[pSrc], #8
         // twStep = 3N/8 * 8 bytes pointing to W^1
         SUB     twStep,step,size,LSL #1

         lsl     step1,size, #2                // step1 = N/4 * 8 = N/2*4 bytes
         SUB     step1,step1,#8                // (N/4-1)*8 bytes

         fadd    dY0,dX0,dX1                   // [b+d | a+c]
         fsub    dY1,dX0,dX1                   // [b-d | a-c]
         fmul    dY0, dY0, half[0]
         fmul    dY1, dY1, half[0]

         // dY0= [a-c | a+c] ;dY1= [b-d | b+d]
         // VZIP    dY0,dY1
         zip1    dZip,dY0,dY1
         zip2    dY1,dY0,dY1
         mov     dY08b, dZip8b

         fsub   dX0,dY0,dY1
         SUBS   size,size,#2
         fadd   dX1,dY0,dY1

         SUB     pSrc,pSrc,step

         st1     {dX0s}[0],[pOut1], #4
         ADD     pTwiddleTmp,pTwiddle,#8       // W^2
         st1     {dX1s}[1],[pOut1], #4
         ADD     argTwiddle1,pTwiddle,twStep   // W^1


         BLT     decrementScale\name
         BEQ     lastElement\name


         // Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
         // Note: W^k is stored as negative values in the table and also
         // need to conjugate the values from the table.
         //
         // Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
         // since both of them require F(1),F(2) and F(N/2-2),F(N/2-1)


         SUB     step,step,#24
 evenOddButterflyLoop\name :


         ld1     {dW0r},[argTwiddle1],step1
         ld1     {dW1r},[argTwiddle1], #8

         ld2     {dX0r,dX0i},[pSrc],step
         SUB     argTwiddle1,argTwiddle1,step1
         ld2     {dX1r,dX1i},[pSrc], #16

         SUB     step1,step1,#8                // (N/4-2)*8 bytes
         ld1     {dW0i},[pTwiddleTmp],step1
         ld1     {dW1i},[pTwiddleTmp], #8
         SUB     pSrc,pSrc,step

         SUB     pTwiddleTmp,pTwiddleTmp,step1
         rev64   dX1r,dX1r
         rev64   dX1i,dX1i
         SUBS    size,size,#4


         fsub    dT2,dX0r,dX1r                 // a-c
         fadd    dT3,dX0i,dX1i                 // b+d
         fadd    dT0,dX0r,dX1r                 // a+c
         fsub    dT1,dX0i,dX1i                 // b-d
         SUB     step1,step1,#8

         fmul    dT2, dT2, half[0]
         fmul    dT3, dT3, half[0]

         fmul    dT0, dT0, half[0]
         fmul    dT1, dT1, half[0]

         // VZIP    dW1r,dW1i
         // VZIP    dW0r,dW0i
         zip1    dZip, dW1r,dW1i
         zip2    dW1i,dW1r,dW1i
         mov     dW1r8b, dZip8b
         zip1    dZip,dW0r,dW0i
         zip2    dW0i,dW0r,dW0i
         mov     dW0r8b, dZip8b

         fmul   dX1r,dW1r,dT2
         fmul   dX1i,dW1r,dT3
         fmul   dX0r,dW0r,dT2
         fmul   dX0i,dW0r,dT3

         fmls   dX1r,dW1i,dT3
         fmla   dX1i,dW1i,dT2

         fmla   dX0r,dW0i,dT3
         fmls   dX0i,dW0i,dT2


         fadd    dY1r,dT0,dX1i                 // F(N/2 -1)
         fsub    dY1i,dX1r,dT1

         rev64   dY1r,dY1r
         rev64   dY1i,dY1i


         fadd    dY0r,dT0,dX0i                 // F(1)
         fsub    dY0i,dT1,dX0r


         st2     {dY0r,dY0i},[pOut1],step
         st2     {dY1r,dY1i},[pOut1], #16
         SUB     pOut1,pOut1,step
         SUB     step,step,#32                 // (N/2-4)*8 bytes


         BGT     evenOddButterflyLoop\name


         // set both the ptrs to the last element
         SUB     pSrc,pSrc,#8
         SUB     pOut1,pOut1,#8

         // Last element can be expanded as follows
         // 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as
         // -ve)
         // 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
         // 1/2[2a+j0] - j (c-jd) [0+j2b]
         // (a+bc, -bd)
         // Since (c,d) = (0,1) for the last element, result is just (a,-b)

 lastElement\name :
         ld1     {dX0r},[pSrc]

         st1     {dX0rs}[0],[pOut1], #4
         fneg    dX0r,dX0r
         st1     {dX0rs}[1],[pOut1]


 decrementScale\name :

         .endm

         M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2,,d15
             FFTSTAGE "FALSE","TRUE",Inv
         M_END

         .end
	//
	// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
	//
	// Use of this source code is governed by a BSD-style license
	// that can be found in the LICENSE file in the root of the source
	// tree. An additional intellectual property rights grant can be found
	// in the file PATENTS. All contributing project authors may
	// be found in the AUTHORS file in the root of the source tree.
	//
	// This is a modification of
	// armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float
	// instead of SC32.
	//

	//
	// Description:
	// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
	// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
	//
	//


	// Include standard headers

	#include "dl/api/arm/arm64COMM_s.h"
	#include "dl/api/arm/omxtypes_s.h"


	// Import symbols required from other files
	// (For example tables)


	// Set debugging level
	//DEBUG_ON SETL {TRUE}



	// Guarding implementation by the processor name



	// Guarding implementation by the processor name



	//Input Registers

	#define pSrc x0
	#define pTwiddle x1
	#define pOut x2
	#define subFFTNum x3

	// Output registers

	//Local Scratch Registers

	#define argTwiddle x5
	#define argDst x6
	#define subFFTSize x7
	#define N subFFTNum

	#define pOut1 x13

	#define size x7
	#define step x8
	#define step1 x9
	#define twStep x10
	#define pTwiddleTmp x11
	#define argTwiddle1 x12

	// Neon registers

	#define dX0 v0.2s
	#define dX0s v0.s
	#define dShift v1.2s
	#define dX1 v1.2s
	#define dX1s v1.s
	#define dY0 v2.2s
	#define dY08b v2.8b
	#define dY1 v3.2s
	#define dX0r v0.2s
	#define dX0rs v0.s
	#define dX0i v1.2s
	#define dX1r v2.2s
	#define dX1i v3.2s
	#define dW0r v4.2s
	#define dW0r8b v4.8b
	#define dW0i v5.2s
	#define dW1r v6.2s
	#define dW1r8b v6.8b
	#define dW1i v7.2s
	#define dT0 v8.2s
	#define dT1 v9.2s
	#define dT2 v10.2s
	#define dT3 v11.2s
	#define qT0 v12.2s
	#define qT1 v14.2s
	#define qT2 v16.2s
	#define qT3 v18.2s
	#define dY0r v4.2s
	#define dY0i v5.2s
	#define dY1r v6.2s
	#define dY1i v7.2s

	#define dY2 v4.2s
	#define dY3 v5.2s
	#define dW0 v6.2s
	#define dW1 v7.2s
	#define dW0Tmp v10.2s
	#define dW1Neg v11.2s

	#define dZip v19.2s
	#define dZip8b v19.8b
	#define half v13.2s

	.MACRO FFTSTAGE scaled, inverse, name

	fmov half, 0.5

	asr size, subFFTNum, #1 // preserve the contents of N = subFFTNum
	lsl step, subFFTNum, #2 // step = N/2 * 8 bytes


	// Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]}
	// Note: W^(k) is stored as negated value and also need to
	// conjugate the values from the table

	// Z(0) : no need of twiddle multiply
	// Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }

	ld1 {dX0},[pSrc],step
	ADD pOut1,pOut,step // pOut1 = pOut+ N/2*8 bytes

	ld1 {dX1},[pSrc], #8
	// twStep = 3N/8 * 8 bytes pointing to W^1
	SUB twStep,step,size,LSL #1

	lsl step1,size, #2 // step1 = N/4 * 8 = N/2*4 bytes
	SUB step1,step1,#8 // (N/4-1)*8 bytes

	fadd dY0,dX0,dX1 // [b+d \| a+c]
	fsub dY1,dX0,dX1 // [b-d \| a-c]
	fmul dY0, dY0, half[0]
	fmul dY1, dY1, half[0]

	// dY0= [a-c \| a+c] ;dY1= [b-d \| b+d]
	// VZIP dY0,dY1
	zip1 dZip,dY0,dY1
	zip2 dY1,dY0,dY1
	mov dY08b, dZip8b

	fsub dX0,dY0,dY1
	SUBS size,size,#2
	fadd dX1,dY0,dY1

	SUB pSrc,pSrc,step

	st1 {dX0s}[0],[pOut1], #4
	ADD pTwiddleTmp,pTwiddle,#8 // W^2
	st1 {dX1s}[1],[pOut1], #4
	ADD argTwiddle1,pTwiddle,twStep // W^1


	BLT decrementScale\name
	BEQ lastElement\name


	// Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]
	// Note: W^k is stored as negative values in the table and also
	// need to conjugate the values from the table.
	//
	// Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
	// since both of them require F(1),F(2) and F(N/2-2),F(N/2-1)


	SUB step,step,#24
	evenOddButterflyLoop\name :


	ld1 {dW0r},[argTwiddle1],step1
	ld1 {dW1r},[argTwiddle1], #8

	ld2 {dX0r,dX0i},[pSrc],step
	SUB argTwiddle1,argTwiddle1,step1
	ld2 {dX1r,dX1i},[pSrc], #16

	SUB step1,step1,#8 // (N/4-2)*8 bytes
	ld1 {dW0i},[pTwiddleTmp],step1
	ld1 {dW1i},[pTwiddleTmp], #8
	SUB pSrc,pSrc,step

	SUB pTwiddleTmp,pTwiddleTmp,step1
	rev64 dX1r,dX1r
	rev64 dX1i,dX1i
	SUBS size,size,#4


	fsub dT2,dX0r,dX1r // a-c
	fadd dT3,dX0i,dX1i // b+d
	fadd dT0,dX0r,dX1r // a+c
	fsub dT1,dX0i,dX1i // b-d
	SUB step1,step1,#8

	fmul dT2, dT2, half[0]
	fmul dT3, dT3, half[0]

	fmul dT0, dT0, half[0]
	fmul dT1, dT1, half[0]

	// VZIP dW1r,dW1i
	// VZIP dW0r,dW0i
	zip1 dZip, dW1r,dW1i
	zip2 dW1i,dW1r,dW1i
	mov dW1r8b, dZip8b
	zip1 dZip,dW0r,dW0i
	zip2 dW0i,dW0r,dW0i
	mov dW0r8b, dZip8b

	fmul dX1r,dW1r,dT2
	fmul dX1i,dW1r,dT3
	fmul dX0r,dW0r,dT2
	fmul dX0i,dW0r,dT3

	fmls dX1r,dW1i,dT3
	fmla dX1i,dW1i,dT2

	fmla dX0r,dW0i,dT3
	fmls dX0i,dW0i,dT2


	fadd dY1r,dT0,dX1i // F(N/2 -1)
	fsub dY1i,dX1r,dT1

	rev64 dY1r,dY1r
	rev64 dY1i,dY1i


	fadd dY0r,dT0,dX0i // F(1)
	fsub dY0i,dT1,dX0r


	st2 {dY0r,dY0i},[pOut1],step
	st2 {dY1r,dY1i},[pOut1], #16
	SUB pOut1,pOut1,step
	SUB step,step,#32 // (N/2-4)*8 bytes


	BGT evenOddButterflyLoop\name


	// set both the ptrs to the last element
	SUB pSrc,pSrc,#8
	SUB pOut1,pOut1,#8

	// Last element can be expanded as follows
	// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as
	// -ve)
	// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
	// 1/2[2a+j0] - j (c-jd) [0+j2b]
	// (a+bc, -bd)
	// Since (c,d) = (0,1) for the last element, result is just (a,-b)

	lastElement\name :
	ld1 {dX0r},[pSrc]

	st1 {dX0rs}[0],[pOut1], #4
	fneg dX0r,dX0r
	st1 {dX0rs}[1],[pOut1]



	decrementScale\name :

	.endm

	M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2,,d15
	FFTSTAGE "FALSE","TRUE",Inv
	M_END

	.end