dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S - deps/third_party/openmax - Git at Google

 //
 //  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
 //
 //  Use of this source code is governed by a BSD-style license
 //  that can be found in the LICENSE file in the root of the source
 //  tree. An additional intellectual property rights grant can be found
 //  in the file PATENTS.  All contributing project authors may
 //  be found in the AUTHORS file in the root of the source tree.
 //
 //  This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
 //  to support float instead of SC32.
 //

 //
 // Description:
 // Compute a Radix 4 FFT stage for a N point complex signal
 //
 //


 // Include standard headers

 #include "dl/api/arm/arm64COMM_s.h"
 #include "dl/api/arm/omxtypes_s.h"

 // Import symbols required from other files
 // (For example tables)


 // Set debugging level
 //DEBUG_ON    SETL {TRUE}


 // Guarding implementation by the processor name


 // Import symbols required from other files
 // (For example tables)
     //IMPORT  armAAC_constTable

 //Input Registers

 #define pSrc            x0
 #define pDst            x1
 #define pTwiddle        x2
 #define	pSubFFTNum	x3
 #define pSubFFTSize	x4


 //Output Registers


 //Local Scratch Registers

 #define subFFTNum       x5
 #define subFFTSize      x6
 #define outPointStep    x8
 #define grpCount        x9
 #define dstStep         x10
 #define grpTwStep       x13
 #define stepTwiddle     x14
 #define twStep          x15
 #define step16          x11
 #define step24          x12


 // Neon Registers

 #define dButterfly1Real02       v0.2s
 #define dButterfly1Real028b     v0.8b
 #define dButterfly1Imag02       v1.2s
 #define dButterfly1Imag028b     v1.8b
 #define dButterfly1Real13       v2.2s
 #define dButterfly1Real138b     v2.8b
 #define dButterfly1Imag13       v3.2s
 #define dButterfly1Imag138b     v3.8b
 #define dButterfly2Real02       v4.2s
 #define dButterfly2Imag02       v5.2s
 #define dButterfly2Real13       v6.2s
 #define dButterfly2Imag13       v7.2s
 #define dXr0                    v0.2s
 #define dXi0                    v1.2s
 #define dXr08b                  v0.8b
 #define dXi08b                  v1.8b
 #define dXr1                    v2.2s
 #define dXi1                    v3.2s
 #define dXr2                    v4.2s
 #define dXi2                    v5.2s
 #define dXr3                    v6.2s
 #define dXi3                    v7.2s

 #define dYr0                    v16.2s
 #define dYi0                    v17.2s
 #define dYr1                    v18.2s
 #define dYi1                    v19.2s
 #define dYr2                    v20.2s
 #define dYi2                    v21.2s
 #define dYr3                    v22.2s
 #define dYi3                    v23.2s

 #define dW1r                    v8.2s
 #define dW1i                    v9.2s
 #define dW2r                    v10.2s
 #define dW2r8b                  v10.8b
 #define dW2i                    v11.2s
 #define dW3r                    v12.2s
 #define dW3r8b                  v12.8b
 #define dW3i                    v13.2s

 #define dZr0                    v14.2s
 #define dZi0                    v15.2s
 #define dZr08b                  v14.8b
 #define dZi08b                  v15.8b
 #define dZr1                    v26.2s
 #define dZi1                    v27.2s
 #define dZr2                    v28.2s
 #define dZi2                    v29.2s
 #define dZr3                    v30.2s
 #define dZi3                    v31.2s

 #define dZip                    v24.2s
 #define dZip8b                  v24.8b

         .MACRO FFTSTAGE scaled, inverse , name

         // Define stack arguments

         // Move args values into our work registers
         ldr     subFFTNum, [pSubFFTNum]
         ldr     subFFTSize, [pSubFFTSize]

         // pOut0+1 increments pOut0 by 8 bytes
         // pOut0+outPointStep == increment of 8*outPointStep bytes
         lsl     outPointStep,subFFTSize, #3

         // Update grpCount and grpSize rightaway

         ld2    {dW1r,dW1i},[pTwiddle]             // [wi|wr]
         MOV     step16,#16
         LSL     grpCount,subFFTSize,#2

         ld1    {dW2r},[pTwiddle]                  // [wi|wr]
         MOV     subFFTNum,#1                      //after the last stage

         ld1    {dW3r},[pTwiddle],step16           // [wi|wr]
         MOV     stepTwiddle,#0

         ld1    {dW2i},[pTwiddle],#8               // [wi|wr]
         SUB     grpTwStep,stepTwiddle,#8          // grpTwStep = -8 to start with

         // update subFFTSize for the next stage
         MOV     subFFTSize,grpCount
         ld1    {dW3i},[pTwiddle],grpTwStep        // [wi|wr]
         lsl     dstStep,outPointStep, #1

         // AC.r AC.i BD.r BD.i
         ld4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
         ADD     dstStep,dstStep,outPointStep      // dstStep = 3*outPointStep

         rsb     dstStep,dstStep,#16               // dstStep = - 3*outPointStep+16
         MOV     step24,#24

         // AC.r AC.i BD.r BD.i
         ld4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32


         // Process two groups at a time

 radix4lsGrpLoop\name :

         // VZIP    dW2r,dW2i
         zip1    dZip, dW2r, dW2i
         zip2    dW2i, dW2r, dW2i
         mov     dW2r8b, dZip8b

         ADD     stepTwiddle,stepTwiddle,#16

         // VZIP    dW3r,dW3i
         zip1    dZip, dW3r,dW3i
         zip2    dW3i, dW3r, dW3i
         mov     dW3r8b, dZip8b
         ADD     grpTwStep,stepTwiddle,#4

         // VUZP     dButterfly1Real13, dButterfly2Real13      // B.r D.r
         uzp1     dZip, dButterfly1Real13, dButterfly2Real13   // B.r D.r
         uzp2     dButterfly2Real13, dButterfly1Real13, dButterfly2Real13   // B.r D.r
         mov      dButterfly1Real138b, dZip8b

         SUB     twStep,stepTwiddle,#16                        // -16+stepTwiddle

         // VUZP     dButterfly1Imag13, dButterfly2Imag13      // B.i D.i
         uzp1     dZip, dButterfly1Imag13, dButterfly2Imag13   // B.i D.i
         uzp2     dButterfly2Imag13, dButterfly1Imag13, dButterfly2Imag13   // B.i D.i
         mov      dButterfly1Imag138b, dZip8b
         lsl     grpTwStep,grpTwStep,#1

         // VUZP     dButterfly1Real02, dButterfly2Real02      // A.r C.r
         uzp1     dZip, dButterfly1Real02, dButterfly2Real02   // A.r C.r
         uzp2     dButterfly2Real02, dButterfly1Real02, dButterfly2Real02   // A.r C.r
         mov      dButterfly1Real028b, dZip8b
         rsb     grpTwStep,grpTwStep,#0                        // -8-2*stepTwiddle

         // VUZP     dButterfly1Imag02, dButterfly2Imag02      // A.i C.i
         uzp1     dZip, dButterfly1Imag02, dButterfly2Imag02   // A.i C.i
         uzp2     dButterfly2Imag02, dButterfly1Imag02, dButterfly2Imag02   // A.i C.i
         mov      dButterfly1Imag028b, dZip8b


         // grpCount is multiplied by 4
         SUBS    grpCount,grpCount,#8

         .ifeqs  "\inverse", "TRUE"
             fmul   dZr1,dW1r,dXr1
             fmla   dZr1,dW1i,dXi1                       // real part
             fmul   dZi1,dW1r,dXi1
             fmls   dZi1,dW1i,dXr1                       // imag part

         .else

             fmul   dZr1,dW1r,dXr1
             fmls   dZr1,dW1i,dXi1                       // real part
             fmul   dZi1,dW1r,dXi1
             fmla   dZi1,dW1i,dXr1                       // imag part

         .endif

         ld2    {dW1r,dW1i},[pTwiddle],stepTwiddle       // [wi|wr]

         .ifeqs  "\inverse", "TRUE"
             fmul   dZr2,dW2r,dXr2
             fmla   dZr2,dW2i,dXi2                       // real part
             fmul   dZi2,dW2r,dXi2
             ld1   {dW2r},[pTwiddle],step16              // [wi|wr]
             fmls   dZi2,dW2i,dXr2                       // imag part

         .else

             fmul   dZr2,dW2r,dXr2
             fmls   dZr2,dW2i,dXi2                       // real part
             fmul   dZi2,dW2r,dXi2
             ld1    {dW2r},[pTwiddle],step16             // [wi|wr]
             fmla   dZi2,dW2i,dXr2                       // imag part

         .endif


         ld1    {dW2i},[pTwiddle],twStep                 // [wi|wr]

         // move qX0 so as to load for the next iteration
         // MOV     qZ0,qX0
         mov     dZr08b, dXr08b
         mov     dZi08b, dXi08b

         .ifeqs  "\inverse", "TRUE"
             fmul   dZr3,dW3r,dXr3
             fmla   dZr3,dW3i,dXi3                       // real part
             fmul   dZi3,dW3r,dXi3
             ld1    {dW3r},[pTwiddle],step24
             fmls   dZi3,dW3i,dXr3                       // imag part

         .else

             fmul   dZr3,dW3r,dXr3
             fmls   dZr3,dW3i,dXi3                       // real part
             fmul   dZi3,dW3r,dXi3
             ld1    {dW3r},[pTwiddle],step24
             fmla   dZi3,dW3i,dXr3                       // imag part

         .endif

         ld1    {dW3i},[pTwiddle],grpTwStep              // [wi|wr]

         // Don't do the load on the last iteration so we don't read past the end
         // of pSrc.
         bne     skipIncrement\name
         add     pSrc, pSrc, #64
 skipIncrement\name:
         beq     radix4lsSkipRead\name
         // AC.r AC.i BD.r BD.i
         ld4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32

         // AC.r AC.i BD.r BD.i
         ld4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
 radix4lsSkipRead\name:

         // finish first stage of 4 point FFT

         // fadd    qY0,qZ0,qZ2
         fadd    dYr0,dZr0,dZr2
         fadd    dYi0,dZi0,dZi2
         // fsub    qY2,qZ0,qZ2
         fsub    dYr2,dZr0,dZr2
         fsub    dYi2,dZi0,dZi2
         // fadd    qY1,qZ1,qZ3
         fadd    dYr1,dZr1,dZr3
         fadd    dYi1,dZi1,dZi3
         // fsub    qY3,qZ1,qZ3
         fsub    dYr3,dZr1,dZr3
         fsub    dYi3,dZi1,dZi3


         // finish second stage of 4 point FFT

         .ifeqs  "\inverse", "TRUE"

             // fsub    qZ0,qY2,qY1
             fsub    dZr0,dYr2,dYr1
             fsub    dZi0,dYi2,dYi1
             fadd    dZr3,dYr0,dYi3
             st2    {dZr0,dZi0},[pDst],outPointStep
             fsub    dZi3,dYi0,dYr3

             // fadd    qZ2,qY2,qY1
             fadd    dZr2,dYr2,dYr1
             fadd    dZi2,dYi2,dYi1

             st2    {dZr3,dZi3},[pDst],outPointStep

             fsub    dZr1,dYr0,dYi3
             st2    {dZr2,dZi2},[pDst],outPointStep
             fadd    dZi1,dYi0,dYr3

             // dstStep = -outPointStep + 16
             st2    {dZr1,dZi1},[pDst],dstStep


         .else

             // fsub    qZ0,qY2,qY1
             fsub    dZr0,dYr2,dYr1
             fsub    dZi0,dYi2,dYi1

             fsub    dZr1,dYr0,dYi3
             st2    {dZr0,dZi0},[pDst],outPointStep
             fadd    dZi1,dYi0,dYr3

             // fadd    qZ2,qY2,qY1
             fadd    dZr2,dYr2,dYr1
             fadd    dZi2,dYi2,dYi1

             st2    {dZr1,dZi1},[pDst],outPointStep

             fadd    dZr3,dYr0,dYi3
             st2    {dZr2,dZi2},[pDst],outPointStep
             fsub    dZi3,dYi0,dYr3

             // dstStep = -outPointStep + 16
             st2    {dZr3,dZi3},[pDst],dstStep


         .endif

         BGT     radix4lsGrpLoop\name

         .endm


         M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace,,d15
         FFTSTAGE "FALSE","FALSE",fwd
         M_END


         M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace,,d15
         FFTSTAGE "FALSE","TRUE",inv
         M_END


         .end
	//
	// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
	//
	// Use of this source code is governed by a BSD-style license
	// that can be found in the LICENSE file in the root of the source
	// tree. An additional intellectual property rights grant can be found
	// in the file PATENTS. All contributing project authors may
	// be found in the AUTHORS file in the root of the source tree.
	//
	// This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
	// to support float instead of SC32.
	//

	//
	// Description:
	// Compute a Radix 4 FFT stage for a N point complex signal
	//
	//


	// Include standard headers

	#include "dl/api/arm/arm64COMM_s.h"
	#include "dl/api/arm/omxtypes_s.h"

	// Import symbols required from other files
	// (For example tables)




	// Set debugging level
	//DEBUG_ON SETL {TRUE}


	// Guarding implementation by the processor name


	// Import symbols required from other files
	// (For example tables)
	//IMPORT armAAC_constTable

	//Input Registers

	#define pSrc x0
	#define pDst x1
	#define pTwiddle x2
	#define pSubFFTNum x3
	#define pSubFFTSize x4



	//Output Registers


	//Local Scratch Registers

	#define subFFTNum x5
	#define subFFTSize x6
	#define outPointStep x8
	#define grpCount x9
	#define dstStep x10
	#define grpTwStep x13
	#define stepTwiddle x14
	#define twStep x15
	#define step16 x11
	#define step24 x12


	// Neon Registers

	#define dButterfly1Real02 v0.2s
	#define dButterfly1Real028b v0.8b
	#define dButterfly1Imag02 v1.2s
	#define dButterfly1Imag028b v1.8b
	#define dButterfly1Real13 v2.2s
	#define dButterfly1Real138b v2.8b
	#define dButterfly1Imag13 v3.2s
	#define dButterfly1Imag138b v3.8b
	#define dButterfly2Real02 v4.2s
	#define dButterfly2Imag02 v5.2s
	#define dButterfly2Real13 v6.2s
	#define dButterfly2Imag13 v7.2s
	#define dXr0 v0.2s
	#define dXi0 v1.2s
	#define dXr08b v0.8b
	#define dXi08b v1.8b
	#define dXr1 v2.2s
	#define dXi1 v3.2s
	#define dXr2 v4.2s
	#define dXi2 v5.2s
	#define dXr3 v6.2s
	#define dXi3 v7.2s

	#define dYr0 v16.2s
	#define dYi0 v17.2s
	#define dYr1 v18.2s
	#define dYi1 v19.2s
	#define dYr2 v20.2s
	#define dYi2 v21.2s
	#define dYr3 v22.2s
	#define dYi3 v23.2s

	#define dW1r v8.2s
	#define dW1i v9.2s
	#define dW2r v10.2s
	#define dW2r8b v10.8b
	#define dW2i v11.2s
	#define dW3r v12.2s
	#define dW3r8b v12.8b
	#define dW3i v13.2s

	#define dZr0 v14.2s
	#define dZi0 v15.2s
	#define dZr08b v14.8b
	#define dZi08b v15.8b
	#define dZr1 v26.2s
	#define dZi1 v27.2s
	#define dZr2 v28.2s
	#define dZi2 v29.2s
	#define dZr3 v30.2s
	#define dZi3 v31.2s

	#define dZip v24.2s
	#define dZip8b v24.8b

	.MACRO FFTSTAGE scaled, inverse , name

	// Define stack arguments

	// Move args values into our work registers
	ldr subFFTNum, [pSubFFTNum]
	ldr subFFTSize, [pSubFFTSize]

	// pOut0+1 increments pOut0 by 8 bytes
	// pOut0+outPointStep == increment of 8*outPointStep bytes
	lsl outPointStep,subFFTSize, #3

	// Update grpCount and grpSize rightaway

	ld2 {dW1r,dW1i},[pTwiddle] // [wi\|wr]
	MOV step16,#16
	LSL grpCount,subFFTSize,#2

	ld1 {dW2r},[pTwiddle] // [wi\|wr]
	MOV subFFTNum,#1 //after the last stage

	ld1 {dW3r},[pTwiddle],step16 // [wi\|wr]
	MOV stepTwiddle,#0

	ld1 {dW2i},[pTwiddle],#8 // [wi\|wr]
	SUB grpTwStep,stepTwiddle,#8 // grpTwStep = -8 to start with

	// update subFFTSize for the next stage
	MOV subFFTSize,grpCount
	ld1 {dW3i},[pTwiddle],grpTwStep // [wi\|wr]
	lsl dstStep,outPointStep, #1

	// AC.r AC.i BD.r BD.i
	ld4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
	ADD dstStep,dstStep,outPointStep // dstStep = 3*outPointStep

	rsb dstStep,dstStep,#16 // dstStep = - 3*outPointStep+16
	MOV step24,#24

	// AC.r AC.i BD.r BD.i
	ld4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32


	// Process two groups at a time

	radix4lsGrpLoop\name :

	// VZIP dW2r,dW2i
	zip1 dZip, dW2r, dW2i
	zip2 dW2i, dW2r, dW2i
	mov dW2r8b, dZip8b

	ADD stepTwiddle,stepTwiddle,#16

	// VZIP dW3r,dW3i
	zip1 dZip, dW3r,dW3i
	zip2 dW3i, dW3r, dW3i
	mov dW3r8b, dZip8b
	ADD grpTwStep,stepTwiddle,#4

	// VUZP dButterfly1Real13, dButterfly2Real13 // B.r D.r
	uzp1 dZip, dButterfly1Real13, dButterfly2Real13 // B.r D.r
	uzp2 dButterfly2Real13, dButterfly1Real13, dButterfly2Real13 // B.r D.r
	mov dButterfly1Real138b, dZip8b

	SUB twStep,stepTwiddle,#16 // -16+stepTwiddle

	// VUZP dButterfly1Imag13, dButterfly2Imag13 // B.i D.i
	uzp1 dZip, dButterfly1Imag13, dButterfly2Imag13 // B.i D.i
	uzp2 dButterfly2Imag13, dButterfly1Imag13, dButterfly2Imag13 // B.i D.i
	mov dButterfly1Imag138b, dZip8b
	lsl grpTwStep,grpTwStep,#1

	// VUZP dButterfly1Real02, dButterfly2Real02 // A.r C.r
	uzp1 dZip, dButterfly1Real02, dButterfly2Real02 // A.r C.r
	uzp2 dButterfly2Real02, dButterfly1Real02, dButterfly2Real02 // A.r C.r
	mov dButterfly1Real028b, dZip8b
	rsb grpTwStep,grpTwStep,#0 // -8-2*stepTwiddle

	// VUZP dButterfly1Imag02, dButterfly2Imag02 // A.i C.i
	uzp1 dZip, dButterfly1Imag02, dButterfly2Imag02 // A.i C.i
	uzp2 dButterfly2Imag02, dButterfly1Imag02, dButterfly2Imag02 // A.i C.i
	mov dButterfly1Imag028b, dZip8b


	// grpCount is multiplied by 4
	SUBS grpCount,grpCount,#8

	.ifeqs "\inverse", "TRUE"
	fmul dZr1,dW1r,dXr1
	fmla dZr1,dW1i,dXi1 // real part
	fmul dZi1,dW1r,dXi1
	fmls dZi1,dW1i,dXr1 // imag part

	.else

	fmul dZr1,dW1r,dXr1
	fmls dZr1,dW1i,dXi1 // real part
	fmul dZi1,dW1r,dXi1
	fmla dZi1,dW1i,dXr1 // imag part

	.endif

	ld2 {dW1r,dW1i},[pTwiddle],stepTwiddle // [wi\|wr]

	.ifeqs "\inverse", "TRUE"
	fmul dZr2,dW2r,dXr2
	fmla dZr2,dW2i,dXi2 // real part
	fmul dZi2,dW2r,dXi2
	ld1 {dW2r},[pTwiddle],step16 // [wi\|wr]
	fmls dZi2,dW2i,dXr2 // imag part

	.else

	fmul dZr2,dW2r,dXr2
	fmls dZr2,dW2i,dXi2 // real part
	fmul dZi2,dW2r,dXi2
	ld1 {dW2r},[pTwiddle],step16 // [wi\|wr]
	fmla dZi2,dW2i,dXr2 // imag part

	.endif


	ld1 {dW2i},[pTwiddle],twStep // [wi\|wr]

	// move qX0 so as to load for the next iteration
	// MOV qZ0,qX0
	mov dZr08b, dXr08b
	mov dZi08b, dXi08b

	.ifeqs "\inverse", "TRUE"
	fmul dZr3,dW3r,dXr3
	fmla dZr3,dW3i,dXi3 // real part
	fmul dZi3,dW3r,dXi3
	ld1 {dW3r},[pTwiddle],step24
	fmls dZi3,dW3i,dXr3 // imag part

	.else

	fmul dZr3,dW3r,dXr3
	fmls dZr3,dW3i,dXi3 // real part
	fmul dZi3,dW3r,dXi3
	ld1 {dW3r},[pTwiddle],step24
	fmla dZi3,dW3i,dXr3 // imag part

	.endif

	ld1 {dW3i},[pTwiddle],grpTwStep // [wi\|wr]

	// Don't do the load on the last iteration so we don't read past the end
	// of pSrc.
	bne skipIncrement\name
	add pSrc, pSrc, #64
	skipIncrement\name:
	beq radix4lsSkipRead\name
	// AC.r AC.i BD.r BD.i
	ld4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32

	// AC.r AC.i BD.r BD.i
	ld4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
	radix4lsSkipRead\name:

	// finish first stage of 4 point FFT

	// fadd qY0,qZ0,qZ2
	fadd dYr0,dZr0,dZr2
	fadd dYi0,dZi0,dZi2
	// fsub qY2,qZ0,qZ2
	fsub dYr2,dZr0,dZr2
	fsub dYi2,dZi0,dZi2
	// fadd qY1,qZ1,qZ3
	fadd dYr1,dZr1,dZr3
	fadd dYi1,dZi1,dZi3
	// fsub qY3,qZ1,qZ3
	fsub dYr3,dZr1,dZr3
	fsub dYi3,dZi1,dZi3


	// finish second stage of 4 point FFT

	.ifeqs "\inverse", "TRUE"

	// fsub qZ0,qY2,qY1
	fsub dZr0,dYr2,dYr1
	fsub dZi0,dYi2,dYi1
	fadd dZr3,dYr0,dYi3
	st2 {dZr0,dZi0},[pDst],outPointStep
	fsub dZi3,dYi0,dYr3

	// fadd qZ2,qY2,qY1
	fadd dZr2,dYr2,dYr1
	fadd dZi2,dYi2,dYi1

	st2 {dZr3,dZi3},[pDst],outPointStep

	fsub dZr1,dYr0,dYi3
	st2 {dZr2,dZi2},[pDst],outPointStep
	fadd dZi1,dYi0,dYr3

	// dstStep = -outPointStep + 16
	st2 {dZr1,dZi1},[pDst],dstStep


	.else

	// fsub qZ0,qY2,qY1
	fsub dZr0,dYr2,dYr1
	fsub dZi0,dYi2,dYi1

	fsub dZr1,dYr0,dYi3
	st2 {dZr0,dZi0},[pDst],outPointStep
	fadd dZi1,dYi0,dYr3

	// fadd qZ2,qY2,qY1
	fadd dZr2,dYr2,dYr1
	fadd dZi2,dYi2,dYi1

	st2 {dZr1,dZi1},[pDst],outPointStep

	fadd dZr3,dYr0,dYi3
	st2 {dZr2,dZi2},[pDst],outPointStep
	fsub dZi3,dYi0,dYr3

	// dstStep = -outPointStep + 16
	st2 {dZr3,dZi3},[pDst],dstStep


	.endif

	BGT radix4lsGrpLoop\name

	.endm


	M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace,,d15
	FFTSTAGE "FALSE","FALSE",fwd
	M_END


	M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace,,d15
	FFTSTAGE "FALSE","TRUE",inv
	M_END


	.end