blob: 355fa0ffcacf9fa08202fa0321c60bbc791ff3ce [file] [log] [blame]
@//
@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
@//
@// Use of this source code is governed by a BSD-style license
@// that can be found in the LICENSE file in the root of the source
@// tree. An additional intellectual property rights grant can be found
@// in the file PATENTS. All contributing project authors may
@// be found in the AUTHORS file in the root of the source tree.
@//
@//
@// This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
@// to support float instead of SC32.
@//
@//
@// Description:
@// Compute a Radix 4 FFT stage for a N point complex signal
@//
@//
@// Include standard headers
#include "dl/api/armCOMM_s.h"
#include "dl/api/omxtypes_s.h"
@// Import symbols required from other files
@// (For example tables)
@// Set debugging level
@//DEBUG_ON SETL {TRUE}
@// Guarding implementation by the processor name
@// Guarding implementation by the processor name
@// Import symbols required from other files
@// (For example tables)
@//Input Registers
#define pSrc r0
#define pDst r2
#define pTwiddle r1
#define subFFTNum r6
#define subFFTSize r7
@//Output Registers
@//Local Scratch Registers
#define grpCount r3
#define pointStep r4
#define outPointStep r5
#define stepTwiddle r12
#define setCount r14
#define srcStep r8
#define setStep r9
#define dstStep r10
#define twStep r11
#define t1 r3
@// Neon Registers
#define dW1 D0.F32
#define dW2 D1.F32
#define dW3 D2.F32
#define dXr0 D4.F32
#define dXi0 D5.F32
#define dXr1 D6.F32
#define dXi1 D7.F32
#define dXr2 D8.F32
#define dXi2 D9.F32
#define dXr3 D10.F32
#define dXi3 D11.F32
#define dYr0 D12.F32
#define dYi0 D13.F32
#define dYr1 D14.F32
#define dYi1 D15.F32
#define dYr2 D16.F32
#define dYi2 D17.F32
#define dYr3 D18.F32
#define dYi3 D19.F32
#define qT0 d16.f32
#define qT1 d18.f32
#define qT2 d12.f32
#define qT3 d14.f32
#define dZr0 D20.F32
#define dZi0 D21.F32
#define dZr1 D22.F32
#define dZi1 D23.F32
#define dZr2 D24.F32
#define dZi2 D25.F32
#define dZr3 D26.F32
#define dZi3 D27.F32
#define qY0 Q6.F32
#define qY1 Q7.F32
#define qY2 Q8.F32
#define qY3 Q9.F32
#define qX0 Q2.F32
#define qZ0 Q10.F32
#define qZ1 Q11.F32
#define qZ2 Q12.F32
#define qZ3 Q13.F32
.MACRO FFTSTAGE scaled, inverse , name
@// Define stack arguments
@// Update grpCount and grpSize rightaway inorder to reuse
@// pGrpCount and pGrpSize regs
LSL grpCount,subFFTSize,#2
LSR subFFTNum,subFFTNum,#2
MOV subFFTSize,grpCount
VLD1 dW1,[pTwiddle] @//[wi | wr]
@// pT0+1 increments pT0 by 8 bytes
@// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
MOV pointStep,subFFTNum,LSL #1
@// pOut0+1 increments pOut0 by 8 bytes
@// pOut0+outPointStep == increment of 8*outPointStep bytes
@// = 2*size bytes
MOV stepTwiddle,#0
VLD1 dW2,[pTwiddle] @//[wi | wr]
SMULBB outPointStep,grpCount,pointStep
LSL pointStep,pointStep,#2 @// 2*grpSize
VLD1 dW3,[pTwiddle] @//[wi | wr]
MOV srcStep,pointStep,LSL #1 @// srcStep = 2*pointStep
ADD setStep,srcStep,pointStep @// setStep = 3*pointStep
RSB setStep,setStep,#0 @// setStep = - 3*pointStep
SUB srcStep,srcStep,#16 @// srcStep = 2*pointStep-16
MOV dstStep,outPointStep,LSL #1
ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep
@// dstStep = - 3*outPointStep+16
RSB dstStep,dstStep,#16
radix4GrpLoop\name :
VLD2 {dXr0,dXi0},[pSrc],pointStep @// data[0]
ADD stepTwiddle,stepTwiddle,pointStep
VLD2 {dXr1,dXi1},[pSrc],pointStep @// data[1]
@// set pTwiddle to the first point
ADD pTwiddle,pTwiddle,stepTwiddle
VLD2 {dXr2,dXi2},[pSrc],pointStep @// data[2]
MOV twStep,stepTwiddle,LSL #2
@// data[3] & update pSrc for the next set
VLD2 {dXr3,dXi3},[pSrc],setStep
SUB twStep,stepTwiddle,twStep @// twStep = -3*stepTwiddle
MOV setCount,pointStep,LSR #3
@// set pSrc to data[0] of the next set
ADD pSrc,pSrc,#16
@// increment to data[1] of the next set
ADD pSrc,pSrc,pointStep
@// Loop on the sets
radix4SetLoop\name :
SUBS setCount,setCount,#2
.ifeqs "\inverse", "TRUE"
VMUL dZr1,dXr1,dW1[0]
VMUL dZi1,dXi1,dW1[0]
VMUL dZr2,dXr2,dW2[0]
VMUL dZi2,dXi2,dW2[0]
VMUL dZr3,dXr3,dW3[0]
VMUL dZi3,dXi3,dW3[0]
VMLA dZr1,dXi1,dW1[1] @// real part
VMLS dZi1,dXr1,dW1[1] @// imag part
@// data[1] for next iteration
VLD2 {dXr1,dXi1},[pSrc],pointStep
VMLA dZr2,dXi2,dW2[1] @// real part
VMLS dZi2,dXr2,dW2[1] @// imag part
@// data[2] for next iteration
VLD2 {dXr2,dXi2},[pSrc],pointStep
VMLA dZr3,dXi3,dW3[1] @// real part
VMLS dZi3,dXr3,dW3[1] @// imag part
.else
VMUL dZr1,dXr1,dW1[0]
VMUL dZi1,dXi1,dW1[0]
VMUL dZr2,dXr2,dW2[0]
VMUL dZi2,dXi2,dW2[0]
VMUL dZr3,dXr3,dW3[0]
VMUL dZi3,dXi3,dW3[0]
VMLS dZr1,dXi1,dW1[1] @// real part
VMLA dZi1,dXr1,dW1[1] @// imag part
@// data[1] for next iteration
VLD2 {dXr1,dXi1},[pSrc],pointStep
VMLS dZr2,dXi2,dW2[1] @// real part
VMLA dZi2,dXr2,dW2[1] @// imag part
@// data[2] for next iteration
VLD2 {dXr2,dXi2},[pSrc],pointStep
VMLS dZr3,dXi3,dW3[1] @// real part
VMLA dZi3,dXr3,dW3[1] @// imag part
.endif
@// data[3] & update pSrc to data[0]
VLD2 {dXr3,dXi3},[pSrc],setStep
@// finish first stage of 4 point FFT
VADD qY0,qX0,qZ2
VSUB qY2,qX0,qZ2
@// data[0] for next iteration
VLD2 {dXr0,dXi0},[pSrc :128]!
VADD qY1,qZ1,qZ3
VSUB qY3,qZ1,qZ3
@// finish second stage of 4 point FFT
VSUB qZ0,qY2,qY1
.ifeqs "\inverse", "TRUE"
VADD dZr3,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VSUB dZi3,dYi0,dYr3
VADD qZ2,qY2,qY1
VST2 {dZr3,dZi3},[pDst :128],outPointStep
VSUB dZr1,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VADD dZi1,dYi0,dYr3
VST2 {dZr1,dZi1},[pDst :128],dstStep
.else
VSUB dZr1,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst :128],outPointStep
VADD dZi1,dYi0,dYr3
VADD qZ2,qY2,qY1
VST2 {dZr1,dZi1},[pDst :128],outPointStep
VADD dZr3,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst :128],outPointStep
VSUB dZi3,dYi0,dYr3
VST2 {dZr3,dZi3},[pDst :128],dstStep
.endif
@// increment to data[1] of the next set
ADD pSrc,pSrc,pointStep
BGT radix4SetLoop\name
VLD1 dW1,[pTwiddle :64],stepTwiddle @//[wi | wr]
@// subtract 4 since grpCount multiplied by 4
SUBS grpCount,grpCount,#4
VLD1 dW2,[pTwiddle :64],stepTwiddle @//[wi | wr]
@// increment pSrc for the next grp
ADD pSrc,pSrc,srcStep
VLD1 dW3,[pTwiddle :64],twStep @//[wi | wr]
BGT radix4GrpLoop\name
@// Reset and Swap pSrc and pDst for the next stage
MOV t1,pDst
@// pDst -= 2*size; pSrc -= 8*size bytes
SUB pDst,pSrc,outPointStep,LSL #2
SUB pSrc,t1,outPointStep
.endm
M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe,r4
FFTSTAGE "FALSE","FALSE",FWD
M_END
M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe,r4
FFTSTAGE "FALSE","TRUE",INV
M_END
.end