blob: 03969be325e3b6f477e32cb95cbeeff05dd8df73 [file] [log] [blame]
//
// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the LICENSE file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
//
// This is a modification of armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.s
// to support float instead of SC32.
//
//
// Description:
// Compute a first stage Radix 8 FFT stage for a N point complex signal
//
//
// Include standard headers
#include "dl/api/arm/arm64COMM_s.h"
#include "dl/api/arm/omxtypes_s.h"
// Import symbols required from other files
// (For example tables)
// Set debugging level
//DEBUG_ON SETL {TRUE}
// Guarding implementation by the processor name
// Guarding implementation by the processor name
//Input Registers
#define pSrc x0
#define pDst x1
#define pTwiddle x2
#define pSubFFTNum x3
#define pSubFFTSize x4
//Output Registers
//Local Scratch Registers
#define subFFTNum x5
#define subFFTSize x6
#define grpSize x7
// Reuse grpSize as setCount
#define setCount x7
#define pointStep x8
#define outPointStep x8
#define setStep x9
#define step1 x10
#define step2 x11
#define t0 w12
// Neon Registers
#define dXr0 v0.2s
#define dXi0 v1.2s
#define dXr1 v2.2s
#define dXi1 v3.2s
#define dXr2 v4.2s
#define dXi2 v5.2s
#define dXr3 v6.2s
#define dXi3 v7.2s
#define dXr4 v8.2s
#define dXi4 v9.2s
#define dXr5 v10.2s
#define dXi5 v11.2s
#define dXr6 v12.2s
#define dXi6 v13.2s
#define dXr7 v14.2s
#define dXi7 v15.2s
#define qX0 v0.4s
#define qX1 v1.4s
#define qX2 v2.4s
#define qX3 v3.4s
#define qX4 v4.4s
#define qX5 v5.4s
#define qX6 v6.4s
#define qX7 v7.4s
#define dUr0 v16.2s
#define dUi0 v17.2s
#define dUr2 v18.2s
#define dUi2 v19.2s
#define dUr4 v20.2s
#define dUi4 v21.2s
#define dUr6 v22.2s
#define dUi6 v23.2s
#define dUr1 v24.2s
#define dUi1 v25.2s
#define dUr3 v26.2s
#define dUi3 v27.2s
#define dUr5 v28.2s
#define dUi5 v29.2s
// reuse dXr7 and dXi7
#define dUr7 v30.2s
#define dUi7 v31.2s
#define qU0 v8.4s
#define qU1 v12.4s
#define qU2 v9.4s
#define qU3 v13.4s
#define qU4 v10.4s
#define qU5 v14.4s
#define qU6 v11.4s
#define qU7 v15.4s
#define dVr0 v24.2s
#define dVi0 v25.2s
#define dVr2 v26.2s
#define dVi2 v27.2s
#define dVr4 v28.2s
#define dVi4 v29.2s
#define dVr6 v30.2s
#define dVi6 v31.2s
#define dVr1 v16.2s
#define dVi1 v17.2s
#define dVr3 v18.2s
#define dVi3 v19.2s
#define dVr5 v20.2s
#define dVi5 v21.2s
#define dVr7 v22.2s
#define dVi7 v23.2s
#define qV0 v12.4s
#define qV1 v8.4s
#define qV2 v13.4s
#define qV3 v9.4s
#define qV4 v14.4s
#define qV5 v10.4s
#define qV6 v15.4s
#define qV7 v11.4s
#define dYr0 v16.2s
#define dYi0 v17.2s
#define dYr2 v18.2s
#define dYi2 v19.2s
#define dYr4 v20.2s
#define dYi4 v21.2s
#define dYr6 v22.2s
#define dYi6 v23.2s
#define dYr1 v24.2s
#define dYi1 v25.2s
#define dYr3 v26.2s
#define dYi3 v27.2s
#define dYr5 v28.2s
#define dYi5 v29.2s
#define dYr7 v30.2s
#define dYi7 v31.2s
#define qY0 v8.4s
#define qY1 v12.4s
#define qY2 v9.4s
#define qY3 v13.4s
#define qY4 v10.4s
#define qY5 v14.4s
#define qY6 v11.4s
#define qY7 v15.4s
#define dT0 v14.2s
#define dT0s v14.s
#define dT1 v15.2s
.macro FFTSTAGE scaled, inverse, name
// Define stack arguments
// Move args values into our work registers
ldr subFFTNum, [pSubFFTNum]
ldr subFFTSize, [pSubFFTSize]
// Update pSubFFTSize and pSubFFTNum regs
// subFFTSize = 1 for the first stage
movz t0, 0x3f35, lsl #16 // High half word of sqrt(1/2).
movk t0, 0x04f3 // Low half word of sqrt(1/2).
MOV subFFTSize,#8
// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
LSR grpSize,subFFTNum,#3
MOV subFFTNum,grpSize
// pT0+1 increments pT0 by 8 bytes
// pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
// Note: outPointStep = pointStep for firststage
lsl pointStep,grpSize, #3
// Calculate the step of input data for the next set
//MOV step1,pointStep,LSL #1 // step1 = 2*pointStep
ld2 {dXr0,dXi0},[pSrc],pointStep // data[0]
lsl step1,grpSize, #4
lsl step2,pointStep, #3
ld2 {dXr1,dXi1},[pSrc],pointStep // data[1]
SUB step2,step2,pointStep // step2 = 7*pointStep
// setStep = - 7*pointStep+16
rsb setStep,step2,#16
ld2 {dXr2,dXi2},[pSrc],pointStep // data[2]
ld2 {dXr3,dXi3},[pSrc],pointStep // data[3]
ld2 {dXr4,dXi4},[pSrc],pointStep // data[4]
ld2 {dXr5,dXi5},[pSrc],pointStep // data[5]
ld2 {dXr6,dXi6},[pSrc],pointStep // data[6]
// data[7] & update pSrc for the next set
// setStep = -7*pointStep + 16
ld2 {dXr7,dXi7},[pSrc],setStep
// grp = 0 a special case since all the twiddle factors are 1
// Loop on the sets
radix8fsGrpZeroSetLoop\name :
// Decrement setcount
SUBS setCount,setCount,#2
// finish first stage of 8 point FFT
// fadd qU0,qX0,qX4
// fadd qU2,qX1,qX5
// fadd qU4,qX2,qX6
// fadd qU6,qX3,qX7
fadd dUr0,dXr0,dXr4
fadd dUr2,dXr1,dXr5
fadd dUr4,dXr2,dXr6
fadd dUr6,dXr3,dXr7
fadd dUi0,dXi0,dXi4
fadd dUi2,dXi1,dXi5
fadd dUi4,dXi2,dXi6
fadd dUi6,dXi3,dXi7
// finish second stage of 8 point FFT
// fadd qV0,qU0,qU4
// fsub qV2,qU0,qU4
// fadd qV4,qU2,qU6
// fsub qV6,qU2,qU6
fadd dVr0,dUr0,dUr4
fsub dVr2,dUr0,dUr4
fadd dVr4,dUr2,dUr6
fsub dVr6,dUr2,dUr6
fadd dVi0,dUi0,dUi4
fsub dVi2,dUi0,dUi4
fadd dVi4,dUi2,dUi6
fsub dVi6,dUi2,dUi6
// finish third stage of 8 point FFT
// fadd qY0,qV0,qV4
// fsub qY4,qV0,qV4
fadd dYr0,dVr0,dVr4
fsub dYr4,dVr0,dVr4
fadd dYi0,dVi0,dVi4
fsub dYi4,dVi0,dVi4
st2 {dYr0,dYi0},[pDst],step1 // store y0
.ifeqs "\inverse", "TRUE"
fsub dYr2,dVr2,dVi6
fadd dYi2,dVi2,dVr6
fadd dYr6,dVr2,dVi6
st2 {dYr2,dYi2},[pDst],step1 // store y2
fsub dYi6,dVi2,dVr6
// fsub qU1,qX0,qX4
fsub dUr1,dXr0,dXr4
fsub dUi1,dXi0,dXi4
st2 {dYr4,dYi4},[pDst],step1 // store y4
// fsub qU3,qX1,qX5
// fsub qU5,qX2,qX6
fsub dUr3,dXr1,dXr5
fsub dUr5,dXr2,dXr6
fsub dUi3,dXi1,dXi5
fsub dUi5,dXi2,dXi6
st2 {dYr6,dYi6},[pDst],step1 // store y6
.else
fadd dYr6,dVr2,dVi6
fsub dYi6,dVi2,dVr6
fsub dYr2,dVr2,dVi6
st2 {dYr6,dYi6},[pDst],step1 // store y2
fadd dYi2,dVi2,dVr6
// fsub qU1,qX0,qX4
fsub dUr1,dXr0,dXr4
fsub dUi1,dXi0,dXi4
st2 {dYr4,dYi4},[pDst],step1 // store y4
// fsub qU3,qX1,qX5
// fsub qU5,qX2,qX6
fsub dUr3,dXr1,dXr5
fsub dUr5,dXr2,dXr6
fsub dUi3,dXi1,dXi5
fsub dUi5,dXi2,dXi6
st2 {dYr2,dYi2},[pDst],step1 // store y6
.endif
// finish first stage of 8 point FFT
// fsub qU7,qX3,qX7
fsub dUr7,dXr3,dXr7
fsub dUi7,dXi3,dXi7
mov dT0s[0], t0
// finish second stage of 8 point FFT
fsub dVr1,dUr1,dUi5
// data[0] for next iteration
ld2 {dXr0,dXi0},[pSrc],pointStep
fadd dVi1,dUi1,dUr5
fadd dVr3,dUr1,dUi5
ld2 {dXr1,dXi1},[pSrc],pointStep // data[1]
fsub dVi3,dUi1,dUr5
fsub dVr5,dUr3,dUi7
ld2 {dXr2,dXi2},[pSrc],pointStep // data[2]
fadd dVi5,dUi3,dUr7
fadd dVr7,dUr3,dUi7
ld2 {dXr3,dXi3},[pSrc],pointStep // data[3]
fsub dVi7,dUi3,dUr7
// finish third stage of 8 point FFT
.ifeqs "\inverse", "TRUE"
// calculate a*v5
fmul dT1,dVr5,dT0[0] // use dVi0 for dT1
ld2 {dXr4,dXi4},[pSrc],pointStep // data[4]
fmul dVi5,dVi5,dT0[0]
ld2 {dXr5,dXi5},[pSrc],pointStep // data[5]
fsub dVr5,dT1,dVi5 // a * V5
fadd dVi5,dT1,dVi5
ld2 {dXr6,dXi6},[pSrc],pointStep // data[6]
// calculate b*v7
fmul dT1,dVr7,dT0[0]
fmul dVi7,dVi7,dT0[0]
// fadd qY1,qV1,qV5
// fsub qY5,qV1,qV5
fadd dYr1,dVr1,dVr5
fsub dYr5,dVr1,dVr5
fadd dYi1,dVi1,dVi5
fsub dYi5,dVi1,dVi5
fadd dVr7,dT1,dVi7 // b * V7
fsub dVi7,dVi7,dT1
SUB pDst, pDst, step2 // set pDst to y1
// On the last iteration, this will read past the end of pSrc,
// so skip this read.
BEQ radix8SkipLastUpdateInv\name
ld2 {dXr7,dXi7},[pSrc],setStep // data[7]
radix8SkipLastUpdateInv\name:
fsub dYr3,dVr3,dVr7
fsub dYi3,dVi3,dVi7
st2 {dYr1,dYi1},[pDst],step1 // store y1
fadd dYr7,dVr3,dVr7
fadd dYi7,dVi3,dVi7
st2 {dYr3,dYi3},[pDst],step1 // store y3
st2 {dYr5,dYi5},[pDst],step1 // store y5
st2 {dYr7,dYi7},[pDst] // store y7
ADD pDst, pDst, #16
.else
// calculate b*v7
fmul dT1,dVr7,dT0[0]
ld2 {dXr4,dXi4},[pSrc],pointStep // data[4]
fmul dVi7,dVi7,dT0[0]
ld2 {dXr5,dXi5},[pSrc],pointStep // data[5]
fadd dVr7,dT1,dVi7 // b * V7
fsub dVi7,dVi7,dT1
ld2 {dXr6,dXi6},[pSrc],pointStep // data[6]
// calculate a*v5
fmul dT1,dVr5,dT0[0] // use dVi0 for dT1
fmul dVi5,dVi5,dT0[0]
fadd dYr7,dVr3,dVr7
fadd dYi7,dVi3,dVi7
SUB pDst, pDst, step2 // set pDst to y1
fsub dVr5,dT1,dVi5 // a * V5
fadd dVi5,dT1,dVi5
// On the last iteration, this will read past the end of pSrc,
// so skip this read.
BEQ radix8SkipLastUpdateFwd\name
ld2 {dXr7,dXi7},[pSrc],setStep // data[7]
radix8SkipLastUpdateFwd\name:
// fsub qY5,qV1,qV5
fsub dYr5,dVr1,dVr5
fsub dYi5,dVi1,dVi5
fsub dYr3,dVr3,dVr7
st2 {dYr7,dYi7},[pDst],step1 // store y1
fsub dYi3,dVi3,dVi7
// fadd qY1,qV1,qV5
fadd dYr1,dVr1,dVr5
fadd dYi1,dVi1,dVi5
st2 {dYr5,dYi5},[pDst],step1 // store y3
st2 {dYr3,dYi3},[pDst],step1 // store y5
st2 {dYr1,dYi1},[pDst],#16 // store y7
.endif
// update pDst for the next set
SUB pDst, pDst, step2
BGT radix8fsGrpZeroSetLoop\name
// Save subFFTNum and subFFTSize for next stage
str subFFTNum, [pSubFFTNum]
str subFFTSize, [pSubFFTSize]
.endm
// Allocate stack memory required by the function
M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace,,d15
FFTSTAGE "FALSE","FALSE",FWD
M_END
M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace,,d15
FFTSTAGE "FALSE","TRUE",INV
M_END
.end