blob: dd1690ad10ba9160ef9a064da2049135e60fdf3e [file] [log] [blame]
@//
@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
@//
@// Use of this source code is governed by a BSD-style license
@// that can be found in the LICENSE file in the root of the source
@// tree. An additional intellectual property rights grant can be found
@// in the file PATENTS. All contributing project authors may
@// be found in the AUTHORS file in the root of the source tree.
@//
@// This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
@// to support float instead of SC32.
@//
@//
@// Description:
@// Compute FFT for a real signal
@//
@//
@// Include standard headers
#include "dl/api/arm/armCOMM_s.h"
#include "dl/api/arm/omxtypes_s.h"
@// M_VARIANTS ARM1136JS
@// Import symbols required from other files
@// (For example tables)
.extern armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
.extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
.extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
.extern armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
@// Set debugging level
@//DEBUG_ON SETL {TRUE}
@// Guarding implementation by the processor name
@// IF ARM1136JS
@//Input Registers
#define pSrc r0
#define pDst r1
#define pFFTSpec r2
@// Output registers
#define result r0
@//Local Scratch Registers
@// N=1 case
#define scaleMinusOne r2
#define rnd r2
#define zero r8
#define Zero r9
#define argTwiddle r1
#define argDst r2
#define argScale r4
#define pTwiddle r4
#define pOut r5
#define subFFTSize r7
#define subFFTNum r6
#define N r6
#define order r14
#define diff r9
#define count r8
#define diffMinusOne r10
#define round r3
#define step r3
#define step1 r6
#define twStep r12
#define pTwiddleTmp r14
#define t0 r12
#define t1 r14 /*@// pTwiddleTmp*/
#define t2 r0
#define t3 r1 /*@// pSrc,argTwiddle*/
#define t4 r6
#define t5 r7 /*@// step1,subFFTSize*/
#define x0r s0
#define x0i s1
#define y0r s2
#define y0i s3
#define x1r s4
#define x1i s5
#define w1r s2
#define w1i s3
#define w0r s6
#define w0i s7
#define y1r s2 /*@// w1r,w1i*/
#define y1i s3
#define st0 s8
#define st1 s9
#define st2 s10
#define st3 s11
#define st4 s12
#define st5 s13
#define half s15
@// Allocate stack memory required by the function
@// Write function header
M_START omxSP_FFTFwd_RToCCS_F32_Sfs_vfp,r11
@ Structure offsets for FFTSpec
.set ARMsFFTSpec_N, 0
.set ARMsFFTSpec_pBitRev, 4
.set ARMsFFTSpec_pTwiddle, 8
.set ARMsFFTSpec_pBuf, 12
@// Define stack arguments
@// Setup half value
movw N, #0 @// Use N as a temp.
movt N, #0x3f00
vmov.f32 half, N
@// Read the size from structure and take log
LDR N, [pFFTSpec, #ARMsFFTSpec_N]
@// Read other structure parameters
LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
@// N=1 Treat seperately
CMP N,#1
BGT sizeGreaterThanOne
// N<=1 is not supported
@// Set return value
MOV result, #OMX_Sts_NoErr
B FunctionEnd
sizeGreaterThanOne:
@// Do a N/2 point complex FFT including the scaling
MOV N,N,ASR #1 @// N/2 point complex FFT
CLZ order,N @// N = 2^order
RSB order,order,#31
MOV subFFTSize,#1
@//MOV subFFTNum,N
CMP order,#1
BGT orderGreaterthan1 @// order > 1
vldmlt.f32 pSrc, {x0r, x0i}
vstmlt.f32 pOut, {x0r, x0i}
MOVLT pSrc,pOut
MOVLT argDst,pDst
BLT FFTEnd
MOV argDst,pOut @// Set input args to fft stages
MOV pOut,pDst @// Set input args to fft stages
MOV argTwiddle,pTwiddle
BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
B finalComplexToRealFixup
orderGreaterthan1:
TST order, #2 @// Set input args to fft stages
MOVEQ argDst,pDst
MOVNE argDst,pOut
MOVNE pOut,pDst @// Pass the first stage dest in RN5
MOV argTwiddle,pTwiddle
@//check for even or odd order
@// NOTE: The following combination of BL's would work fine
@// eventhough the first BL would corrupt the flags. This is
@// because the end of the "grpZeroSetLoop" loop inside
@// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets
@// the Z flag to EQ
TST order,#0x00000001
BLEQ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
BLNE armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
unscaledRadix4Loop:
CMP subFFTNum,#1
BEQ FFTEnd
BL armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
B unscaledRadix4Loop
FFTEnd:
finalComplexToRealFixup:
@// step = N/2 * 8 bytes
MOV step,subFFTSize,LSL #3
@// twStep = 3N/8 * 8 bytes pointing to W^1
SUB twStep,step,subFFTSize,LSL #1
@// step1 = N/4 * 8 = N/2*4 bytes
MOV step1,subFFTSize,LSL #2
@// (N/4-1)*8 bytes
SUB step1,step1,#8
@// F(0) = 1/2 [Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
@// 1/2 [(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)]
@// 1/2 [2a+j0] - j [0+j2b]
@// (a+b, 0)
@// F(N/2) =1/2 [Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
@// 1/2 [(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)]
@// 1/2 [2a+j0] + j [0+j2b]
@// (a-b, 0)
@// F(0) and F(N/2)
vldm.f32 pSrc!, {x0r, x0i}
vadd.f32 y0r,x0r,x0i @// F(0) = (2(Z0.r+Z0.i) , 0)
vsub.f32 x0r,x0r,x0i @// F(N/2) = (2(Z0.r-Z0.i) , 0)
vsub.f32 y0i, y0i @ y0i and x0i set to 0.0
vsub.f32 x0i, x0i
add argDst, step
vstm.f32 argDst, {x0r, x0i} @// {x0r,x0i}->[argDst, step]
sub argDst, step
vstm.f32 argDst!, {y0r, y0i}
SUBS subFFTSize,subFFTSize,#2
ADD pTwiddleTmp,argTwiddle,#8 @// W^2
ADD argTwiddle,argTwiddle,twStep @// W^1
BLT End
BEQ lastElement
@// F(k) = 1/2 [Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)]
@// Process 2 elements at a time. E.g: F(1) and F(N/2-1) since
@// both of them require Z(1) and Z(N/2-1)
ASR subFFTSize,subFFTSize,#1
evenOddButterflyLoop:
SUB step,step,#16 @// (N/2-2)*8 bytes
add pSrc, step
vldm.f32 pSrc, {x1r, x1i} @// {x1r, x1i} = [pSrc, step]
sub pSrc, step
vldm.f32 pSrc!, {x0r, x0i}
add argTwiddle, step1
vldm.f32 argTwiddle, {w1r, w1i} @// {w1r, w1i} = [argTwiddle, step1]
sub argTwiddle, step1
vldm.f32 argTwiddle!, {w0r, w0i} @// {w0r, w0i} = [argTwiddle], #8
SUB step1,step1,#8
SUBS subFFTSize,subFFTSize,#1
vsub.f32 st2,x0r,x1r @// a-c
vadd.f32 st3,x0i,x1i @// b+d
vadd.f32 st0,x0r,x1r @// a+c
vsub.f32 st1,x0i,x1i @// b-d
vmul.f32 x1r,w1r,st2
vmul.f32 x1i,w1r,st3
vmla.f32 x1r,w1i,st3 @// x1r = w1r*st2 + w1i*st3
@//RSB x1r,x1r,#0
vmls.f32 x1i,w1i,st2 @// x1i = w1r*st3 - wli*st2
vsub.f32 y1r, st0, x1i
vadd.f32 y1i, x1r, st1
vneg.f32 y1i, y1i
vmul.f32 x0r,w0r,st2
vmul.f32 x0i,w0r,st3
vmls.f32 x0r,w0i,st3 @// x0r = w0r*st2 - w0i*st3
vmla.f32 x0i,w0i,st2 @// x0i = w0r*st3 + x0i*st1
vsub.f32 st4,st0,x0i @// F(1)
vadd.f32 st5,x0r,st1
vmul.f32 y1r, half
vmul.f32 y1i, half
vmul.f32 st4, half
vmul.f32 st5, half
add argDst, step
vstm.f32 argDst, {y1r, y1i} @// {y1r,y1i} -> [argDst,step]
sub argDst, step
vstm.f32 argDst!, {st4, st5}
MOV t0,argTwiddle @// swap ptr for even and odd twiddles
MOV argTwiddle,pTwiddleTmp
MOV pTwiddleTmp,t0
BGT evenOddButterflyLoop
@// Last element can be expanded as follows
@// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
@// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
@// 1/2[2a+j0] + j (c+jd) [0+j2b]
@// (a-bc, -bd)
lastElement:
vldm.f32 pSrc, {x0r, x0i}
vneg.f32 x0i, x0i
vstm.f32 argDst, {x0r, x0i}
End:
@// Set return value
MOV result, #OMX_Sts_NoErr
FunctionEnd:
@// Write function tail
M_END
@// ENDIF @//ARM1136JS
@// Guarding implementation by the processor name
.end