Implementation of real value 16 bit FFT with 16 bit complex FFT routines, for ARM Neon platforms.
Verified with SNR testing code in Openmax folder.
R=aedla@chromium.org, rtoy@google.com
Review URL: https://webrtc-codereview.appspot.com/1323010
git-svn-id: http://webrtc.googlecode.com/svn/deps/third_party/openmax@4362 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/dl/dl.gyp b/dl/dl.gyp
index 0573ce2..40cbdc4 100644
--- a/dl/dl.gyp
+++ b/dl/dl.gyp
@@ -54,6 +54,7 @@
'sp/src/omxSP_FFTInit_R_S32.c',
'sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S',
# Complex 16-bit fixed-point FFT
+ 'sp/src/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S',
'sp/src/omxSP_FFTInit_C_SC16.c',
'sp/src/omxSP_FFTGetBufSize_C_SC16.c',
'sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S',
@@ -67,6 +68,10 @@
'sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S',
'sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S',
# Real 16-bit fixed-point FFT
+ 'sp/src/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S',
+ 'sp/src/omxSP_FFTGetBufSize_R_S16.c',
+ 'sp/src/omxSP_FFTInit_R_S16.c',
+ 'sp/src/omxSP_FFTInv_CCSToR_S16_Sfs_s.S',
'sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S',
'sp/src/omxSP_FFTGetBufSize_R_S16S32.c',
'sp/src/omxSP_FFTInit_R_S16S32.c',
diff --git a/dl/sp/api/armSP.h b/dl/sp/api/armSP.h
index f615a87..4972f09 100644
--- a/dl/sp/api/armSP.h
+++ b/dl/sp/api/armSP.h
@@ -64,6 +64,14 @@
OMX_S32 *pBuf;
}ARMsFFTSpec_R_SC32;
+typedef struct ARMsFFTSpec_R_SC16_Tag
+{
+ OMX_U32 N;
+ OMX_U16 *pBitRev;
+ OMX_SC16 *pTwiddle;
+ OMX_S16 *pBuf;
+} ARMsFFTSpec_R_SC16;
+
typedef struct ARMsFFTSpec_R_FC32_Tag
{
OMX_U32 N;
diff --git a/dl/sp/api/omxSP.h b/dl/sp/api/omxSP.h
index 695fa90..c910363 100644
--- a/dl/sp/api/omxSP.h
+++ b/dl/sp/api/omxSP.h
@@ -44,6 +44,7 @@
typedef void OMXFFTSpec_C_SC16;
typedef void OMXFFTSpec_C_SC32;
typedef void OMXFFTSpec_R_S16S32;
+ typedef void OMXFFTSpec_R_S16;
typedef void OMXFFTSpec_R_S32;
typedef void OMXFFTSpec_R_F32;
typedef void OMXFFTSpec_C_FC32;
@@ -1487,6 +1488,45 @@
/**
+ * Function: omxSP_FFTInit_R_S16
+ *
+ * Description:
+ * These functions initialize specification structures required for the real
+ * FFT and IFFT functions. The function <FFTInit_R_S16> is used
+ * to initialize the specification structures for functions
+ * <FFTFwd_RToCCS_S16_Sfs> and <FFTInv_CCSToR_S16_Sfs>.
+ *
+ * Memory for *pFFTFwdSpec must be allocated before calling these functions
+ * and should be 8-byte aligned.
+ *
+ * The number of bytes required for *pFFTFwdSpec can be
+ * determined using <FFTGetBufSize_R_S16>.
+ *
+ * Input Arguments:
+ *
+ * order - base-2 logarithm of the desired block length; valid in the range
+ * [1,12]
+ *
+ * Output Arguments:
+ *
+ * pFFTFwdSpec - pointer to the initialized specification structure
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned if one or more of the
+ * following is true:
+ * - pFFTFwdSpec is either NULL or violates the 8-byte alignment
+ * restrictions
+ * - order < 1 or order > 12
+ *
+ */
+OMXResult omxSP_FFTInit_R_S16 (
+ OMXFFTSpec_R_S32*pFFTFwdSpec,
+ OMX_INT order
+);
+
+/**
* Function: omxSP_FFTInit_R_S32 (2.2.4.1.4)
*
* Description:
@@ -1699,6 +1739,38 @@
);
+/**
+ * Function: omxSP_FFTGetBufSize_R_S16
+ *
+ * Description:
+ * These functions compute the size of the specification structure
+ * required for the length 2^order real FFT and IFFT functions. The function
+ * <FFTGetBufSize_R_S16> is used in conjunction with the 16-bit
+ * functions <FFTFwd_RToCCS_S16_Sfs> and <FFTInv_CCSToR_S16_Sfs>.
+ *
+ * Input Arguments:
+ *
+ * order - base-2 logarithm of the length; valid in the range
+ * [1,12]
+ *
+ * Output Arguments:
+ *
+ * pSize - pointer to the number of bytes required for the specification
+ * structure
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments The function returns
+ * OMX_Sts_BadArgErr if one or more of the following is true:
+ * pSize is NULL
+ * order < 1 or order > 12
+ *
+ */
+OMXResult omxSP_FFTGetBufSize_R_S16 (
+ OMX_INT order,
+ OMX_INT *pSize
+);
/**
* Function: omxSP_FFTGetBufSize_R_S32 (2.2.4.1.8)
@@ -2023,6 +2095,59 @@
);
+/**
+ * Function: omxSP_FFTFwd_RToCCS_S16_Sfs
+ *
+ * Description:
+ * These functions compute an FFT for a real-valued signal of length of 2^order,
+ * where 0 < order <= 12. Transform length is determined by the
+ * specification structure, which must be initialized prior to calling the FFT
+ * function using the appropriate helper, i.e., <FFTInit_R_S16>.
+ * The relationship between the input and output sequences can
+ * be expressed in terms of the DFT, i.e.:
+ *
+ * x[n] = (2^(-scalefactor)/N) . SUM[k=0,...,N-1] X[k].e^(jnk.2.pi/N)
+ * n=0,1,2,...N-1
+ * N=2^order.
+ *
+ * The conjugate-symmetric output sequence is represented using a CCS vector,
+ * which is of length N+2, and is organized as follows:
+ *
+ * Index: 0 1 2 3 4 5 . . . N-2 N-1 N N+1
+ * Component: R0 0 R1 I1 R2 I2 . . . R[N/2-1] I[N/2-1] R[N/2] 0
+ *
+ * where R[n] and I[n], respectively, denote the real and imaginary components
+ * for FFT bin 'n'. Bins are numbered from 0 to N/2, where N is the FFT length.
+ * Bin index 0 corresponds to the DC component, and bin index N/2 corresponds to
+ * the foldover frequency.
+ *
+ * Input Arguments:
+ * pSrc - pointer to the real-valued input sequence, of length 2^order;
+ * must be aligned on a 32-byte boundary.
+ * pFFTSpec - pointer to the preallocated and initialized specification
+ * structure
+ * scaleFactor - output scale factor; valid range is [0, 16]
+ *
+ * Output Arguments:
+ * pDst - pointer to output sequence, represented using CCS format, of
+ * length (2^order)+2; must be aligned on a 32-byte boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments, if one or more of followings is true:
+ * - one of the pointers pSrc, pDst, or pFFTSpec is NULL
+ * - pSrc or pDst is not aligned on a 32-byte boundary
+ * - scaleFactor<0 or scaleFactor >16
+ *
+ */
+OMXResult omxSP_FFTFwd_RToCCS_S16_Sfs (
+ const OMX_S16* pSrc,
+ OMX_S16* pDst,
+ const OMXFFTSpec_R_S16* pFFTSpec,
+ OMX_INT scaleFactor
+);
+
/**
* Function: omxSP_FFTFwd_RToCCS_S32_Sfs (2.2.4.4.2)
@@ -2178,6 +2303,53 @@
);
+/**
+ * Function: omxSP_FFTInv_CCSToR_S16_Sfs
+ *
+ * Description:
+ * These functions compute the inverse FFT for a conjugate-symmetric input
+ * sequence. Transform length is determined by the specification structure,
+ * which must be initialized prior to calling the FFT function using
+ * <FFTInit_R_S16>. For a transform of length M, the input
+ * sequence is represented using a packed CCS vector of length
+ * M+2, and is organized as follows:
+ *
+ * Index: 0 1 2 3 4 5 . . . M-2 M-1 M M+1
+ * Component R[0] 0 R[1] I[1] R[2] I[2] . . . R[M/2-1] I[M/2-1] R[M/2] 0
+ *
+ * where R[n] and I[n], respectively, denote the real and imaginary components
+ * for FFT bin n.
+ * Bins are numbered from 0 to M/2, where M is the FFT length. Bin index 0
+ * corresponds to the DC component, and bin index M/2 corresponds to the
+ * foldover frequency.
+ *
+ * Input Arguments:
+ * pSrc - pointer to the complex-valued input sequence represented using
+ * CCS format, of length (2^order) + 2; must be aligned on a 32-byte
+ * boundary.
+ * pFFTSpec - pointer to the preallocated and initialized specification
+ * structure
+ * scaleFactor - output scalefactor; range is [0,16]
+ *
+ * Output Arguments:
+ * pDst - pointer to the real-valued output sequence, of length 2^order ; must
+ * be aligned on a 32-byte boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments if one or more of the following is true:
+ * - pSrc, pDst, or pFFTSpec is NULL
+ * - pSrc or pDst is not aligned on a 32-byte boundary
+ * - scaleFactor<0 or scaleFactor >16
+ *
+ */
+OMXResult omxSP_FFTInv_CCSToR_S16_Sfs (
+ const OMX_S16* pSrc,
+ OMX_S16* pDst,
+ const OMXFFTSpec_R_S16* pFFTSpec,
+ OMX_INT scaleFactor
+);
/**
* Function: omxSP_FFTInv_CCSToR_S32_Sfs (2.2.4.4.4)
diff --git a/dl/sp/src/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S b/dl/sp/src/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S
new file mode 100644
index 0000000..31638dd
--- /dev/null
+++ b/dl/sp/src/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S
@@ -0,0 +1,409 @@
+@
+@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS. All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+@ Some code in this file was originally from file
+@ armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S which was licensed as
+@ follows. It has been relicensed with permission from the copyright holders.
+@
+
+@
+@ OpenMAX DL: v1.0.2
+@ Last Modified Revision: 7485
+@ Last Modified Date: Fri, 21 Sep 2007
+@
+@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@
+
+@
+@ Description:
+@ Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT.
+@ It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation.
+@ It implements both "scaled"(by 1/2) and "unscaled" versions of the above
+@ formula.
+@
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+@//Input Registers
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+#define scale r3
+
+@ Output registers
+#define result r0
+
+@//Local Scratch Registers
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define tmpOrder r4
+#define pTwiddle r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+@ Total num of radix stages to comple the FFT.
+#define count r8
+#define x0r r4
+#define x0i r5
+#define diffMinusOne r2
+#define round r3
+#define pOut1 r2
+#define size r7
+#define step r8
+#define step1 r9
+#define step2 r10
+#define twStep r10
+#define pTwiddleTmp r11
+#define argTwiddle1 r12
+#define zero r14
+
+@ Neon registers
+#define dX0 D0.S16
+#define dX0S32 D0.S32
+#define dShift D1.S16
+#define dX1 D1.S16
+#define dX1S32 D1.S32
+#define dY0 D2.S16
+#define dY1 D3.S16
+#define dX0r D0.S16
+#define dX0rS32 D0.S32
+#define dX0i D1.S16
+#define dX1r D2.S16
+#define dX1i D3.S16
+#define qX1 Q1.S16
+#define dW0r D4.S16
+#define dW0i D5.S16
+#define dW1r D6.S16
+#define dW1i D7.S16
+#define dW0rS32 D4.S32
+#define dW0iS32 D5.S32
+#define dW1rS32 D6.S32
+#define dW1iS32 D7.S32
+#define dT0 D8.S16
+#define dT1 D9.S16
+#define dT2 D10.S16
+#define dT3 D11.S16
+#define qT0 Q6.S32
+#define qT1 Q7.S32
+#define qT2 Q8.S32
+#define qT3 Q9.S32
+#define dY0r D4.S16
+#define dY0i D5.S16
+#define dY1r D6.S16
+#define dY1i D7.S16
+#define qY1 Q3.S16
+#define dY2 D4.S16
+#define dY3 D5.S16
+#define dW0 D6.S16
+#define dW1 D7.S16
+#define dW0Tmp D10.S16
+#define dW1Neg D11.S16
+
+ @ Structure offsets for the FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ @ Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+ @ Read other structure parameters
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+ MOV size,N,ASR #1 @ preserve the contents of N
+ MOV step,N,LSL #1 @ step = N/2 * 4 bytes
+
+ @ Process different FFT sizes with different loops.
+ CMP size,#4
+ BLE smallFFTSize\name
+
+ @ Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]}
+ @ Note: W^(k) is stored as negated value and also need to
+ @ conjugate the values from the table.
+
+ @ Z(0) : no need of twiddle multiply
+ @ Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }
+
+ VLD1 dX0S32[0],[pSrc],step
+ ADD pOut1,pOut,step @ pOut1 = pOut+ N/2*4 bytes
+
+ VLD1 dX1S32[0],[pSrc]!
+ SUB twStep,step,size @ twStep = 3N/8 * 4 bytes pointing to W^1
+
+ MOV step1,size,LSL #1 @ step1 = N/4 * 4 = N/2*2 bytes
+ SUB step1,step1,#4 @ (N/4-1)*4 bytes
+
+ VHADD dY0,dX0,dX1 @ [b+d | a+c]
+ VHSUB dY1,dX0,dX1 @ [b-d | a-c]
+ VTRN dY0,dY1 @ dY0= [a-c | a+c] ;dY1= [b-d | b+d]
+
+ .ifeqs "\scaled", "TRUE"
+ VHSUB dX0,dY0,dY1
+ SUBS size,size,#2
+ VHADD dX1,dY0,dY1
+ .else
+ VSUB dX0,dY0,dY1
+ SUBS size,size,#2
+ VADD dX1,dY0,dY1
+ .endif
+
+ SUB pSrc,pSrc,step
+ VST1 dX0[0],[pOut1]!
+ ADD pTwiddleTmp,pTwiddle,#4 @ W^2
+ VST1 dX1[1],[pOut1]!
+ ADD argTwiddle1,pTwiddle,twStep @ W^1
+
+ BLT decrementScale\name
+ BEQ lastElement\name
+
+ SUB step,step,#20
+ SUB step1,step1,#4 @ (N/4-1)*8 bytes
+ SUB step2, step1, #4
+
+ @ Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]
+ @ Note: W^k is stored as negative values in the table and also need to
+ @ conjugate the values from the table.
+ @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+ @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
+
+evenOddButterflyLoop\name:
+ VLD2 {dX0r,dX0i},[pSrc],step
+ VLD2 {dX1r,dX1i},[pSrc]!
+ SUB pSrc, pSrc, step
+
+ VLD1 dW0r,[argTwiddle1],step1
+ VREV64 qX1,qX1
+ VLD1 dW1r,[argTwiddle1]!
+ VHSUB dT2,dX0r,dX1r @ a-c
+ SUB argTwiddle1, argTwiddle1, step1
+ SUB step1,step1,#16
+
+ VLD1 dW0i,[pTwiddleTmp],step2
+ VHADD dT3,dX0i,dX1i @ b+d
+ VLD1 dW1i,[pTwiddleTmp]!
+ VHADD dT0,dX0r,dX1r @ a+c
+ VHSUB dT1,dX0i,dX1i @ b-d
+ SUB pTwiddleTmp, pTwiddleTmp, step2
+ SUB step2,step2,#16
+
+ SUBS size,size,#8
+
+ VZIP dW1r,dW1i
+ VTRN dW0r,dW0i
+ VZIP dW1iS32, dW1rS32
+
+ VMULL qT0,dW1i,dT2
+ VMLSL qT0,dW1r,dT3
+ VMULL qT1,dW1i,dT3
+ VMLAL qT1,dW1r,dT2
+ VMULL qT2,dW0r,dT2
+ VMLAL qT2,dW0i,dT3
+ VMULL qT3,dW0r,dT3
+ VMLSL qT3,dW0i,dT2
+
+ VRSHRN dX1r,qT0,#15
+ VRSHRN dX1i,qT1,#15
+ VRSHRN dX0r,qT2,#15
+ VRSHRN dX0i,qT3,#15
+
+ .ifeqs "\scaled", "TRUE"
+ VHADD dY1r,dT0,dX1i @ F(N/2 -1)
+ VHSUB dY1i,dX1r,dT1
+ .else
+ VADD dY1r,dT0,dX1i @ F(N/2 -1)
+ VSUB dY1i,dX1r,dT1
+ .endif
+
+ .ifeqs "\scaled", "TRUE"
+ VHADD dY0r,dT0,dX0i @ F(1)
+ VHSUB dY0i,dT1,dX0r
+ .else
+ VADD dY0r,dT0,dX0i @ F(1)
+ VSUB dY0i,dT1,dX0r
+ .endif
+
+ VREV64 qY1,qY1
+
+ VST2 {dY0r,dY0i},[pOut1],step
+ VST2 {dY1r,dY1i},[pOut1]
+ ADD pOut1,pOut1,#16
+ SUB pOut1, pOut1, step
+ SUB step,step,#32
+
+ BGT evenOddButterflyLoop\name
+
+ SUB pSrc,pSrc,#4 @ set both the ptrs to the last element
+ SUB pOut1,pOut1,#4
+ B lastElement\name
+
+smallFFTSize\name:
+ @ Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]}
+ @ Note: W^(k) is stored as negated value and also need to
+ @ conjugate the values from the table.
+
+ @ Z(0) : no need of twiddle multiply
+ @ Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }
+
+ VLD1 dX0S32[0],[pSrc],step
+ ADD pOut1,pOut,step @ pOut1 = pOut+ N/2*4 bytes
+
+ VLD1 dX1S32[0],[pSrc]!
+ SUB twStep,step,size @ twStep = 3N/8 * 4 bytes pointing to W^1
+
+ MOV step1,size,LSL #1 @ step1 = N/4 * 4 = N/2*2 bytes
+ SUB step1,step1,#4 @ (N/4-1)*4 bytes
+
+ VHADD dY0,dX0,dX1 @ [b+d | a+c]
+ VHSUB dY1,dX0,dX1 @ [b-d | a-c]
+ VTRN dY0,dY1 @ dY0= [a-c | a+c] ;dY1= [b-d | b+d]
+
+ .ifeqs "\scaled", "TRUE"
+ VHSUB dX0,dY0,dY1
+ SUBS size,size,#2
+ VHADD dX1,dY0,dY1
+ .else
+ VSUB dX0,dY0,dY1
+ SUBS size,size,#2
+ VADD dX1,dY0,dY1
+ .endif
+
+ SUB pSrc,pSrc,step
+ VST1 dX0[0],[pOut1]!
+ ADD pTwiddleTmp,pTwiddle,#4 @ W^2
+ VST1 dX1[1],[pOut1]!
+ ADD argTwiddle1,pTwiddle,twStep @ W^1
+
+ BLT decrementScale\name
+ BEQ lastElement\name
+
+ @ Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]
+ @ Note: W^k is stored as negative values in the table and also need to
+ @ conjugate the values from the table.
+ @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+ @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
+
+ SUB step,step,#12
+
+evenOddButterflyLoopSize4\name:
+ VLD1 dW0rS32[0],[argTwiddle1],step1
+ VLD1 dW1rS32[0],[argTwiddle1]!
+
+ VLD2 {dX0r[0],dX0i[0]},[pSrc]!
+ VLD2 {dX0r[1],dX0i[1]},[pSrc],step
+ SUB pSrc,pSrc,#4
+ SUB argTwiddle1,argTwiddle1,step1
+ VLD2 {dX1r[0],dX1i[0]},[pSrc]!
+ VLD2 {dX1r[1],dX1i[1]},[pSrc]!
+
+ SUB step1,step1,#4 @ (N/4-2)*4 bytes
+ VLD1 dW0iS32[0],[pTwiddleTmp],step1
+ VLD1 dW1iS32[0],[pTwiddleTmp]!
+ SUB pSrc,pSrc,step
+
+ SUB pTwiddleTmp,pTwiddleTmp,step1
+ VREV32 dX1r,dX1r
+ VREV32 dX1i,dX1i
+ SUBS size,size,#4
+
+ VHSUB dT2,dX0r,dX1r @ a-c
+ VHADD dT3,dX0i,dX1i @ b+d
+ SUB step1,step1,#4
+ VHADD dT0,dX0r,dX1r @ a+c
+ VHSUB dT1,dX0i,dX1i @ b-d
+
+ VTRN dW1r,dW1i
+ VTRN dW0r,dW0i
+
+ VMULL qT0,dW1r,dT2
+ VMLSL qT0,dW1i,dT3
+ VMULL qT1,dW1r,dT3
+ VMLAL qT1,dW1i,dT2
+ VMULL qT2,dW0r,dT2
+ VMLAL qT2,dW0i,dT3
+ VMULL qT3,dW0r,dT3
+ VMLSL qT3,dW0i,dT2
+
+ VRSHRN dX1r,qT0,#15
+ VRSHRN dX1i,qT1,#15
+
+ .ifeqs "\scaled", "TRUE"
+ VHADD dY1r,dT0,dX1i @ F(N/2 -1)
+ VHSUB dY1i,dX1r,dT1
+ .else
+ VADD dY1r,dT0,dX1i @ F(N/2 -1)
+ VSUB dY1i,dX1r,dT1
+ .endif
+
+ VREV32 dY1r,dY1r
+ VREV32 dY1i,dY1i
+
+ VRSHRN dX0r,qT2,#15
+ VRSHRN dX0i,qT3,#15
+
+ .ifeqs "\scaled", "TRUE"
+ VHADD dY0r,dT0,dX0i @ F(1)
+ VHSUB dY0i,dT1,dX0r
+ .else
+ VADD dY0r,dT0,dX0i @ F(1)
+ VSUB dY0i,dT1,dX0r
+ .endif
+
+ VST2 {dY0r[0],dY0i[0]},[pOut1]!
+ VST2 {dY0r[1],dY0i[1]},[pOut1],step
+ SUB pOut1, #4
+ VST2 {dY1r[0],dY1i[0]},[pOut1]!
+ VST2 {dY1r[1],dY1i[1]},[pOut1]!
+ SUB pOut1,pOut1,step
+ SUB pSrc,pSrc,#4 @ set both the ptrs to the last element
+ SUB pOut1,pOut1,#4
+
+ @ Last element can be expanded as follows
+ @ 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (W^k is stored as -ve)
+ @ 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
+ @ 1/2[2a+j0] - j (c-jd) [0+j2b]
+ @ (a+bc, -bd)
+ @ Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement\name:
+ VLD1 dX0rS32[0],[pSrc]
+
+ .ifeqs "\scaled", "TRUE"
+ VSHR dX0r,dX0r,#1
+ .endif
+
+ VST1 dX0r[0],[pOut1]!
+ VNEG dX0r,dX0r
+ VST1 dX0r[1],[pOut1]
+
+decrementScale\name:
+ .ifeqs "\scaled", "TRUE"
+ SUB scale,scale,#1
+ .endif
+
+ .endm
+
+ M_START armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe,r4
+ FFTSTAGE "FALSE","TRUE",Inv
+ M_END
+
+ M_START armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe,r4
+ FFTSTAGE "TRUE","TRUE",InvSfs
+ M_END
+
+
+ .end
diff --git a/dl/sp/src/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S b/dl/sp/src/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S
new file mode 100644
index 0000000..f97fe5a
--- /dev/null
+++ b/dl/sp/src/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S
@@ -0,0 +1,639 @@
+@
+@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS. All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+@ Some code in this file was originally from file
+@ omxSP_FFTFwd_RToCCS_S32_Sfs_s.S which was licensed as follows.
+@ It has been relicensed with permission from the copyright holders.
+@
+
+@
+@ OpenMAX DL: v1.0.2
+@ Last Modified Revision: 7810
+@ Last Modified Date: Thu, 04 Oct 2007
+@
+@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@
+
+@
+@ Description:
+@ Compute a forward FFT for a real signal, using 16 bit complex FFT routines.
+@
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+
+@Input Registers
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+#define scale r3
+
+@ Output registers
+#define result r0
+
+@Local Scratch Registers
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define pTwiddle r4
+#define tmpOrder r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+@ Total num of radix stages to comple the FFT
+#define count r8
+#define x0r r4
+#define x0i r5
+#define diffMinusOne r2
+#define round r3
+#define subFFTSizeTmp r6
+#define step r3
+#define stepr r11
+#define step1 r10
+#define step1r r6
+#define step2 r8
+#define step2r r9
+#define twStep r8
+#define zero r9
+#define pTwiddleTmp r5
+#define t0 r10
+
+@ Neon registers
+#define dX0 d0.s16
+#define dX0S32 d0.s32
+#define dzero d1.s16
+#define dZero d2.s16
+#define dShift d3.s16
+#define qShift q1.s16
+#define dX0r d2.s16
+#define dX0i d3.s16
+#define dX1r d4.s16
+#define dX1i d5.s16
+#define qX1 q2.s16
+#define dX0rS32 d2.s32
+#define dX0iS32 d3.s32
+#define dX1rS32 d4.s32
+#define dX1iS32 d5.s32
+#define dT0 d6.s16
+#define dT1 d7.s16
+#define dT2 d8.s16
+#define dT3 d9.s16
+#define qT0 q5.s32
+#define qT1 q6.s32
+#define qT0s q5.s16
+#define qT1s q6.s16
+#define dW0r d14.s16
+#define dW0i d15.s16
+#define dW1r d16.s16
+#define dW1i d17.s16
+#define dW0rS32 d14.s32
+#define dW0iS32 d15.s32
+#define dW1rS32 d16.s32
+#define dW1iS32 d17.s32
+#define dY0r d14.s16
+#define dY0i d15.s16
+#define dY0rS32 d14.s32
+#define dY0iS32 d15.s32
+#define dY1r d16.s16
+#define dY1i d17.s16
+#define qY1 q8.s16
+#define dY1rS32 d16.s32
+#define dY1iS32 d17.s32
+#define dY0rS64 d14.s32
+#define dY0iS64 d15.s32
+#define qT2 q9.s32
+#define qT3 q10.s32
+#define d18s16 d18.s16
+#define d19s16 d19.s16
+#define d20s16 d20.s16
+#define d21s16 d21.s16
+@ lastThreeelements
+#define dX1 d3.s16
+#define dW0 d4.s16
+#define dW1 d5.s16
+#define dY0 d10.s16
+#define dY1 d11.s16
+#define dY2 d12.s16
+#define dY3 d13.s16
+
+ @ Allocate stack memory required by the function
+ M_ALLOC4 diffOnStack, 4
+
+ @ Write function header
+ M_START omxSP_FFTFwd_RToCCS_S16_Sfs,r11,d15
+
+ @ Structure offsets for the FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+ @ Define stack arguments
+
+ @ Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+ @ Read other structure parameters
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+ @ Do a N/2 point complex FFT including the scaling
+
+ MOV N,N,ASR #1 @ N/2 point complex FFT
+
+ CLZ order,N @ N = 2^order
+ RSB order,order,#31
+ MOV subFFTSize,#1
+
+ CMP order,#3
+ BGT orderGreaterthan3 @ order > 3
+
+ CMP order,#1
+ BGE orderGreaterthan0 @ order > 0
+ M_STR scale, diffOnStack,LT @ order = 0
+ LDR x0r,[pSrc]
+ STR x0r,[pOut]
+ MOV pSrc,pOut
+ MOV argDst,pDst
+ B FFTEnd
+
+orderGreaterthan0:
+ @ set the buffers appropriately for various orders
+ CMP order,#2
+ MOVEQ argDst,pDst
+ MOVNE argDst,pOut
+ MOVNE pOut,pDst @ Pass 1st stage destination in RN5
+ MOV argTwiddle,pTwiddle
+
+ SUBS diff,scale,order
+ M_STR diff,diffOnStack
+ MOVGT scale,order
+ @ Now scale <= order
+
+ CMP order,#1
+ BGT orderGreaterthan1
+ @ order = 1:
+ SUBS scale,scale,#1
+ BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+ B FFTEnd
+
+orderGreaterthan1:
+ CMP order,#2
+ MOV argScale,scale
+ BGT orderGreaterthan2
+ @ order = 2:
+ SUBS argScale,argScale,#1
+ BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1
+ BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+orderGreaterthan2: @ order = 3
+ SUBS argScale,argScale,#1
+ BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1
+ BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1
+ BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+ BLLT armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+
+orderGreaterthan3:
+ @ check scale = 0 or scale = order
+ SUBS diff, scale, order @ scale > order
+ MOVGT scale,order
+ BGE specialScaleCase @ scale = 0 or scale = order
+ CMP scale,#0
+ BEQ specialScaleCase
+ B generalScaleCase
+
+specialScaleCase: @ scale = 0, or, scale = order && order > 3
+ TST order, #2 @ Set input args to fft stages
+ MOVEQ argDst,pDst
+ MOVNE argDst,pOut
+ MOVNE pOut,pDst @ Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
+
+ CMP diff,#0
+ M_STR diff, diffOnStack
+ BGE scaleEqualsOrder
+
+ @ check for even or odd order.
+ @ NOTE: The following combination of BL's would work fine even though
+ @ the first BL would corrupt the flags. This is because the end of the
+ @ "grpZeroSetLoop" loop inside
+ @ armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets Z flag to EQ.
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+ BLNE armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+
+ CMP subFFTNum,#4
+ BLT FFTEnd
+
+unscaledRadix4Loop:
+ BEQ lastStageUnscaledRadix4
+ BL armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+ CMP subFFTNum,#4
+ B unscaledRadix4Loop
+
+lastStageUnscaledRadix4:
+ BL armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+scaleEqualsOrder:
+ @ check for even or odd order
+ @ NOTE: The following combination of BL's would work fine even though
+ @ the first BL would corrupt the flags. This is because the end of the
+ @ "grpZeroSetLoop" loop inside
+ @ armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets Z flag to EQ.
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+ BLNE armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+
+ CMP subFFTNum,#4
+ BLT FFTEnd
+
+scaledRadix4Loop:
+ BEQ lastStageScaledRadix4
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+ CMP subFFTNum,#4
+ B scaledRadix4Loop
+
+lastStageScaledRadix4:
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+generalScaleCase: @ 0 < scale < order and order > 3
+ @ Determine the correct destination buffer
+ SUB diff,order,scale
+ TST diff,#0x01
+ ADDEQ count,scale,diff,LSR #1 @ count = scale + (order - scale)/2
+ MOVNE count,order
+ TST count,#0x01 @ Is count even or odd ?
+
+ MOVEQ argDst,pDst @ Set input args to fft stages
+ MOVNE argDst,pOut
+ MOVNE pOut,pDst @ Pass 1st stage destination in RN5
+ MOV argTwiddle,pTwiddle
+
+ CMP diff,#1
+ M_STR diff, diffOnStack
+ BEQ scaleps @ scaling including a radix2_ps stage
+
+ MOV argScale,scale @ Put scale in RN4 to save and restore
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1
+
+scaledRadix2Loop:
+ BLGT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1 @ save, restore scale in scaled stages
+ BGT scaledRadix2Loop
+ B outScale
+
+scaleps:
+ SUB argScale,scale,#1 @ order>3 and diff=1 => scale >= 3
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1
+
+scaledRadix2psLoop:
+ BEQ scaledRadix2psStage
+ BLGT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+ SUBS argScale,argScale,#1 @ save, restore scale in scaled stages
+ BGE scaledRadix2psLoop
+
+scaledRadix2psStage:
+ BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+ B generalLastStageUnscaledRadix2
+
+outScale:
+ M_LDR diff, diffOnStack
+ @check for even or odd order
+ TST diff,#0x00000001
+ BEQ generalUnscaledRadix4Loop
+ B unscaledRadix2Loop
+
+generalUnscaledRadix4Loop:
+ CMP subFFTNum,#4
+ BEQ generalLastStageUnscaledRadix4
+ BL armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+ B generalUnscaledRadix4Loop
+
+generalLastStageUnscaledRadix4:
+ BL armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ B End
+
+unscaledRadix2Loop:
+ CMP subFFTNum,#4
+ BEQ generalLastTwoStagesUnscaledRadix2
+ BL armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
+ B unscaledRadix2Loop
+
+generalLastTwoStagesUnscaledRadix2:
+ BL armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+generalLastStageUnscaledRadix2:
+ BL armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+ B End
+
+FFTEnd: @ Does only the scaling
+ M_LDR diff, diffOnStack
+ CMP diff,#0
+ BLE finalComplexToRealFixup
+
+ RSB diff,diff,#0 @ for right shift by a variable
+ VDUP qShift,diff
+
+ @ save subFFTSize and use subFFTSizeTmp in the following loop
+ MOV subFFTSizeTmp,subFFTSize @ subFFTSizeTmp same reg as subFFTNum
+
+ @ Use parallel loads for bigger FFT size.
+ CMP subFFTSizeTmp, #8
+ BLT scaleLessFFTData
+
+scaleFFTData:
+ VLD1 {qT0s, qT1s},[pSrc:256] @ pSrc contains pDst pointer
+ SUBS subFFTSizeTmp,subFFTSizeTmp,#8
+ VSHL qT0s,qShift
+ VSHL qT1s,qShift
+ VST1 {qT0s, qT1s},[pSrc:256]!
+ BGT scaleFFTData
+ B afterScaling
+
+scaleLessFFTData:
+ VLD1 {dX0S32[0]},[pSrc] @ pSrc contains pDst pointer
+ SUBS subFFTSizeTmp,subFFTSizeTmp,#1
+ VSHL dX0,dShift
+ VST1 {dX0S32[0]},[pSrc]!
+ BGT scaleLessFFTData
+
+afterScaling:
+ SUB pSrc,pSrc,subFFTSize,LSL #2 @ reset pSrc for final fixup
+
+ @ change the logic so that output after scaling is in pOut and not in pDst
+ @ finally store from pOut to pDst
+ @ change branch "End" to branch "finalComplexToRealFixup" in the above
+ @ chk the code below for multiplication by j factor
+
+finalComplexToRealFixup:
+ @ F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
+ @ 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)]
+ @ 1/2[2a+j0] - j [0+j2b]
+ @ (a+b, 0)
+
+ @ F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
+ @ 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)]
+ @ 1/2[2a+j0] + j [0+j2b]
+ @ (a-b, 0)
+
+ CMP subFFTSize,#4
+ BLE smallFFTSize
+
+@ SubSize > 3:
+ @ F(0) and F(N/2)
+ VLD2 {dX0r[0],dX0i[0]},[pSrc]!
+ MOV zero,#0
+ VMOV dX0r[1],zero
+ MOV step,subFFTSize,LSL #2 @ step = N/2 * 4 bytes
+ VMOV dX0i[1],zero
+ SUB twStep,step,subFFTSize @ twStep = 3N/8 * 8 bytes
+
+ VADD dY0r,dX0r,dX0i @ F(0) = ((Z0.r+Z0.i) , 0)
+ MOV step1,subFFTSize,LSL #1 @ step1 = N/2 * 2 bytes
+ VSUB dY0i,dX0r,dX0i @ F(N/2) = ((Z0.r-Z0.i) , 0)
+ SUBS subFFTSize,subFFTSize,#2
+
+ VST1 dY0rS32[0],[argDst], step
+ ADD pTwiddleTmp,argTwiddle,#4 @ W^2
+ VST1 dY0iS32[0],[argDst]!
+ ADD argTwiddle,argTwiddle,twStep @ W^1
+
+ VDUP dzero,zero
+ SUB argDst,argDst,step
+ SUB step,step,#20
+ RSB stepr, step, #16
+ SUB step1,step1,#8 @ (N/4-1)*8 bytes
+ RSB step1r,step1,#8
+
+ SUB step2, step1, #4
+ RSB step2r, step2, #8
+
+ @ F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)]
+ @ Note: W^k is stored as negative values in the table.
+ @ Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
+ @ since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1).
+
+evenOddButterflyLoop:
+ VLD2 {dX0r,dX0i},[pSrc],step
+ VLD2 {dX1r,dX1i},[pSrc],stepr
+
+ VLD1 dW0r,[argTwiddle],step1
+ SUB step1, step1, #16
+ VREV64 qX1,qX1
+
+ VLD1 dW1r,[argTwiddle],step1r
+ ADD step1r, step1r, #16
+ VSUB dT2,dX0r,dX1r @ a-c
+
+ VLD1 dW0i,[pTwiddleTmp],step2
+ SUB step2, step2, #16
+ VADD dT3,dX0i,dX1i @ b+d
+
+ VLD1 dW1i,[pTwiddleTmp],step2r
+ ADD step2r, step2r, #16
+
+ VTRN dW0r,dW0i
+ VZIP dW1r, dW1i
+
+ SUBS subFFTSize,subFFTSize,#8
+
+ VHADD dT0,dX0r,dX1r @ (a+c)/2
+ VZIP dW1iS32, dW1rS32
+ VHSUB dT1,dX0i,dX1i @ (b-d)/2
+
+ VQDMULH dY0,dW1i,dT2
+ VQDMULH dY1,dW1r,dT3
+ VQDMULH dY2,dW1i,dT3
+ VQDMULH dY3,dW1r,dT2
+
+ VQDMULH d18s16,dW0r,dT2
+ VQDMULH d19s16,dW0i,dT3
+ VQDMULH d20s16,dW0r,dT3
+ VQDMULH d21s16,dW0i,dT2
+
+ VRHADD dX1r, dY0, dY1
+ VHSUB dX1i, dY2, dY3
+ VHSUB dX0r, d18s16, d19s16
+ VADD dY1i,dT1,dX1r
+ VRHADD dX0i, d20s16, d21s16
+ VSUB dY1r,dT0,dX1i @ F(N/2 -1)
+ VSUB dY0r,dT0,dX0i @ F(1)
+ VADD dY0i,dT1,dX0r
+
+ VNEG dY1i,dY1i
+ VREV64 qY1, qY1
+
+ VST2 {dY0r,dY0i},[argDst],step
+ SUB step,step,#32 @ (N/2-4)*4 bytes
+ VST2 {dY1r,dY1i},[argDst],stepr
+ ADD stepr,stepr,#32
+
+ BGT evenOddButterflyLoop
+
+ SUB pSrc,pSrc,#4 @ points to the last element.
+ SUB argDst,argDst,#4 @ points to the last element.
+
+ b lastElement
+
+smallFFTSize:
+
+ @ F(0) and F(N/2)
+ VLD2 {dX0r[0],dX0i[0]},[pSrc]!
+ MOV zero,#0
+ VMOV dX0r[1],zero
+ MOV step,subFFTSize,LSL #2 @ step = N/2 * 4 bytes
+ VMOV dX0i[1],zero
+ SUB twStep,step,subFFTSize @ twStep = 3N/8 * 8 bytes
+
+ VADD dY0r,dX0r,dX0i @ F(0) = ((Z0.r+Z0.i) , 0)
+ MOV step1,subFFTSize,LSL #1 @ step1 = N/2 * 2 bytes
+ VSUB dY0i,dX0r,dX0i @ F(N/2) = ((Z0.r-Z0.i) , 0)
+ SUBS subFFTSize,subFFTSize,#2
+
+
+ VST1 dY0rS32[0],[argDst], step
+ ADD pTwiddleTmp,argTwiddle,#4 @ W^2
+ VST1 dY0iS32[0],[argDst]!
+ ADD argTwiddle,argTwiddle,twStep @ W^1
+
+ VDUP dzero,zero
+ SUB argDst,argDst,step
+
+ BLT End
+ BEQ lastElement
+
+ SUB step,step,#12
+ SUB step1,step1,#4 @ (N/4-1)*8 bytes
+
+ @ F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)]
+
+butterflyLoopSubFFTSize4:
+ VLD1 dW0rS32[0], [argTwiddle],step1
+ VLD1 dW1rS32[0],[argTwiddle]!
+
+ VLD2 {dX0r[0],dX0i[0]},[pSrc]!
+ VLD2 {dX0r[1],dX0i[1]},[pSrc],step
+ SUB pSrc,pSrc,#4
+ SUB argTwiddle,argTwiddle,step1
+ VLD2 {dX1r[0],dX1i[0]},[pSrc]!
+ VLD2 {dX1r[1],dX1i[1]},[pSrc]!
+
+ SUB step1,step1,#4 @ (N/4-2)*4 bytes
+ VLD1 dW0iS32[0],[pTwiddleTmp],step1
+ VLD1 dW1iS32[0],[pTwiddleTmp]!
+ SUB pSrc,pSrc,step
+
+ SUB pTwiddleTmp,pTwiddleTmp,step1
+ VREV32 dX1r,dX1r
+ VREV32 dX1i,dX1i
+ SUBS subFFTSize,subFFTSize,#4
+
+ VSUB dT2,dX0r,dX1r @ a-c
+ SUB step1,step1,#4
+ VADD dT3,dX0i,dX1i @ b+d
+ VADD dT0,dX0r,dX1r @ a+c
+ VSUB dT1,dX0i,dX1i @ b-d
+ VHADD dT0,dT0,dzero
+ VHADD dT1,dT1,dzero
+
+ VTRN dW1r,dW1i
+ VTRN dW0r,dW0i
+
+ VMULL qT0,dW1r,dT2
+ VMLAL qT0,dW1i,dT3
+ VMULL qT1,dW1r,dT3
+ VMLSL qT1,dW1i,dT2
+
+ VMULL qT2,dW0r,dT2
+ VMLSL qT2,dW0i,dT3
+ VMULL qT3,dW0r,dT3
+ VMLAL qT3,dW0i,dT2
+
+ VRSHRN dX1r,qT0,#16
+ VRSHRN dX1i,qT1,#16
+
+ VSUB dY1r,dT0,dX1i @ F(N/2 -1)
+ VADD dY1i,dT1,dX1r
+ VNEG dY1i,dY1i
+
+ VREV32 dY1r,dY1r
+ VREV32 dY1i,dY1i
+
+ VRSHRN dX0r,qT2,#16
+ VRSHRN dX0i,qT3,#16
+
+ VSUB dY0r,dT0,dX0i @ F(1)
+ VADD dY0i,dT1,dX0r
+
+ VST2 {dY0r[0],dY0i[0]},[argDst]!
+ VST2 {dY0r[1],dY0i[1]},[argDst],step
+ SUB argDst, #4
+ VST2 {dY1r[0],dY1i[0]},[argDst]!
+ VST2 {dY1r[1],dY1i[1]},[argDst]!
+ SUB argDst,argDst,step
+ SUB pSrc,pSrc,#4 @ points to the last element.
+ SUB argDst,argDst,#4 @ points to the last element.
+
+lastElement:
+ @ Last element can be expanded as follows
+ @ 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
+ @ 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
+ @ 1/2[2a+j0] + j (c+jd) [0+j2b]
+ @ (a-bc, -bd)
+ @ Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+ VLD1 dX0rS32[0],[pSrc]
+ VST1 dX0r[0],[argDst]!
+ VNEG dX0r,dX0r
+ VST1 dX0r[1],[argDst]!
+
+End:
+ @ Set return value
+ MOV result, #OMX_Sts_NoErr
+
+ @ Write function tail
+ M_END
+
+ .END
diff --git a/dl/sp/src/omxSP_FFTGetBufSize_R_S16.c b/dl/sp/src/omxSP_FFTGetBufSize_R_S16.c
new file mode 100644
index 0000000..cc318a3
--- /dev/null
+++ b/dl/sp/src/omxSP_FFTGetBufSize_R_S16.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * Some code in this file was originally from file omxSP_FFTGetBufSize_R_S32.c
+ * which was licensed as follows.
+ * It has been relicensed with permission from the copyright holders.
+ */
+
+/*
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:
+ * Last Modified Date:
+ */
+
+#include "dl/api/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+/**
+ * Function: omxSP_FFTGetBufSize_R_S16
+ *
+ * Description:
+ * Computes the size of the specification structure required for the length
+ * 2^order real FFT and IFFT functions.
+ *
+ * Remarks:
+ * This function is used in conjunction with the 16-bit functions
+ * <FFTFwd_RToCCS_S16_Sfs> and <FFTInv_CCSToR_S16_Sfs>.
+ *
+ * Parameters:
+ * [in] order base-2 logarithm of the length; valid in the range
+ * [1,12].
+ * [out] pSize pointer to the number of bytes required for the
+ * specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTGetBufSize_R_S16(OMX_INT order, OMX_INT *pSize) {
+ OMX_INT NBy2,N,twiddleSize;
+
+ /* Order zero not allowed */
+ if (order == 0) {
+ return OMX_Sts_BadArgErr;
+ }
+
+ NBy2 = 1 << (order - 1);
+ N = NBy2 << 1;
+ twiddleSize = 5 * N / 8; /* 3 / 4 (N / 2) + N / 4 */
+
+ /* 2 pointers to store bitreversed array and twiddle factor array */
+ *pSize = sizeof(ARMsFFTSpec_R_SC16)
+ /* Twiddle factors */
+ + sizeof(OMX_SC16) * twiddleSize
+ /* Ping Pong buffer for doing the N/2 point complex FFT; */
+ /* extra size 'N' as a temporary buf for FFTInv_CCSToR_S16_Sfs */
+ + sizeof(OMX_S16) * (N << 1)
+ /* Extra bytes to get 32 byte alignment of ptwiddle and pBuf */
+ + 62 ;
+
+
+ return OMX_Sts_NoErr;
+}
+
+/*****************************************************************************
+ * END OF FILE
+ *****************************************************************************/
+
diff --git a/dl/sp/src/omxSP_FFTInit_R_S16.c b/dl/sp/src/omxSP_FFTInit_R_S16.c
new file mode 100644
index 0000000..f44dff3
--- /dev/null
+++ b/dl/sp/src/omxSP_FFTInit_R_S16.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * Some code in this file was originally from file omxSP_FFTInit_R_S16S32.c
+ * which was licensed as follows.
+ * It has been relicensed with permission from the copyright holders.
+ */
+
+/*
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:
+ * Last Modified Date:
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ */
+
+#include "dl/api/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+/**
+ * Function: omxSP_FFTInit_R_S16
+ *
+ * Description:
+ * Initialize the real forward-FFT specification information struct.
+ *
+ * Remarks:
+ * This function is used to initialize the specification structures
+ * for functions <ippsFFTFwd_RToCCS_S16_Sfs> and
+ * <ippsFFTInv_CCSToR_S16_Sfs>. Memory for *pFFTSpec must be
+ * allocated prior to calling this function. The number of bytes
+ * required for *pFFTSpec can be determined using
+ * <FFTGetBufSize_R_S16>.
+ *
+ * Parameters:
+ * [in] order base-2 logarithm of the desired block length;
+ * valid in the range [1,12].
+ * [out] pFFTFwdSpec pointer to the initialized specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTInit_R_S16(OMXFFTSpec_R_S16* pFFTSpec, OMX_INT order) {
+ OMX_INT i = 0, j = 0;
+ OMX_SC16 *pTwiddle = NULL, *pTwiddle1 = NULL, *pTwiddle2 = NULL;
+ OMX_SC16 *pTwiddle3 = NULL, *pTwiddle4 = NULL;
+ OMX_S16 *pBuf = NULL;
+ OMX_U16 *pBitRev = NULL;
+ OMX_U32 pTmp = 0;
+ OMX_INT Nby2 = 0, N = 0, M = 0, diff = 0, step = 0;
+ OMX_S16 x = 0, y = 0, xNeg = 0;
+ OMX_S32 xS32 = 0, yS32 = 0;
+ ARMsFFTSpec_R_SC16 *pFFTStruct = NULL;
+
+ /* Order zero not allowed */
+ if (order == 0) {
+ return OMX_Sts_BadArgErr;
+ }
+
+ /* Do the initializations */
+ pFFTStruct = (ARMsFFTSpec_R_SC16*) pFFTSpec;
+ Nby2 = 1 << (order - 1);
+ N = Nby2 << 1;
+ pBitRev = NULL ; /* optimized implementations don't use bitreversal */
+ pTwiddle = (OMX_SC16*) (sizeof(ARMsFFTSpec_R_SC16) + (OMX_S8*)pFFTSpec);
+
+ /* Align to 32 byte boundary */
+ pTmp = ((OMX_U32)pTwiddle)&31; /* (OMX_U32)pTwiddle % 32 */
+ if(pTmp != 0) {
+ pTwiddle = (OMX_SC16*) ((OMX_S8*)pTwiddle + (32 - pTmp));
+ }
+
+ pBuf = (OMX_S16*) (sizeof(OMX_SC16) * (5 * N / 8) + (OMX_S8*)pTwiddle);
+
+ /* Align to 32 byte boundary */
+ pTmp = ((OMX_U32)pBuf)&31; /* (OMX_U32)pBuf % 32 */
+ if(pTmp != 0) {
+ pBuf = (OMX_S16*)((OMX_S8*)pBuf + (32 - pTmp));
+ }
+
+ /*
+ * Filling Twiddle factors : exp^(-j*2*PI*k/ (N/2) ) ; k=0,1,2,...,3/4(N/2).
+ * N/2 point complex FFT is used to compute N point real FFT.
+ * The original twiddle table "armSP_FFT_S32TwiddleTable" is of size
+ * (MaxSize/8 + 1). Rest of the values i.e., up to MaxSize are calculated
+ * using the symmetries of sin and cos.
+ * The max size of the twiddle table needed is 3/4(N/2) for a radix-4 stage.
+ *
+ * W = (-2 * PI) / N
+ * N = 1 << order
+ * W = -PI >> (order - 1)
+ *
+ * Note we use S32 twiddle factor table and round the values to 16 bits.
+ */
+
+ M = Nby2 >> 3;
+ diff = 12 - (order - 1);
+ step = 1 << diff; /* Step into the twiddle table for the current order */
+
+ xS32 = armSP_FFT_S32TwiddleTable[0];
+ yS32 = armSP_FFT_S32TwiddleTable[1];
+ x = (xS32 + 0x8000) >> 16;
+ y = (yS32 + 0x8000) >> 16;
+ xNeg = 0x7FFF;
+
+ if((order-1) >= 3) {
+ /* i = 0 case */
+ pTwiddle[0].Re = x;
+ pTwiddle[0].Im = y;
+ pTwiddle[2 * M].Re = -y;
+ pTwiddle[2 * M].Im = xNeg;
+ pTwiddle[4 * M].Re = xNeg;
+ pTwiddle[4 * M].Im = y;
+
+ for (i=1; i<=M; i++){
+ OMX_S16 x_neg = 0, y_neg = 0;
+ j = i * step;
+
+ xS32 = armSP_FFT_S32TwiddleTable[2 * j];
+ yS32 = armSP_FFT_S32TwiddleTable[2 * j + 1];
+ x = (xS32 + 0x8000) >> 16;
+ y = (yS32 + 0x8000) >> 16;
+ /* |x_neg = -x| doesn't work when x is 0x8000. */
+ x_neg = (-(xS32 + 0x8000)) >> 16;
+ y_neg = (-(yS32 + 0x8000)) >> 16;
+
+ pTwiddle[i].Re = x;
+ pTwiddle[i].Im = y;
+ pTwiddle[2 * M - i].Re = y_neg;
+ pTwiddle[2 * M - i].Im = x_neg;
+ pTwiddle[2 * M + i].Re = y;
+ pTwiddle[2 * M + i].Im = x_neg;
+ pTwiddle[4 * M - i].Re = x_neg;
+ pTwiddle[4 * M - i].Im = y;
+ pTwiddle[4 * M + i].Re = x_neg;
+ pTwiddle[4 * M + i].Im = y_neg;
+ pTwiddle[6 * M - i].Re = y;
+ pTwiddle[6 * M - i].Im = x;
+ }
+ }
+ else {
+ if ((order - 1) == 2) {
+ pTwiddle[0].Re = x;
+ pTwiddle[0].Im = y;
+ pTwiddle[1].Re = -y;
+ pTwiddle[1].Im = xNeg;
+ pTwiddle[2].Re = xNeg;
+ pTwiddle[2].Im = y;
+ }
+ if ((order-1) == 1) {
+ pTwiddle[0].Re = x;
+ pTwiddle[0].Im = y;
+ }
+ }
+
+ /*
+ * Now fill the last N/4 values : exp^(-j*2*PI*k/N); k=1,3,5,...,N/2-1.
+ * These are used for the final twiddle fix-up for converting complex to
+ * real FFT.
+ */
+
+ M = N >> 3;
+ diff = 12 - order;
+ step = 1 << diff;
+
+ pTwiddle1 = pTwiddle + 3 * N / 8;
+ pTwiddle4 = pTwiddle1 + (N / 4 - 1);
+ pTwiddle3 = pTwiddle1 + N / 8;
+ pTwiddle2 = pTwiddle1 + (N / 8 - 1);
+
+ xS32 = armSP_FFT_S32TwiddleTable[0];
+ yS32 = armSP_FFT_S32TwiddleTable[1];
+ x = (xS32 + 0x8000) >> 16;
+ y = (yS32 + 0x8000) >> 16;
+ xNeg = 0x7FFF;
+
+ if((order) >= 3) {
+ for (i = 1; i <= M; i += 2 ) {
+ OMX_S16 x_neg = 0, y_neg = 0;
+
+ j = i*step;
+
+ xS32 = armSP_FFT_S32TwiddleTable[2 * j];
+ yS32 = armSP_FFT_S32TwiddleTable[2 * j + 1];
+ x = (xS32 + 0x8000) >> 16;
+ y = (yS32 + 0x8000) >> 16;
+ /* |x_neg = -x| doesn't work when x is 0x8000. */
+ x_neg = (-(xS32 + 0x8000)) >> 16;
+ y_neg = (-(yS32 + 0x8000)) >> 16;
+
+ pTwiddle1[0].Re = x;
+ pTwiddle1[0].Im = y;
+ pTwiddle1 += 1;
+ pTwiddle2[0].Re = y_neg;
+ pTwiddle2[0].Im = x_neg;
+ pTwiddle2 -= 1;
+ pTwiddle3[0].Re = y;
+ pTwiddle3[0].Im = x_neg;
+ pTwiddle3 += 1;
+ pTwiddle4[0].Re = x_neg;
+ pTwiddle4[0].Im = y;
+ pTwiddle4 -= 1;
+ }
+ }
+ else {
+ if (order == 2) {
+ pTwiddle1[0].Re = -y;
+ pTwiddle1[0].Im = xNeg;
+ }
+ }
+
+ /* Update the structure */
+ pFFTStruct->N = N;
+ pFFTStruct->pTwiddle = pTwiddle;
+ pFFTStruct->pBitRev = pBitRev;
+ pFFTStruct->pBuf = pBuf;
+
+ return OMX_Sts_NoErr;
+}
+/*****************************************************************************
+ * END OF FILE
+ *****************************************************************************/
+
diff --git a/dl/sp/src/omxSP_FFTInv_CCSToR_S16_Sfs_s.S b/dl/sp/src/omxSP_FFTInv_CCSToR_S16_Sfs_s.S
new file mode 100644
index 0000000..553f280
--- /dev/null
+++ b/dl/sp/src/omxSP_FFTInv_CCSToR_S16_Sfs_s.S
@@ -0,0 +1,301 @@
+@
+@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS. All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+@ Some code in this file was originally from file
+@ omxSP_FFTInv_CToC_SC16_Sfs_s.S which was licensed as follows.
+@ It has been relicensed with permission from the copyright holders.
+@
+
+@
+@ File Name: omxSP_FFTInv_CToC_SC16_Sfs_s.s
+@ OpenMAX DL: v1.0.2
+@ Last Modified Revision: 6729
+@ Last Modified Date: Tue, 17 Jul 2007
+@
+@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@
+
+@
+@ Description:
+@ Compute an inverse FFT for a 16-bit real signal, with complex FFT routines.
+@
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+.extern armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+.extern armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+
+@Input Registers
+#define pSrc r0
+#define pDst r1
+#define pFFTSpec r2
+#define scale r3
+
+@ Output registers
+#define result r0
+
+@Local Scratch Registers
+#define argTwiddle r1
+#define argDst r2
+#define argScale r4
+#define pTwiddle r4
+#define tmpOrder r4
+#define pOut r5
+#define subFFTSize r7
+#define subFFTNum r6
+#define N r6
+#define order r14
+#define diff r9
+@ Total num of radix stages to comple the FFT
+#define count r8
+#define x0r r4
+#define x0i r5
+#define diffMinusOne r2
+#define round r3
+#define pOut1 r2
+#define size r7
+#define step r8
+#define step1 r9
+#define twStep r10
+#define pTwiddleTmp r11
+#define argTwiddle1 r12
+#define zero r14
+
+@ Neon registers
+#define dX0 D0.S32
+#define dShift D1.S32
+#define qShift Q0.s16
+#define dX1 D1.S32
+#define dY0 D2.S32
+#define dY1 D3.S32
+#define dX0r D0.S32
+#define dX0i D1.S32
+#define dX1r D2.S32
+#define dX1i D3.S32
+#define dW0r D4.S32
+#define dW0i D5.S32
+#define dW1r D6.S32
+#define dW1i D7.S32
+#define dT0 D8.S32
+#define dT1 D9.S32
+#define dT2 D10.S32
+#define dT3 D11.S32
+#define qT0 Q6.S64
+#define qT1 Q7.S64
+#define qT0s Q6.S16
+#define qT1s Q7.S16
+#define qT2 Q8.S64
+#define qT3 Q9.S64
+#define dY0r D4.S32
+#define dY0i D5.S32
+#define dY1r D6.S32
+#define dY1i D7.S32
+#define dzero D20.S32
+#define dY2 D4.S32
+#define dY3 D5.S32
+#define dW0 D6.S32
+#define dW1 D7.S32
+#define dW0Tmp D10.S32
+#define dW1Neg D11.S32
+
+
+
+ @ Allocate stack memory required by the function
+ M_ALLOC4 diffOnStack, 4
+
+ @ Write function header
+ M_START omxSP_FFTInv_CCSToR_S16_Sfs,r11,d15
+
+@ Structure offsets for the FFTSpec
+ .set ARMsFFTSpec_N, 0
+ .set ARMsFFTSpec_pBitRev, 4
+ .set ARMsFFTSpec_pTwiddle, 8
+ .set ARMsFFTSpec_pBuf, 12
+
+ @ Define stack arguments
+
+ @ Read the size from structure and take log
+ LDR N, [pFFTSpec, #ARMsFFTSpec_N]
+
+ @ Read other structure parameters
+ LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+ LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+ @ Call the preTwiddle Radix2 stage before doing the complex IFFT
+
+ @ The following conditional BL combination would work since
+ @ evenOddButterflyLoop in the first call would set Z flag to zero
+
+ CMP scale,#0
+ BLEQ armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe
+ BLGT armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe
+
+complexIFFT:
+
+ ASR N,N,#1 @ N/2 point complex IFFT
+ ADD pSrc,pOut,N,LSL #2 @ set pSrc as pOut1
+
+ CLZ order,N @ N = 2^order
+ RSB order,order,#31
+ MOV subFFTSize,#1
+
+ ADD scale,scale,order @ FFTInverse has a final scaling factor by N
+
+ CMP order,#3
+ BGT orderGreaterthan3 @ order > 3
+
+ CMP order,#1
+ BGE orderGreaterthan0 @ order > 0
+ M_STR scale, diffOnStack,LT @ order = 0
+ LDRLT x0r,[pSrc]
+ STRLT x0r,[pDst]
+ MOVLT pSrc,pDst
+ BLT FFTEnd
+
+orderGreaterthan0:
+ @ set the buffers appropriately for various orders
+ CMP order,#2
+ MOVNE argDst,pDst
+ MOVEQ argDst,pOut
+ MOVEQ pOut,pDst @ Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
+ @ Store the scale factor and scale at the end
+ SUB diff,scale,order
+ M_STR diff, diffOnStack
+ BGE orderGreaterthan1
+ BLLT armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe @ order = 1
+ B FFTEnd
+
+
+orderGreaterthan1:
+ MOV tmpOrder,order @ tmpOrder = RN 4
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+ CMP tmpOrder,#2
+ BLGT armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+
+
+
+orderGreaterthan3:
+ @ check scale = 0 or scale = order
+ SUB diff, scale, order @ scale > order
+
+ TST order, #2 @ Set input args to fft stages
+ MOVNE argDst,pDst
+ MOVEQ argDst,pOut
+ MOVEQ pOut,pDst @ Pass the first stage destination in RN5
+ MOV argTwiddle,pTwiddle
+
+ CMP diff,#0
+ M_STR diff, diffOnStack
+ BGE scaleEqualsOrder
+
+ @check for even or odd order
+ @ NOTE: The following combination of BL's would work fine eventhough the first
+ @ BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+ @ armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+ BLNE armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+
+ CMP subFFTNum,#4
+ BLT FFTEnd
+
+unscaledRadix4Loop:
+ BEQ lastStageUnscaledRadix4
+ BL armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
+ CMP subFFTNum,#4
+ B unscaledRadix4Loop
+
+lastStageUnscaledRadix4:
+ BL armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+ B FFTEnd
+
+scaleEqualsOrder:
+ @check for even or odd order
+ @ NOTE: The following combination of BL's would work fine eventhough the first
+ @ BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+ @ armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
+ TST order,#0x00000001
+ BLEQ armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+ BLNE armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+
+ CMP subFFTNum,#4
+ BLT FFTEnd
+
+scaledRadix4Loop:
+ BEQ lastStageScaledRadix4
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+ CMP subFFTNum,#4
+ B scaledRadix4Loop
+
+lastStageScaledRadix4:
+ BL armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+
+FFTEnd: @ Does only the scaling
+
+ M_LDR diff, diffOnStack
+ CMP diff,#0
+ BLE End
+
+ RSB diff,diff,#0 @ to use VRSHL for right shift by a variable
+ VDUP qShift,diff
+
+ @ Use parallel loads for bigger FFT size.
+ CMP subFFTSize, #8
+ BLT scaleLessFFTData
+
+scaleFFTData:
+ VLD1 {qT0s, qT1s},[pSrc:256] @ pSrc contains pDst pointer
+ SUBS subFFTSize,subFFTSize,#8
+ VSHL qT0s,qShift
+ VSHL qT1s,qShift
+ VST1 {qT0s, qT1s},[pSrc:256]!
+ BGT scaleFFTData
+ B End
+
+scaleLessFFTData: @ N = subFFTSize ; dataptr = pDst ; scale = diff
+ VLD1 {dX0[0]},[pSrc] @ pSrc contains pDst pointer
+ SUBS subFFTSize,subFFTSize,#1
+ VRSHL dX0,dShift
+ VST1 {dX0[0]},[pSrc]!
+ BGT scaleLessFFTData
+
+End:
+ @ Set return value
+ MOV result, #OMX_Sts_NoErr
+
+ @ Write function tail
+ M_END
+
+
+
+
+
+
+ .END
diff --git a/dl/sp/src/test/test_fft.gyp b/dl/sp/src/test/test_fft.gyp
index 99b3774..3290550 100644
--- a/dl/sp/src/test/test_fft.gyp
+++ b/dl/sp/src/test/test_fft.gyp
@@ -67,11 +67,19 @@
],
},
{
- # Test real 16-bit fixed-point FFT
- 'target_name': 'test_rfft16',
+ # Test real 16-bit fixed-point FFT implemented with S32 routines.
+ 'target_name': 'test_rfft16_s32',
'type': 'executable',
'sources': [
- 'test_rfft16.c',
+ 'test_rfft16_s32.c',
+ ],
+ },
+ {
+ # Test real 16-bit fixed-point FFT implemented with S16 routines.
+ 'target_name': 'test_rfft16_s16',
+ 'type': 'executable',
+ 'sources': [
+ 'test_rfft16_s16.c',
],
},
{
@@ -107,7 +115,8 @@
'test_fft32',
'test_float_fft',
'test_float_rfft',
- 'test_rfft16',
+ 'test_rfft16_s32',
+ 'test_rfft16_s16',
'test_rfft32',
'test_fft_time',
],
diff --git a/dl/sp/src/test/test_fft16.c b/dl/sp/src/test/test_fft16.c
index 081bf23..bedf278 100644
--- a/dl/sp/src/test/test_fft16.c
+++ b/dl/sp/src/test/test_fft16.c
@@ -24,7 +24,7 @@
#define MAX_FFT_ORDER 12
int verbose = 0;
-int signal_value = 1024;
+int signal_value = 32767;
int scale_factor = 0;
struct KnownTestFailures known_failures[] = {
diff --git a/dl/sp/src/test/test_fft_time.c b/dl/sp/src/test/test_fft_time.c
index a401594..05f51d0 100644
--- a/dl/sp/src/test/test_fft_time.c
+++ b/dl/sp/src/test/test_fft_time.c
@@ -20,9 +20,14 @@
#include "dl/sp/src/test/aligned_ptr.h"
#include "dl/sp/src/test/gensig.h"
-#define MAX_FFT_ORDER TWIDDLE_TABLE_ORDER
+#define MAX_FFT_ORDER TWIDDLE_TABLE_ORDER
#define MAX_FFT_ORDER_FIXED_POINT 12
+typedef enum {
+ S16,
+ S32,
+} s16_s32;
+
void TimeOneFloatFFT(int count, int fft_log_size, float signal_value,
int signal_type);
void TimeFloatFFT(int count, float signal_value, int signal_type);
@@ -32,8 +37,11 @@
void TimeOneSC32FFT(int count, int fft_log_size, float signal_value,
int signal_type);
void TimeSC32FFT(int count, float signal_value, int signal_type);
+void TimeOneSC16FFT(int count, int fft_log_size, float signal_value,
+ int signal_type);
+void TimeSC16FFT(int count, float signal_value, int signal_type);
void TimeOneRFFT16(int count, int fft_log_size, float signal_value,
- int signal_type);
+ int signal_type, s16_s32 s16s32);
void TimeRFFT16(int count, float signal_value, int signal_type);
void TimeOneRFFT32(int count, int fft_log_size, float signal_value,
int signal_type);
@@ -49,57 +57,58 @@
void TimeFFTUsage(const char* prog) {
fprintf(stderr,
- "%s: [-hTFICA] [-f fft] [-c count] [-n logsize] [-s scale]\n"
- " [-g signal-type] [-S signal value]\n"
- " [-m minFFTsize] [-M maxFFTsize]\n",
+ "%s: [-hTFICA] [-f fft] [-c count] [-n logsize] [-s scale]\n"
+ " [-g signal-type] [-S signal value]\n"
+ " [-m minFFTsize] [-M maxFFTsize]\n",
ProgramName(prog));
fprintf(stderr,
- "Simple FFT timing tests\n"
- " -h This help\n"
- " -v level Verbose output level (default = 1)\n"
- " -F Skip forward FFT tests\n"
- " -I Skip inverse FFT tests\n"
- " -C Include float-to-fixed and fixed-to-float cost for"
- " real\n"
- " 16-bit FFT (forward and inverse)\n"
- " -c count Number of FFTs to compute for timing. This is a"
- " lower\n"
- " lower limit; shorter FFTs will do more FFTs such"
- " that the\n"
- " elapsed time is very roughly constant, if -A is"
- " not given.\n"
- " -A Don't adapt the count given by -c; use specified"
- " value\n"
- " -m min Mininum FFT order to test\n"
- " -M max Maximum FFT order to test\n"
- " -T Run just one FFT timing test\n"
- " -f FFT type:\n"
- " 0 - Complex Float\n"
- " 1 - Real Float\n"
- " 2 - Complex 32-bit\n"
- " 3 - Real 16-bit\n"
- " 4 - Real 32-bit\n"
- " -n logsize Log2 of FFT size\n"
- " -s scale Scale factor for forward FFT (default = 0)\n"
- " -S signal Base value for the test signal (default = 1024)\n"
- " -g type Input signal type:\n"
- " 0 - Constant signal S + i*S. (Default value.)\n"
- " 1 - Real ramp starting at S/N, N = FFT size\n"
- " 2 - Sine wave of amplitude S\n"
- " 3 - Complex signal whose transform is a sine wave.\n"
- "\n"
- "Use -v 0 in combination with -F or -I to get output that can\n"
- "be pasted into a spreadsheet.\n"
- "\n"
- "Most of the options listed after -T above are only applicable\n"
- "when -T is given to test just one FFT size and FFT type.\n"
- "\n");
+ "Simple FFT timing tests\n"
+ " -h This help\n"
+ " -v level Verbose output level (default = 1)\n"
+ " -F Skip forward FFT tests\n"
+ " -I Skip inverse FFT tests\n"
+ " -C Include float-to-fixed and fixed-to-float cost for"
+ " real\n"
+ " 16-bit FFT (forward and inverse)\n"
+ " -c count Number of FFTs to compute for timing. This is a"
+ " lower\n"
+ " lower limit; shorter FFTs will do more FFTs such"
+ " that the\n"
+ " elapsed time is very roughly constant, if -A is"
+ " not given.\n"
+ " -A Don't adapt the count given by -c; use specified"
+ " value\n"
+ " -m min Mininum FFT order to test\n"
+ " -M max Maximum FFT order to test\n"
+ " -T Run just one FFT timing test\n"
+ " -f FFT type:\n"
+ " 0 - Complex Float\n"
+ " 1 - Real Float\n"
+ " 2 - Complex 16-bit\n"
+ " 3 - Real 16-bit\n"
+ " 4 - Complex 32-bit\n"
+ " 5 - Real 32-bit\n"
+ " -n logsize Log2 of FFT size\n"
+ " -s scale Scale factor for forward FFT (default = 0)\n"
+ " -S signal Base value for the test signal (default = 1024)\n"
+ " -g type Input signal type:\n"
+ " 0 - Constant signal S + i*S. (Default value.)\n"
+ " 1 - Real ramp starting at S/N, N = FFT size\n"
+ " 2 - Sine wave of amplitude S\n"
+ " 3 - Complex signal whose transform is a sine wave.\n"
+ "\n"
+ "Use -v 0 in combination with -F or -I to get output that can\n"
+ "be pasted into a spreadsheet.\n"
+ "\n"
+ "Most of the options listed after -T above are only applicable\n"
+ "when -T is given to test just one FFT size and FFT type.\n"
+ "\n");
exit(0);
}
void main(int argc, char* argv[]) {
int fft_log_size = 4;
- float signal_value = 1024;
+ float signal_value = 32767;
int signal_type = 0;
int test_mode = 1;
int count = 100;
@@ -175,8 +184,9 @@
if (test_mode) {
TimeFloatFFT(count, signal_value, signal_type);
TimeFloatRFFT(count, signal_value, signal_type);
- TimeSC32FFT(count, signal_value, signal_type);
+ TimeSC16FFT(count, signal_value, signal_type);
TimeRFFT16(count, signal_value, signal_type);
+ TimeSC32FFT(count, signal_value, signal_type);
TimeRFFT32(count, signal_value, signal_type);
} else {
switch (fft_type) {
@@ -187,12 +197,16 @@
TimeOneFloatRFFT(count, fft_log_size, signal_value, signal_type);
break;
case 2:
- TimeOneSC32FFT(count, fft_log_size, signal_value, signal_type);
+ TimeOneSC16FFT(count, fft_log_size, signal_value, signal_type);
break;
case 3:
- TimeOneRFFT16(count, fft_log_size, signal_value, signal_type);
+ TimeOneRFFT16(count, fft_log_size, signal_value, signal_type, S32);
+ TimeOneRFFT16(count, fft_log_size, signal_value, signal_type, S16);
break;
case 4:
+ TimeOneSC32FFT(count, fft_log_size, signal_value, signal_type);
+ break;
+ case 5:
TimeOneRFFT32(count, fft_log_size, signal_value, signal_type);
break;
default:
@@ -638,6 +652,189 @@
}
}
+void generateSC16Signal(OMX_SC16* x, OMX_SC16* fft, int size, int signal_type,
+ float signal_value) {
+ int k;
+ struct ComplexFloat *test_signal;
+ struct ComplexFloat *true_fft;
+
+ test_signal = (struct ComplexFloat*) malloc(sizeof(*test_signal) * size);
+ true_fft = (struct ComplexFloat*) malloc(sizeof(*true_fft) * size);
+ GenerateTestSignalAndFFT(test_signal, true_fft, size, signal_type,
+ signal_value, 0);
+
+ /*
+ * Convert the complex result to what we want
+ */
+
+ for (k = 0; k < size; ++k) {
+ x[k].Re = 0.5 + test_signal[k].Re;
+ x[k].Im = 0.5 + test_signal[k].Im;
+ fft[k].Re = 0.5 + true_fft[k].Re;
+ fft[k].Im = 0.5 + true_fft[k].Im;
+ }
+
+ free(test_signal);
+ free(true_fft);
+}
+
+void TimeOneSC16FFT(int count, int fft_log_size, float signal_value,
+ int signal_type) {
+ OMX_SC16* x;
+ OMX_SC16* y;
+ OMX_SC16* z;
+
+ struct AlignedPtr* x_aligned;
+ struct AlignedPtr* y_aligned;
+ struct AlignedPtr* z_aligned;
+
+ OMX_SC16* y_true;
+ OMX_SC16* temp16a;
+ OMX_SC16* temp16b;
+
+ OMX_INT n, fft_spec_buffer_size;
+ OMXResult status;
+ OMXFFTSpec_C_SC16 * fft_fwd_spec = NULL;
+ OMXFFTSpec_C_SC16 * fft_inv_spec = NULL;
+ int fft_size;
+ struct timeval start_time;
+ struct timeval end_time;
+ double elapsed_time;
+
+ fft_size = 1 << fft_log_size;
+
+ x_aligned = AllocAlignedPointer(32, sizeof(*x) * fft_size);
+ y_aligned = AllocAlignedPointer(32, sizeof(*y) * fft_size);
+ z_aligned = AllocAlignedPointer(32, sizeof(*z) * fft_size);
+ y_true = (OMX_SC16*) malloc(sizeof(*y_true) * fft_size);
+ temp16a = (OMX_SC16*) malloc(sizeof(*temp16a) * fft_size);
+ temp16b = (OMX_SC16*) malloc(sizeof(*temp16b) * fft_size);
+
+ x = x_aligned->aligned_pointer_;
+ y = y_aligned->aligned_pointer_;
+ z = z_aligned->aligned_pointer_;
+
+ generateSC16Signal(x, y_true, fft_size, signal_type, signal_value);
+
+ status = omxSP_FFTGetBufSize_C_SC16(fft_log_size, &fft_spec_buffer_size);
+
+ fft_fwd_spec = (OMXFFTSpec_C_SC16*) malloc(fft_spec_buffer_size);
+ fft_inv_spec = (OMXFFTSpec_C_SC16*) malloc(fft_spec_buffer_size);
+ status = omxSP_FFTInit_C_SC16(fft_fwd_spec, fft_log_size);
+
+ status = omxSP_FFTInit_C_SC16(fft_inv_spec, fft_log_size);
+
+ if (do_forward_test) {
+ if (include_conversion) {
+ int k;
+ float factor = -1;
+
+ GetUserTime(&start_time);
+ for (k = 0; k < count; ++k) {
+ for (n = 0; n < fft_size; ++n) {
+ if (fabs(x[n].Re) > factor) {
+ factor = fabs(x[n].Re);
+ }
+ if (fabs(x[n].Im) > factor) {
+ factor = fabs(x[n].Im);
+ }
+ }
+
+ factor = ((1 << 18) - 1) / factor;
+ for (n = 0; n < fft_size; ++n) {
+ temp16a[n].Re = factor * x[n].Re;
+ temp16a[n].Im = factor * x[n].Im;
+ }
+
+ omxSP_FFTFwd_CToC_SC16_Sfs(x, y, fft_fwd_spec, 0);
+
+ factor = 1 / factor;
+ for (n = 0; n < fft_size; ++n) {
+ temp16b[n].Re = y[n].Re * factor;
+ temp16b[n].Im = y[n].Im * factor;
+ }
+ }
+ GetUserTime(&end_time);
+ } else {
+ GetUserTime(&start_time);
+ for (n = 0; n < count; ++n) {
+ omxSP_FFTFwd_CToC_SC16_Sfs(x, y, fft_fwd_spec, 0);
+ }
+ GetUserTime(&end_time);
+ }
+
+ elapsed_time = TimeDifference(&start_time, &end_time);
+
+ PrintResult("Forward SC16 FFT", fft_log_size, elapsed_time, count);
+ }
+
+ if (do_inverse_test) {
+ if (include_conversion) {
+ int k;
+ float factor = -1;
+
+ GetUserTime(&start_time);
+ for (k = 0; k < count; ++k) {
+ for (n = 0; n < fft_size; ++n) {
+ if (fabs(x[n].Re) > factor) {
+ factor = fabs(x[n].Re);
+ }
+ if (fabs(x[n].Im) > factor) {
+ factor = fabs(x[n].Im);
+ }
+ }
+ factor = ((1 << 18) - 1) / factor;
+ for (n = 0; n < fft_size; ++n) {
+ temp16a[n].Re = factor * x[n].Re;
+ temp16a[n].Im = factor * x[n].Im;
+ }
+
+ status = omxSP_FFTInv_CToC_SC16_Sfs(y, z, fft_inv_spec, 0);
+
+ factor = 1 / factor;
+ for (n = 0; n < fft_size; ++n) {
+ temp16b[n].Re = y[n].Re * factor;
+ temp16b[n].Im = y[n].Im * factor;
+ }
+ }
+ GetUserTime(&end_time);
+ } else {
+ GetUserTime(&start_time);
+ for (n = 0; n < count; ++n) {
+ status = omxSP_FFTInv_CToC_SC16_Sfs(y, z, fft_inv_spec, 0);
+ }
+ GetUserTime(&end_time);
+ }
+
+ elapsed_time = TimeDifference(&start_time, &end_time);
+
+ PrintResult("Inverse SC16 FFT", fft_log_size, elapsed_time, count);
+ }
+
+ FreeAlignedPointer(x_aligned);
+ FreeAlignedPointer(y_aligned);
+ FreeAlignedPointer(z_aligned);
+ free(temp16a);
+ free(temp16b);
+ free(fft_fwd_spec);
+ free(fft_inv_spec);
+}
+
+void TimeSC16FFT(int count, float signal_value, int signal_type) {
+ int k;
+ int max_order = (max_fft_order > MAX_FFT_ORDER_FIXED_POINT)
+ ? MAX_FFT_ORDER_FIXED_POINT : max_fft_order;
+
+ if (verbose == 0)
+ printf("SC16 FFT\n");
+
+ for (k = min_fft_order; k <= max_order; ++k) {
+ //for (k = 7; k <= 8; ++k) {
+ int testCount = ComputeCount(count, k);
+ TimeOneSC16FFT(testCount, k, signal_value, signal_type);
+ }
+}
+
void GenerateRFFT16Signal(OMX_S16* x, OMX_SC32* fft, int size, int signal_type,
float signal_value) {
int k;
@@ -666,8 +863,12 @@
free(true_fft);
}
+/* Argument s16s32:
+ * S32: Calculate RFFT16 with 32 bit complex FFT;
+ * otherwise: Calculate RFFT16 with 16 bit complex FFT.
+ */
void TimeOneRFFT16(int count, int fft_log_size, float signal_value,
- int signal_type) {
+ int signal_type, s16_s32 s16s32) {
OMX_S16* x;
OMX_S32* y;
OMX_S16* z;
@@ -689,8 +890,8 @@
OMX_INT n, fft_spec_buffer_size;
OMXResult status;
- OMXFFTSpec_R_S16S32 * fft_fwd_spec = NULL;
- OMXFFTSpec_R_S16S32 * fft_inv_spec = NULL;
+ OMXFFTSpec_R_S16 * fft_fwd_spec = NULL;
+ OMXFFTSpec_R_S16 * fft_inv_spec = NULL;
int fft_size;
struct timeval start_time;
struct timeval end_time;
@@ -728,13 +929,20 @@
GenerateRealFloatSignal(xr, (OMX_FC32*) yrTrue, fft_size, signal_type,
signal_value);
- status = omxSP_FFTGetBufSize_R_S16S32(fft_log_size, &fft_spec_buffer_size);
-
- fft_fwd_spec = (OMXFFTSpec_R_S16S32*) malloc(fft_spec_buffer_size);
- fft_inv_spec = (OMXFFTSpec_R_S16S32*) malloc(fft_spec_buffer_size);
- status = omxSP_FFTInit_R_S16S32(fft_fwd_spec, fft_log_size);
-
- status = omxSP_FFTInit_R_S16S32(fft_inv_spec, fft_log_size);
+ if(s16s32 == S32) {
+ status = omxSP_FFTGetBufSize_R_S16S32(fft_log_size, &fft_spec_buffer_size);
+ fft_fwd_spec = malloc(fft_spec_buffer_size);
+ fft_inv_spec = malloc(fft_spec_buffer_size);
+ status = omxSP_FFTInit_R_S16S32(fft_fwd_spec, fft_log_size);
+ status = omxSP_FFTInit_R_S16S32(fft_inv_spec, fft_log_size);
+ }
+ else {
+ status = omxSP_FFTGetBufSize_R_S16(fft_log_size, &fft_spec_buffer_size);
+ fft_fwd_spec = malloc(fft_spec_buffer_size);
+ fft_inv_spec = malloc(fft_spec_buffer_size);
+ status = omxSP_FFTInit_R_S16(fft_fwd_spec, fft_log_size);
+ status = omxSP_FFTInit_R_S16(fft_inv_spec, fft_log_size);
+ }
if (do_forward_test) {
if (include_conversion) {
@@ -757,9 +965,14 @@
temp16[n] = factor * xr[n];
}
- status = omxSP_FFTFwd_RToCCS_S16S32_Sfs(x, y, fft_fwd_spec,
- (OMX_INT) scaleFactor);
-
+ if(s16s32 == S32) {
+ status = omxSP_FFTFwd_RToCCS_S16S32_Sfs(x, y,
+ (OMXFFTSpec_R_S16S32*)fft_fwd_spec, (OMX_INT) scaleFactor);
+ }
+ else {
+ status = omxSP_FFTFwd_RToCCS_S16_Sfs(x, (OMX_S16*)y,
+ (OMXFFTSpec_R_S16*)fft_fwd_spec, (OMX_INT) scaleFactor);
+ }
/*
* Now spend some time converting the fixed-point FFT back to float.
*/
@@ -774,15 +987,26 @@
GetUserTime(&start_time);
for (n = 0; n < count; ++n) {
- status = omxSP_FFTFwd_RToCCS_S16S32_Sfs(x, y, fft_fwd_spec,
- (OMX_INT) scaleFactor);
+ if(s16s32 == S32) {
+ status = omxSP_FFTFwd_RToCCS_S16S32_Sfs(x, y,
+ (OMXFFTSpec_R_S16S32*)fft_fwd_spec, (OMX_INT) scaleFactor);
+ }
+ else {
+ status = omxSP_FFTFwd_RToCCS_S16_Sfs(x, (OMX_S16*)y,
+ (OMXFFTSpec_R_S16*)fft_fwd_spec, (OMX_INT) scaleFactor);
+ }
}
GetUserTime(&end_time);
}
elapsed_time = TimeDifference(&start_time, &end_time);
- PrintResult("Forward RFFT16", fft_log_size, elapsed_time, count);
+ if(s16s32 == S32) {
+ PrintResult("Forward RFFT16 (with S32)", fft_log_size, elapsed_time, count);
+ }
+ else {
+ PrintResult("Forward RFFT16 (with S16)", fft_log_size, elapsed_time, count);
+ }
}
if (do_inverse_test) {
@@ -804,9 +1028,14 @@
temp32[n] = factor * yrTrue[n];
}
- status = omxSP_FFTFwd_RToCCS_S16S32_Sfs(x, y, fft_fwd_spec,
- (OMX_INT) scaleFactor);
-
+ if(s16s32 == S32) {
+ status = omxSP_FFTInv_CCSToR_S32S16_Sfs(y, z,
+ (OMXFFTSpec_R_S16S32*)fft_inv_spec, 0);
+ }
+ else {
+ status = omxSP_FFTInv_CCSToR_S16_Sfs((OMX_S16*)y, z,
+ (OMXFFTSpec_R_S16*)fft_inv_spec, 0);
+ }
/*
* Spend some time converting the result back to float
*/
@@ -819,14 +1048,26 @@
} else {
GetUserTime(&start_time);
for (n = 0; n < count; ++n) {
- status = omxSP_FFTInv_CCSToR_S32S16_Sfs(y, z, fft_inv_spec, 0);
+ if(s16s32 == S32) {
+ status = omxSP_FFTInv_CCSToR_S32S16_Sfs(y, z,
+ (OMXFFTSpec_R_S16S32*)fft_inv_spec, 0);
+ }
+ else {
+ status = omxSP_FFTInv_CCSToR_S16_Sfs((OMX_S16*)y, z,
+ (OMXFFTSpec_R_S16*)fft_inv_spec, 0);
+ }
}
GetUserTime(&end_time);
}
elapsed_time = TimeDifference(&start_time, &end_time);
- PrintResult("Inverse RFFT16", fft_log_size, elapsed_time, count);
+ if(s16s32 == S32) {
+ PrintResult("Inverse RFFT16 (with S32)", fft_log_size, elapsed_time, count);
+ }
+ else {
+ PrintResult("Inverse RFFT16 (with S16)", fft_log_size, elapsed_time, count);
+ }
}
FreeAlignedPointer(x_aligned);
@@ -845,11 +1086,18 @@
? MAX_FFT_ORDER_FIXED_POINT : max_fft_order;
if (verbose == 0)
- printf("RFFT16\n");
-
+ printf("RFFT16 (with S32)\n");
for (k = min_fft_order; k <= max_order; ++k) {
int testCount = ComputeCount(count, k);
- TimeOneRFFT16(testCount, k, signal_value, signal_type);
+ TimeOneRFFT16(testCount, k, signal_value, signal_type, 1);
+ }
+
+ if (verbose == 0)
+ printf("RFFT16 (with S16)\n");
+ //for (k = min_fft_order; k <= max_order; ++k) {
+ for (k = 7; k <= max_order; ++k) {
+ int testCount = ComputeCount(count, k);
+ TimeOneRFFT16(testCount, k, signal_value, signal_type, 0);
}
}
diff --git a/dl/sp/src/test/test_float_rfft.c b/dl/sp/src/test/test_float_rfft.c
index cb3262f..20b5e33 100644
--- a/dl/sp/src/test/test_float_rfft.c
+++ b/dl/sp/src/test/test_float_rfft.c
@@ -36,8 +36,6 @@
SetDefaultOptions(&options, 1, MAX_FFT_ORDER);
- options.signal_value_ = 1024;
-
ProcessCommandLine(&options, argc, argv,
"Test forward and inverse real floating-point FFT\n");
diff --git a/dl/sp/src/test/test_rfft16_s16.c b/dl/sp/src/test/test_rfft16_s16.c
new file mode 100644
index 0000000..c5520f8
--- /dev/null
+++ b/dl/sp/src/test/test_rfft16_s16.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+#include "dl/sp/src/test/aligned_ptr.h"
+#include "dl/sp/src/test/compare.h"
+#include "dl/sp/src/test/gensig.h"
+#include "dl/sp/src/test/test_util.h"
+
+#define MAX_FFT_ORDER 12
+
+int verbose = 0;
+int signal_value = 32767;
+int scale_factor = 0;
+
+void TestFFT(int fftLogSize, int scale_factor, int signalType);
+
+void main(int argc, char* argv[]) {
+ struct Options options;
+
+ SetDefaultOptions(&options, 1, MAX_FFT_ORDER);
+
+ options.signal_value_ = signal_value;
+ options.scale_factor_ = scale_factor;
+
+ ProcessCommandLine(&options, argc, argv, "Test forward and inverse real 16 \
+ -bit fixed-point FFT, with 16-bit complex FFT routines\n");
+
+ verbose = options.verbose_;
+ signal_value = options.signal_value_;
+ scale_factor = options.scale_factor_;
+
+ if (verbose > 255)
+ DumpOptions(stderr, &options);
+
+ if (options.test_mode_) {
+ struct TestInfo info;
+
+ info.real_only_ = options.real_only_;
+ info.max_fft_order_ = options.max_fft_order_;
+ info.min_fft_order_ = options.min_fft_order_;
+ info.do_forward_tests_ = options.do_forward_tests_;
+ info.do_inverse_tests_ = options.do_inverse_tests_;
+ /* No known failures */
+ info.known_failures_ = 0;
+ info.forward_threshold_ = 45;
+ info.inverse_threshold_ = 14;
+
+ RunAllTests(&info);
+ } else {
+ TestFFT(options.fft_log_size_,
+ options.signal_type_,
+ options.scale_factor_);
+ }
+}
+
+void GenerateSignal(struct ComplexFloat* fft,
+ float* x_true, int size, int sigtype) {
+ int k;
+ struct ComplexFloat *test_signal;
+
+ test_signal = (struct ComplexFloat*) malloc(sizeof(*test_signal) * size);
+ GenerateTestSignalAndFFT(test_signal, fft, size, sigtype, signal_value, 1);
+
+ /*
+ * Convert the complex result to what we want
+ */
+
+ for (k = 0; k < size; ++k) {
+ x_true[k] = test_signal[k].Re;
+ }
+
+ free(test_signal);
+}
+
+void TestFFT(int fft_log_size, int signal_type, int scale_factor) {
+ struct SnrResult snr;
+
+ RunOneForwardTest(fft_log_size, signal_type, signal_value, &snr);
+ printf("Forward float FFT\n");
+ printf("SNR: real part %f dB\n", snr.real_snr_);
+ printf(" imag part %f dB\n", snr.imag_snr_);
+ printf(" complex part %f dB\n", snr.complex_snr_);
+
+ RunOneInverseTest(fft_log_size, signal_type, signal_value, &snr);
+ printf("Inverse float FFT\n");
+ printf("SNR: %f dB\n", snr.real_snr_);
+}
+
+float RunOneForwardTest(int fft_log_size, int signal_type,
+ float unused_signal_value,
+ struct SnrResult* snr) {
+ OMX_S16* x;
+ OMX_SC16* y;
+
+ struct AlignedPtr* x_aligned;
+ struct AlignedPtr* y_aligned;
+
+ float* x_true;
+ struct ComplexFloat* y_true;
+ OMX_SC16* y_scaled;
+
+ OMX_INT n, fft_spec_buffer_size;
+ OMXResult status;
+ OMXFFTSpec_R_S16 * fft_fwd_spec = NULL;
+ int fft_size;
+
+ /*
+ * To get good FFT results, set the forward FFT scale factor
+ * to be the same as the order.
+ */
+ scale_factor = fft_log_size;
+
+ fft_size = 1 << fft_log_size;
+
+ status = omxSP_FFTGetBufSize_R_S16(fft_log_size, &fft_spec_buffer_size);
+ if (verbose > 63) {
+ printf("fft_spec_buffer_size = %d\n", fft_spec_buffer_size);
+ }
+
+ fft_fwd_spec = (OMXFFTSpec_R_S16*) malloc(fft_spec_buffer_size);
+ status = omxSP_FFTInit_R_S16(fft_fwd_spec, fft_log_size);
+ if (status) {
+ fprintf(stderr, "Failed to init forward FFT: status = %d\n", status);
+ exit(1);
+ }
+
+ x_aligned = AllocAlignedPointer(32, sizeof(*x) * fft_size);
+ y_aligned = AllocAlignedPointer(32, sizeof(*y) * (fft_size + 2));
+
+ x = x_aligned->aligned_pointer_;
+ y = y_aligned->aligned_pointer_;
+
+ x_true = (float*) malloc(sizeof(*x_true) * fft_size);
+ y_true = (struct ComplexFloat*) malloc(sizeof(*y_true) * (fft_size / 2 + 1));
+ y_scaled = (OMX_SC16*) malloc(sizeof(*y_true) * (fft_size / 2 + 1));
+
+ GenerateSignal(y_true, x_true, fft_size, signal_type);
+ for (n = 0; n < fft_size; ++n) {
+ x[n] = 0.5 + x_true[n];
+ }
+
+ {
+ float scale = 1 << fft_log_size;
+
+ for (n = 0; n < fft_size; ++n) {
+ y_scaled[n].Re = 0.5 + y_true[n].Re / scale;
+ y_scaled[n].Im = 0.5 + y_true[n].Im / scale;
+ }
+ }
+
+ if (verbose > 63) {
+ printf("Signal\n");
+ DumpArrayReal16("x", fft_size, x);
+
+ printf("Expected FFT output\n");
+ DumpArrayComplex16("y", fft_size / 2 + 1, y_scaled);
+ }
+
+ status = omxSP_FFTFwd_RToCCS_S16_Sfs(x, (OMX_S16*) y, fft_fwd_spec, scale_factor);
+ if (status) {
+ fprintf(stderr, "Forward FFT failed: status = %d\n", status);
+ exit(1);
+ }
+
+ if (verbose > 63) {
+ printf("FFT Output\n");
+ DumpArrayComplex16("y", fft_size / 2 + 1, y);
+ }
+
+ CompareComplex16(snr, y, y_scaled, fft_size / 2 + 1);
+
+ FreeAlignedPointer(x_aligned);
+ FreeAlignedPointer(y_aligned);
+ free(fft_fwd_spec);
+
+ return snr->complex_snr_;
+}
+
+float RunOneInverseTest(int fft_log_size, int signal_type,
+ float unused_signal_value,
+ struct SnrResult* snr) {
+ OMX_S16* x_scaled;
+ OMX_S16* z;
+ OMX_SC16* y;
+ OMX_SC16* y_scaled;
+
+ struct AlignedPtr* y_aligned;
+ struct AlignedPtr* z_aligned;
+
+ float* x_true;
+ struct ComplexFloat* y_true;
+
+ OMX_INT n, fft_spec_buffer_size;
+ OMXResult status;
+ OMXFFTSpec_R_S16 * fft_inv_spec = NULL;
+ int fft_size;
+
+ fft_size = 1 << fft_log_size;
+
+ status = omxSP_FFTGetBufSize_R_S16(fft_log_size, &fft_spec_buffer_size);
+ if (verbose > 3) {
+ printf("fft_spec_buffer_size = %d\n", fft_spec_buffer_size);
+ }
+
+ fft_inv_spec = (OMXFFTSpec_R_S16*)malloc(fft_spec_buffer_size);
+ status = omxSP_FFTInit_R_S16(fft_inv_spec, fft_log_size);
+ if (status) {
+ fprintf(stderr, "Failed to init backward FFT: status = %d\n", status);
+ exit(1);
+ }
+
+ y_aligned = AllocAlignedPointer(32, sizeof(*y) * (fft_size / 2 + 1));
+ z_aligned = AllocAlignedPointer(32, sizeof(*z) * fft_size);
+
+ x_true = (float*) malloc(sizeof(*x_true) * fft_size);
+ x_scaled = (OMX_S16*) malloc(sizeof(*x_scaled) * fft_size);
+ y_true = (struct ComplexFloat*) malloc(sizeof(*y_true) * fft_size);
+ y_scaled = y_aligned->aligned_pointer_;
+ z = z_aligned->aligned_pointer_;
+
+ GenerateSignal(y_true, x_true, fft_size, signal_type);
+
+ {
+ /*
+ * To get max accuracy, scale the input to the inverse FFT up
+ * to use as many bits as we can.
+ */
+ float scale = 1;
+ float max = 0;
+
+ for (n = 0; n < fft_size / 2 + 1; ++n) {
+ float val;
+ val = fabs(y_true[n].Re);
+ if (val > max) {
+ max = val;
+ }
+ val = fabs(y_true[n].Im);
+ if (val > max) {
+ max = val;
+ }
+ }
+
+ scale = 16384 / max;
+ if (verbose > 63)
+ printf("Inverse FFT input scaled factor %g\n", scale);
+
+ /*
+ * Scale both the true FFT signal and the input so we can
+ * compare them correctly later
+ */
+ for (n = 0; n < fft_size / 2 + 1; ++n) {
+ y_scaled[n].Re = (OMX_S16)(0.5 + y_true[n].Re * scale);
+ y_scaled[n].Im = (OMX_S16)(0.5 + y_true[n].Im * scale);
+ }
+ for (n = 0; n < fft_size; ++n) {
+ x_scaled[n] = 0.5 + x_true[n] * scale;
+ }
+ }
+
+
+ if (verbose > 63) {
+ printf("Inverse FFT Input Signal\n");
+ DumpArrayComplex16("y", fft_size / 2 + 1, y_scaled);
+
+ printf("Expected Inverse FFT output\n");
+ DumpArrayReal16("x", fft_size, x_scaled);
+ }
+
+ status = omxSP_FFTInv_CCSToR_S16_Sfs((OMX_S16 const *)y_scaled, z, fft_inv_spec, 0);
+ if (status) {
+ fprintf(stderr, "Inverse FFT failed: status = %d\n", status);
+ exit(1);
+ }
+
+ if (verbose > 63) {
+ printf("Actual Inverse FFT Output\n");
+ DumpArrayReal16("z", fft_size, z);
+ }
+
+ CompareReal16(snr, z, x_scaled, fft_size);
+
+ FreeAlignedPointer(y_aligned);
+ FreeAlignedPointer(z_aligned);
+ free(fft_inv_spec);
+
+ return snr->real_snr_;
+}
diff --git a/dl/sp/src/test/test_rfft16.c b/dl/sp/src/test/test_rfft16_s32.c
similarity index 96%
rename from dl/sp/src/test/test_rfft16.c
rename to dl/sp/src/test/test_rfft16_s32.c
index 171ccdc..2d825c6 100644
--- a/dl/sp/src/test/test_rfft16.c
+++ b/dl/sp/src/test/test_rfft16_s32.c
@@ -33,8 +33,8 @@
SetDefaultOptions(&options, 1, MAX_FFT_ORDER);
- ProcessCommandLine(&options, argc, argv,
- "Test forward and inverse real 16-bit fixed-point FFT\n");
+ ProcessCommandLine(&options, argc, argv, "Test forward and inverse real 16 \
+ -bit fixed-point FFT, with 32-bit complex FFT routines\n");
verbose = options.verbose_;
signal_value = options.signal_value_;
@@ -54,7 +54,6 @@
info.known_failures_ = 0;
info.forward_threshold_ = 90.12;
info.inverse_threshold_ = 89.28;
- signal_value = 32767;
RunAllTests(&info);
} else {
TestFFT(options.fft_log_size_,
@@ -82,8 +81,8 @@
}
for (k = 0; k < size / 2 + 1; ++k) {
- fft[k].Re = true_fft[k].Re;
- fft[k].Im = true_fft[k].Im;
+ fft[k].Re = true_fft[k].Re + 0.5;
+ fft[k].Im = true_fft[k].Im + 0.5;
}
free(test_signal);
diff --git a/dl/sp/src/test/test_util.c b/dl/sp/src/test/test_util.c
index 88d697b..69830b6 100644
--- a/dl/sp/src/test/test_util.c
+++ b/dl/sp/src/test/test_util.c
@@ -97,7 +97,7 @@
options->fft_log_size_ = 4;
options->scale_factor_ = 0;
options->signal_type_ = 0;
- options->signal_value_ = 1024;
+ options->signal_value_ = 32767;
options->signal_value_given_ = 0;
}
@@ -382,7 +382,7 @@
const OMX_SC16* array) {
int n;
- printf("%4s\t%10s.re[n]\t%10s.im[n]\n", "n", array_name);
+ printf("%4s\t%10s.re[n]\t%10s.im[n]\n", "n", array_name, array_name);
for (n = 0; n < count; ++n) {
printf("%4d\t%16d\t%16d\n", n, array[n].Re, array[n].Im);
}