Implement ARM64 version of OpenMAX DL
This is a conversion of the existing ARM NEON OpenMAX DL FFT routines
to arm64. The translation was done by hand and mostly just uses the
correct register names and instructions for ARM64.
The test_float_fft and test_float_rfft programs pass with SNRs
basically equivalent to the original ARM NEON version.
BUG=
R=andrew@webrtc.org, leecam@google.com
Review URL: https://webrtc-codereview.appspot.com/14539004
git-svn-id: http://webrtc.googlecode.com/svn/deps/third_party/openmax@6477 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/dl/api/arm/arm64COMM_s.h b/dl/api/arm/arm64COMM_s.h
new file mode 100644
index 0000000..e19ceee
--- /dev/null
+++ b/dl/api/arm/arm64COMM_s.h
@@ -0,0 +1,258 @@
+// -*- Mode: asm; -*-
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This file was originally licensed as follows. It has been
+// relicensed with permission from the copyright holders.
+//
+
+//
+// File Name: armCOMM_s.h
+// OpenMAX DL: v1.0.2
+// Last Modified Revision: 13871
+// Last Modified Date: Fri, 09 May 2008
+//
+// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+//
+//
+//
+// ARM optimized OpenMAX common header file
+//
+
+ .set _SBytes, 0 // Number of scratch bytes on stack
+ .set _Workspace, 0 // Stack offset of scratch workspace
+
+ .set _RRegList, 0 // R saved register list (last register number)
+ .set _DRegList, 0 // D saved register list (last register number)
+
+ // Work out list of D saved registers, like for R registers.
+ .macro _M_GETDREGLIST dreg
+ .ifeqs "\dreg", ""
+ .set _DRegList, 0
+ .exitm
+ .endif
+
+ .ifeqs "\dreg", "d8"
+ .set _DRegList, 8
+ .exitm
+ .endif
+
+ .ifeqs "\dreg", "d9"
+ .set _DRegList, 9
+ .exitm
+ .endif
+
+ .ifeqs "\dreg", "d10"
+ .set _DRegList, 10
+ .exitm
+ .endif
+
+ .ifeqs "\dreg", "d11"
+ .set _DRegList, 11
+ .exitm
+ .endif
+
+ .ifeqs "\dreg", "d12"
+ .set _DRegList, 12
+ .exitm
+ .endif
+
+ .ifeqs "\dreg", "d13"
+ .set _DRegList, 13
+ .exitm
+ .endif
+
+ .ifeqs "\dreg", "d14"
+ .set _DRegList, 14
+ .exitm
+ .endif
+
+ .ifeqs "\dreg", "d15"
+ .set _DRegList, 15
+ .exitm
+ .endif
+
+ .warning "Unrecognized saved d register limit: \rreg"
+ .endm
+
+//////////////////////////////////////////////////////////
+// Function header and footer macros
+//////////////////////////////////////////////////////////
+
+ // Function Header Macro
+ // Generates the function prologue
+ // Note that functions should all be "stack-moves-once"
+ // The FNSTART and FNEND macros should be the only places
+ // where the stack moves.
+ //
+ // name = function name
+ // rreg = "" don't stack any registers
+ // "lr" stack "lr" only
+ // "rN" stack registers "r4-rN,lr"
+ // dreg = "" don't stack any D registers
+ // "dN" stack registers "d8-dN"
+ //
+ // Note: ARM Archicture procedure call standard AAPCS
+ // states that r4-r11, sp, d8-d15 must be preserved by
+ // a compliant function.
+ .macro M_START name, rreg, dreg
+ .set _Workspace, 0
+
+ // Define the function and make it external.
+ .global \name
+ .func \name
+ .section .text.\name,"ax",%progbits
+ .align 4
+\name :
+//.fnstart
+ // Save specified R registers
+ _M_PUSH_RREG
+
+ // Save specified D registers
+ _M_GETDREGLIST \dreg
+ _M_PUSH_DREG
+
+ // Ensure size claimed on stack is 16-byte aligned for ARM64
+ .if (_SBytes & 15) != 0
+ .set _SBytes, _SBytes + (16 - (_SBytes & 15))
+ .endif
+ .if _SBytes != 0
+ sub sp, sp, #_SBytes
+ .endif
+ .endm
+
+ // Function Footer Macro
+ // Generates the function epilogue
+ .macro M_END
+ // Restore the stack pointer to its original value on function entry
+ .if _SBytes != 0
+ add sp, sp, #_SBytes
+ .endif
+ // Restore any saved R or D registers.
+ _M_RET
+ //.fnend
+ .endfunc
+ // Reset the global stack tracking variables back to their
+ // initial values.
+ .set _SBytes, 0
+ .endm
+
+ // Based on the value of _DRegList, push the specified set of registers
+ // to the stack.
+ // The ARM64 ABI says only v8-v15 needs to be saved across calls and only
+ // the lower 64 bits need to be saved.
+ .macro _M_PUSH_DREG
+ .if _DRegList >= 8
+ sub sp, sp, (_DRegList - 7) * 16 // 16-byte alignment
+ str q8, [sp]
+ .endif
+
+ .if _DRegList >= 9
+ str q9, [sp, #16]
+ .endif
+
+ .if _DRegList >= 10
+ str q10, [sp, #32]
+ .endif
+
+ .if _DRegList >= 11
+ str q11, [sp, #48]
+ .endif
+
+ .if _DRegList >= 12
+ str q12, [sp, #64]
+ .endif
+
+ .if _DRegList >= 13
+ str q13, [sp, #80]
+ .endif
+
+ .if _DRegList >= 14
+ str q14, [sp, #96]
+ .endif
+
+ .if _DRegList >= 15
+ str q15, [sp, #112]
+ .endif
+
+ .exitm
+ .endm
+
+ // Based on the value of _RRegList, push the specified set of registers
+ // to the stack.
+ // The ARM64 ABI says registers r19-r29 needs to be saved across calls.
+ // But for the FFT routines, we don't need to save anything, so just
+ // preserve the SP and LR.
+ .macro _M_PUSH_RREG
+ sub sp, sp, #16
+ str x30, [sp]
+ str x29, [sp, #8]
+ .exitm
+ .endm
+
+ // The opposite of _M_PUSH_DREG
+ .macro _M_POP_DREG
+ .if _DRegList >= 8
+ ldr q8, [sp]
+ .endif
+
+ .if _DRegList >= 9
+ ldr q9, [sp, #16]
+ .endif
+
+ .if _DRegList >= 10
+ ldr q10, [sp, #32]
+ .endif
+
+ .if _DRegList >= 11
+ ldr q11, [sp, #48]
+ .endif
+
+ .if _DRegList >= 12
+ ldr q12, [sp, #64]
+ .endif
+
+ .if _DRegList >= 13
+ ldr q13, [sp, #80]
+ .endif
+
+ .if _DRegList >= 14
+ ldr q14, [sp, #96]
+ .endif
+
+ .if _DRegList >= 15
+ ldr q15, [sp, #112]
+ .endif
+
+ .if _DRegList >= 8
+ add sp, sp, (_DRegList - 7) * 16 // 16-byte alignment
+ .endif
+ .exitm
+ .endm
+
+ // The opposite of _M_PUSH_RREG
+ .macro _M_POP_RREG cc
+ ldr x29, [sp, #8]
+ ldr x30, [sp]
+ add sp, sp, #16
+ .exitm
+ .endm
+
+ // Produce function return instructions
+ .macro _M_RET cc
+ _M_POP_DREG \cc
+ _M_POP_RREG \cc
+ ret
+ .endm
+ // rsb - reverse subtract
+ // compute dst = src2 - src1, useful when src2 is an immediate value
+ .macro rsb dst, src1, src2
+ sub \dst, \src1, \src2
+ neg \dst, \dst
+ .endm
diff --git a/dl/api/arm/omxtypes_s.h b/dl/api/arm/omxtypes_s.h
index d880d35..b27f72d 100644
--- a/dl/api/arm/omxtypes_s.h
+++ b/dl/api/arm/omxtypes_s.h
@@ -1,76 +1,76 @@
-@//
-@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
-@//
-@// Use of this source code is governed by a BSD-style license
-@// that can be found in the LICENSE file in the root of the source
-@// tree. An additional intellectual property rights grant can be found
-@// in the file PATENTS. All contributing project authors may
-@// be found in the AUTHORS file in the root of the source tree.
-@//
-@// This file was originally licensed as follows. It has been
-@// relicensed with permission from the copyright holders.
-@//
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This file was originally licensed as follows. It has been
+// relicensed with permission from the copyright holders.
+//
-@//
-@// File Name: omxtypes_s.h
-@// OpenMAX DL: v1.0.2
-@// Last Modified Revision: 9622
-@// Last Modified Date: Wed, 06 Feb 2008
-@//
-@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-@//
-@//
+//
+// File Name: omxtypes_s.h
+// OpenMAX DL: v1.0.2
+// Last Modified Revision: 9622
+// Last Modified Date: Wed, 06 Feb 2008
+//
+// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+//
+//
-@// Mandatory return codes - use cases are explicitly described for each function
- .equ OMX_Sts_NoErr, 0 @// No error the function completed successfully
- .equ OMX_Sts_Err, -2 @// Unknown/unspecified error
- .equ OMX_Sts_InvalidBitstreamValErr, -182 @// Invalid value detected during bitstream processing
- .equ OMX_Sts_MemAllocErr, -9 @// Not enough memory allocated for the operation
- .equ OMX_StsACAAC_GainCtrErr, -159 @// AAC: Unsupported gain control data detected
- .equ OMX_StsACAAC_PrgNumErr, -167 @// AAC: Invalid number of elements for one program
- .equ OMX_StsACAAC_CoefValErr, -163 @// AAC: Invalid quantized coefficient value
- .equ OMX_StsACAAC_MaxSfbErr, -162 @// AAC: Invalid maxSfb value in relation to numSwb
- .equ OMX_StsACAAC_PlsDataErr, -160 @// AAC: pulse escape sequence data error
+// Mandatory return codes - use cases are explicitly described for each function
+ .equ OMX_Sts_NoErr, 0 // No error the function completed successfully
+ .equ OMX_Sts_Err, -2 // Unknown/unspecified error
+ .equ OMX_Sts_InvalidBitstreamValErr, -182 // Invalid value detected during bitstream processing
+ .equ OMX_Sts_MemAllocErr, -9 // Not enough memory allocated for the operation
+ .equ OMX_StsACAAC_GainCtrErr, -159 // AAC: Unsupported gain control data detected
+ .equ OMX_StsACAAC_PrgNumErr, -167 // AAC: Invalid number of elements for one program
+ .equ OMX_StsACAAC_CoefValErr, -163 // AAC: Invalid quantized coefficient value
+ .equ OMX_StsACAAC_MaxSfbErr, -162 // AAC: Invalid maxSfb value in relation to numSwb
+ .equ OMX_StsACAAC_PlsDataErr, -160 // AAC: pulse escape sequence data error
-@// Optional return codes - use cases are explicitly described for each function
- .equ OMX_Sts_BadArgErr, -5 @// Bad Arguments
+// Optional return codes - use cases are explicitly described for each function
+ .equ OMX_Sts_BadArgErr, -5 // Bad Arguments
- .equ OMX_StsACAAC_TnsNumFiltErr, -157 @// AAC: Invalid number of TNS filters
- .equ OMX_StsACAAC_TnsLenErr, -156 @// AAC: Invalid TNS region length
- .equ OMX_StsACAAC_TnsOrderErr, -155 @// AAC: Invalid order of TNS filter
- .equ OMX_StsACAAC_TnsCoefResErr, -154 @// AAC: Invalid bit-resolution for TNS filter coefficients
- .equ OMX_StsACAAC_TnsCoefErr, -153 @// AAC: Invalid TNS filter coefficients
- .equ OMX_StsACAAC_TnsDirectErr, -152 @// AAC: Invalid TNS filter direction
- .equ OMX_StsICJP_JPEGMarkerErr, -183 @// JPEG marker encountered within an entropy-coded block;
- @// Huffman decoding operation terminated early.
- .equ OMX_StsICJP_JPEGMarker, -181 @// JPEG marker encountered; Huffman decoding
- @// operation terminated early.
- .equ OMX_StsIPPP_ContextMatchErr, -17 @// Context parameter doesn't match to the operation
+ .equ OMX_StsACAAC_TnsNumFiltErr, -157 // AAC: Invalid number of TNS filters
+ .equ OMX_StsACAAC_TnsLenErr, -156 // AAC: Invalid TNS region length
+ .equ OMX_StsACAAC_TnsOrderErr, -155 // AAC: Invalid order of TNS filter
+ .equ OMX_StsACAAC_TnsCoefResErr, -154 // AAC: Invalid bit-resolution for TNS filter coefficients
+ .equ OMX_StsACAAC_TnsCoefErr, -153 // AAC: Invalid TNS filter coefficients
+ .equ OMX_StsACAAC_TnsDirectErr, -152 // AAC: Invalid TNS filter direction
+ .equ OMX_StsICJP_JPEGMarkerErr, -183 // JPEG marker encountered within an entropy-coded block;
+ // Huffman decoding operation terminated early.
+ .equ OMX_StsICJP_JPEGMarker, -181 // JPEG marker encountered; Huffman decoding
+ // operation terminated early.
+ .equ OMX_StsIPPP_ContextMatchErr, -17 // Context parameter doesn't match to the operation
- .equ OMX_StsSP_EvenMedianMaskSizeErr, -180 @// Even size of the Median Filter mask was replaced by the odd one
+ .equ OMX_StsSP_EvenMedianMaskSizeErr, -180 // Even size of the Median Filter mask was replaced by the odd one
- .equ OMX_Sts_MaximumEnumeration, 0x7FFFFFFF
+ .equ OMX_Sts_MaximumEnumeration, 0x7FFFFFFF
- .equ OMX_MIN_S8, (-128)
- .equ OMX_MIN_U8, 0
- .equ OMX_MIN_S16, (-32768)
- .equ OMX_MIN_U16, 0
+ .equ OMX_MIN_S8, (-128)
+ .equ OMX_MIN_U8, 0
+ .equ OMX_MIN_S16, (-32768)
+ .equ OMX_MIN_U16, 0
- .equ OMX_MIN_S32, (-2147483647-1)
- .equ OMX_MIN_U32, 0
+ .equ OMX_MIN_S32, (-2147483647-1)
+ .equ OMX_MIN_U32, 0
- .equ OMX_MAX_S8, (127)
- .equ OMX_MAX_U8, (255)
- .equ OMX_MAX_S16, (32767)
- .equ OMX_MAX_U16, (0xFFFF)
- .equ OMX_MAX_S32, (2147483647)
- .equ OMX_MAX_U32, (0xFFFFFFFF)
+ .equ OMX_MAX_S8, (127)
+ .equ OMX_MAX_U8, (255)
+ .equ OMX_MAX_S16, (32767)
+ .equ OMX_MAX_U16, (0xFFFF)
+ .equ OMX_MAX_S32, (2147483647)
+ .equ OMX_MAX_U32, (0xFFFFFFFF)
- .equ OMX_VC_UPPER, 0x1 @// Used by the PredictIntra functions
- .equ OMX_VC_LEFT, 0x2 @// Used by the PredictIntra functions
- .equ OMX_VC_UPPER_RIGHT, 0x40 @// Used by the PredictIntra functions
+ .equ OMX_VC_UPPER, 0x1 // Used by the PredictIntra functions
+ .equ OMX_VC_LEFT, 0x2 // Used by the PredictIntra functions
+ .equ OMX_VC_UPPER_RIGHT, 0x40 // Used by the PredictIntra functions
- .equ NULL, 0
+ .equ NULL, 0
diff --git a/dl/dl.gyp b/dl/dl.gyp
index d4c812b..8972d07 100644
--- a/dl/dl.gyp
+++ b/dl/dl.gyp
@@ -34,6 +34,21 @@
'BIG_FFT_TABLE',
],
}],
+ ['target_arch=="arm" or target_arch=="arm64"', {
+ 'sources':[
+ # Common files that are used by both arm and arm64 code.
+ 'api/arm/armOMX.h',
+ 'api/arm/omxtypes_s.h',
+ 'sp/api/armSP.h',
+ 'sp/src/arm/armSP_FFT_S32TwiddleTable.c',
+ 'sp/src/arm/omxSP_FFTGetBufSize_C_FC32.c',
+ 'sp/src/arm/omxSP_FFTGetBufSize_C_SC32.c',
+ 'sp/src/arm/omxSP_FFTGetBufSize_R_F32.c',
+ 'sp/src/arm/omxSP_FFTGetBufSize_R_S32.c',
+ 'sp/src/arm/omxSP_FFTInit_C_FC32.c',
+ 'sp/src/arm/omxSP_FFTInit_R_F32.c',
+ ],
+ }],
['target_arch=="arm"', {
'cflags!': [
'-mfpu=vfpv3-d16',
@@ -49,21 +64,11 @@
'sources': [
# Common files that are used by both the NEON and non-NEON code.
'api/armCOMM_s.h',
- 'api/armOMX.h',
- 'api/omxtypes_s.h',
- 'sp/api/armSP.h',
- 'sp/src/arm/armSP_FFT_S32TwiddleTable.c',
- 'sp/src/arm/omxSP_FFTGetBufSize_C_FC32.c',
'sp/src/arm/omxSP_FFTGetBufSize_C_SC16.c',
- 'sp/src/arm/omxSP_FFTGetBufSize_C_SC32.c',
- 'sp/src/arm/omxSP_FFTGetBufSize_R_F32.c',
'sp/src/arm/omxSP_FFTGetBufSize_R_S16.c',
'sp/src/arm/omxSP_FFTGetBufSize_R_S16S32.c',
- 'sp/src/arm/omxSP_FFTGetBufSize_R_S32.c',
- 'sp/src/arm/omxSP_FFTInit_C_FC32.c',
'sp/src/arm/omxSP_FFTInit_C_SC16.c',
'sp/src/arm/omxSP_FFTInit_C_SC32.c',
- 'sp/src/arm/omxSP_FFTInit_R_F32.c',
'sp/src/arm/omxSP_FFTInit_R_S16.c',
'sp/src/arm/omxSP_FFTInit_R_S16S32.c',
'sp/src/arm/omxSP_FFTInit_R_S32.c',
@@ -153,6 +158,27 @@
'sp/src/x86/x86SP_SSE_Math.h',
],
}],
+ ['target_arch=="arm64"', {
+ 'sources':[
+ 'api/arm/arm64COMM_s.h',
+
+ # Complex floating-point FFT
+ 'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S',
+ 'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S',
+ 'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S',
+ 'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S',
+ 'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S',
+ 'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S',
+ 'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S',
+ 'sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c',
+ 'sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c',
+ # Real floating-point FFT
+ 'sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S',
+ 'sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c',
+ 'sp/src/arm/arm64/ComplexToRealFixup.S',
+ 'sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c',
+ ],
+ }],
],
},
],
diff --git a/dl/sp/api/armSP.h b/dl/sp/api/armSP.h
index 4972f09..cf17ec5 100644
--- a/dl/sp/api/armSP.h
+++ b/dl/sp/api/armSP.h
@@ -29,6 +29,8 @@
#ifndef _armSP_H_
#define _armSP_H_
+#include <stdint.h>
+
#include "dl/api/omxtypes.h"
#ifdef __cplusplus
@@ -88,6 +90,42 @@
OMX_FC32* pBuf;
} ARMsFFTSpec_FC32;
+/*
+ * Compute log2(x), where x must be a power of 2.
+ */
+static inline OMX_U32 fastlog2(long x) {
+ OMX_U32 out;
+ asm("clz %0,%1\n\t"
+ "sub %0, %0, #63\n\t"
+ "neg %0, %0\n\t"
+ : "=r"(out)
+ : "r"(x)
+ :);
+ return out;
+}
+
+/*
+ * Validate args. All pointers must be non-NULL; the source and
+ * destination pointers must be aligned on a 32-byte boundary; the
+ * FFT spec must have non-NULL pointers; and the FFT size must be
+ * within range.
+ */
+static inline int validateParametersFC32(const void* pSrc,
+ const void* pDst,
+ const ARMsFFTSpec_FC32* pFFTSpec) {
+ return pSrc && pDst && pFFTSpec && !(((uintptr_t)pSrc) & 31) &&
+ !(((uintptr_t)pDst) & 31) && pFFTSpec->pTwiddle && pFFTSpec->pBuf &&
+ (pFFTSpec->N >= 2) && (pFFTSpec->N <= (1 << TWIDDLE_TABLE_ORDER));
+}
+
+static inline int validateParametersF32(const void* pSrc,
+ const void* pDst,
+ const ARMsFFTSpec_R_FC32* pFFTSpec) {
+ return pSrc && pDst && pFFTSpec && !(((uintptr_t)pSrc) & 31) &&
+ !(((uintptr_t)pDst) & 31) && pFFTSpec->pTwiddle && pFFTSpec->pBuf &&
+ (pFFTSpec->N >= 2) && (pFFTSpec->N <= (1 << TWIDDLE_TABLE_ORDER));
+}
+
#ifdef __cplusplus
}
#endif
diff --git a/dl/sp/src/arm/arm64/ComplexToRealFixup.S b/dl/sp/src/arm/arm64/ComplexToRealFixup.S
new file mode 100644
index 0000000..9b30093
--- /dev/null
+++ b/dl/sp/src/arm/arm64/ComplexToRealFixup.S
@@ -0,0 +1,261 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
+// to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute FFT for a real signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+ // Guarding implementation by the processor name
+
+// Import symbols required from other files
+
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pOut x3
+#define subFFTNum x4
+
+// Output registers
+
+//Local Scratch Registers
+
+#define argTwiddle x5
+#define argDst x6
+#define subFFTSize x7
+#define N subFFTNum
+#define order x14
+#define step x8
+#define step1 pTwiddle
+#define twStep x9
+#define zero w10
+#define pTwiddleTmp pOut
+
+// Neon registers
+
+#define dX0 v0.2s
+#define dX0s v0.s
+#define dX0r v2.2s
+#define dX0rs v2.s
+#define dX0i v3.2s
+#define dX0is v3.s
+#define dX1r v4.2s
+#define dX1i v5.2s
+#define dT0 v6.2s
+#define dT1 v7.2s
+#define dT2 v8.2s
+#define dT3 v9.2s
+#define qT0 v10.2s
+#define qT1 v12.2s
+#define dW0r v14.2s
+#define dW0r8b v14.8b
+#define dW0i v15.2s
+#define dW1r v16.2s
+#define dW1r8b v16.8b
+#define dW1i v17.2s
+#define dY0r v14.2s
+#define dY0i v15.2s
+#define dY1r v16.2s
+#define dY1i v17.2s
+#define qT2 v18.2s
+#define qT3 v20.2s
+
+#define half v0.2s
+#define dZip v21.2s
+#define dZip8b v21.8b
+
+ // Allocate stack memory required by the function
+
+ // Write function header
+ M_START ComplexToRealFixup,,d15
+
+ asr N, N, #1
+
+ clz order, subFFTNum // N = 2^order
+
+ RSB order,order,#63
+ MOV subFFTSize,subFFTNum // subFFTSize = N/2
+ //MOV subFFTNum,N
+ mov argDst, pDst
+ mov argTwiddle, pTwiddle
+
+ // F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
+ // 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)]
+ // 1/2[2a+j0] - j [0+j2b]
+ // (a+b, 0)
+
+ // F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
+ // 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)]
+ // 1/2[2a+j0] + j [0+j2b]
+ // (a-b, 0)
+
+ // F(0) and F(N/2)
+ ld2 {dX0rs,dX0is}[0],[pSrc], #8
+ MOV zero,#0
+ mov dX0rs[1],zero
+ lsl step,subFFTSize, #3 // step = N/2 * 8 bytes
+ mov dX0i[1],zero
+ // twStep = 3N/8 * 8 bytes pointing to W^1
+ SUB twStep,step,subFFTSize,LSL #1
+
+ fadd dY0r,dX0r,dX0i // F(0) = ((Z0.r+Z0.i) , 0)
+ lsl step1,subFFTSize, #2 // step1 = N/2 * 4 bytes
+ fsub dY0i,dX0r,dX0i // F(N/2) = ((Z0.r-Z0.i) , 0)
+ SUBS subFFTSize,subFFTSize,#2
+
+ st1 {dY0r},[argDst],step
+ ADD pTwiddleTmp,argTwiddle,#8 // W^2
+ st1 {dY0i},[argDst], #8
+ ADD argTwiddle,argTwiddle,twStep // W^1
+
+// dup dzero,zero
+ SUB argDst,argDst,step
+
+ BLT End
+ BEQ lastElement
+ SUB step,step,#24
+ SUB step1,step1,#8 // (N/4-1)*8 bytes
+
+ // F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)]
+ // Note: W^k is stored as negative values in the table
+ // Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
+ // since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+
+ fmov half, #0.5
+
+evenOddButterflyLoop:
+
+
+ ld1 {dW0r},[argTwiddle],step1
+ ld1 {dW1r},[argTwiddle], #8
+
+ ld2 {dX0r,dX0i},[pSrc],step
+ SUB argTwiddle,argTwiddle,step1
+ ld2 {dX1r,dX1i},[pSrc], #16
+
+
+
+ SUB step1,step1,#8 // (N/4-2)*8 bytes
+ ld1 {dW0i},[pTwiddleTmp],step1
+ ld1 {dW1i},[pTwiddleTmp], #8
+ SUB pSrc,pSrc,step
+
+ SUB pTwiddleTmp,pTwiddleTmp,step1
+ rev64 dX1r,dX1r
+ rev64 dX1i,dX1i
+ SUBS subFFTSize,subFFTSize,#4
+
+
+
+ fsub dT2,dX0r,dX1r // a-c
+ SUB step1,step1,#8
+ fadd dT0,dX0r,dX1r // a+c
+ fsub dT1,dX0i,dX1i // b-d
+ fadd dT3,dX0i,dX1i // b+d
+ fmul dT0,dT0,half[0]
+ fmul dT1,dT1,half[0]
+ // VZIP dW1r,dW1i
+ // VZIP dW0r,dW0i
+ zip1 dZip, dW1r, dW1i
+ zip2 dW1i, dW1r, dW1i
+ mov dW1r8b, dZip8b
+ zip1 dZip, dW0r, dW0i
+ zip2 dW0i, dW0r, dW0i
+ mov dW0r8b, dZip8b
+
+ fmul qT0,dW1r,dT2
+ fmul qT1,dW1r,dT3
+ fmul qT2,dW0r,dT2
+ fmul qT3,dW0r,dT3
+
+ fmla qT0,dW1i,dT3
+ fmls qT1,dW1i,dT2
+
+ fmls qT2,dW0i,dT3
+ fmla qT3,dW0i,dT2
+
+
+ fmul dX1r,qT0,half[0]
+ fmul dX1i,qT1,half[0]
+
+ fsub dY1r,dT0,dX1i // F(N/2 -1)
+ fadd dY1i,dT1,dX1r
+ fneg dY1i,dY1i
+
+ rev64 dY1r,dY1r
+ rev64 dY1i,dY1i
+
+
+ fmul dX0r,qT2,half[0]
+ fmul dX0i,qT3,half[0]
+
+ fsub dY0r,dT0,dX0i // F(1)
+ fadd dY0i,dT1,dX0r
+
+
+ st2 {dY0r,dY0i},[argDst],step
+ st2 {dY1r,dY1i},[argDst], #16
+ SUB argDst,argDst,step
+ SUB step,step,#32 // (N/2-4)*8 bytes
+
+
+ BGT evenOddButterflyLoop
+
+ // set both the ptrs to the last element
+ SUB pSrc,pSrc,#8
+ SUB argDst,argDst,#8
+
+
+
+ // Last element can be expanded as follows
+ // 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
+ // 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
+ // 1/2[2a+j0] + j (c+jd) [0+j2b]
+ // (a-bc, -bd)
+ // Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement:
+ ld1 {dX0r},[pSrc]
+
+ st1 {dX0rs}[0],[argDst], #4
+ fneg dX0r,dX0r
+ st1 {dX0rs}[1],[argDst], #4
+End:
+
+ // Write function tail
+ M_END
+
+ .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S b/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
new file mode 100644
index 0000000..da68314
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
@@ -0,0 +1,280 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This is a modification of
+// armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float
+// instead of SC32.
+//
+
+//
+// Description:
+// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
+// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+ // Guarding implementation by the processor name
+
+
+
+//Input Registers
+
+#define pSrc x0
+#define pTwiddle x1
+#define pOut x2
+#define subFFTNum x3
+
+// Output registers
+
+//Local Scratch Registers
+
+#define argTwiddle x5
+#define argDst x6
+#define subFFTSize x7
+#define N subFFTNum
+
+#define pOut1 x13
+
+#define size x7
+#define step x8
+#define step1 x9
+#define twStep x10
+#define pTwiddleTmp x11
+#define argTwiddle1 x12
+
+// Neon registers
+
+#define dX0 v0.2s
+#define dX0s v0.s
+#define dShift v1.2s
+#define dX1 v1.2s
+#define dX1s v1.s
+#define dY0 v2.2s
+#define dY08b v2.8b
+#define dY1 v3.2s
+#define dX0r v0.2s
+#define dX0rs v0.s
+#define dX0i v1.2s
+#define dX1r v2.2s
+#define dX1i v3.2s
+#define dW0r v4.2s
+#define dW0r8b v4.8b
+#define dW0i v5.2s
+#define dW1r v6.2s
+#define dW1r8b v6.8b
+#define dW1i v7.2s
+#define dT0 v8.2s
+#define dT1 v9.2s
+#define dT2 v10.2s
+#define dT3 v11.2s
+#define qT0 v12.2s
+#define qT1 v14.2s
+#define qT2 v16.2s
+#define qT3 v18.2s
+#define dY0r v4.2s
+#define dY0i v5.2s
+#define dY1r v6.2s
+#define dY1i v7.2s
+
+#define dY2 v4.2s
+#define dY3 v5.2s
+#define dW0 v6.2s
+#define dW1 v7.2s
+#define dW0Tmp v10.2s
+#define dW1Neg v11.2s
+
+#define dZip v19.2s
+#define dZip8b v19.8b
+#define half v13.2s
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ fmov half, 0.5
+
+ asr size, subFFTNum, #1 // preserve the contents of N = subFFTNum
+ lsl step, subFFTNum, #2 // step = N/2 * 8 bytes
+
+
+ // Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]}
+ // Note: W^(k) is stored as negated value and also need to
+ // conjugate the values from the table
+
+ // Z(0) : no need of twiddle multiply
+ // Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }
+
+ ld1 {dX0},[pSrc],step
+ ADD pOut1,pOut,step // pOut1 = pOut+ N/2*8 bytes
+
+ ld1 {dX1},[pSrc], #8
+ // twStep = 3N/8 * 8 bytes pointing to W^1
+ SUB twStep,step,size,LSL #1
+
+ lsl step1,size, #2 // step1 = N/4 * 8 = N/2*4 bytes
+ SUB step1,step1,#8 // (N/4-1)*8 bytes
+
+ fadd dY0,dX0,dX1 // [b+d | a+c]
+ fsub dY1,dX0,dX1 // [b-d | a-c]
+ fmul dY0, dY0, half[0]
+ fmul dY1, dY1, half[0]
+
+ // dY0= [a-c | a+c] ;dY1= [b-d | b+d]
+ // VZIP dY0,dY1
+ zip1 dZip,dY0,dY1
+ zip2 dY1,dY0,dY1
+ mov dY08b, dZip8b
+
+ fsub dX0,dY0,dY1
+ SUBS size,size,#2
+ fadd dX1,dY0,dY1
+
+ SUB pSrc,pSrc,step
+
+ st1 {dX0s}[0],[pOut1], #4
+ ADD pTwiddleTmp,pTwiddle,#8 // W^2
+ st1 {dX1s}[1],[pOut1], #4
+ ADD argTwiddle1,pTwiddle,twStep // W^1
+
+
+ BLT decrementScale\name
+ BEQ lastElement\name
+
+
+ // Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]
+ // Note: W^k is stored as negative values in the table and also
+ // need to conjugate the values from the table.
+ //
+ // Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+ // since both of them require F(1),F(2) and F(N/2-2),F(N/2-1)
+
+
+ SUB step,step,#24
+evenOddButterflyLoop\name :
+
+
+ ld1 {dW0r},[argTwiddle1],step1
+ ld1 {dW1r},[argTwiddle1], #8
+
+ ld2 {dX0r,dX0i},[pSrc],step
+ SUB argTwiddle1,argTwiddle1,step1
+ ld2 {dX1r,dX1i},[pSrc], #16
+
+ SUB step1,step1,#8 // (N/4-2)*8 bytes
+ ld1 {dW0i},[pTwiddleTmp],step1
+ ld1 {dW1i},[pTwiddleTmp], #8
+ SUB pSrc,pSrc,step
+
+ SUB pTwiddleTmp,pTwiddleTmp,step1
+ rev64 dX1r,dX1r
+ rev64 dX1i,dX1i
+ SUBS size,size,#4
+
+
+ fsub dT2,dX0r,dX1r // a-c
+ fadd dT3,dX0i,dX1i // b+d
+ fadd dT0,dX0r,dX1r // a+c
+ fsub dT1,dX0i,dX1i // b-d
+ SUB step1,step1,#8
+
+ fmul dT2, dT2, half[0]
+ fmul dT3, dT3, half[0]
+
+ fmul dT0, dT0, half[0]
+ fmul dT1, dT1, half[0]
+
+ // VZIP dW1r,dW1i
+ // VZIP dW0r,dW0i
+ zip1 dZip, dW1r,dW1i
+ zip2 dW1i,dW1r,dW1i
+ mov dW1r8b, dZip8b
+ zip1 dZip,dW0r,dW0i
+ zip2 dW0i,dW0r,dW0i
+ mov dW0r8b, dZip8b
+
+ fmul dX1r,dW1r,dT2
+ fmul dX1i,dW1r,dT3
+ fmul dX0r,dW0r,dT2
+ fmul dX0i,dW0r,dT3
+
+ fmls dX1r,dW1i,dT3
+ fmla dX1i,dW1i,dT2
+
+ fmla dX0r,dW0i,dT3
+ fmls dX0i,dW0i,dT2
+
+
+ fadd dY1r,dT0,dX1i // F(N/2 -1)
+ fsub dY1i,dX1r,dT1
+
+ rev64 dY1r,dY1r
+ rev64 dY1i,dY1i
+
+
+ fadd dY0r,dT0,dX0i // F(1)
+ fsub dY0i,dT1,dX0r
+
+
+ st2 {dY0r,dY0i},[pOut1],step
+ st2 {dY1r,dY1i},[pOut1], #16
+ SUB pOut1,pOut1,step
+ SUB step,step,#32 // (N/2-4)*8 bytes
+
+
+ BGT evenOddButterflyLoop\name
+
+
+ // set both the ptrs to the last element
+ SUB pSrc,pSrc,#8
+ SUB pOut1,pOut1,#8
+
+ // Last element can be expanded as follows
+ // 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as
+ // -ve)
+ // 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
+ // 1/2[2a+j0] - j (c-jd) [0+j2b]
+ // (a+bc, -bd)
+ // Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement\name :
+ ld1 {dX0r},[pSrc]
+
+ st1 {dX0rs}[0],[pOut1], #4
+ fneg dX0r,dX0r
+ st1 {dX0rs}[1],[pOut1]
+
+
+
+decrementScale\name :
+
+ .endm
+
+ M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2,,d15
+ FFTSTAGE "FALSE","TRUE",Inv
+ M_END
+
+ .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S
new file mode 100644
index 0000000..b22912d
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S
@@ -0,0 +1,136 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This is a modification of armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S
+// to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute the first stage of a Radix 2 DIT in-order out-of-place FFT
+// stage for a N point complex signal.
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+// Guarding implementation by the processor name
+
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pSubFFTNum x3
+#define pSubFFTSize x4
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum x5
+#define subFFTSize x6
+#define pointStep x7
+#define outPointStep x7
+#define grpSize x8
+#define setCount x8
+#define step x9
+#define dstStep x9
+
+// Neon Registers
+#define dX0 v0.2s
+#define dX1 v1.2s
+#define dY0 v2.2s
+#define dY1 v3.2s
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ // Define stack arguments
+
+ // Move args values into our work registers
+ ldr subFFTNum, [pSubFFTNum]
+ ldr subFFTSize, [pSubFFTSize]
+
+ // update subFFTSize and subFFTNum into RN6 and RN7 for the next stage
+
+
+ MOV subFFTSize,#2
+ LSR grpSize,subFFTNum,#1
+ MOV subFFTNum,grpSize
+
+
+ // pT0+1 increments pT0 by 8 bytes
+ // pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+ // Note: outPointStep = pointStep for firststage
+ // Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+
+ lsl pointStep, grpSize, #3
+ rsb step, pointStep, #8
+
+ // Loop on the sets for grp zero
+
+grpZeroSetLoop\name :
+
+ LD1 {dX0},[pSrc],pointStep
+ LD1 {dX1},[pSrc],step // step = -pointStep + 8
+
+ SUBS setCount,setCount,#1
+
+ fadd dY0,dX0,dX1
+ fsub dY1,dX0,dX1
+
+ ST1 {dY0},[pDst],outPointStep
+ // dstStep = step = -pointStep + 8
+ ST1 {dY1},[pDst],dstStep
+
+ BGT grpZeroSetLoop\name
+
+
+ // Save subFFTNum and subFFTSize for next stage
+ str subFFTNum, [pSubFFTNum]
+ str subFFTSize, [pSubFFTSize]
+
+ .endm
+
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace
+ FFTSTAGE "FALSE","FALSE",fwd
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace
+ FFTSTAGE "FALSE","TRUE",inv
+ M_END
+
+ .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S
new file mode 100644
index 0000000..e7de11e
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S
@@ -0,0 +1,149 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This is a modification of armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S
+// to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute the last stage of a Radix 2 DIT in-order out-of-place FFT
+// stage for a N point complex signal.
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+// Guarding implementation by the processor name
+
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pSubFFTNum x3
+#define pSubFFTSize x4
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+
+#define subFFTNum x5
+#define subFFTSize x6
+#define outPointStep x8
+#define grpCount x9
+#define dstStep x10
+
+// Neon Registers
+
+#define dWr v0.2s
+#define dWi v1.2s
+#define dXr0 v2.2s
+#define dXi0 v3.2s
+#define dXr1 v4.2s
+#define dXi1 v5.2s
+#define dYr0 v6.2s
+#define dYi0 v7.2s
+#define dYr1 v8.2s
+#define dYi1 v9.2s
+#define qT0 v10.2s
+#define qT1 v12.2s
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ // Move parameters into our work registers
+ ldr subFFTSize, [pSubFFTSize]
+
+ lsl outPointStep, subFFTSize, #3
+
+ // Update grpCount and grpSize rightaway
+
+ MOV subFFTNum,#1 //after the last stage
+ LSL grpCount,subFFTSize,#1
+
+ // update subFFTSize for the next stage
+ MOV subFFTSize,grpCount
+
+ rsb dstStep,outPointStep,#16
+
+ // Loop on 2 grps at a time for the last stage
+
+radix2lsGrpLoop\name :
+ // dWr = [pTwiddle[0].Re, pTwiddle[1].Re]
+ // dWi = [pTwiddle[0].Im, pTwiddle[1].Im]
+ ld2 {dWr,dWi},[pTwiddle], #16
+
+ // dXr0 = [pSrc[0].Re, pSrc[2].Re]
+ // dXi0 = [pSrc[0].Im, pSrc[2].Im]
+ // dXr1 = [pSrc[1].Re, pSrc[3].Re]
+ // dXi1 = [pSrc[1].Im, pSrc[3].Im]
+ ld4 {dXr0,dXi0,dXr1,dXi1}, [pSrc], #32
+
+ SUBS grpCount,grpCount,#4 // grpCount is multiplied by 2
+
+ .ifeqs "\inverse", "TRUE"
+ fmul qT0,dWr,dXr1
+ fmla qT0,dWi,dXi1 // real part
+ fmul qT1,dWr,dXi1
+ fmls qT1,dWi,dXr1 // imag part
+
+ .else
+
+ fmul qT0,dWr,dXr1
+ fmls qT0,dWi,dXi1 // real part
+ fmul qT1,dWr,dXi1
+ fmla qT1,dWi,dXr1 // imag part
+
+ .endif
+
+ fsub dYr0,dXr0,qT0
+ fsub dYi0,dXi0,qT1
+ fadd dYr1,dXr0,qT0
+ fadd dYi1,dXi0,qT1
+
+ st2 {dYr0,dYi0},[pDst],outPointStep
+ st2 {dYr1,dYi1},[pDst],dstStep // dstStep = step = -outPointStep + 16
+
+ BGT radix2lsGrpLoop\name
+
+
+ .endm
+
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace,,d12
+ FFTSTAGE "FALSE","FALSE",fwd
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace,,d12
+ FFTSTAGE "FALSE","TRUE",inv
+ M_END
+
+ .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S
new file mode 100644
index 0000000..530a815
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S
@@ -0,0 +1,185 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This is a modification of armSP_FFT_CToC_SC32_Radix2_unsafe_s.s
+// to support float instead of SC32.
+//
+
+// Description:
+// Compute a Radix 2 DIT in-order out-of-place FFT stage for an N point
+// complex signal. This handles the general stage, not the first or last
+// stage.
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+
+// Guarding implementation by the processor name
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pSubFFTNum x3
+#define pSubFFTSize x4
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum x5
+#define subFFTSize x6
+#define outPointStep x8
+#define pointStep x9
+#define pointStep32 w9
+#define grpCount x10
+#define grpCount32 w10
+#define setCount x13
+#define step x15
+#define dstStep x11
+
+// Neon Registers
+
+#define dW v0.2s
+#define dX0 v2.2s
+#define dX1 v3.2s
+#define dX2 v4.2s
+#define dX3 v5.2s
+#define dY0 v6.2s
+#define dY1 v7.2s
+#define dY2 v8.2s
+#define dY3 v9.2s
+#define qT0 v10.2s
+#define qT1 v11.2s
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ // Define stack arguments
+
+ // Move args values into our work registers
+ ldr subFFTNum, [pSubFFTNum]
+ ldr subFFTSize, [pSubFFTSize]
+
+ // Update grpCount and grpSize rightaway inorder to reuse pGrpCount
+ // and pGrpSize regs
+
+ LSR subFFTNum,subFFTNum,#1 //grpSize
+ LSL grpCount,subFFTSize,#1
+
+
+ // pT0+1 increments pT0 by 8 bytes
+ // pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+ lsl pointStep, subFFTNum, #2
+
+ // update subFFTSize for the next stage
+ MOV subFFTSize,grpCount
+
+ // pOut0+1 increments pOut0 by 8 bytes
+ // pOut0+outPointStep == increment of 8*outPointStep bytes =
+ // 4*size bytes
+ smull outPointStep, grpCount32, pointStep32
+
+ LSL pointStep,pointStep,#1
+
+
+ rsb step,pointStep,#16
+ rsb dstStep,outPointStep,#16
+
+ // Loop on the groups
+
+radix2GrpLoop\name :
+ lsr setCount, pointStep, #3
+ LD1 {dW},[pTwiddle],pointStep //[wi | wr]
+
+
+ // Loop on the sets
+
+
+radix2SetLoop\name :
+
+
+ // point0: dX0-real part dX1-img part
+ LD2 {dX0,dX1},[pSrc],pointStep
+ // point1: dX2-real part dX3-img part
+ LD2 {dX2,dX3},[pSrc],step
+
+ SUBS setCount,setCount,#2
+
+ .ifeqs "\inverse", "TRUE"
+ fmul qT0,dX2,dW[0]
+ fmla qT0,dX3,dW[1] // real part
+ fmul qT1,dX3,dW[0]
+ fmls qT1,dX2,dW[1] // imag part
+
+ .else
+
+ fmul qT0,dX2,dW[0]
+ fmls qT0,dX3,dW[1] // real part
+ fmul qT1,dX3,dW[0]
+ fmla qT1,dX2,dW[1] // imag part
+
+ .endif
+
+ fsub dY0,dX0,qT0
+ fsub dY1,dX1,qT1
+ fadd dY2,dX0,qT0
+ fadd dY3,dX1,qT1
+
+ st2 {dY0,dY1},[pDst],outPointStep
+ // dstStep = -outPointStep + 16
+ st2 {dY2,dY3},[pDst],dstStep
+
+ BGT radix2SetLoop\name
+
+ SUBS grpCount,grpCount,#2
+ ADD pSrc,pSrc,pointStep
+ BGT radix2GrpLoop\name
+
+
+ str subFFTNum, [pSubFFTNum]
+ str subFFTSize, [pSubFFTSize]
+ .endm
+
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace,,d11
+ FFTSTAGE "FALSE","FALSE",FWD
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace,,d11
+ FFTSTAGE "FALSE","TRUE",INV
+ M_END
+
+
+ .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S
new file mode 100644
index 0000000..624ef3e
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S
@@ -0,0 +1,266 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+//
+// This is a modification of armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.s
+// to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a first stage Radix 4 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+// Guarding implementation by the processor name
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pSubFFTNum x3
+#define pSubFFTSize x4
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum x5
+#define subFFTSize x6
+#define grpSize x7
+// Reuse grpSize as setCount
+#define setCount x7
+#define pointStep x8
+#define outPointStep x8
+#define setStep x9
+#define step1 x10
+#define step3 x11
+
+// Neon Registers
+
+#define dXr0 v0.2s
+#define dXi0 v1.2s
+#define dXr1 v2.2s
+#define dXi1 v3.2s
+#define dXr2 v4.2s
+#define dXi2 v5.2s
+#define dXr3 v6.2s
+#define dXi3 v7.2s
+#define dYr0 v8.2s
+#define dYi0 v9.2s
+#define dYr1 v10.2s
+#define dYi1 v11.2s
+#define dYr2 v12.2s
+#define dYi2 v13.2s
+#define dYr3 v14.2s
+#define dYi3 v15.2s
+#define dZr0 v16.2s
+#define dZi0 v17.2s
+#define dZr1 v18.2s
+#define dZi1 v19.2s
+#define dZr2 v20.2s
+#define dZi2 v21.2s
+#define dZr3 v22.2s
+#define dZi3 v23.2s
+
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ // Define stack arguments
+
+ // Move args values into our work registers
+ ldr subFFTNum, [pSubFFTNum]
+ ldr subFFTSize, [pSubFFTSize]
+
+ // pT0+1 increments pT0 by 8 bytes
+ // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+ // Note: outPointStep = pointStep for firststage
+
+ lsl pointStep, subFFTNum, #1
+
+ // Update pSubFFTSize and pSubFFTNum regs
+ ld2 {dXr0,dXi0}, [pSrc], pointStep // data[0]
+
+ // subFFTSize = 1 for the first stage
+ MOV subFFTSize,#4
+
+ // Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
+ LSR grpSize,subFFTNum,#2
+ ld2 {dXr1,dXi1}, [pSrc], pointStep // data[1]
+ MOV subFFTNum,grpSize
+
+
+ // Calculate the step of input data for the next set
+ //MOV setStep,pointStep,LSL #1
+ lsl setStep, grpSize, #4
+ ld2 {dXr2,dXi2}, [pSrc], pointStep // data[2]
+
+ // setStep = 3*pointStep
+ ADD setStep,setStep,pointStep
+ // setStep = - 3*pointStep+16
+
+ rsb setStep,setStep,#16
+ // data[3] & update pSrc for the next set
+ ld2 {dXr3,dXi3}, [pSrc], setStep
+
+ // step1 = 2*pointStep
+ lsl step1, pointStep, #1
+
+ // fadd qY0, qX0, qX2
+ fadd dYr0, dXr0, dXr2
+ fadd dYi0, dXi0, dXi2
+ // step3 = -pointStep
+ neg step3, pointStep
+
+ // grp = 0 a special case since all the twiddle factors are 1
+ // Loop on the sets : 2 sets at a time
+
+radix4fsGrpZeroSetLoop\name :
+
+
+
+ // Decrement setcount
+ SUBS setCount,setCount,#2
+
+
+ // finish first stage of 4 point FFT
+
+
+ // fsub qy2,qx0,qx2
+ fsub dYr2, dXr0, dXr2
+ fsub dYi2, dXi0, dXi2
+
+ ld2 {dXr0,dXi0}, [pSrc], step1 // data[0]
+ // fadd qy1,qx1,qx3
+ fadd dYr1, dXr1, dXr3
+ fadd dYi1, dXi1, dXi3
+ ld2 {dXr2,dXi2}, [pSrc], step3 // data[2]
+ // fsub qy3,qx1,qx3
+ fsub dYr3, dXr1, dXr3
+ fsub dYi3, dXi1, dXi3
+
+
+ // finish second stage of 4 point FFT
+
+ .ifeqs "\inverse", "TRUE"
+
+ ld2 {dXr1,dXi1}, [pSrc], step1 // data[1]
+ // fadd qz0,qy0,qy1
+ fadd dZr0, dYr0, dYr1
+ fadd dZi0, dYi0, dYi1
+
+ // data[3] & update pSrc for the next set, but not if it's the
+ // last iteration so that we don't read past the end of the
+ // input array.
+ BEQ radix4SkipLastUpdateInv\name
+ ld2 {dXr3,dXi3}, [pSrc], setStep
+
+radix4SkipLastUpdateInv\name:
+ FSUB dZr3,dYr2,dYi3
+
+ st2 {dZr0,dZi0},[pDst],outPointStep
+ FADD dZi3,dYi2,dYr3
+
+ // fsub qZ1,qY0,qY1
+ FSUB dZr1, dYr0, dYr1
+ FSUB dZi1, dYi0, dYi1
+ st2 {dZr3,dZi3},[pDst],outPointStep
+
+ FADD dZr2,dYr2,dYi3
+ st2 {dZr1,dZi1},[pDst],outPointStep
+ FSUB dZi2,dYi2,dYr3
+
+ // fadd qY0, qX0, qX2
+ FADD dYr0, dXr0, dXr2 // u0 for next iteration
+ FADD dYi0, dXi0, dXi2
+ st2 {dZr2,dZi2},[pDst],setStep
+
+
+ .else
+
+ ld2 {dXr1,dXi1}, [pSrc], step1 // data[1]
+ // fadd qZ0,qY0,qY1
+ fadd dZr0, dYr0, dYr1
+ fadd dZi0, dYi0, dYi1
+
+ // data[3] & update pSrc for the next set, but not if it's the
+ // last iteration so that we don't read past the end of the
+ // input array.
+ BEQ radix4SkipLastUpdateFwd\name
+ ld2 {dXr3,dXi3}, [pSrc], setStep
+
+radix4SkipLastUpdateFwd\name:
+ FADD dZr2,dYr2,dYi3
+
+ st2 {dZr0,dZi0},[pDst],outPointStep
+ FSUB dZi2,dYi2,dYr3
+
+ // fsub qz1,qy0,qy1
+ fsub dZr1, dYr0, dYr1
+ fsub dZi1, dYi0, dYi1
+ st2 {dZr2,dZi2},[pDst],outPointStep
+
+ FSUB dZr3,dYr2,dYi3
+ st2 {dZr1,dZi1},[pDst],outPointStep
+ FADD dZi3,dYi2,dYr3
+
+ // fadd qy0,qx0,qx2
+ fadd dYr0, dXr0, dXr2 // u0 for next iteration
+ fadd dYi0, dXi0, dXi2
+
+ st2 {dZr3,dZi3},[pDst],setStep
+
+ .endif
+
+ BGT radix4fsGrpZeroSetLoop\name
+
+ // Save subFFTNum and subFFTSize for next stage
+ str subFFTNum, [pSubFFTNum]
+ str subFFTSize, [pSubFFTSize]
+
+ .endm
+
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace,,d15
+ FFTSTAGE "FALSE","FALSE",fwd
+ M_END
+
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace,,d15
+ FFTSTAGE "FALSE","TRUE",inv
+ M_END
+
+
+ .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S
new file mode 100644
index 0000000..2fc2e60
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S
@@ -0,0 +1,371 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
+// to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a Radix 4 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+// Guarding implementation by the processor name
+
+
+// Import symbols required from other files
+// (For example tables)
+ //IMPORT armAAC_constTable
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pSubFFTNum x3
+#define pSubFFTSize x4
+
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum x5
+#define subFFTSize x6
+#define outPointStep x8
+#define grpCount x9
+#define dstStep x10
+#define grpTwStep x13
+#define stepTwiddle x14
+#define twStep x15
+#define step16 x11
+#define step24 x12
+
+
+// Neon Registers
+
+#define dButterfly1Real02 v0.2s
+#define dButterfly1Real028b v0.8b
+#define dButterfly1Imag02 v1.2s
+#define dButterfly1Imag028b v1.8b
+#define dButterfly1Real13 v2.2s
+#define dButterfly1Real138b v2.8b
+#define dButterfly1Imag13 v3.2s
+#define dButterfly1Imag138b v3.8b
+#define dButterfly2Real02 v4.2s
+#define dButterfly2Imag02 v5.2s
+#define dButterfly2Real13 v6.2s
+#define dButterfly2Imag13 v7.2s
+#define dXr0 v0.2s
+#define dXi0 v1.2s
+#define dXr08b v0.8b
+#define dXi08b v1.8b
+#define dXr1 v2.2s
+#define dXi1 v3.2s
+#define dXr2 v4.2s
+#define dXi2 v5.2s
+#define dXr3 v6.2s
+#define dXi3 v7.2s
+
+#define dYr0 v16.2s
+#define dYi0 v17.2s
+#define dYr1 v18.2s
+#define dYi1 v19.2s
+#define dYr2 v20.2s
+#define dYi2 v21.2s
+#define dYr3 v22.2s
+#define dYi3 v23.2s
+
+#define dW1r v8.2s
+#define dW1i v9.2s
+#define dW2r v10.2s
+#define dW2r8b v10.8b
+#define dW2i v11.2s
+#define dW3r v12.2s
+#define dW3r8b v12.8b
+#define dW3i v13.2s
+
+#define dZr0 v14.2s
+#define dZi0 v15.2s
+#define dZr08b v14.8b
+#define dZi08b v15.8b
+#define dZr1 v26.2s
+#define dZi1 v27.2s
+#define dZr2 v28.2s
+#define dZi2 v29.2s
+#define dZr3 v30.2s
+#define dZi3 v31.2s
+
+#define dZip v24.2s
+#define dZip8b v24.8b
+
+ .MACRO FFTSTAGE scaled, inverse , name
+
+ // Define stack arguments
+
+ // Move args values into our work registers
+ ldr subFFTNum, [pSubFFTNum]
+ ldr subFFTSize, [pSubFFTSize]
+
+ // pOut0+1 increments pOut0 by 8 bytes
+ // pOut0+outPointStep == increment of 8*outPointStep bytes
+ lsl outPointStep,subFFTSize, #3
+
+ // Update grpCount and grpSize rightaway
+
+ ld2 {dW1r,dW1i},[pTwiddle] // [wi|wr]
+ MOV step16,#16
+ LSL grpCount,subFFTSize,#2
+
+ ld1 {dW2r},[pTwiddle] // [wi|wr]
+ MOV subFFTNum,#1 //after the last stage
+
+ ld1 {dW3r},[pTwiddle],step16 // [wi|wr]
+ MOV stepTwiddle,#0
+
+ ld1 {dW2i},[pTwiddle],#8 // [wi|wr]
+ SUB grpTwStep,stepTwiddle,#8 // grpTwStep = -8 to start with
+
+ // update subFFTSize for the next stage
+ MOV subFFTSize,grpCount
+ ld1 {dW3i},[pTwiddle],grpTwStep // [wi|wr]
+ lsl dstStep,outPointStep, #1
+
+ // AC.r AC.i BD.r BD.i
+ ld4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
+ ADD dstStep,dstStep,outPointStep // dstStep = 3*outPointStep
+
+ rsb dstStep,dstStep,#16 // dstStep = - 3*outPointStep+16
+ MOV step24,#24
+
+ // AC.r AC.i BD.r BD.i
+ ld4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
+
+
+ // Process two groups at a time
+
+radix4lsGrpLoop\name :
+
+ // VZIP dW2r,dW2i
+ zip1 dZip, dW2r, dW2i
+ zip2 dW2i, dW2r, dW2i
+ mov dW2r8b, dZip8b
+
+ ADD stepTwiddle,stepTwiddle,#16
+
+ // VZIP dW3r,dW3i
+ zip1 dZip, dW3r,dW3i
+ zip2 dW3i, dW3r, dW3i
+ mov dW3r8b, dZip8b
+ ADD grpTwStep,stepTwiddle,#4
+
+ // VUZP dButterfly1Real13, dButterfly2Real13 // B.r D.r
+ uzp1 dZip, dButterfly1Real13, dButterfly2Real13 // B.r D.r
+ uzp2 dButterfly2Real13, dButterfly1Real13, dButterfly2Real13 // B.r D.r
+ mov dButterfly1Real138b, dZip8b
+
+ SUB twStep,stepTwiddle,#16 // -16+stepTwiddle
+
+ // VUZP dButterfly1Imag13, dButterfly2Imag13 // B.i D.i
+ uzp1 dZip, dButterfly1Imag13, dButterfly2Imag13 // B.i D.i
+ uzp2 dButterfly2Imag13, dButterfly1Imag13, dButterfly2Imag13 // B.i D.i
+ mov dButterfly1Imag138b, dZip8b
+ lsl grpTwStep,grpTwStep,#1
+
+ // VUZP dButterfly1Real02, dButterfly2Real02 // A.r C.r
+ uzp1 dZip, dButterfly1Real02, dButterfly2Real02 // A.r C.r
+ uzp2 dButterfly2Real02, dButterfly1Real02, dButterfly2Real02 // A.r C.r
+ mov dButterfly1Real028b, dZip8b
+ rsb grpTwStep,grpTwStep,#0 // -8-2*stepTwiddle
+
+ // VUZP dButterfly1Imag02, dButterfly2Imag02 // A.i C.i
+ uzp1 dZip, dButterfly1Imag02, dButterfly2Imag02 // A.i C.i
+ uzp2 dButterfly2Imag02, dButterfly1Imag02, dButterfly2Imag02 // A.i C.i
+ mov dButterfly1Imag028b, dZip8b
+
+
+ // grpCount is multiplied by 4
+ SUBS grpCount,grpCount,#8
+
+ .ifeqs "\inverse", "TRUE"
+ fmul dZr1,dW1r,dXr1
+ fmla dZr1,dW1i,dXi1 // real part
+ fmul dZi1,dW1r,dXi1
+ fmls dZi1,dW1i,dXr1 // imag part
+
+ .else
+
+ fmul dZr1,dW1r,dXr1
+ fmls dZr1,dW1i,dXi1 // real part
+ fmul dZi1,dW1r,dXi1
+ fmla dZi1,dW1i,dXr1 // imag part
+
+ .endif
+
+ ld2 {dW1r,dW1i},[pTwiddle],stepTwiddle // [wi|wr]
+
+ .ifeqs "\inverse", "TRUE"
+ fmul dZr2,dW2r,dXr2
+ fmla dZr2,dW2i,dXi2 // real part
+ fmul dZi2,dW2r,dXi2
+ ld1 {dW2r},[pTwiddle],step16 // [wi|wr]
+ fmls dZi2,dW2i,dXr2 // imag part
+
+ .else
+
+ fmul dZr2,dW2r,dXr2
+ fmls dZr2,dW2i,dXi2 // real part
+ fmul dZi2,dW2r,dXi2
+ ld1 {dW2r},[pTwiddle],step16 // [wi|wr]
+ fmla dZi2,dW2i,dXr2 // imag part
+
+ .endif
+
+
+ ld1 {dW2i},[pTwiddle],twStep // [wi|wr]
+
+ // move qX0 so as to load for the next iteration
+ // MOV qZ0,qX0
+ mov dZr08b, dXr08b
+ mov dZi08b, dXi08b
+
+ .ifeqs "\inverse", "TRUE"
+ fmul dZr3,dW3r,dXr3
+ fmla dZr3,dW3i,dXi3 // real part
+ fmul dZi3,dW3r,dXi3
+ ld1 {dW3r},[pTwiddle],step24
+ fmls dZi3,dW3i,dXr3 // imag part
+
+ .else
+
+ fmul dZr3,dW3r,dXr3
+ fmls dZr3,dW3i,dXi3 // real part
+ fmul dZi3,dW3r,dXi3
+ ld1 {dW3r},[pTwiddle],step24
+ fmla dZi3,dW3i,dXr3 // imag part
+
+ .endif
+
+ ld1 {dW3i},[pTwiddle],grpTwStep // [wi|wr]
+
+ // Don't do the load on the last iteration so we don't read past the end
+ // of pSrc.
+ bne skipIncrement\name
+ add pSrc, pSrc, #64
+skipIncrement\name:
+ beq radix4lsSkipRead\name
+ // AC.r AC.i BD.r BD.i
+ ld4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
+
+ // AC.r AC.i BD.r BD.i
+ ld4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
+radix4lsSkipRead\name:
+
+ // finish first stage of 4 point FFT
+
+ // fadd qY0,qZ0,qZ2
+ fadd dYr0,dZr0,dZr2
+ fadd dYi0,dZi0,dZi2
+ // fsub qY2,qZ0,qZ2
+ fsub dYr2,dZr0,dZr2
+ fsub dYi2,dZi0,dZi2
+ // fadd qY1,qZ1,qZ3
+ fadd dYr1,dZr1,dZr3
+ fadd dYi1,dZi1,dZi3
+ // fsub qY3,qZ1,qZ3
+ fsub dYr3,dZr1,dZr3
+ fsub dYi3,dZi1,dZi3
+
+
+ // finish second stage of 4 point FFT
+
+ .ifeqs "\inverse", "TRUE"
+
+ // fsub qZ0,qY2,qY1
+ fsub dZr0,dYr2,dYr1
+ fsub dZi0,dYi2,dYi1
+ fadd dZr3,dYr0,dYi3
+ st2 {dZr0,dZi0},[pDst],outPointStep
+ fsub dZi3,dYi0,dYr3
+
+ // fadd qZ2,qY2,qY1
+ fadd dZr2,dYr2,dYr1
+ fadd dZi2,dYi2,dYi1
+
+ st2 {dZr3,dZi3},[pDst],outPointStep
+
+ fsub dZr1,dYr0,dYi3
+ st2 {dZr2,dZi2},[pDst],outPointStep
+ fadd dZi1,dYi0,dYr3
+
+ // dstStep = -outPointStep + 16
+ st2 {dZr1,dZi1},[pDst],dstStep
+
+
+ .else
+
+ // fsub qZ0,qY2,qY1
+ fsub dZr0,dYr2,dYr1
+ fsub dZi0,dYi2,dYi1
+
+ fsub dZr1,dYr0,dYi3
+ st2 {dZr0,dZi0},[pDst],outPointStep
+ fadd dZi1,dYi0,dYr3
+
+ // fadd qZ2,qY2,qY1
+ fadd dZr2,dYr2,dYr1
+ fadd dZi2,dYi2,dYi1
+
+ st2 {dZr1,dZi1},[pDst],outPointStep
+
+ fadd dZr3,dYr0,dYi3
+ st2 {dZr2,dZi2},[pDst],outPointStep
+ fsub dZi3,dYi0,dYr3
+
+ // dstStep = -outPointStep + 16
+ st2 {dZr3,dZi3},[pDst],dstStep
+
+
+ .endif
+
+ BGT radix4lsGrpLoop\name
+
+ .endm
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace,,d15
+ FFTSTAGE "FALSE","FALSE",fwd
+ M_END
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace,,d15
+ FFTSTAGE "FALSE","TRUE",inv
+ M_END
+
+
+ .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S
new file mode 100644
index 0000000..830fd16
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S
@@ -0,0 +1,339 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+//
+// This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
+// to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a Radix 4 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+
+// Guarding implementation by the processor name
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pSubFFTNum x3
+#define pSubFFTSize x4
+
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum x5
+#define subFFTSize x6
+#define grpCount x7
+#define grpCount32 w7
+#define pointStep x8
+#define pointStep32 w8
+#define outPointStep x9
+#define stepTwiddle x10
+#define setCount x11
+#define srcStep x12
+#define setStep x13
+#define dstStep x14
+#define twStep x15
+
+// Neon Registers
+
+#define dW1 v0.2s
+#define dW2 v1.2s
+#define dW3 v2.2s
+
+#define dXr0 v4.2s
+#define dXi0 v5.2s
+#define dXr1 v6.2s
+#define dXi1 v7.2s
+#define dXr2 v8.2s
+#define dXi2 v9.2s
+#define dXr3 v10.2s
+#define dXi3 v11.2s
+#define dYr0 v12.2s
+#define dYi0 v13.2s
+#define dYr1 v14.2s
+#define dYi1 v15.2s
+#define dYr2 v16.2s
+#define dYi2 v17.2s
+#define dYr3 v18.2s
+#define dYi3 v19.2s
+#define dZr0 v20.2s
+#define dZi0 v21.2s
+#define dZr1 v22.2s
+#define dZi1 v23.2s
+#define dZr2 v24.2s
+#define dZi2 v25.2s
+#define dZr3 v26.2s
+#define dZi3 v27.2s
+
+ .MACRO FFTSTAGE scaled, inverse , name
+
+ // Define stack arguments
+
+ // Move args values into our work registers
+ ldr subFFTNum, [pSubFFTNum]
+ ldr subFFTSize, [pSubFFTSize]
+
+ // Update grpCount and grpSize rightaway inorder to reuse
+ // pGrpCount and pGrpSize regs
+
+ LSL grpCount,subFFTSize,#2
+ LSR subFFTNum,subFFTNum,#2
+ MOV subFFTSize,grpCount
+
+ ld1 {dW1},[pTwiddle] //[wi | wr]
+ // pT0+1 increments pT0 by 8 bytes
+ // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+ lsl pointStep,subFFTNum, #1
+
+ // pOut0+1 increments pOut0 by 8 bytes
+ // pOut0+outPointStep == increment of 8*outPointStep bytes
+ // = 2*size bytes
+
+ MOV stepTwiddle,#0
+ ld1 {dW2},[pTwiddle] //[wi | wr]
+ smull outPointStep,grpCount32,pointStep32
+
+ LSL pointStep,pointStep,#2 // 2*grpSize
+
+ ld1 {dW3},[pTwiddle] //[wi | wr]
+ lsl srcStep,pointStep, #1 // srcStep = 2*pointStep
+
+ ADD setStep,srcStep,pointStep // setStep = 3*pointStep
+
+ rsb setStep,setStep,#0 // setStep = - 3*pointStep
+ SUB srcStep,srcStep,#16 // srcStep = 2*pointStep-16
+
+ lsl dstStep,outPointStep, #1
+
+ ADD dstStep,dstStep,outPointStep // dstStep = 3*outPointStep
+ // dstStep = - 3*outPointStep+16
+ rsb dstStep,dstStep,#16
+
+
+radix4GrpLoop\name :
+
+ ld2 {dXr0,dXi0},[pSrc],pointStep // data[0]
+ ADD stepTwiddle,stepTwiddle,pointStep
+ ld2 {dXr1,dXi1},[pSrc],pointStep // data[1]
+ // set pTwiddle to the first point
+ ADD pTwiddle,pTwiddle,stepTwiddle
+ ld2 {dXr2,dXi2},[pSrc],pointStep // data[2]
+ lsl twStep,stepTwiddle, #2
+
+ // data[3] & update pSrc for the next set
+ ld2 {dXr3,dXi3},[pSrc],setStep
+ SUB twStep,stepTwiddle,twStep // twStep = -3*stepTwiddle
+
+ lsr setCount,pointStep, #3
+
+ // set pSrc to data[0] of the next set
+ ADD pSrc,pSrc,#16
+ // increment to data[1] of the next set
+ ADD pSrc,pSrc,pointStep
+
+
+ // Loop on the sets
+
+radix4SetLoop\name :
+
+
+
+ .ifeqs "\inverse", "TRUE"
+ fmul dZr1,dXr1,dW1[0]
+ fmul dZi1,dXi1,dW1[0]
+ fmul dZr2,dXr2,dW2[0]
+ fmul dZi2,dXi2,dW2[0]
+ fmul dZr3,dXr3,dW3[0]
+ fmul dZi3,dXi3,dW3[0]
+
+ fmla dZr1,dXi1,dW1[1] // real part
+ fmls dZi1,dXr1,dW1[1] // imag part
+
+ // data[1] for next iteration
+ ld2 {dXr1,dXi1},[pSrc],pointStep
+
+ fmla dZr2,dXi2,dW2[1] // real part
+ fmls dZi2,dXr2,dW2[1] // imag part
+
+ // data[2] for next iteration
+ ld2 {dXr2,dXi2},[pSrc],pointStep
+
+ fmla dZr3,dXi3,dW3[1] // real part
+ fmls dZi3,dXr3,dW3[1] // imag part
+ .else
+ fmul dZr1,dXr1,dW1[0]
+ fmul dZi1,dXi1,dW1[0]
+ fmul dZr2,dXr2,dW2[0]
+ fmul dZi2,dXi2,dW2[0]
+ fmul dZr3,dXr3,dW3[0]
+ fmul dZi3,dXi3,dW3[0]
+
+ fmls dZr1,dXi1,dW1[1] // real part
+ fmla dZi1,dXr1,dW1[1] // imag part
+
+ // data[1] for next iteration
+ ld2 {dXr1,dXi1},[pSrc],pointStep
+
+ fmls dZr2,dXi2,dW2[1] // real part
+ fmla dZi2,dXr2,dW2[1] // imag part
+
+ // data[2] for next iteration
+ ld2 {dXr2,dXi2},[pSrc],pointStep
+
+ fmls dZr3,dXi3,dW3[1] // real part
+ fmla dZi3,dXr3,dW3[1] // imag part
+ .endif
+
+ // data[3] & update pSrc to data[0]
+ // But don't read on the very last iteration because that reads past
+ // the end of pSrc. The last iteration is grpCount = 4, setCount = 2.
+ cmp grpCount, #4
+
+ b.ne skipUpdate\name
+ cmp setCount, #2
+ b.ne skipUpdate\name
+ add pSrc, pSrc, setStep
+ beq radix4SkipRead\name
+skipUpdate\name:
+ ld2 {dXr3,dXi3},[pSrc],setStep
+radix4SkipRead\name:
+
+ SUBS setCount,setCount,#2
+
+ // finish first stage of 4 point FFT
+ // fadd qY0,qX0,qZ2
+ // fsub qY2,qX0,qZ2
+ fadd dYr0,dXr0,dZr2
+ fsub dYr2,dXr0,dZr2
+ fadd dYi0,dXi0,dZi2
+ fsub dYi2,dXi0,dZi2
+
+ // data[0] for next iteration
+ ld2 {dXr0,dXi0},[pSrc], #16
+ // fadd qY1,qZ1,qZ3
+ // fsub qY3,qZ1,qZ3
+ fadd dYr1,dZr1,dZr3
+ fsub dYr3,dZr1,dZr3
+ fadd dYi1,dZi1,dZi3
+ fsub dYi3,dZi1,dZi3
+
+ // finish second stage of 4 point FFT
+
+ // fsub qZ0,qY2,qY1
+ fsub dZr0,dYr2,dYr1
+ fsub dZi0,dYi2,dYi1
+
+ .ifeqs "\inverse", "TRUE"
+
+ fadd dZr3,dYr0,dYi3
+ st2 {dZr0,dZi0},[pDst],outPointStep
+ fsub dZi3,dYi0,dYr3
+
+ // fadd qZ2,qY2,qY1
+ fadd dZr2,dYr2,dYr1
+ fadd dZi2,dYi2,dYi1
+
+ st2 {dZr3,dZi3},[pDst],outPointStep
+
+ fsub dZr1,dYr0,dYi3
+ st2 {dZr2,dZi2},[pDst],outPointStep
+ fadd dZi1,dYi0,dYr3
+
+ st2 {dZr1,dZi1},[pDst],dstStep
+
+
+ .else
+
+ fsub dZr1,dYr0,dYi3
+ st2 {dZr0,dZi0},[pDst],outPointStep
+ fadd dZi1,dYi0,dYr3
+
+ // fadd qZ2,qY2,qY1
+ fadd dZr2,dYr2,dYr1
+ fadd dZi2,dYi2,dYi1
+
+ st2 {dZr1,dZi1},[pDst],outPointStep
+
+ fadd dZr3,dYr0,dYi3
+ st2 {dZr2,dZi2},[pDst],outPointStep
+ fsub dZi3,dYi0,dYr3
+
+ st2 {dZr3,dZi3},[pDst],dstStep
+
+
+ .endif
+
+ // increment to data[1] of the next set
+ ADD pSrc,pSrc,pointStep
+ BGT radix4SetLoop\name
+
+
+ ld1 {dW1},[pTwiddle],stepTwiddle //[wi | wr]
+ // subtract 4 since grpCount multiplied by 4
+ SUBS grpCount,grpCount,#4
+ ld1 {dW2},[pTwiddle],stepTwiddle //[wi | wr]
+ // increment pSrc for the next grp
+ ADD pSrc,pSrc,srcStep
+ ld1 {dW3},[pTwiddle],twStep //[wi | wr]
+ BGT radix4GrpLoop\name
+
+ str subFFTNum, [pSubFFTNum]
+ str subFFTSize, [pSubFFTSize]
+
+ .endm
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace,,d15
+ FFTSTAGE "FALSE","FALSE",FWD
+ M_END
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace,,d15
+ FFTSTAGE "FALSE","TRUE",INV
+ M_END
+
+
+ .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
new file mode 100644
index 0000000..f348e6a
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
@@ -0,0 +1,473 @@
+//
+// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+//
+// This is a modification of armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.s
+// to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a first stage Radix 8 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+// Set debugging level
+//DEBUG_ON SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+
+// Guarding implementation by the processor name
+
+//Input Registers
+
+#define pSrc x0
+#define pDst x1
+#define pTwiddle x2
+#define pSubFFTNum x3
+#define pSubFFTSize x4
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum x5
+#define subFFTSize x6
+#define grpSize x7
+// Reuse grpSize as setCount
+#define setCount x7
+#define pointStep x8
+#define outPointStep x8
+#define setStep x9
+#define step1 x10
+#define step2 x11
+#define t0 w12
+
+
+// Neon Registers
+
+#define dXr0 v0.2s
+#define dXi0 v1.2s
+#define dXr1 v2.2s
+#define dXi1 v3.2s
+#define dXr2 v4.2s
+#define dXi2 v5.2s
+#define dXr3 v6.2s
+#define dXi3 v7.2s
+#define dXr4 v8.2s
+#define dXi4 v9.2s
+#define dXr5 v10.2s
+#define dXi5 v11.2s
+#define dXr6 v12.2s
+#define dXi6 v13.2s
+#define dXr7 v14.2s
+#define dXi7 v15.2s
+#define qX0 v0.4s
+#define qX1 v1.4s
+#define qX2 v2.4s
+#define qX3 v3.4s
+#define qX4 v4.4s
+#define qX5 v5.4s
+#define qX6 v6.4s
+#define qX7 v7.4s
+
+#define dUr0 v16.2s
+#define dUi0 v17.2s
+#define dUr2 v18.2s
+#define dUi2 v19.2s
+#define dUr4 v20.2s
+#define dUi4 v21.2s
+#define dUr6 v22.2s
+#define dUi6 v23.2s
+#define dUr1 v24.2s
+#define dUi1 v25.2s
+#define dUr3 v26.2s
+#define dUi3 v27.2s
+#define dUr5 v28.2s
+#define dUi5 v29.2s
+// reuse dXr7 and dXi7
+#define dUr7 v30.2s
+#define dUi7 v31.2s
+#define qU0 v8.4s
+#define qU1 v12.4s
+#define qU2 v9.4s
+#define qU3 v13.4s
+#define qU4 v10.4s
+#define qU5 v14.4s
+#define qU6 v11.4s
+#define qU7 v15.4s
+
+
+#define dVr0 v24.2s
+#define dVi0 v25.2s
+#define dVr2 v26.2s
+#define dVi2 v27.2s
+#define dVr4 v28.2s
+#define dVi4 v29.2s
+#define dVr6 v30.2s
+#define dVi6 v31.2s
+#define dVr1 v16.2s
+#define dVi1 v17.2s
+#define dVr3 v18.2s
+#define dVi3 v19.2s
+#define dVr5 v20.2s
+#define dVi5 v21.2s
+#define dVr7 v22.2s
+#define dVi7 v23.2s
+#define qV0 v12.4s
+#define qV1 v8.4s
+#define qV2 v13.4s
+#define qV3 v9.4s
+#define qV4 v14.4s
+#define qV5 v10.4s
+#define qV6 v15.4s
+#define qV7 v11.4s
+
+#define dYr0 v16.2s
+#define dYi0 v17.2s
+#define dYr2 v18.2s
+#define dYi2 v19.2s
+#define dYr4 v20.2s
+#define dYi4 v21.2s
+#define dYr6 v22.2s
+#define dYi6 v23.2s
+#define dYr1 v24.2s
+#define dYi1 v25.2s
+#define dYr3 v26.2s
+#define dYi3 v27.2s
+#define dYr5 v28.2s
+#define dYi5 v29.2s
+#define dYr7 v30.2s
+#define dYi7 v31.2s
+#define qY0 v8.4s
+#define qY1 v12.4s
+#define qY2 v9.4s
+#define qY3 v13.4s
+#define qY4 v10.4s
+#define qY5 v14.4s
+#define qY6 v11.4s
+#define qY7 v15.4s
+
+#define dT0 v14.2s
+#define dT0s v14.s
+#define dT1 v15.2s
+
+ .MACRO FFTSTAGE scaled, inverse, name
+
+ // Define stack arguments
+
+ // Move args values into our work registers
+ ldr subFFTNum, [pSubFFTNum]
+ ldr subFFTSize, [pSubFFTSize]
+
+ // Update pSubFFTSize and pSubFFTNum regs
+ // subFFTSize = 1 for the first stage
+
+ movz t0, 0x3f35, lsl #16 // High half word of sqrt(1/2).
+ movk t0, 0x04f3 // Low half word of sqrt(1/2).
+ MOV subFFTSize,#8
+
+ // Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
+ LSR grpSize,subFFTNum,#3
+ MOV subFFTNum,grpSize
+
+
+ // pT0+1 increments pT0 by 8 bytes
+ // pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
+ // Note: outPointStep = pointStep for firststage
+
+ lsl pointStep,grpSize, #3
+
+
+ // Calculate the step of input data for the next set
+ //MOV step1,pointStep,LSL #1 // step1 = 2*pointStep
+ ld2 {dXr0,dXi0},[pSrc],pointStep // data[0]
+ lsl step1,grpSize, #4
+ lsl step2,pointStep, #3
+
+ ld2 {dXr1,dXi1},[pSrc],pointStep // data[1]
+ SUB step2,step2,pointStep // step2 = 7*pointStep
+ // setStep = - 7*pointStep+16
+ rsb setStep,step2,#16
+
+ ld2 {dXr2,dXi2},[pSrc],pointStep // data[2]
+ ld2 {dXr3,dXi3},[pSrc],pointStep // data[3]
+ ld2 {dXr4,dXi4},[pSrc],pointStep // data[4]
+ ld2 {dXr5,dXi5},[pSrc],pointStep // data[5]
+ ld2 {dXr6,dXi6},[pSrc],pointStep // data[6]
+ // data[7] & update pSrc for the next set
+ // setStep = -7*pointStep + 16
+ ld2 {dXr7,dXi7},[pSrc],setStep
+ // grp = 0 a special case since all the twiddle factors are 1
+ // Loop on the sets
+
+radix8fsGrpZeroSetLoop\name :
+
+ // Decrement setcount
+ SUBS setCount,setCount,#2
+
+
+ // finish first stage of 8 point FFT
+
+ // fadd qU0,qX0,qX4
+ // fadd qU2,qX1,qX5
+ // fadd qU4,qX2,qX6
+ // fadd qU6,qX3,qX7
+ fadd dUr0,dXr0,dXr4
+ fadd dUr2,dXr1,dXr5
+ fadd dUr4,dXr2,dXr6
+ fadd dUr6,dXr3,dXr7
+ fadd dUi0,dXi0,dXi4
+ fadd dUi2,dXi1,dXi5
+ fadd dUi4,dXi2,dXi6
+ fadd dUi6,dXi3,dXi7
+
+ // finish second stage of 8 point FFT
+
+ // fadd qV0,qU0,qU4
+ // fsub qV2,qU0,qU4
+ // fadd qV4,qU2,qU6
+ // fsub qV6,qU2,qU6
+ fadd dVr0,dUr0,dUr4
+ fsub dVr2,dUr0,dUr4
+ fadd dVr4,dUr2,dUr6
+ fsub dVr6,dUr2,dUr6
+ fadd dVi0,dUi0,dUi4
+ fsub dVi2,dUi0,dUi4
+ fadd dVi4,dUi2,dUi6
+ fsub dVi6,dUi2,dUi6
+
+ // finish third stage of 8 point FFT
+
+ // fadd qY0,qV0,qV4
+ // fsub qY4,qV0,qV4
+ fadd dYr0,dVr0,dVr4
+ fsub dYr4,dVr0,dVr4
+ fadd dYi0,dVi0,dVi4
+ fsub dYi4,dVi0,dVi4
+
+ st2 {dYr0,dYi0},[pDst],step1 // store y0
+
+ .ifeqs "\inverse", "TRUE"
+
+ fsub dYr2,dVr2,dVi6
+ fadd dYi2,dVi2,dVr6
+
+ fadd dYr6,dVr2,dVi6
+ st2 {dYr2,dYi2},[pDst],step1 // store y2
+ fsub dYi6,dVi2,dVr6
+
+ // fsub qU1,qX0,qX4
+ fsub dUr1,dXr0,dXr4
+ fsub dUi1,dXi0,dXi4
+
+ st2 {dYr4,dYi4},[pDst],step1 // store y4
+
+ // fsub qU3,qX1,qX5
+ // fsub qU5,qX2,qX6
+ fsub dUr3,dXr1,dXr5
+ fsub dUr5,dXr2,dXr6
+ fsub dUi3,dXi1,dXi5
+ fsub dUi5,dXi2,dXi6
+
+ st2 {dYr6,dYi6},[pDst],step1 // store y6
+
+ .ELSE
+
+ fadd dYr6,dVr2,dVi6
+ fsub dYi6,dVi2,dVr6
+
+ fsub dYr2,dVr2,dVi6
+ st2 {dYr6,dYi6},[pDst],step1 // store y2
+ fadd dYi2,dVi2,dVr6
+
+
+ // fsub qU1,qX0,qX4
+ fsub dUr1,dXr0,dXr4
+ fsub dUi1,dXi0,dXi4
+
+ st2 {dYr4,dYi4},[pDst],step1 // store y4
+
+ // fsub qU3,qX1,qX5
+ // fsub qU5,qX2,qX6
+ fsub dUr3,dXr1,dXr5
+ fsub dUr5,dXr2,dXr6
+ fsub dUi3,dXi1,dXi5
+ fsub dUi5,dXi2,dXi6
+
+ st2 {dYr2,dYi2},[pDst],step1 // store y6
+
+
+ .ENDIF
+
+ // finish first stage of 8 point FFT
+
+ // fsub qU7,qX3,qX7
+ fsub dUr7,dXr3,dXr7
+ fsub dUi7,dXi3,dXi7
+
+ mov dT0s[0], t0
+
+ // finish second stage of 8 point FFT
+
+ fsub dVr1,dUr1,dUi5
+ // data[0] for next iteration
+ ld2 {dXr0,dXi0},[pSrc],pointStep
+ fadd dVi1,dUi1,dUr5
+ fadd dVr3,dUr1,dUi5
+ ld2 {dXr1,dXi1},[pSrc],pointStep // data[1]
+ fsub dVi3,dUi1,dUr5
+
+ fsub dVr5,dUr3,dUi7
+ ld2 {dXr2,dXi2},[pSrc],pointStep // data[2]
+ fadd dVi5,dUi3,dUr7
+ fadd dVr7,dUr3,dUi7
+ ld2 {dXr3,dXi3},[pSrc],pointStep // data[3]
+ fsub dVi7,dUi3,dUr7
+
+ // finish third stage of 8 point FFT
+
+ .ifeqs "\inverse", "TRUE"
+
+ // calculate a*v5
+ fmul dT1,dVr5,dT0[0] // use dVi0 for dT1
+
+ ld2 {dXr4,dXi4},[pSrc],pointStep // data[4]
+ fmul dVi5,dVi5,dT0[0]
+
+ ld2 {dXr5,dXi5},[pSrc],pointStep // data[5]
+ fsub dVr5,dT1,dVi5 // a * V5
+ fadd dVi5,dT1,dVi5
+
+ ld2 {dXr6,dXi6},[pSrc],pointStep // data[6]
+
+ // calculate b*v7
+ fmul dT1,dVr7,dT0[0]
+ fmul dVi7,dVi7,dT0[0]
+
+ // fadd qY1,qV1,qV5
+ // fsub qY5,qV1,qV5
+ fadd dYr1,dVr1,dVr5
+ fsub dYr5,dVr1,dVr5
+ fadd dYi1,dVi1,dVi5
+ fsub dYi5,dVi1,dVi5
+
+ fadd dVr7,dT1,dVi7 // b * V7
+ fsub dVi7,dVi7,dT1
+ SUB pDst, pDst, step2 // set pDst to y1
+
+ // On the last iteration, this will read past the end of pSrc,
+ // so skip this read.
+ BEQ radix8SkipLastUpdateInv\name
+ ld2 {dXr7,dXi7},[pSrc],setStep // data[7]
+radix8SkipLastUpdateInv\name:
+
+ fsub dYr3,dVr3,dVr7
+ fsub dYi3,dVi3,dVi7
+ st2 {dYr1,dYi1},[pDst],step1 // store y1
+ fadd dYr7,dVr3,dVr7
+ fadd dYi7,dVi3,dVi7
+
+
+ st2 {dYr3,dYi3},[pDst],step1 // store y3
+ st2 {dYr5,dYi5},[pDst],step1 // store y5
+ st2 {dYr7,dYi7},[pDst] // store y7
+ ADD pDst, pDst, #16
+
+ .ELSE
+
+ // calculate b*v7
+ fmul dT1,dVr7,dT0[0]
+ ld2 {dXr4,dXi4},[pSrc],pointStep // data[4]
+ fmul dVi7,dVi7,dT0[0]
+
+ ld2 {dXr5,dXi5},[pSrc],pointStep // data[5]
+ fadd dVr7,dT1,dVi7 // b * V7
+ fsub dVi7,dVi7,dT1
+
+ ld2 {dXr6,dXi6},[pSrc],pointStep // data[6]
+
+ // calculate a*v5
+ fmul dT1,dVr5,dT0[0] // use dVi0 for dT1
+ fmul dVi5,dVi5,dT0[0]
+
+ fadd dYr7,dVr3,dVr7
+ fadd dYi7,dVi3,dVi7
+ SUB pDst, pDst, step2 // set pDst to y1
+
+ fsub dVr5,dT1,dVi5 // a * V5
+ fadd dVi5,dT1,dVi5
+
+ // On the last iteration, this will read past the end of pSrc,
+ // so skip this read.
+ BEQ radix8SkipLastUpdateFwd\name
+ ld2 {dXr7,dXi7},[pSrc],setStep // data[7]
+radix8SkipLastUpdateFwd\name:
+
+ // fsub qY5,qV1,qV5
+ fsub dYr5,dVr1,dVr5
+ fsub dYi5,dVi1,dVi5
+
+ fsub dYr3,dVr3,dVr7
+ st2 {dYr7,dYi7},[pDst],step1 // store y1
+ fsub dYi3,dVi3,dVi7
+
+ // fadd qY1,qV1,qV5
+ fadd dYr1,dVr1,dVr5
+ fadd dYi1,dVi1,dVi5
+
+ st2 {dYr5,dYi5},[pDst],step1 // store y3
+ st2 {dYr3,dYi3},[pDst],step1 // store y5
+ st2 {dYr1,dYi1},[pDst],#16 // store y7
+
+ .ENDIF
+
+
+ // update pDst for the next set
+ SUB pDst, pDst, step2
+ BGT radix8fsGrpZeroSetLoop\name
+
+ // Save subFFTNum and subFFTSize for next stage
+ str subFFTNum, [pSubFFTNum]
+ str subFFTSize, [pSubFFTSize]
+
+ .endm
+
+
+ // Allocate stack memory required by the function
+
+
+ M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace,,d15
+ FFTSTAGE "FALSE","FALSE",FWD
+ M_END
+
+
+ M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace,,d15
+ FFTSTAGE "FALSE","TRUE",INV
+ M_END
+
+
+
+ .end
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c b/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c
new file mode 100644
index 0000000..f29796b
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+/**
+ * Function: omxSP_FFTFwd_CToC_FC32_Sfs (2.2.4.2.2)
+ *
+ * Description:
+ * Compute an FFT for a complex signal of length of 2^order,
+ * where 0 <= order <= 15.
+ * Transform length is determined by the specification structure, which
+ * must be initialized prior to calling the FFT function using the appropriate
+ * helper, i.e., <FFTInit_C_sc32> or <FFTInit_C_SC16>. The relationship
+ * between the input and output sequences can be expressed in terms of the
+ * DFT, i.e.,
+ *
+ * X[k] = SUM[n=0...N-1]x[n].e^(-jnk.2.pi/N)
+ * k = 0,1,2,..., N-1
+ * N = 2^order
+ *
+ * Input Arguments:
+ * pSrc - pointer to the input signal, a complex-valued vector of length
+ * 2^order; must be aligned on a 32 byte boundary.
+ * pFFTSpec - pointer to the preallocated and initialized specification
+ * structure
+ *
+ * Output Arguments:
+ * pDst - pointer to the complex-valued output vector, of length 2^order;
+ * must be aligned on an 32-byte boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - returned if one or more of the following conditions
+ * is true:
+ * - one or more of the following pointers is NULL: pSrc, pDst, or
+ * pFFTSpec.
+ * - pSrc or pDst is not 32-byte aligned
+ *
+ */
+
+OMXResult omxSP_FFTFwd_CToC_FC32_Sfs(const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ const OMXFFTSpec_C_FC32* pFFTSpec) {
+ ARMsFFTSpec_FC32* spec = (ARMsFFTSpec_FC32*)pFFTSpec;
+ int order;
+ long subFFTSize;
+ long subFFTNum;
+ OMX_FC32* pTwiddle;
+ OMX_FC32* pOut;
+
+ /*
+ * Check args are not NULL and the source and destination pointers
+ * are properly aligned.
+ */
+ if (!validateParametersFC32(pSrc, pDst, spec))
+ return OMX_Sts_BadArgErr;
+
+ order = fastlog2(spec->N);
+
+ subFFTSize = 1;
+ subFFTNum = spec->N;
+ pTwiddle = spec->pTwiddle;
+ pOut = spec->pBuf;
+
+ if (order > 3) {
+ OMX_FC32* argDst;
+
+ /*
+ * Set up argDst and pOut appropriately so that pOut = pDst for
+ * the very last FFT stage.
+ */
+ if ((order & 2) == 0) {
+ argDst = pOut;
+ pOut = pDst;
+ } else {
+ argDst = pDst;
+ }
+
+ /*
+ * Odd order uses a radix 8 first stage; even order, a radix 4
+ * first stage.
+ */
+ if (order & 1) {
+ armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+ pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else {
+ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+ pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+
+ /*
+ * Now use radix 4 stages to finish rest of the FFT
+ */
+ if (subFFTNum >= 4) {
+ while (subFFTNum > 4) {
+ OMX_FC32* tmp;
+
+ armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+ argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ /*
+ * Swap argDst and pOut
+ */
+ tmp = pOut;
+ pOut = argDst;
+ argDst = tmp;
+ }
+
+ armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+ argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+ } else if (order == 3) {
+ armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+ pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+ pDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+ pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else if (order == 2) {
+ armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+ pSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+ pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else {
+ /* Order = 1 */
+ armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+ pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+
+ return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c b/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c
new file mode 100644
index 0000000..f1e503e
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void ComplexToRealFixup(OMX_FC32* pSrc,
+ OMX_F32* pDst,
+ const OMX_FC32* pTwiddle,
+ OMX_F32* pBuf,
+ long N);
+
+/**
+ * Function: omxSP_FFTFwd_CToC_FC32_Sfs (2.2.4.2.2)
+ *
+ * Description:
+ * Compute an FFT for a complex signal of length of 2^order,
+ * where 0 <= order <= 15.
+ * Transform length is determined by the specification structure, which
+ * must be initialized prior to calling the FFT function using the appropriate
+ * helper, i.e., <FFTInit_C_sc32> or <FFTInit_C_SC16>. The relationship
+ * between the input and output sequences can be expressed in terms of the
+ * DFT, i.e.,
+ *
+ * X[k] = SUM[n=0...N-1]x[n].e^(-jnk.2.pi/N)
+ * k = 0,1,2,..., N-1
+ * N = 2^order
+ *
+ * Input Arguments:
+ * pSrc - pointer to the input signal, a complex-valued vector of length
+ * 2^order; must be aligned on a 32 byte boundary.
+ * pFFTSpec - pointer to the preallocated and initialized specification
+ * structure
+ *
+ * Output Arguments:
+ * pDst - pointer to the complex-valued output vector, of length 2^order;
+ * must be aligned on an 32-byte boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - returned if one or more of the following conditions
+ * is true:
+ * - one or more of the following pointers is NULL: pSrc, pDst, or
+ * pFFTSpec.
+ * - pSrc or pDst is not 32-byte aligned
+ *
+ */
+
+OMXResult omxSP_FFTFwd_RToCCS_F32_Sfs(const OMX_F32* pSrc,
+ OMX_F32* pDst,
+ const OMXFFTSpec_R_F32* pFFTSpec) {
+ ARMsFFTSpec_R_FC32* spec = (ARMsFFTSpec_R_FC32*)pFFTSpec;
+ int order;
+ long subFFTSize;
+ long subFFTNum;
+ OMX_FC32* pTwiddle;
+ OMX_FC32* pOut;
+ OMX_FC32* pComplexSrc = (OMX_FC32*) pSrc;
+ OMX_FC32* pComplexDst = (OMX_FC32*) pDst;
+
+ /*
+ * Check args are not NULL and the source and destination pointers
+ * are properly aligned.
+ */
+ if (!validateParametersF32(pSrc, pDst, spec))
+ return OMX_Sts_BadArgErr;
+
+ /*
+ * Compute the RFFT using a complex FFT of one less order, so set
+ * order to be the order of the complex FFT.
+ */
+ order = fastlog2(spec->N) - 1;
+
+ subFFTSize = 1;
+ subFFTNum = spec->N >> 1;
+ pTwiddle = spec->pTwiddle;
+ pOut = (OMX_FC32*) spec->pBuf;
+
+ if (order > 3) {
+ OMX_FC32* argDst;
+ OMX_FC32* pComplexDst = (OMX_FC32*) pDst;
+
+ /*
+ * Set up argDst and pOut appropriately so that pOut = pDst for
+ * ComplexToRealFixup.
+ */
+ if ((order & 2) != 0) {
+ argDst = pOut;
+ pOut = pComplexDst;
+ } else {
+ argDst = pComplexDst;
+ }
+
+ /*
+ * Odd order uses a radix 8 first stage; even order, a radix 4
+ * first stage.
+ */
+ if (order & 1) {
+ armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+ pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else {
+ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+ pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+
+ /*
+ * Now use radix 4 stages to finish rest of the FFT
+ */
+ if (subFFTNum >= 4) {
+ while (subFFTNum > 4) {
+ OMX_FC32* tmp;
+
+ armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+ argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ /*
+ * Swap argDst and pOut
+ */
+ tmp = pOut;
+ pOut = argDst;
+ argDst = tmp;
+ }
+
+ armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+ argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+ } else if (order == 3) {
+ armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+ pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+ pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+ pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ } else if (order == 2) {
+ armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+ pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+ pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ } else if (order == 1) {
+ armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+ pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ } else {
+ /* Handle complex order 0 specially */
+ pOut->Re = pSrc[0];
+ pOut->Im = pSrc[1];
+ }
+
+ /*
+ * Complex FFT done. Fix up the complex result to give the correct
+ * RFFT.
+ */
+
+ ComplexToRealFixup(pOut, pDst, pTwiddle, spec->pBuf, spec->N);
+
+ return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c b/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c
new file mode 100644
index 0000000..84de9cf
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CCSToR_F32_preTwiddleRadix2(
+ const OMX_F32* pSrc,
+ const OMX_FC32* pTwiddle,
+ OMX_F32* pBuf,
+ long N);
+
+/*
+ * Scale FFT data by 1/|length|. |length| must be a power of two
+ */
+static inline ScaleRFFTData(OMX_F32* fftData, unsigned length) {
+ float32_t* data = (float32_t*)fftData;
+ float32_t scale = 2.0f / length;
+
+ if (length >= 4) {
+ /*
+ * Do 4 float elements at a time because |length| is always a
+ * multiple of 4 when |length| >= 4.
+ *
+ * TODO(rtoy): Figure out how to process 8 elements at a time
+ * using intrinsics or replace this with inline assembly.
+ */
+ do {
+ float32x4_t x = vld1q_f32(data);
+
+ length -= 4;
+ x = vmulq_n_f32(x, scale);
+ vst1q_f32(data, x);
+ data += 4;
+ } while (length > 0);
+ } else if (length == 2) {
+ float32x2_t x = vld1_f32(data);
+ x = vmul_n_f32(x, scale);
+ vst1_f32(data, x);
+ } else {
+ fftData[0] *= scale;
+ }
+}
+
+/**
+ * Function: omxSP_FFTInv_CCSToR_F32_Sfs
+ *
+ * Description:
+ * These functions compute the inverse FFT for a conjugate-symmetric input
+ * sequence. Transform length is determined by the specification structure,
+ * which must be initialized prior to calling the FFT function using
+ * <FFTInit_R_F32>. For a transform of length M, the input sequence is
+ * represented using a packed CCS vector of length M+2, and is organized
+ * as follows:
+ *
+ * Index: 0 1 2 3 4 5 . . . M-2 M-1 M M+1
+ * Comp: R[0] 0 R[1] I[1] R[2] I[2] . . . R[M/2-1] I[M/2-1] R[M/2] 0
+ *
+ * where R[n] and I[n], respectively, denote the real and imaginary
+ * components for FFT bin n. Bins are numbered from 0 to M/2, where M
+ * is the FFT length. Bin index 0 corresponds to the DC component,
+ * and bin index M/2 corresponds to the foldover frequency.
+ *
+ * Input Arguments:
+ * pSrc - pointer to the complex-valued input sequence represented
+ * using CCS format, of length (2^order) + 2; must be aligned on a
+ * 32-byte boundary.
+ * pFFTSpec - pointer to the preallocated and initialized
+ * specification structure
+ *
+ * Output Arguments:
+ * pDst - pointer to the real-valued output sequence, of length
+ * 2^order ; must be aligned on a 32-byte boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+
+ * OMX_Sts_BadArgErr - bad arguments if one or more of the
+ * following is true:
+ * - pSrc, pDst, or pFFTSpec is NULL
+ * - pSrc or pDst is not aligned on a 32-byte boundary
+ *
+ */
+OMXResult omxSP_FFTInv_CCSToR_F32_Sfs(
+ const OMX_F32* pSrc,
+ OMX_F32* pDst,
+ const OMXFFTSpec_R_F32* pFFTSpec) {
+ ARMsFFTSpec_R_FC32* spec = (ARMsFFTSpec_R_FC32*)pFFTSpec;
+ int order;
+ long subFFTSize;
+ long subFFTNum;
+ OMX_FC32* pTwiddle;
+ OMX_FC32* pOut;
+ OMX_FC32* pComplexSrc;
+ OMX_FC32* pComplexDst = (OMX_FC32*) pDst;
+
+ /*
+ * Check args are not NULL and the source and destination pointers
+ * are properly aligned.
+ */
+ if (!validateParametersF32(pSrc, pDst, spec))
+ return OMX_Sts_BadArgErr;
+
+ /*
+ * Preprocess the input before calling the complex inverse FFT. The
+ * result is actually stored in the second half of the temp buffer
+ * in pFFTSpec.
+ */
+ if (spec->N > 1)
+ armSP_FFTInv_CCSToR_F32_preTwiddleRadix2(
+ pSrc, spec->pTwiddle, spec->pBuf, spec->N);
+
+ /*
+ * Do a complex inverse FFT of half size.
+ */
+ order = fastlog2(spec->N) - 1;
+
+ subFFTSize = 1;
+ subFFTNum = spec->N >> 1;
+ pTwiddle = spec->pTwiddle;
+ /*
+ * The pBuf is split in half. The first half is the temp buffer. The
+ * second half holds the source data that was placed there by
+ * armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe.
+ */
+ pOut = (OMX_FC32*) spec->pBuf;
+ pComplexSrc = pOut + (1 << order);
+
+
+ if (order > 3) {
+ OMX_FC32* argDst;
+
+ /*
+ * Set up argDst and pOut appropriately so that pOut = pDst for
+ * the very last FFT stage.
+ */
+ if ((order & 2) == 0) {
+ argDst = pOut;
+ pOut = pComplexDst;
+ } else {
+ argDst = pComplexDst;
+ }
+
+ /*
+ * Odd order uses a radix 8 first stage; even order, a radix 4
+ * first stage.
+ */
+ if (order & 1) {
+ armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+ pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else {
+ armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+ pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+
+ /*
+ * Now use radix 4 stages to finish rest of the FFT
+ */
+ if (subFFTNum >= 4) {
+ while (subFFTNum > 4) {
+ OMX_FC32* tmp;
+
+ armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+ argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ /*
+ * Swap argDst and pOut
+ */
+ tmp = pOut;
+ pOut = argDst;
+ argDst = tmp;
+ }
+
+ armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+ argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+ } else if (order == 3) {
+ armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+ pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+ pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+ pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else if (order == 2) {
+ armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+ pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+ pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else if (order == 1) {
+ armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+ pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else {
+ /* Order = 0 */
+ *pComplexDst = *pComplexSrc;
+ }
+
+ ScaleRFFTData(pDst, spec->N);
+ return OMX_Sts_NoErr;
+}
+
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c b/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c
new file mode 100644
index 0000000..eec05e9
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+ const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ OMX_FC32* pTwiddle,
+ long* subFFTNum,
+ long* subFFTSize);
+
+/*
+ * Scale FFT data by 1/|length|. |length| must be a power of two
+ */
+static inline ScaleFFTData(OMX_FC32* fftData, unsigned length) {
+ float32_t* data = (float32_t*)fftData;
+ float32_t scale = 1.0f / length;
+
+ /*
+ * Do two complex elements at a time because |length| is always
+ * greater than or equal to 2 (order >= 1)
+ */
+ do {
+ float32x4_t x = vld1q_f32(data);
+
+ length -= 2;
+ x = vmulq_n_f32(x, scale);
+ vst1q_f32(data, x);
+ data += 4;
+ } while (length > 0);
+}
+
+/**
+ * Function: omxSP_FFTInv_CToC_FC32
+ *
+ * Description:
+ * These functions compute an inverse FFT for a complex signal of
+ * length of 2^order, where 0 <= order <= 15. Transform length is
+ * determined by the specification structure, which must be
+ * initialized prior to calling the FFT function using the appropriate
+ * helper, i.e., <FFTInit_C_FC32>. The relationship between the input
+ * and output sequences can be expressed in terms of the IDFT, i.e.:
+ *
+ * x[n] = SUM[k=0,...,N-1] X[k].e^(jnk.2.pi/N)
+ * n=0,1,2,...N-1
+ * N=2^order.
+ *
+ * Input Arguments:
+ * pSrc - pointer to the complex-valued input signal, of length 2^order ;
+ * must be aligned on a 32-byte boundary.
+ * pFFTSpec - pointer to the preallocated and initialized specification
+ * structure
+ *
+ * Output Arguments:
+ * order
+ * pDst - pointer to the complex-valued output signal, of length 2^order;
+ * must be aligned on a 32-byte boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - returned if one or more of the following conditions
+ * is true:
+ * - one or more of the following pointers is NULL: pSrc, pDst, or
+ * pFFTSpec.
+ * - pSrc or pDst is not 32-byte aligned
+ *
+ */
+
+OMXResult omxSP_FFTInv_CToC_FC32_Sfs(const OMX_FC32* pSrc,
+ OMX_FC32* pDst,
+ const OMXFFTSpec_C_FC32* pFFTSpec) {
+ ARMsFFTSpec_FC32* spec = (ARMsFFTSpec_FC32*)pFFTSpec;
+ int order;
+ long subFFTSize;
+ long subFFTNum;
+ OMX_FC32* pTwiddle;
+ OMX_FC32* pOut;
+
+ /*
+ * Check args are not NULL and the source and destination pointers
+ * are properly aligned.
+ */
+ if (!validateParametersFC32(pSrc, pDst, spec))
+ return OMX_Sts_BadArgErr;
+
+ order = fastlog2(spec->N);
+
+ subFFTSize = 1;
+ subFFTNum = spec->N;
+ pTwiddle = spec->pTwiddle;
+ pOut = spec->pBuf;
+
+ if (order > 3) {
+ OMX_FC32* argDst;
+
+ /*
+ * Set up argDst and pOut appropriately so that pOut = pDst for
+ * the very last FFT stage.
+ */
+ if ((order & 2) == 0) {
+ argDst = pOut;
+ pOut = pDst;
+ } else {
+ argDst = pDst;
+ }
+
+ /*
+ * Odd order uses a radix 8 first stage; even order, a radix 4
+ * first stage.
+ */
+ if (order & 1) {
+ armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+ pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else {
+ armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+ pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+
+ /*
+ * Now use radix 4 stages to finish rest of the FFT
+ */
+ if (subFFTNum >= 4) {
+ while (subFFTNum > 4) {
+ OMX_FC32* tmp;
+
+ armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+ argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ /*
+ * Swap argDst and pOut
+ */
+ tmp = pOut;
+ pOut = argDst;
+ argDst = tmp;
+ }
+
+ armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+ argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+ } else if (order == 3) {
+ armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+ pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+ pDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+ pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else if (order == 2) {
+ armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+ pSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+ armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+ pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+ } else {
+ /* Order = 1 */
+ armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+ pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+ }
+
+ ScaleFFTData(pDst, spec->N);
+ return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/test/support/float_fft_neon.c b/dl/sp/src/test/support/float_fft_neon.c
index 3f0cf16..a10d803 100644
--- a/dl/sp/src/test/support/float_fft_neon.c
+++ b/dl/sp/src/test/support/float_fft_neon.c
@@ -8,22 +8,37 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
+
#include "dl/sp/api/armSP.h"
#include "dl/sp/api/omxSP.h"
#include "dl/sp/src/test/test_util.h"
static const char* message =
- "Test forward and inverse floating-point FFT (NEON)\n";
+ "Test forward and inverse floating-point FFT"
+#if defined(__aarch64__)
+ " (ARM64)\n"
+#else
+ " (NEON)\n"
+#endif
+ ;
const char* UsageMessage() {
return message;
}
+#if defined(__aarch64__)
+#define FINISHED_MESSAGE "ARM64 tests finished.\n"
+#else
+#define FINISHED_MESSAGE "NEON tests finished.\n"
+#endif
+
void FinishedMessage() {
- printf("NEON tests finished.\n");
+ printf(FINISHED_MESSAGE);
}
void SetThresholds(struct TestInfo* info) {
+#if defined(__arm__)
#ifdef BIG_FFT_TABLE
info->forward_threshold_ = 138.81;
info->inverse_threshold_ = 137.81;
@@ -31,6 +46,15 @@
info->forward_threshold_ = 138.81;
info->inverse_threshold_ = 138.81;
#endif
+#else
+#ifdef BIG_FFT_TABLE
+ info->forward_threshold_ = 138.96;
+ info->inverse_threshold_ = 138.96;
+#else
+ info->forward_threshold_ = 138.96;
+ info->inverse_threshold_ = 138.96;
+#endif
+#endif
}
OMXResult ForwardFFT(OMX_FC32* x,
diff --git a/dl/sp/src/test/support/float_rfft_thresholds.h b/dl/sp/src/test/support/float_rfft_thresholds.h
index 2d84eec..6bbc937 100644
--- a/dl/sp/src/test/support/float_rfft_thresholds.h
+++ b/dl/sp/src/test/support/float_rfft_thresholds.h
@@ -23,6 +23,14 @@
#define FLOAT_RFFT_FORWARD_THRESHOLD_ARMV7 (134.95)
#define FLOAT_RFFT_INVERSE_THRESHOLD_ARMV7 (142.25)
#endif
+#elif defined(__aarch64__)
+#ifdef BIG_FFT_TABLE
+#define FLOAT_RFFT_FORWARD_THRESHOLD_NEON (136.55)
+#define FLOAT_RFFT_INVERSE_THRESHOLD_NEON (141.55)
+#else
+#define FLOAT_RFFT_FORWARD_THRESHOLD_NEON (136.55)
+#define FLOAT_RFFT_INVERSE_THRESHOLD_NEON (142.74)
+#endif
#else
#ifdef BIG_FFT_TABLE
#define FLOAT_RFFT_FORWARD_THRESHOLD_X86 (135.97)
diff --git a/dl/sp/src/test/test_fft.gyp b/dl/sp/src/test/test_fft.gyp
index b1d88e3..3c739cf 100644
--- a/dl/sp/src/test/test_fft.gyp
+++ b/dl/sp/src/test/test_fft.gyp
@@ -128,6 +128,19 @@
},
],
}],
+ ['target_arch == "arm64"', {
+ 'targets': [
+ {
+ # Test complex floating-point FFT
+ 'target_name': 'test_float_fft',
+ 'type': 'executable',
+ 'sources': [
+ 'test_float_fft.c',
+ 'support/float_fft_neon.c',
+ ],
+ },
+ ],
+ }],
],
'targets': [
# Targets that should be supported by all architectures
@@ -155,7 +168,7 @@
'support/float_rfft_thresholds.h',
],
'conditions': [
- ['target_arch == "arm"', {
+ ['target_arch == "arm" or target_arch == "arm64"', {
'sources': [
'support/float_rfft_neon.c',
],
@@ -175,9 +188,9 @@
'test_fft_time.c',
],
'conditions': [
- ['target_arch == "ia32"', {
+ ['target_arch == "ia32" or target_arch == "arm64"', {
'defines': [
- # Timing test only for float FFTs on x86
+ # Timing test only for float FFTs on x86 and arm64.
'FLOAT_ONLY',
],
}],
@@ -206,6 +219,12 @@
'test_float_rfft_detect',
],
}],
+ ['target_arch == "arm64"', {
+ # Supported test programs for ARM64
+ 'dependencies': [
+ 'test_float_fft',
+ ],
+ }],
],
'dependencies' : [
# All architectures must support at least the float rfft test
diff --git a/dl/sp/src/test/test_fft_time.c b/dl/sp/src/test/test_fft_time.c
index db65134..6154228 100644
--- a/dl/sp/src/test/test_fft_time.c
+++ b/dl/sp/src/test/test_fft_time.c
@@ -51,7 +51,7 @@
S32,
} s16_s32;
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
void TimeOneFloatFFT(int count, int fft_log_size, float signal_value,
int signal_type);
void TimeFloatFFT(int count, float signal_value, int signal_type);
@@ -112,7 +112,7 @@
" -T Run just one FFT timing test\n"
" -f FFT type:\n"
" 0 - Complex Float\n"
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
" 1 - Real Float\n"
#endif
#ifdef ENABLE_FIXED_POINT_FFT_TESTS
@@ -217,7 +217,7 @@
printf("Warning: -f ignored when -T not specified\n");
if (test_mode) {
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
TimeFloatFFT(count, signal_value, signal_type);
#endif
TimeFloatRFFT(count, signal_value, signal_type);
@@ -229,7 +229,7 @@
#endif
} else {
switch (fft_type) {
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
case 0:
TimeOneFloatFFT(count, fft_log_size, signal_value, signal_type);
break;
@@ -323,7 +323,7 @@
return count;
}
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
void TimeOneFloatFFT(int count, int fft_log_size, float signal_value,
int signal_type) {
struct AlignedPtr* x_aligned;
@@ -1061,11 +1061,11 @@
if(s16s32 == S32) {
PrintResult("Forward RFFT16 (with S32)",
- fft_log_size, elapsed_time, count);
+ fft_log_size, elapsed_time, count);
}
else {
PrintResult("Forward RFFT16 (with S16)",
- fft_log_size, elapsed_time, count);
+ fft_log_size, elapsed_time, count);
}
}
@@ -1124,11 +1124,11 @@
if(s16s32 == S32) {
PrintResult("Inverse RFFT16 (with S32)",
- fft_log_size, elapsed_time, count);
+ fft_log_size, elapsed_time, count);
}
else {
PrintResult("Inverse RFFT16 (with S16)",
- fft_log_size, elapsed_time, count);
+ fft_log_size, elapsed_time, count);
}
}
diff --git a/dl/sp/src/test/test_float_fft.c b/dl/sp/src/test/test_float_fft.c
index 289b197..6aac3a4 100644
--- a/dl/sp/src/test/test_float_fft.c
+++ b/dl/sp/src/test/test_float_fft.c
@@ -154,7 +154,7 @@
status = ForwardFFT(x, y, fft_fwd_spec);
- if (status) {
+ if (status != OMX_Sts_NoErr) {
fprintf(stderr, "Forward FFT failed: status = %d\n", status);
exit(1);
}
@@ -223,7 +223,7 @@
status = InverseFFT(y, z, fft_inv_spec);
- if (status) {
+ if (status != OMX_Sts_NoErr) {
fprintf(stderr, "Inverse FFT failed: status = %d\n", status);
exit(1);
}
diff --git a/dl/sp/src/test/test_float_rfft.c b/dl/sp/src/test/test_float_rfft.c
index a976162..3b37ece 100644
--- a/dl/sp/src/test/test_float_rfft.c
+++ b/dl/sp/src/test/test_float_rfft.c
@@ -177,7 +177,7 @@
}
status = ForwardRFFT(x, (OMX_F32*) y, fft_fwd_spec);
- if (status) {
+ if (status != OMX_Sts_NoErr) {
fprintf(stderr, "Forward FFT failed: status = %d\n", status);
exit(1);
}
@@ -257,7 +257,7 @@
}
status = InverseRFFT((OMX_F32 *) yTrue, z, fft_inv_spec);
- if (status) {
+ if (status != OMX_Sts_NoErr) {
fprintf(stderr, "Inverse FFT failed: status = %d\n", status);
exit(1);
}