Implement ARM64 version of OpenMAX DL

This is a conversion of the existing ARM NEON OpenMAX DL FFT routines
to arm64. The translation was done by hand and mostly just uses the
correct register names and instructions for ARM64.

The test_float_fft and test_float_rfft programs pass with SNRs
basically equivalent to the original ARM NEON version.

BUG=
R=andrew@webrtc.org, leecam@google.com

Review URL: https://webrtc-codereview.appspot.com/14539004

git-svn-id: http://webrtc.googlecode.com/svn/deps/third_party/openmax@6477 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/dl/api/arm/arm64COMM_s.h b/dl/api/arm/arm64COMM_s.h
new file mode 100644
index 0000000..e19ceee
--- /dev/null
+++ b/dl/api/arm/arm64COMM_s.h
@@ -0,0 +1,258 @@
+// -*- Mode: asm; -*-
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This file was originally licensed as follows. It has been
+//  relicensed with permission from the copyright holders.
+//
+	
+// 
+// File Name:  armCOMM_s.h
+// OpenMAX DL: v1.0.2
+// Last Modified Revision:   13871
+// Last Modified Date:       Fri, 09 May 2008
+// 
+// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+// 
+// 
+//
+// ARM optimized OpenMAX common header file
+//
+
+	.set	_SBytes, 0	// Number of scratch bytes on stack
+	.set	_Workspace, 0	// Stack offset of scratch workspace
+
+	.set	_RRegList, 0	// R saved register list (last register number)
+	.set	_DRegList, 0	// D saved register list (last register number)
+
+	// Work out list of D saved registers, like for R registers.
+	.macro	_M_GETDREGLIST dreg
+	.ifeqs "\dreg", ""
+	.set	_DRegList, 0
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d8"
+	.set	_DRegList, 8
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d9"
+	.set	_DRegList, 9
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d10"
+	.set	_DRegList, 10
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d11"
+	.set	_DRegList, 11
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d12"
+	.set	_DRegList, 12
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d13"
+	.set	_DRegList, 13
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d14"
+	.set	_DRegList, 14
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d15"
+	.set	_DRegList, 15
+	.exitm
+	.endif
+
+	.warning "Unrecognized saved d register limit: \rreg"
+	.endm
+
+//////////////////////////////////////////////////////////
+// Function header and footer macros
+//////////////////////////////////////////////////////////      
+	
+        // Function Header Macro    
+        // Generates the function prologue
+        // Note that functions should all be "stack-moves-once"
+        // The FNSTART and FNEND macros should be the only places
+        // where the stack moves.
+        //    
+        //  name  = function name
+        //  rreg  = ""   don't stack any registers
+        //          "lr" stack "lr" only
+        //          "rN" stack registers "r4-rN,lr"
+        //  dreg  = ""   don't stack any D registers
+        //          "dN" stack registers "d8-dN"
+        //
+        // Note: ARM Archicture procedure call standard AAPCS
+        // states that r4-r11, sp, d8-d15 must be preserved by
+        // a compliant function.
+	.macro	M_START name, rreg, dreg
+	.set	_Workspace, 0
+
+	// Define the function and make it external.
+	.global	\name
+	.func	\name
+	.section	.text.\name,"ax",%progbits
+	.align	4
+\name :		
+//.fnstart
+	// Save specified R registers
+	_M_PUSH_RREG
+
+	// Save specified D registers
+        _M_GETDREGLIST  \dreg
+	_M_PUSH_DREG
+
+	// Ensure size claimed on stack is 16-byte aligned for ARM64
+	.if (_SBytes & 15) != 0
+	.set	_SBytes, _SBytes + (16 - (_SBytes & 15))
+	.endif
+	.if _SBytes != 0
+		sub	sp, sp, #_SBytes
+	.endif	
+	.endm
+
+        // Function Footer Macro        
+        // Generates the function epilogue
+	.macro M_END
+	// Restore the stack pointer to its original value on function entry
+	.if _SBytes != 0
+		add	sp, sp, #_SBytes
+	.endif
+	// Restore any saved R or D registers.
+	_M_RET
+	//.fnend	
+	.endfunc
+        // Reset the global stack tracking variables back to their
+	// initial values.
+	.set _SBytes, 0
+	.endm
+
+	// Based on the value of _DRegList, push the specified set of registers 
+	// to the stack.
+	// The ARM64 ABI says only v8-v15 needs to be saved across calls and only 
+	// the lower 64 bits need to be saved.
+	.macro _M_PUSH_DREG
+	.if _DRegList >= 8
+		sub	sp, sp, (_DRegList - 7) * 16	// 16-byte alignment
+		str	q8, [sp]
+	.endif
+	
+	.if _DRegList >= 9
+		str	q9, [sp, #16]
+	.endif
+	
+	.if _DRegList >= 10
+		str	q10, [sp, #32]
+	.endif
+	
+	.if _DRegList >= 11
+		str	q11, [sp, #48]
+	.endif
+	
+	.if _DRegList >= 12
+		str	q12, [sp, #64]
+	.endif
+	
+	.if _DRegList >= 13
+		str	q13, [sp, #80]
+	.endif
+	
+	.if _DRegList >= 14
+		str	q14, [sp, #96]
+	.endif
+	
+	.if _DRegList >= 15
+		str	q15, [sp, #112]
+	.endif
+	
+	.exitm
+	.endm
+
+	// Based on the value of _RRegList, push the specified set of registers 
+	// to the stack.
+	// The ARM64 ABI says registers r19-r29 needs to be saved across calls.
+	// But for the FFT routines, we don't need to save anything, so just
+	// preserve the SP and LR.
+	.macro _M_PUSH_RREG
+	sub	sp, sp, #16
+	str	x30, [sp]
+	str	x29, [sp, #8]
+	.exitm
+	.endm
+
+	// The opposite of _M_PUSH_DREG
+	.macro  _M_POP_DREG
+	.if _DRegList >= 8
+		ldr	q8, [sp]
+	.endif
+	
+	.if _DRegList >= 9
+		ldr	q9, [sp, #16]
+	.endif
+	
+	.if _DRegList >= 10
+		ldr	q10, [sp, #32]
+	.endif
+	
+	.if _DRegList >= 11
+		ldr	q11, [sp, #48]
+	.endif
+	
+	.if _DRegList >= 12
+		ldr	q12, [sp, #64]
+	.endif
+	
+	.if _DRegList >= 13
+		ldr	q13, [sp, #80]
+	.endif
+	
+	.if _DRegList >= 14
+		ldr	q14, [sp, #96]
+	.endif
+	
+	.if _DRegList >= 15
+		ldr	q15, [sp, #112]
+	.endif
+
+	.if _DRegList >= 8
+		add	sp, sp, (_DRegList - 7) * 16	// 16-byte alignment
+	.endif
+	.exitm
+	.endm
+
+	// The opposite of _M_PUSH_RREG
+	.macro _M_POP_RREG cc
+	ldr	x29, [sp, #8]
+	ldr	x30, [sp]
+	add	sp, sp, #16
+	.exitm
+	.endm
+	
+        // Produce function return instructions
+	.macro	_M_RET cc
+	_M_POP_DREG \cc
+	_M_POP_RREG \cc
+	ret
+	.endm	
+	// rsb - reverse subtract
+	// compute dst = src2 - src1, useful when src2 is an immediate value
+	.macro	rsb	dst, src1, src2
+	sub	\dst, \src1, \src2
+	neg	\dst, \dst
+	.endm
diff --git a/dl/api/arm/omxtypes_s.h b/dl/api/arm/omxtypes_s.h
index d880d35..b27f72d 100644
--- a/dl/api/arm/omxtypes_s.h
+++ b/dl/api/arm/omxtypes_s.h
@@ -1,76 +1,76 @@
-@//
-@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
-@//
-@//  Use of this source code is governed by a BSD-style license
-@//  that can be found in the LICENSE file in the root of the source
-@//  tree. An additional intellectual property rights grant can be found
-@//  in the file PATENTS.  All contributing project authors may
-@//  be found in the AUTHORS file in the root of the source tree.
-@//
-@//  This file was originally licensed as follows. It has been
-@//  relicensed with permission from the copyright holders.
-@//
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This file was originally licensed as follows. It has been
+//  relicensed with permission from the copyright holders.
+//
 
-@// 
-@// File Name:  omxtypes_s.h
-@// OpenMAX DL: v1.0.2
-@// Last Modified Revision:   9622
-@// Last Modified Date:       Wed, 06 Feb 2008
-@// 
-@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-@// 
-@//
+// 
+// File Name:  omxtypes_s.h
+// OpenMAX DL: v1.0.2
+// Last Modified Revision:   9622
+// Last Modified Date:       Wed, 06 Feb 2008
+// 
+// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+// 
+//
 
-@// Mandatory return codes - use cases are explicitly described for each function 
-	.equ	OMX_Sts_NoErr, 0    @// No error the function completed successfully 
-	.equ	OMX_Sts_Err, -2    @// Unknown/unspecified error     
-	.equ	OMX_Sts_InvalidBitstreamValErr, -182  @// Invalid value detected during bitstream processing     
-	.equ	OMX_Sts_MemAllocErr, -9    @// Not enough memory allocated for the operation 
-	.equ	OMX_StsACAAC_GainCtrErr, -159  @// AAC: Unsupported gain control data detected 
-	.equ	OMX_StsACAAC_PrgNumErr, -167  @// AAC: Invalid number of elements for one program   
-	.equ	OMX_StsACAAC_CoefValErr, -163  @// AAC: Invalid quantized coefficient value               
-	.equ	OMX_StsACAAC_MaxSfbErr, -162  @// AAC: Invalid maxSfb value in relation to numSwb     
-	.equ	OMX_StsACAAC_PlsDataErr, -160  @// AAC: pulse escape sequence data error 
+// Mandatory return codes - use cases are explicitly described for each function 
+        .equ    OMX_Sts_NoErr, 0    // No error the function completed successfully 
+        .equ    OMX_Sts_Err, -2    // Unknown/unspecified error     
+        .equ    OMX_Sts_InvalidBitstreamValErr, -182  // Invalid value detected during bitstream processing     
+        .equ    OMX_Sts_MemAllocErr, -9    // Not enough memory allocated for the operation 
+        .equ    OMX_StsACAAC_GainCtrErr, -159  // AAC: Unsupported gain control data detected 
+        .equ    OMX_StsACAAC_PrgNumErr, -167  // AAC: Invalid number of elements for one program   
+        .equ    OMX_StsACAAC_CoefValErr, -163  // AAC: Invalid quantized coefficient value               
+        .equ    OMX_StsACAAC_MaxSfbErr, -162  // AAC: Invalid maxSfb value in relation to numSwb     
+        .equ    OMX_StsACAAC_PlsDataErr, -160  // AAC: pulse escape sequence data error 
 
-@// Optional return codes - use cases are explicitly described for each function
-	.equ	OMX_Sts_BadArgErr, -5    @// Bad Arguments 
+// Optional return codes - use cases are explicitly described for each function
+        .equ    OMX_Sts_BadArgErr, -5    // Bad Arguments 
 
-	.equ	OMX_StsACAAC_TnsNumFiltErr, -157  @// AAC: Invalid number of TNS filters  
-	.equ	OMX_StsACAAC_TnsLenErr, -156  @// AAC: Invalid TNS region length     
-	.equ	OMX_StsACAAC_TnsOrderErr, -155  @// AAC: Invalid order of TNS filter                    
-	.equ	OMX_StsACAAC_TnsCoefResErr, -154  @// AAC: Invalid bit-resolution for TNS filter coefficients  
-	.equ	OMX_StsACAAC_TnsCoefErr, -153  @// AAC: Invalid TNS filter coefficients                    
-	.equ	OMX_StsACAAC_TnsDirectErr, -152  @// AAC: Invalid TNS filter direction    
-	.equ	OMX_StsICJP_JPEGMarkerErr, -183  @// JPEG marker encountered within an entropy-coded block; 
-                                            @// Huffman decoding operation terminated early.           
-	.equ	OMX_StsICJP_JPEGMarker, -181  @// JPEG marker encountered; Huffman decoding 
-                                            @// operation terminated early.                         
-	.equ	OMX_StsIPPP_ContextMatchErr, -17   @// Context parameter doesn't match to the operation 
+        .equ    OMX_StsACAAC_TnsNumFiltErr, -157  // AAC: Invalid number of TNS filters  
+        .equ    OMX_StsACAAC_TnsLenErr, -156  // AAC: Invalid TNS region length     
+        .equ    OMX_StsACAAC_TnsOrderErr, -155  // AAC: Invalid order of TNS filter                    
+        .equ    OMX_StsACAAC_TnsCoefResErr, -154  // AAC: Invalid bit-resolution for TNS filter coefficients  
+        .equ    OMX_StsACAAC_TnsCoefErr, -153  // AAC: Invalid TNS filter coefficients                    
+        .equ    OMX_StsACAAC_TnsDirectErr, -152  // AAC: Invalid TNS filter direction    
+        .equ    OMX_StsICJP_JPEGMarkerErr, -183  // JPEG marker encountered within an entropy-coded block; 
+                                            // Huffman decoding operation terminated early.           
+        .equ    OMX_StsICJP_JPEGMarker, -181  // JPEG marker encountered; Huffman decoding 
+                                            // operation terminated early.                         
+        .equ    OMX_StsIPPP_ContextMatchErr, -17   // Context parameter doesn't match to the operation 
 
-	.equ	OMX_StsSP_EvenMedianMaskSizeErr, -180  @// Even size of the Median Filter mask was replaced by the odd one 
+        .equ    OMX_StsSP_EvenMedianMaskSizeErr, -180  // Even size of the Median Filter mask was replaced by the odd one 
 
-	.equ	OMX_Sts_MaximumEnumeration, 0x7FFFFFFF
+        .equ    OMX_Sts_MaximumEnumeration, 0x7FFFFFFF
 
 
 
-	.equ	OMX_MIN_S8, (-128)
-	.equ	OMX_MIN_U8, 0
-	.equ	OMX_MIN_S16, (-32768)
-	.equ	OMX_MIN_U16, 0
+        .equ    OMX_MIN_S8, (-128)
+        .equ    OMX_MIN_U8, 0
+        .equ    OMX_MIN_S16, (-32768)
+        .equ    OMX_MIN_U16, 0
 
 
-	.equ	OMX_MIN_S32, (-2147483647-1)
-	.equ	OMX_MIN_U32, 0
+        .equ    OMX_MIN_S32, (-2147483647-1)
+        .equ    OMX_MIN_U32, 0
 
-	.equ	OMX_MAX_S8, (127)
-	.equ	OMX_MAX_U8, (255)
-	.equ	OMX_MAX_S16, (32767)
-	.equ	OMX_MAX_U16, (0xFFFF)
-	.equ	OMX_MAX_S32, (2147483647)
-	.equ	OMX_MAX_U32, (0xFFFFFFFF)
+        .equ    OMX_MAX_S8, (127)
+        .equ    OMX_MAX_U8, (255)
+        .equ    OMX_MAX_S16, (32767)
+        .equ    OMX_MAX_U16, (0xFFFF)
+        .equ    OMX_MAX_S32, (2147483647)
+        .equ    OMX_MAX_U32, (0xFFFFFFFF)
 
-	.equ	OMX_VC_UPPER, 0x1                 @// Used by the PredictIntra functions   
-	.equ	OMX_VC_LEFT, 0x2                 @// Used by the PredictIntra functions 
-	.equ	OMX_VC_UPPER_RIGHT, 0x40          @// Used by the PredictIntra functions   
+        .equ    OMX_VC_UPPER, 0x1                 // Used by the PredictIntra functions   
+        .equ    OMX_VC_LEFT, 0x2                 // Used by the PredictIntra functions 
+        .equ    OMX_VC_UPPER_RIGHT, 0x40          // Used by the PredictIntra functions   
 
-	.equ	NULL, 0
+        .equ    NULL, 0
diff --git a/dl/dl.gyp b/dl/dl.gyp
index d4c812b..8972d07 100644
--- a/dl/dl.gyp
+++ b/dl/dl.gyp
@@ -34,6 +34,21 @@
             'BIG_FFT_TABLE',
           ],
         }],
+        ['target_arch=="arm" or target_arch=="arm64"', {
+          'sources':[
+            # Common files that are used by both arm and arm64 code.
+            'api/arm/armOMX.h',
+            'api/arm/omxtypes_s.h',
+            'sp/api/armSP.h',
+            'sp/src/arm/armSP_FFT_S32TwiddleTable.c',
+            'sp/src/arm/omxSP_FFTGetBufSize_C_FC32.c',
+            'sp/src/arm/omxSP_FFTGetBufSize_C_SC32.c',
+            'sp/src/arm/omxSP_FFTGetBufSize_R_F32.c',
+            'sp/src/arm/omxSP_FFTGetBufSize_R_S32.c',
+            'sp/src/arm/omxSP_FFTInit_C_FC32.c',
+            'sp/src/arm/omxSP_FFTInit_R_F32.c',
+          ],
+        }],
         ['target_arch=="arm"', {
           'cflags!': [
             '-mfpu=vfpv3-d16',
@@ -49,21 +64,11 @@
           'sources': [
             # Common files that are used by both the NEON and non-NEON code.
             'api/armCOMM_s.h',
-            'api/armOMX.h',
-            'api/omxtypes_s.h',
-            'sp/api/armSP.h',
-            'sp/src/arm/armSP_FFT_S32TwiddleTable.c',
-            'sp/src/arm/omxSP_FFTGetBufSize_C_FC32.c',
             'sp/src/arm/omxSP_FFTGetBufSize_C_SC16.c',
-            'sp/src/arm/omxSP_FFTGetBufSize_C_SC32.c',
-            'sp/src/arm/omxSP_FFTGetBufSize_R_F32.c',
             'sp/src/arm/omxSP_FFTGetBufSize_R_S16.c',
             'sp/src/arm/omxSP_FFTGetBufSize_R_S16S32.c',
-            'sp/src/arm/omxSP_FFTGetBufSize_R_S32.c',
-            'sp/src/arm/omxSP_FFTInit_C_FC32.c',
             'sp/src/arm/omxSP_FFTInit_C_SC16.c',
             'sp/src/arm/omxSP_FFTInit_C_SC32.c',
-            'sp/src/arm/omxSP_FFTInit_R_F32.c',
             'sp/src/arm/omxSP_FFTInit_R_S16.c',
             'sp/src/arm/omxSP_FFTInit_R_S16S32.c',
             'sp/src/arm/omxSP_FFTInit_R_S32.c',
@@ -153,6 +158,27 @@
             'sp/src/x86/x86SP_SSE_Math.h',
           ],
         }],
+        ['target_arch=="arm64"', {
+          'sources':[
+            'api/arm/arm64COMM_s.h',
+
+            # Complex floating-point FFT
+            'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S',
+            'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S',
+            'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S',
+            'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S',
+            'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S',
+            'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S',
+            'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S',
+            'sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c',
+            'sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c',
+            # Real floating-point FFT
+            'sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S',
+            'sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c',
+            'sp/src/arm/arm64/ComplexToRealFixup.S',
+            'sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c',
+          ],
+        }],
       ],
     },
   ],
diff --git a/dl/sp/api/armSP.h b/dl/sp/api/armSP.h
index 4972f09..cf17ec5 100644
--- a/dl/sp/api/armSP.h
+++ b/dl/sp/api/armSP.h
@@ -29,6 +29,8 @@
 #ifndef _armSP_H_
 #define _armSP_H_
 
+#include <stdint.h>
+
 #include "dl/api/omxtypes.h"
 
 #ifdef __cplusplus
@@ -88,6 +90,42 @@
     OMX_FC32* pBuf;
 } ARMsFFTSpec_FC32;
 
+/*
+ * Compute log2(x), where x must be a power of 2.
+ */
+static inline OMX_U32 fastlog2(long x) {
+  OMX_U32 out;
+  asm("clz %0,%1\n\t"
+      "sub %0, %0, #63\n\t"
+      "neg %0, %0\n\t"
+      : "=r"(out)
+      : "r"(x)
+      :);
+  return out;
+}
+
+/*
+ *  Validate args. All pointers must be non-NULL; the source and
+ *  destination pointers must be aligned on a 32-byte boundary; the
+ *  FFT spec must have non-NULL pointers; and the FFT size must be
+ *  within range.
+ */
+static inline int validateParametersFC32(const void* pSrc,
+					 const void* pDst,
+					 const ARMsFFTSpec_FC32* pFFTSpec) {
+  return pSrc && pDst && pFFTSpec && !(((uintptr_t)pSrc) & 31) &&
+         !(((uintptr_t)pDst) & 31) && pFFTSpec->pTwiddle && pFFTSpec->pBuf &&
+         (pFFTSpec->N >= 2) && (pFFTSpec->N <= (1 << TWIDDLE_TABLE_ORDER));
+}
+
+static inline int validateParametersF32(const void* pSrc,
+					const void* pDst,
+					const ARMsFFTSpec_R_FC32* pFFTSpec) {
+  return pSrc && pDst && pFFTSpec && !(((uintptr_t)pSrc) & 31) &&
+         !(((uintptr_t)pDst) & 31) && pFFTSpec->pTwiddle && pFFTSpec->pBuf &&
+         (pFFTSpec->N >= 2) && (pFFTSpec->N <= (1 << TWIDDLE_TABLE_ORDER));
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/dl/sp/src/arm/arm64/ComplexToRealFixup.S b/dl/sp/src/arm/arm64/ComplexToRealFixup.S
new file mode 100644
index 0000000..9b30093
--- /dev/null
+++ b/dl/sp/src/arm/arm64/ComplexToRealFixup.S
@@ -0,0 +1,261 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute FFT for a real signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+    // Guarding implementation by the processor name
+
+// Import symbols required from other files
+
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pOut		x3	
+#define	subFFTNum	x4
+
+// Output registers
+
+//Local Scratch Registers
+
+#define argTwiddle      x5
+#define argDst          x6
+#define subFFTSize      x7
+#define N               subFFTNum
+#define order           x14
+#define step            x8
+#define step1           pTwiddle
+#define twStep          x9
+#define zero            w10
+#define pTwiddleTmp     pOut
+
+// Neon registers
+
+#define dX0       v0.2s
+#define dX0s      v0.s
+#define dX0r      v2.2s
+#define dX0rs     v2.s
+#define dX0i      v3.2s
+#define dX0is     v3.s
+#define dX1r      v4.2s
+#define dX1i      v5.2s
+#define dT0       v6.2s
+#define dT1       v7.2s
+#define dT2       v8.2s
+#define dT3       v9.2s
+#define qT0       v10.2s
+#define qT1       v12.2s
+#define dW0r      v14.2s
+#define dW0r8b    v14.8b
+#define dW0i      v15.2s
+#define dW1r      v16.2s
+#define dW1r8b    v16.8b
+#define dW1i      v17.2s
+#define dY0r      v14.2s
+#define dY0i      v15.2s
+#define dY1r      v16.2s
+#define dY1i      v17.2s
+#define qT2       v18.2s
+#define qT3       v20.2s
+
+#define half      v0.2s
+#define dZip      v21.2s
+#define dZip8b    v21.8b
+        
+    // Allocate stack memory required by the function
+
+    // Write function header
+        M_START     ComplexToRealFixup,,d15
+
+        asr     N, N, #1
+
+        clz     order, subFFTNum                    // N = 2^order
+
+        RSB     order,order,#63
+        MOV     subFFTSize,subFFTNum            // subFFTSize = N/2
+        //MOV     subFFTNum,N
+        mov     argDst, pDst
+        mov     argTwiddle, pTwiddle
+
+        // F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
+        // 1/2[(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
+        // 1/2[2a+j0] - j [0+j2b]
+        // (a+b, 0)
+
+        // F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
+        // 1/2[(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
+        // 1/2[2a+j0] + j [0+j2b]
+        // (a-b, 0)
+
+        // F(0) and F(N/2)
+        ld2     {dX0rs,dX0is}[0],[pSrc], #8
+        MOV     zero,#0
+        mov    dX0rs[1],zero
+        lsl     step,subFFTSize, #3               // step = N/2 * 8 bytes
+        mov    dX0i[1],zero
+        // twStep = 3N/8 * 8 bytes pointing to W^1
+        SUB     twStep,step,subFFTSize,LSL #1
+
+        fadd    dY0r,dX0r,dX0i                    // F(0) = ((Z0.r+Z0.i) , 0)
+        lsl     step1,subFFTSize, #2              // step1 = N/2 * 4 bytes
+        fsub    dY0i,dX0r,dX0i                    // F(N/2) = ((Z0.r-Z0.i) , 0)
+        SUBS    subFFTSize,subFFTSize,#2
+
+        st1     {dY0r},[argDst],step
+        ADD     pTwiddleTmp,argTwiddle,#8         // W^2
+        st1     {dY0i},[argDst], #8
+        ADD     argTwiddle,argTwiddle,twStep      // W^1
+
+//        dup     dzero,zero
+        SUB     argDst,argDst,step
+
+        BLT     End
+        BEQ     lastElement
+        SUB     step,step,#24
+        SUB     step1,step1,#8                    // (N/4-1)*8 bytes
+
+        // F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
+        // Note: W^k is stored as negative values in the table
+        // Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
+        // since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+
+        fmov     half, #0.5
+
+evenOddButterflyLoop:
+
+
+        ld1     {dW0r},[argTwiddle],step1
+        ld1     {dW1r},[argTwiddle], #8
+
+        ld2     {dX0r,dX0i},[pSrc],step
+        SUB     argTwiddle,argTwiddle,step1
+        ld2     {dX1r,dX1i},[pSrc], #16
+
+
+
+        SUB     step1,step1,#8                    // (N/4-2)*8 bytes
+        ld1     {dW0i},[pTwiddleTmp],step1
+        ld1     {dW1i},[pTwiddleTmp], #8
+        SUB     pSrc,pSrc,step
+
+        SUB     pTwiddleTmp,pTwiddleTmp,step1
+        rev64   dX1r,dX1r
+        rev64   dX1i,dX1i
+        SUBS    subFFTSize,subFFTSize,#4
+
+
+
+        fsub    dT2,dX0r,dX1r                     // a-c
+        SUB     step1,step1,#8
+        fadd    dT0,dX0r,dX1r                     // a+c
+        fsub    dT1,dX0i,dX1i                     // b-d
+        fadd    dT3,dX0i,dX1i                     // b+d
+        fmul   dT0,dT0,half[0]
+        fmul   dT1,dT1,half[0]
+        // VZIP    dW1r,dW1i
+        // VZIP    dW0r,dW0i
+        zip1    dZip, dW1r, dW1i
+        zip2    dW1i, dW1r, dW1i
+        mov     dW1r8b, dZip8b
+        zip1    dZip, dW0r, dW0i
+        zip2    dW0i, dW0r, dW0i
+        mov     dW0r8b, dZip8b
+
+        fmul   qT0,dW1r,dT2
+        fmul   qT1,dW1r,dT3
+        fmul   qT2,dW0r,dT2
+        fmul   qT3,dW0r,dT3
+
+        fmla   qT0,dW1i,dT3
+        fmls   qT1,dW1i,dT2
+
+        fmls   qT2,dW0i,dT3
+        fmla   qT3,dW0i,dT2
+
+
+        fmul  dX1r,qT0,half[0]
+        fmul  dX1i,qT1,half[0]
+
+        fsub    dY1r,dT0,dX1i                     // F(N/2 -1)
+        fadd    dY1i,dT1,dX1r
+        fneg    dY1i,dY1i
+
+        rev64   dY1r,dY1r
+        rev64   dY1i,dY1i
+
+
+        fmul  dX0r,qT2,half[0]
+        fmul  dX0i,qT3,half[0]
+
+        fsub    dY0r,dT0,dX0i                     // F(1)
+        fadd    dY0i,dT1,dX0r
+
+
+        st2     {dY0r,dY0i},[argDst],step
+        st2     {dY1r,dY1i},[argDst], #16
+        SUB     argDst,argDst,step
+        SUB     step,step,#32                     // (N/2-4)*8 bytes
+
+
+        BGT     evenOddButterflyLoop
+
+        // set both the ptrs to the last element
+        SUB     pSrc,pSrc,#8
+        SUB     argDst,argDst,#8
+
+
+
+        // Last element can be expanded as follows
+        // 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
+        // 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
+        // 1/2[2a+j0] + j (c+jd) [0+j2b]
+        // (a-bc, -bd)
+        // Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement:
+        ld1     {dX0r},[pSrc]
+
+        st1     {dX0rs}[0],[argDst], #4
+        fneg    dX0r,dX0r
+        st1     {dX0rs}[1],[argDst], #4
+End:
+
+        // Write function tail
+        M_END
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S b/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
new file mode 100644
index 0000000..da68314
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
@@ -0,0 +1,280 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of
+//  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float
+//  instead of SC32.
+//
+
+//
+// Description:
+// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
+// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+      // Guarding implementation by the processor name
+
+
+
+//Input Registers
+
+#define pSrc            x0
+#define pTwiddle        x1
+#define	pOut		x2	
+#define	subFFTNum	x3
+
+// Output registers
+
+//Local Scratch Registers
+
+#define argTwiddle      x5
+#define argDst          x6
+#define subFFTSize      x7
+#define N               subFFTNum
+
+#define pOut1           x13
+	
+#define size            x7
+#define step            x8
+#define step1           x9
+#define twStep          x10
+#define pTwiddleTmp     x11
+#define argTwiddle1     x12
+
+// Neon registers
+
+#define dX0     v0.2s
+#define dX0s    v0.s
+#define dShift  v1.2s
+#define dX1     v1.2s
+#define dX1s    v1.s
+#define dY0     v2.2s
+#define dY08b   v2.8b
+#define dY1     v3.2s
+#define dX0r    v0.2s
+#define dX0rs   v0.s
+#define dX0i    v1.2s
+#define dX1r    v2.2s
+#define dX1i    v3.2s
+#define dW0r    v4.2s
+#define dW0r8b  v4.8b
+#define dW0i    v5.2s
+#define dW1r    v6.2s
+#define dW1r8b  v6.8b
+#define dW1i    v7.2s
+#define dT0     v8.2s
+#define dT1     v9.2s
+#define dT2     v10.2s
+#define dT3     v11.2s
+#define qT0     v12.2s
+#define qT1     v14.2s
+#define qT2     v16.2s
+#define qT3     v18.2s
+#define dY0r    v4.2s
+#define dY0i    v5.2s
+#define dY1r    v6.2s
+#define dY1i    v7.2s
+
+#define dY2     v4.2s
+#define dY3     v5.2s
+#define dW0     v6.2s
+#define dW1     v7.2s
+#define dW0Tmp  v10.2s
+#define dW1Neg  v11.2s
+
+#define dZip    v19.2s
+#define dZip8b  v19.8b
+#define half    v13.2s
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        fmov    half, 0.5
+
+        asr     size, subFFTNum, #1           // preserve the contents of N = subFFTNum
+        lsl     step, subFFTNum, #2           // step = N/2 * 8 bytes
+
+
+        // Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
+        // Note: W^(k) is stored as negated value and also need to
+        // conjugate the values from the table
+
+        // Z(0) : no need of twiddle multiply
+        // Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
+
+        ld1     {dX0},[pSrc],step
+        ADD     pOut1,pOut,step               // pOut1 = pOut+ N/2*8 bytes
+
+        ld1     {dX1},[pSrc], #8
+        // twStep = 3N/8 * 8 bytes pointing to W^1
+        SUB     twStep,step,size,LSL #1
+
+        lsl     step1,size, #2                // step1 = N/4 * 8 = N/2*4 bytes
+        SUB     step1,step1,#8                // (N/4-1)*8 bytes
+
+        fadd    dY0,dX0,dX1                   // [b+d | a+c]
+        fsub    dY1,dX0,dX1                   // [b-d | a-c]
+        fmul    dY0, dY0, half[0]
+        fmul    dY1, dY1, half[0]
+
+        // dY0= [a-c | a+c] ;dY1= [b-d | b+d]
+        // VZIP    dY0,dY1
+        zip1    dZip,dY0,dY1
+        zip2    dY1,dY0,dY1
+        mov     dY08b, dZip8b
+
+        fsub   dX0,dY0,dY1
+        SUBS   size,size,#2
+        fadd   dX1,dY0,dY1
+
+        SUB     pSrc,pSrc,step
+
+        st1     {dX0s}[0],[pOut1], #4
+        ADD     pTwiddleTmp,pTwiddle,#8       // W^2
+        st1     {dX1s}[1],[pOut1], #4
+        ADD     argTwiddle1,pTwiddle,twStep   // W^1
+
+
+        BLT     decrementScale\name
+        BEQ     lastElement\name
+
+
+        // Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
+        // Note: W^k is stored as negative values in the table and also
+        // need to conjugate the values from the table.
+        //
+        // Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+        // since both of them require F(1),F(2) and F(N/2-2),F(N/2-1)
+
+
+        SUB     step,step,#24
+evenOddButterflyLoop\name :
+
+
+        ld1     {dW0r},[argTwiddle1],step1
+        ld1     {dW1r},[argTwiddle1], #8
+
+        ld2     {dX0r,dX0i},[pSrc],step
+        SUB     argTwiddle1,argTwiddle1,step1
+        ld2     {dX1r,dX1i},[pSrc], #16
+
+        SUB     step1,step1,#8                // (N/4-2)*8 bytes
+        ld1     {dW0i},[pTwiddleTmp],step1
+        ld1     {dW1i},[pTwiddleTmp], #8
+        SUB     pSrc,pSrc,step
+
+        SUB     pTwiddleTmp,pTwiddleTmp,step1
+        rev64   dX1r,dX1r
+        rev64   dX1i,dX1i
+        SUBS    size,size,#4
+
+
+        fsub    dT2,dX0r,dX1r                 // a-c
+        fadd    dT3,dX0i,dX1i                 // b+d
+        fadd    dT0,dX0r,dX1r                 // a+c
+        fsub    dT1,dX0i,dX1i                 // b-d
+        SUB     step1,step1,#8
+
+        fmul    dT2, dT2, half[0]
+        fmul    dT3, dT3, half[0]
+
+        fmul    dT0, dT0, half[0]
+        fmul    dT1, dT1, half[0]
+
+        // VZIP    dW1r,dW1i
+        // VZIP    dW0r,dW0i
+        zip1    dZip, dW1r,dW1i
+        zip2    dW1i,dW1r,dW1i
+        mov     dW1r8b, dZip8b
+        zip1    dZip,dW0r,dW0i
+        zip2    dW0i,dW0r,dW0i
+        mov     dW0r8b, dZip8b
+
+        fmul   dX1r,dW1r,dT2
+        fmul   dX1i,dW1r,dT3
+        fmul   dX0r,dW0r,dT2
+        fmul   dX0i,dW0r,dT3
+
+        fmls   dX1r,dW1i,dT3
+        fmla   dX1i,dW1i,dT2
+
+        fmla   dX0r,dW0i,dT3
+        fmls   dX0i,dW0i,dT2
+
+
+        fadd    dY1r,dT0,dX1i                 // F(N/2 -1)
+        fsub    dY1i,dX1r,dT1
+
+        rev64   dY1r,dY1r
+        rev64   dY1i,dY1i
+
+
+        fadd    dY0r,dT0,dX0i                 // F(1)
+        fsub    dY0i,dT1,dX0r
+
+
+        st2     {dY0r,dY0i},[pOut1],step
+        st2     {dY1r,dY1i},[pOut1], #16
+        SUB     pOut1,pOut1,step
+        SUB     step,step,#32                 // (N/2-4)*8 bytes
+
+
+        BGT     evenOddButterflyLoop\name
+
+
+        // set both the ptrs to the last element
+        SUB     pSrc,pSrc,#8
+        SUB     pOut1,pOut1,#8
+
+        // Last element can be expanded as follows
+        // 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as
+        // -ve)
+        // 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
+        // 1/2[2a+j0] - j (c-jd) [0+j2b]
+        // (a+bc, -bd)
+        // Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement\name :
+        ld1     {dX0r},[pSrc]
+
+        st1     {dX0rs}[0],[pOut1], #4
+        fneg    dX0r,dX0r
+        st1     {dX0rs}[1],[pOut1]
+
+
+
+decrementScale\name :
+
+        .endm
+
+        M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2,,d15
+            FFTSTAGE "FALSE","TRUE",Inv
+        M_END
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S
new file mode 100644
index 0000000..b22912d
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S
@@ -0,0 +1,136 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute the first stage of a Radix 2 DIT in-order out-of-place FFT
+// stage for a N point complex signal.
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+// Guarding implementation by the processor name
+
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define pointStep       x7
+#define outPointStep    x7
+#define grpSize         x8
+#define setCount        x8
+#define step            x9
+#define dstStep         x9
+
+// Neon Registers
+#define dX0     v0.2s
+#define dX1     v1.2s
+#define dY0     v2.2s
+#define dY1     v3.2s
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+
+        // update subFFTSize and subFFTNum into RN6 and RN7 for the next stage
+
+
+        MOV        subFFTSize,#2
+        LSR        grpSize,subFFTNum,#1
+        MOV        subFFTNum,grpSize
+
+
+        // pT0+1 increments pT0 by 8 bytes
+        // pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+        // Note: outPointStep = pointStep for firststage
+        // Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+
+        lsl     pointStep, grpSize, #3
+        rsb     step, pointStep, #8
+
+        // Loop on the sets for grp zero
+
+grpZeroSetLoop\name :
+
+        LD1    {dX0},[pSrc],pointStep
+        LD1    {dX1},[pSrc],step                   // step = -pointStep + 8
+
+        SUBS    setCount,setCount,#1
+
+        fadd    dY0,dX0,dX1
+        fsub    dY1,dX0,dX1
+
+        ST1    {dY0},[pDst],outPointStep
+        // dstStep =  step = -pointStep + 8
+        ST1    {dY1},[pDst],dstStep
+
+        BGT     grpZeroSetLoop\name
+
+
+        // Save subFFTNum and subFFTSize for next stage
+        str     subFFTNum, [pSubFFTNum]
+        str     subFFTSize, [pSubFFTSize]
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S
new file mode 100644
index 0000000..e7de11e
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S
@@ -0,0 +1,149 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute the last stage of a Radix 2 DIT in-order out-of-place FFT
+// stage for a N point complex signal.
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+// Guarding implementation by the processor name
+
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define outPointStep    x8
+#define grpCount        x9
+#define dstStep         x10
+
+// Neon Registers
+
+#define dWr     v0.2s
+#define dWi     v1.2s
+#define dXr0    v2.2s
+#define dXi0    v3.2s
+#define dXr1    v4.2s
+#define dXi1    v5.2s
+#define dYr0    v6.2s
+#define dYi0    v7.2s
+#define dYr1    v8.2s
+#define dYi1    v9.2s
+#define qT0     v10.2s
+#define qT1     v12.2s
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        // Move parameters into our work registers
+        ldr     subFFTSize, [pSubFFTSize]
+
+        lsl     outPointStep, subFFTSize, #3
+
+        // Update grpCount and grpSize rightaway
+
+        MOV     subFFTNum,#1                          //after the last stage
+        LSL     grpCount,subFFTSize,#1
+
+        // update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        rsb     dstStep,outPointStep,#16
+
+        // Loop on 2 grps at a time for the last stage
+
+radix2lsGrpLoop\name :
+        // dWr = [pTwiddle[0].Re, pTwiddle[1].Re]
+        // dWi = [pTwiddle[0].Im, pTwiddle[1].Im]
+        ld2     {dWr,dWi},[pTwiddle], #16
+
+        // dXr0 = [pSrc[0].Re, pSrc[2].Re]
+        // dXi0 = [pSrc[0].Im, pSrc[2].Im]
+        // dXr1 = [pSrc[1].Re, pSrc[3].Re]
+        // dXi1 = [pSrc[1].Im, pSrc[3].Im]
+        ld4     {dXr0,dXi0,dXr1,dXi1}, [pSrc], #32
+
+        SUBS    grpCount,grpCount,#4                  // grpCount is multiplied by 2
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   qT0,dWr,dXr1
+            fmla   qT0,dWi,dXi1                       // real part
+            fmul   qT1,dWr,dXi1
+            fmls   qT1,dWi,dXr1                       // imag part
+
+        .else
+
+            fmul   qT0,dWr,dXr1
+            fmls   qT0,dWi,dXi1                       // real part
+            fmul   qT1,dWr,dXi1
+            fmla   qT1,dWi,dXr1                       // imag part
+
+        .endif
+
+        fsub    dYr0,dXr0,qT0
+        fsub    dYi0,dXi0,qT1
+        fadd    dYr1,dXr0,qT0
+        fadd    dYi1,dXi0,qT1
+
+        st2     {dYr0,dYi0},[pDst],outPointStep
+        st2     {dYr1,dYi1},[pDst],dstStep            // dstStep =  step = -outPointStep + 16
+
+        BGT     radix2lsGrpLoop\name
+
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace,,d12
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace,,d12
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S
new file mode 100644
index 0000000..530a815
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S
@@ -0,0 +1,185 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix2_unsafe_s.s
+//  to support float instead of SC32.
+//
+
+// Description:
+// Compute a Radix 2 DIT in-order out-of-place FFT stage for an N point
+// complex signal.  This handles the general stage, not the first or last
+// stage.
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+
+// Guarding implementation by the processor name
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define outPointStep    x8
+#define pointStep       x9
+#define pointStep32     w9
+#define grpCount        x10
+#define grpCount32      w10
+#define setCount        x13
+#define step            x15
+#define dstStep         x11
+
+// Neon Registers
+
+#define dW      v0.2s
+#define dX0     v2.2s
+#define dX1     v3.2s
+#define dX2     v4.2s
+#define dX3     v5.2s
+#define dY0     v6.2s
+#define dY1     v7.2s
+#define dY2     v8.2s
+#define dY3     v9.2s
+#define qT0     v10.2s
+#define qT1     v11.2s
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+
+        // Update grpCount and grpSize rightaway inorder to reuse pGrpCount
+        // and pGrpSize regs
+
+        LSR     subFFTNum,subFFTNum,#1                 //grpSize
+        LSL     grpCount,subFFTSize,#1
+
+
+        // pT0+1 increments pT0 by 8 bytes
+        // pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+        lsl     pointStep, subFFTNum, #2
+
+        // update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        // pOut0+1 increments pOut0 by 8 bytes
+        // pOut0+outPointStep == increment of 8*outPointStep bytes =
+        //    4*size bytes
+        smull   outPointStep, grpCount32, pointStep32
+
+        LSL     pointStep,pointStep,#1
+
+
+        rsb      step,pointStep,#16
+        rsb      dstStep,outPointStep,#16
+
+        // Loop on the groups
+
+radix2GrpLoop\name :
+        lsr     setCount, pointStep, #3
+        LD1     {dW},[pTwiddle],pointStep              //[wi | wr]
+
+
+        // Loop on the sets
+
+
+radix2SetLoop\name :
+
+
+        // point0: dX0-real part dX1-img part
+        LD2    {dX0,dX1},[pSrc],pointStep
+        // point1: dX2-real part dX3-img part
+        LD2    {dX2,dX3},[pSrc],step
+
+        SUBS    setCount,setCount,#2
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   qT0,dX2,dW[0]
+            fmla   qT0,dX3,dW[1]                       // real part
+            fmul   qT1,dX3,dW[0]
+            fmls   qT1,dX2,dW[1]                       // imag part
+
+        .else
+
+            fmul   qT0,dX2,dW[0]
+            fmls   qT0,dX3,dW[1]                       // real part
+            fmul   qT1,dX3,dW[0]
+            fmla   qT1,dX2,dW[1]                       // imag part
+
+        .endif
+
+        fsub    dY0,dX0,qT0
+        fsub    dY1,dX1,qT1
+        fadd    dY2,dX0,qT0
+        fadd    dY3,dX1,qT1
+
+        st2    {dY0,dY1},[pDst],outPointStep
+        // dstStep = -outPointStep + 16
+        st2    {dY2,dY3},[pDst],dstStep
+
+        BGT     radix2SetLoop\name
+
+        SUBS    grpCount,grpCount,#2
+        ADD     pSrc,pSrc,pointStep
+        BGT     radix2GrpLoop\name
+
+
+        str     subFFTNum, [pSubFFTNum]
+        str     subFFTSize, [pSubFFTSize]
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace,,d11
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace,,d11
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S
new file mode 100644
index 0000000..624ef3e
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S
@@ -0,0 +1,266 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.s
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a first stage Radix 4 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+// Guarding implementation by the processor name
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define grpSize         x7
+// Reuse grpSize as setCount
+#define setCount        x7
+#define pointStep       x8
+#define outPointStep    x8
+#define setStep         x9
+#define step1           x10
+#define step3           x11
+
+// Neon Registers
+
+#define dXr0    v0.2s
+#define dXi0    v1.2s
+#define dXr1    v2.2s
+#define dXi1    v3.2s
+#define dXr2    v4.2s
+#define dXi2    v5.2s
+#define dXr3    v6.2s
+#define dXi3    v7.2s
+#define dYr0    v8.2s
+#define dYi0    v9.2s
+#define dYr1    v10.2s
+#define dYi1    v11.2s
+#define dYr2    v12.2s
+#define dYi2    v13.2s
+#define dYr3    v14.2s
+#define dYi3    v15.2s
+#define dZr0    v16.2s
+#define dZi0    v17.2s
+#define dZr1    v18.2s
+#define dZi1    v19.2s
+#define dZr2    v20.2s
+#define dZi2    v21.2s
+#define dZr3    v22.2s
+#define dZi3    v23.2s
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+        
+        // pT0+1 increments pT0 by 8 bytes
+        // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+        // Note: outPointStep = pointStep for firststage
+
+        lsl     pointStep, subFFTNum, #1
+
+        // Update pSubFFTSize and pSubFFTNum regs
+        ld2     {dXr0,dXi0}, [pSrc], pointStep          // data[0]
+
+        // subFFTSize = 1 for the first stage
+        MOV     subFFTSize,#4
+
+        // Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#2
+        ld2     {dXr1,dXi1}, [pSrc], pointStep          //  data[1]
+        MOV     subFFTNum,grpSize
+
+
+        // Calculate the step of input data for the next set
+        //MOV     setStep,pointStep,LSL #1
+        lsl     setStep, grpSize, #4
+        ld2     {dXr2,dXi2}, [pSrc], pointStep          //  data[2]
+
+        // setStep = 3*pointStep
+        ADD     setStep,setStep,pointStep
+        // setStep = - 3*pointStep+16
+
+        rsb     setStep,setStep,#16
+        //  data[3] & update pSrc for the next set
+        ld2     {dXr3,dXi3}, [pSrc], setStep
+
+        // step1 = 2*pointStep
+        lsl     step1, pointStep, #1
+
+        // fadd qY0, qX0, qX2
+        fadd    dYr0, dXr0, dXr2        
+        fadd    dYi0, dXi0, dXi2
+        // step3 = -pointStep
+        neg     step3, pointStep
+
+        // grp = 0 a special case since all the twiddle factors are 1
+        // Loop on the sets : 2 sets at a time
+
+radix4fsGrpZeroSetLoop\name :
+
+
+
+        // Decrement setcount
+        SUBS    setCount,setCount,#2
+
+
+        // finish first stage of 4 point FFT
+
+
+        // fsub qy2,qx0,qx2
+        fsub    dYr2, dXr0, dXr2
+        fsub    dYi2, dXi0, dXi2
+
+        ld2     {dXr0,dXi0}, [pSrc], step1              //  data[0]
+        // fadd qy1,qx1,qx3     
+        fadd    dYr1, dXr1, dXr3                    
+        fadd    dYi1, dXi1, dXi3
+        ld2     {dXr2,dXi2}, [pSrc], step3              //  data[2]
+        // fsub qy3,qx1,qx3     
+        fsub    dYr3, dXr1, dXr3                    
+        fsub    dYi3, dXi1, dXi3
+
+
+        // finish second stage of 4 point FFT
+
+        .ifeqs "\inverse", "TRUE"
+
+            ld2     {dXr1,dXi1}, [pSrc], step1          //  data[1]
+            // fadd  qz0,qy0,qy1
+            fadd    dZr0, dYr0, dYr1                    
+            fadd    dZi0, dYi0, dYi1
+
+            //  data[3] & update pSrc for the next set, but not if it's the
+            //  last iteration so that we don't read past the end of the 
+            //  input array.
+            BEQ     radix4SkipLastUpdateInv\name
+            ld2     {dXr3,dXi3}, [pSrc], setStep
+
+radix4SkipLastUpdateInv\name:
+            FSUB    dZr3,dYr2,dYi3
+
+            st2    {dZr0,dZi0},[pDst],outPointStep
+            FADD    dZi3,dYi2,dYr3
+
+            // fsub qZ1,qY0,qY1 
+            FSUB    dZr1, dYr0, dYr1
+            FSUB    dZi1, dYi0, dYi1
+            st2    {dZr3,dZi3},[pDst],outPointStep
+
+            FADD    dZr2,dYr2,dYi3
+            st2    {dZr1,dZi1},[pDst],outPointStep
+            FSUB    dZi2,dYi2,dYr3
+
+            //  fadd qY0, qX0, qX2
+            FADD    dYr0, dXr0, dXr2                    // u0 for next iteration
+            FADD    dYi0, dXi0, dXi2
+            st2    {dZr2,dZi2},[pDst],setStep
+
+
+        .else
+
+            ld2     {dXr1,dXi1}, [pSrc], step1          //  data[1]
+            // fadd qZ0,qY0,qY1
+            fadd    dZr0, dYr0, dYr1
+            fadd    dZi0, dYi0, dYi1
+
+            //  data[3] & update pSrc for the next set, but not if it's the
+            //  last iteration so that we don't read past the end of the 
+            //  input array.
+            BEQ     radix4SkipLastUpdateFwd\name
+            ld2     {dXr3,dXi3}, [pSrc], setStep
+
+radix4SkipLastUpdateFwd\name:
+            FADD    dZr2,dYr2,dYi3
+
+            st2    {dZr0,dZi0},[pDst],outPointStep
+            FSUB    dZi2,dYi2,dYr3
+
+            // fsub  qz1,qy0,qy1
+            fsub    dZr1, dYr0, dYr1
+            fsub    dZi1, dYi0, dYi1
+            st2    {dZr2,dZi2},[pDst],outPointStep
+
+            FSUB    dZr3,dYr2,dYi3
+            st2    {dZr1,dZi1},[pDst],outPointStep
+            FADD    dZi3,dYi2,dYr3
+
+            //  fadd  qy0,qx0,qx2
+            fadd    dYr0, dXr0, dXr2                    // u0 for next iteration
+            fadd    dYi0, dXi0, dXi2
+
+            st2    {dZr3,dZi3},[pDst],setStep
+
+        .endif
+
+        BGT     radix4fsGrpZeroSetLoop\name
+
+        // Save subFFTNum and subFFTSize for next stage
+        str     subFFTNum, [pSubFFTNum]
+        str     subFFTSize, [pSubFFTSize]
+        
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace,,d15
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace,,d15
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S
new file mode 100644
index 0000000..2fc2e60
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S
@@ -0,0 +1,371 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a Radix 4 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+// Guarding implementation by the processor name
+
+
+// Import symbols required from other files
+// (For example tables)
+    //IMPORT  armAAC_constTable
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define outPointStep    x8
+#define grpCount        x9
+#define dstStep         x10
+#define grpTwStep       x13
+#define stepTwiddle     x14
+#define twStep          x15
+#define step16          x11
+#define step24          x12
+
+
+// Neon Registers
+
+#define dButterfly1Real02       v0.2s
+#define dButterfly1Real028b     v0.8b
+#define dButterfly1Imag02       v1.2s
+#define dButterfly1Imag028b     v1.8b
+#define dButterfly1Real13       v2.2s
+#define dButterfly1Real138b     v2.8b
+#define dButterfly1Imag13       v3.2s
+#define dButterfly1Imag138b     v3.8b
+#define dButterfly2Real02       v4.2s
+#define dButterfly2Imag02       v5.2s
+#define dButterfly2Real13       v6.2s
+#define dButterfly2Imag13       v7.2s
+#define dXr0                    v0.2s
+#define dXi0                    v1.2s
+#define dXr08b                  v0.8b
+#define dXi08b                  v1.8b
+#define dXr1                    v2.2s
+#define dXi1                    v3.2s
+#define dXr2                    v4.2s
+#define dXi2                    v5.2s
+#define dXr3                    v6.2s
+#define dXi3                    v7.2s
+
+#define dYr0                    v16.2s
+#define dYi0                    v17.2s
+#define dYr1                    v18.2s
+#define dYi1                    v19.2s
+#define dYr2                    v20.2s
+#define dYi2                    v21.2s
+#define dYr3                    v22.2s
+#define dYi3                    v23.2s
+
+#define dW1r                    v8.2s
+#define dW1i                    v9.2s
+#define dW2r                    v10.2s
+#define dW2r8b                  v10.8b
+#define dW2i                    v11.2s
+#define dW3r                    v12.2s
+#define dW3r8b                  v12.8b
+#define dW3i                    v13.2s
+
+#define dZr0                    v14.2s
+#define dZi0                    v15.2s
+#define dZr08b                  v14.8b
+#define dZi08b                  v15.8b
+#define dZr1                    v26.2s
+#define dZi1                    v27.2s
+#define dZr2                    v28.2s
+#define dZi2                    v29.2s
+#define dZr3                    v30.2s
+#define dZi3                    v31.2s
+
+#define dZip                    v24.2s
+#define dZip8b                  v24.8b
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+
+        // pOut0+1 increments pOut0 by 8 bytes
+        // pOut0+outPointStep == increment of 8*outPointStep bytes
+        lsl     outPointStep,subFFTSize, #3
+
+        // Update grpCount and grpSize rightaway
+
+        ld2    {dW1r,dW1i},[pTwiddle]             // [wi|wr]
+        MOV     step16,#16
+        LSL     grpCount,subFFTSize,#2
+
+        ld1    {dW2r},[pTwiddle]                  // [wi|wr]
+        MOV     subFFTNum,#1                      //after the last stage
+
+        ld1    {dW3r},[pTwiddle],step16           // [wi|wr]
+        MOV     stepTwiddle,#0
+
+        ld1    {dW2i},[pTwiddle],#8               // [wi|wr]
+        SUB     grpTwStep,stepTwiddle,#8          // grpTwStep = -8 to start with
+
+        // update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+        ld1    {dW3i},[pTwiddle],grpTwStep        // [wi|wr]
+        lsl     dstStep,outPointStep, #1
+
+        // AC.r AC.i BD.r BD.i
+        ld4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
+        ADD     dstStep,dstStep,outPointStep      // dstStep = 3*outPointStep
+
+        rsb     dstStep,dstStep,#16               // dstStep = - 3*outPointStep+16
+        MOV     step24,#24
+
+        // AC.r AC.i BD.r BD.i
+        ld4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
+
+
+        // Process two groups at a time
+
+radix4lsGrpLoop\name :
+
+        // VZIP    dW2r,dW2i
+        zip1    dZip, dW2r, dW2i
+        zip2    dW2i, dW2r, dW2i
+        mov     dW2r8b, dZip8b
+
+        ADD     stepTwiddle,stepTwiddle,#16
+
+        // VZIP    dW3r,dW3i
+        zip1    dZip, dW3r,dW3i
+        zip2    dW3i, dW3r, dW3i
+        mov     dW3r8b, dZip8b
+        ADD     grpTwStep,stepTwiddle,#4
+
+        // VUZP     dButterfly1Real13, dButterfly2Real13      // B.r D.r
+        uzp1     dZip, dButterfly1Real13, dButterfly2Real13   // B.r D.r
+        uzp2     dButterfly2Real13, dButterfly1Real13, dButterfly2Real13   // B.r D.r
+        mov      dButterfly1Real138b, dZip8b
+
+        SUB     twStep,stepTwiddle,#16                        // -16+stepTwiddle
+
+        // VUZP     dButterfly1Imag13, dButterfly2Imag13      // B.i D.i
+        uzp1     dZip, dButterfly1Imag13, dButterfly2Imag13   // B.i D.i
+        uzp2     dButterfly2Imag13, dButterfly1Imag13, dButterfly2Imag13   // B.i D.i
+        mov      dButterfly1Imag138b, dZip8b
+        lsl     grpTwStep,grpTwStep,#1
+
+        // VUZP     dButterfly1Real02, dButterfly2Real02      // A.r C.r
+        uzp1     dZip, dButterfly1Real02, dButterfly2Real02   // A.r C.r
+        uzp2     dButterfly2Real02, dButterfly1Real02, dButterfly2Real02   // A.r C.r
+        mov      dButterfly1Real028b, dZip8b
+        rsb     grpTwStep,grpTwStep,#0                        // -8-2*stepTwiddle
+
+        // VUZP     dButterfly1Imag02, dButterfly2Imag02      // A.i C.i
+        uzp1     dZip, dButterfly1Imag02, dButterfly2Imag02   // A.i C.i
+        uzp2     dButterfly2Imag02, dButterfly1Imag02, dButterfly2Imag02   // A.i C.i
+        mov      dButterfly1Imag028b, dZip8b
+
+
+        // grpCount is multiplied by 4
+        SUBS    grpCount,grpCount,#8
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   dZr1,dW1r,dXr1
+            fmla   dZr1,dW1i,dXi1                       // real part
+            fmul   dZi1,dW1r,dXi1
+            fmls   dZi1,dW1i,dXr1                       // imag part
+
+        .else
+
+            fmul   dZr1,dW1r,dXr1
+            fmls   dZr1,dW1i,dXi1                       // real part
+            fmul   dZi1,dW1r,dXi1
+            fmla   dZi1,dW1i,dXr1                       // imag part
+
+        .endif
+
+        ld2    {dW1r,dW1i},[pTwiddle],stepTwiddle       // [wi|wr]
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   dZr2,dW2r,dXr2
+            fmla   dZr2,dW2i,dXi2                       // real part
+            fmul   dZi2,dW2r,dXi2
+            ld1   {dW2r},[pTwiddle],step16              // [wi|wr]
+            fmls   dZi2,dW2i,dXr2                       // imag part
+
+        .else
+
+            fmul   dZr2,dW2r,dXr2
+            fmls   dZr2,dW2i,dXi2                       // real part
+            fmul   dZi2,dW2r,dXi2
+            ld1    {dW2r},[pTwiddle],step16             // [wi|wr]
+            fmla   dZi2,dW2i,dXr2                       // imag part
+
+        .endif
+
+
+        ld1    {dW2i},[pTwiddle],twStep                 // [wi|wr]
+
+        // move qX0 so as to load for the next iteration
+        // MOV     qZ0,qX0
+        mov     dZr08b, dXr08b
+        mov     dZi08b, dXi08b
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   dZr3,dW3r,dXr3
+            fmla   dZr3,dW3i,dXi3                       // real part
+            fmul   dZi3,dW3r,dXi3
+            ld1    {dW3r},[pTwiddle],step24
+            fmls   dZi3,dW3i,dXr3                       // imag part
+
+        .else
+
+            fmul   dZr3,dW3r,dXr3
+            fmls   dZr3,dW3i,dXi3                       // real part
+            fmul   dZi3,dW3r,dXi3
+            ld1    {dW3r},[pTwiddle],step24
+            fmla   dZi3,dW3i,dXr3                       // imag part
+
+        .endif
+
+        ld1    {dW3i},[pTwiddle],grpTwStep              // [wi|wr]
+
+        // Don't do the load on the last iteration so we don't read past the end
+        // of pSrc.
+        bne     skipIncrement\name
+        add     pSrc, pSrc, #64
+skipIncrement\name:     
+        beq     radix4lsSkipRead\name
+        // AC.r AC.i BD.r BD.i
+        ld4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
+
+        // AC.r AC.i BD.r BD.i
+        ld4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
+radix4lsSkipRead\name:
+
+        // finish first stage of 4 point FFT
+
+        // fadd    qY0,qZ0,qZ2
+        fadd    dYr0,dZr0,dZr2
+        fadd    dYi0,dZi0,dZi2
+        // fsub    qY2,qZ0,qZ2
+        fsub    dYr2,dZr0,dZr2
+        fsub    dYi2,dZi0,dZi2
+        // fadd    qY1,qZ1,qZ3
+        fadd    dYr1,dZr1,dZr3
+        fadd    dYi1,dZi1,dZi3
+        // fsub    qY3,qZ1,qZ3
+        fsub    dYr3,dZr1,dZr3
+        fsub    dYi3,dZi1,dZi3
+
+
+        // finish second stage of 4 point FFT
+
+        .ifeqs  "\inverse", "TRUE"
+
+            // fsub    qZ0,qY2,qY1
+            fsub    dZr0,dYr2,dYr1
+            fsub    dZi0,dYi2,dYi1
+            fadd    dZr3,dYr0,dYi3
+            st2    {dZr0,dZi0},[pDst],outPointStep
+            fsub    dZi3,dYi0,dYr3
+
+            // fadd    qZ2,qY2,qY1
+            fadd    dZr2,dYr2,dYr1
+            fadd    dZi2,dYi2,dYi1
+
+            st2    {dZr3,dZi3},[pDst],outPointStep
+
+            fsub    dZr1,dYr0,dYi3
+            st2    {dZr2,dZi2},[pDst],outPointStep
+            fadd    dZi1,dYi0,dYr3
+
+            // dstStep = -outPointStep + 16
+            st2    {dZr1,dZi1},[pDst],dstStep
+
+
+        .else
+
+            // fsub    qZ0,qY2,qY1
+            fsub    dZr0,dYr2,dYr1
+            fsub    dZi0,dYi2,dYi1
+
+            fsub    dZr1,dYr0,dYi3
+            st2    {dZr0,dZi0},[pDst],outPointStep
+            fadd    dZi1,dYi0,dYr3
+
+            // fadd    qZ2,qY2,qY1
+            fadd    dZr2,dYr2,dYr1
+            fadd    dZi2,dYi2,dYi1
+
+            st2    {dZr1,dZi1},[pDst],outPointStep
+
+            fadd    dZr3,dYr0,dYi3
+            st2    {dZr2,dZi2},[pDst],outPointStep
+            fsub    dZi3,dYi0,dYr3
+
+            // dstStep = -outPointStep + 16
+            st2    {dZr3,dZi3},[pDst],dstStep
+
+
+        .endif
+
+        BGT     radix4lsGrpLoop\name
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace,,d15
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace,,d15
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S
new file mode 100644
index 0000000..830fd16
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S
@@ -0,0 +1,339 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a Radix 4 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+
+// Guarding implementation by the processor name
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define grpCount        x7
+#define grpCount32      w7
+#define pointStep       x8
+#define pointStep32     w8
+#define outPointStep    x9
+#define stepTwiddle     x10
+#define setCount        x11
+#define srcStep         x12
+#define setStep         x13
+#define dstStep         x14
+#define twStep          x15
+
+// Neon Registers
+
+#define dW1     v0.2s
+#define dW2     v1.2s
+#define dW3     v2.2s
+
+#define dXr0    v4.2s
+#define dXi0    v5.2s
+#define dXr1    v6.2s
+#define dXi1    v7.2s
+#define dXr2    v8.2s
+#define dXi2    v9.2s
+#define dXr3    v10.2s
+#define dXi3    v11.2s
+#define dYr0    v12.2s
+#define dYi0    v13.2s
+#define dYr1    v14.2s
+#define dYi1    v15.2s
+#define dYr2    v16.2s
+#define dYi2    v17.2s
+#define dYr3    v18.2s
+#define dYi3    v19.2s
+#define dZr0    v20.2s
+#define dZi0    v21.2s
+#define dZr1    v22.2s
+#define dZi1    v23.2s
+#define dZr2    v24.2s
+#define dZi2    v25.2s
+#define dZr3    v26.2s
+#define dZi3    v27.2s
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+
+        // Update grpCount and grpSize rightaway inorder to reuse
+        // pGrpCount and pGrpSize regs
+
+        LSL     grpCount,subFFTSize,#2
+        LSR     subFFTNum,subFFTNum,#2
+        MOV     subFFTSize,grpCount
+
+        ld1      {dW1},[pTwiddle]                    //[wi | wr]
+        // pT0+1 increments pT0 by 8 bytes
+        // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+        lsl     pointStep,subFFTNum, #1
+
+        // pOut0+1 increments pOut0 by 8 bytes
+        // pOut0+outPointStep == increment of 8*outPointStep bytes
+        //   = 2*size bytes
+
+        MOV     stepTwiddle,#0
+        ld1      {dW2},[pTwiddle]                    //[wi | wr]
+        smull   outPointStep,grpCount32,pointStep32
+
+        LSL     pointStep,pointStep,#2             // 2*grpSize
+
+        ld1      {dW3},[pTwiddle]                  //[wi | wr]
+        lsl     srcStep,pointStep, #1              // srcStep = 2*pointStep
+
+        ADD     setStep,srcStep,pointStep          // setStep = 3*pointStep
+
+        rsb     setStep,setStep,#0                 // setStep = - 3*pointStep
+        SUB     srcStep,srcStep,#16                // srcStep = 2*pointStep-16
+
+        lsl     dstStep,outPointStep, #1
+
+        ADD     dstStep,dstStep,outPointStep       // dstStep = 3*outPointStep
+        // dstStep = - 3*outPointStep+16
+        rsb     dstStep,dstStep,#16
+
+
+radix4GrpLoop\name :
+
+        ld2     {dXr0,dXi0},[pSrc],pointStep       //  data[0]
+        ADD      stepTwiddle,stepTwiddle,pointStep
+        ld2     {dXr1,dXi1},[pSrc],pointStep       //  data[1]
+        // set pTwiddle to the first point
+        ADD      pTwiddle,pTwiddle,stepTwiddle
+        ld2     {dXr2,dXi2},[pSrc],pointStep       //  data[2]
+        lsl      twStep,stepTwiddle, #2
+
+        //  data[3] & update pSrc for the next set
+        ld2     {dXr3,dXi3},[pSrc],setStep
+        SUB      twStep,stepTwiddle,twStep         // twStep = -3*stepTwiddle
+
+        lsr      setCount,pointStep, #3
+
+        // set pSrc to data[0] of the next set
+        ADD     pSrc,pSrc,#16
+        // increment to data[1] of the next set
+        ADD     pSrc,pSrc,pointStep
+
+
+        // Loop on the sets
+
+radix4SetLoop\name :
+
+
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   dZr1,dXr1,dW1[0]
+            fmul   dZi1,dXi1,dW1[0]
+            fmul   dZr2,dXr2,dW2[0]
+            fmul   dZi2,dXi2,dW2[0]
+            fmul   dZr3,dXr3,dW3[0]
+            fmul   dZi3,dXi3,dW3[0]
+
+            fmla   dZr1,dXi1,dW1[1]                // real part
+            fmls   dZi1,dXr1,dW1[1]                // imag part
+
+            //  data[1] for next iteration
+            ld2     {dXr1,dXi1},[pSrc],pointStep
+
+            fmla   dZr2,dXi2,dW2[1]                // real part
+            fmls   dZi2,dXr2,dW2[1]                // imag part
+
+            //  data[2] for next iteration
+            ld2     {dXr2,dXi2},[pSrc],pointStep
+
+            fmla   dZr3,dXi3,dW3[1]                // real part
+            fmls   dZi3,dXr3,dW3[1]                // imag part
+        .else
+            fmul   dZr1,dXr1,dW1[0]
+            fmul   dZi1,dXi1,dW1[0]
+            fmul   dZr2,dXr2,dW2[0]
+            fmul   dZi2,dXi2,dW2[0]
+            fmul   dZr3,dXr3,dW3[0]
+            fmul   dZi3,dXi3,dW3[0]
+
+            fmls   dZr1,dXi1,dW1[1]                // real part
+            fmla   dZi1,dXr1,dW1[1]                // imag part
+
+            //  data[1] for next iteration
+            ld2     {dXr1,dXi1},[pSrc],pointStep
+
+            fmls   dZr2,dXi2,dW2[1]                // real part
+            fmla   dZi2,dXr2,dW2[1]                // imag part
+
+            //  data[2] for next iteration
+            ld2     {dXr2,dXi2},[pSrc],pointStep
+
+            fmls   dZr3,dXi3,dW3[1]                // real part
+            fmla   dZi3,dXr3,dW3[1]                // imag part
+        .endif
+
+        //  data[3] & update pSrc to data[0]
+        // But don't read on the very last iteration because that reads past 
+        // the end of pSrc. The last iteration is grpCount = 4, setCount = 2.
+        cmp     grpCount, #4
+        
+        b.ne    skipUpdate\name
+        cmp     setCount, #2
+        b.ne    skipUpdate\name
+        add     pSrc, pSrc, setStep
+        beq     radix4SkipRead\name
+skipUpdate\name:
+        ld2     {dXr3,dXi3},[pSrc],setStep
+radix4SkipRead\name:
+
+        SUBS    setCount,setCount,#2
+
+        // finish first stage of 4 point FFT
+        // fadd    qY0,qX0,qZ2
+        // fsub    qY2,qX0,qZ2
+        fadd    dYr0,dXr0,dZr2
+        fsub    dYr2,dXr0,dZr2
+        fadd    dYi0,dXi0,dZi2
+        fsub    dYi2,dXi0,dZi2
+
+        //  data[0] for next iteration
+        ld2     {dXr0,dXi0},[pSrc], #16
+        // fadd    qY1,qZ1,qZ3
+        // fsub    qY3,qZ1,qZ3
+        fadd    dYr1,dZr1,dZr3
+        fsub    dYr3,dZr1,dZr3
+        fadd    dYi1,dZi1,dZi3
+        fsub    dYi3,dZi1,dZi3
+
+        // finish second stage of 4 point FFT
+
+        // fsub    qZ0,qY2,qY1
+        fsub    dZr0,dYr2,dYr1
+        fsub    dZi0,dYi2,dYi1
+
+        .ifeqs  "\inverse", "TRUE"
+
+            fadd    dZr3,dYr0,dYi3
+            st2     {dZr0,dZi0},[pDst],outPointStep
+            fsub    dZi3,dYi0,dYr3
+
+            // fadd    qZ2,qY2,qY1
+            fadd    dZr2,dYr2,dYr1
+            fadd    dZi2,dYi2,dYi1
+
+            st2     {dZr3,dZi3},[pDst],outPointStep
+
+            fsub    dZr1,dYr0,dYi3
+            st2     {dZr2,dZi2},[pDst],outPointStep
+            fadd    dZi1,dYi0,dYr3
+
+            st2     {dZr1,dZi1},[pDst],dstStep
+
+
+        .else
+
+            fsub    dZr1,dYr0,dYi3
+            st2     {dZr0,dZi0},[pDst],outPointStep
+            fadd    dZi1,dYi0,dYr3
+
+            // fadd    qZ2,qY2,qY1
+            fadd    dZr2,dYr2,dYr1
+            fadd    dZi2,dYi2,dYi1
+
+            st2     {dZr1,dZi1},[pDst],outPointStep
+
+            fadd    dZr3,dYr0,dYi3
+            st2     {dZr2,dZi2},[pDst],outPointStep
+            fsub    dZi3,dYi0,dYr3
+
+            st2     {dZr3,dZi3},[pDst],dstStep
+
+
+        .endif
+
+        // increment to data[1] of the next set
+        ADD     pSrc,pSrc,pointStep
+        BGT     radix4SetLoop\name
+
+
+        ld1      {dW1},[pTwiddle],stepTwiddle    //[wi | wr]
+        // subtract 4 since grpCount multiplied by 4
+        SUBS    grpCount,grpCount,#4
+        ld1      {dW2},[pTwiddle],stepTwiddle    //[wi | wr]
+        // increment pSrc for the next grp
+        ADD     pSrc,pSrc,srcStep
+        ld1      {dW3},[pTwiddle],twStep         //[wi | wr]
+        BGT     radix4GrpLoop\name
+
+        str     subFFTNum, [pSubFFTNum]
+        str     subFFTSize, [pSubFFTSize]
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace,,d15
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace,,d15
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
new file mode 100644
index 0000000..f348e6a
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
@@ -0,0 +1,473 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.s
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a first stage Radix 8 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+
+// Guarding implementation by the processor name
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define grpSize         x7
+// Reuse grpSize as setCount
+#define setCount        x7
+#define pointStep       x8
+#define outPointStep    x8
+#define setStep         x9
+#define step1           x10
+#define step2           x11
+#define t0              w12
+
+
+// Neon Registers
+
+#define dXr0    v0.2s
+#define dXi0    v1.2s
+#define dXr1    v2.2s
+#define dXi1    v3.2s
+#define dXr2    v4.2s
+#define dXi2    v5.2s
+#define dXr3    v6.2s
+#define dXi3    v7.2s
+#define dXr4    v8.2s
+#define dXi4    v9.2s
+#define dXr5    v10.2s
+#define dXi5    v11.2s
+#define dXr6    v12.2s
+#define dXi6    v13.2s
+#define dXr7    v14.2s
+#define dXi7    v15.2s
+#define qX0     v0.4s
+#define qX1     v1.4s
+#define qX2     v2.4s
+#define qX3     v3.4s
+#define qX4     v4.4s
+#define qX5     v5.4s
+#define qX6     v6.4s
+#define qX7     v7.4s
+
+#define dUr0    v16.2s
+#define dUi0    v17.2s
+#define dUr2    v18.2s
+#define dUi2    v19.2s
+#define dUr4    v20.2s
+#define dUi4    v21.2s
+#define dUr6    v22.2s
+#define dUi6    v23.2s
+#define dUr1    v24.2s
+#define dUi1    v25.2s
+#define dUr3    v26.2s
+#define dUi3    v27.2s
+#define dUr5    v28.2s
+#define dUi5    v29.2s
+// reuse dXr7 and dXi7
+#define dUr7    v30.2s
+#define dUi7    v31.2s
+#define qU0     v8.4s
+#define qU1     v12.4s
+#define qU2     v9.4s
+#define qU3     v13.4s
+#define qU4     v10.4s
+#define qU5     v14.4s
+#define qU6     v11.4s
+#define qU7     v15.4s
+
+
+#define dVr0    v24.2s
+#define dVi0    v25.2s
+#define dVr2    v26.2s
+#define dVi2    v27.2s
+#define dVr4    v28.2s
+#define dVi4    v29.2s
+#define dVr6    v30.2s
+#define dVi6    v31.2s
+#define dVr1    v16.2s
+#define dVi1    v17.2s
+#define dVr3    v18.2s
+#define dVi3    v19.2s
+#define dVr5    v20.2s
+#define dVi5    v21.2s
+#define dVr7    v22.2s
+#define dVi7    v23.2s
+#define qV0     v12.4s
+#define qV1     v8.4s
+#define qV2     v13.4s
+#define qV3     v9.4s
+#define qV4     v14.4s
+#define qV5     v10.4s
+#define qV6     v15.4s
+#define qV7     v11.4s
+
+#define dYr0    v16.2s
+#define dYi0    v17.2s
+#define dYr2    v18.2s
+#define dYi2    v19.2s
+#define dYr4    v20.2s
+#define dYi4    v21.2s
+#define dYr6    v22.2s
+#define dYi6    v23.2s
+#define dYr1    v24.2s
+#define dYi1    v25.2s
+#define dYr3    v26.2s
+#define dYi3    v27.2s
+#define dYr5    v28.2s
+#define dYi5    v29.2s
+#define dYr7    v30.2s
+#define dYi7    v31.2s
+#define qY0     v8.4s
+#define qY1     v12.4s
+#define qY2     v9.4s
+#define qY3     v13.4s
+#define qY4     v10.4s
+#define qY5     v14.4s
+#define qY6     v11.4s
+#define qY7     v15.4s
+
+#define dT0     v14.2s
+#define dT0s    v14.s
+#define dT1     v15.2s
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+
+        // Update pSubFFTSize and pSubFFTNum regs
+        // subFFTSize = 1 for the first stage
+
+        movz    t0, 0x3f35, lsl #16               // High half word of sqrt(1/2).
+        movk    t0, 0x04f3                        // Low half word of sqrt(1/2).
+        MOV     subFFTSize,#8
+
+        // Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#3
+        MOV     subFFTNum,grpSize
+
+
+        // pT0+1 increments pT0 by 8 bytes
+        // pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
+        // Note: outPointStep = pointStep for firststage
+
+        lsl     pointStep,grpSize, #3
+
+
+        // Calculate the step of input data for the next set
+        //MOV     step1,pointStep,LSL #1             // step1 = 2*pointStep
+        ld2     {dXr0,dXi0},[pSrc],pointStep         //  data[0]
+        lsl     step1,grpSize, #4
+        lsl     step2,pointStep, #3
+
+        ld2     {dXr1,dXi1},[pSrc],pointStep         //  data[1]
+        SUB     step2,step2,pointStep                // step2 = 7*pointStep
+        // setStep = - 7*pointStep+16
+        rsb     setStep,step2,#16
+
+        ld2     {dXr2,dXi2},[pSrc],pointStep         //  data[2]
+        ld2     {dXr3,dXi3},[pSrc],pointStep         //  data[3]
+        ld2     {dXr4,dXi4},[pSrc],pointStep         //  data[4]
+        ld2     {dXr5,dXi5},[pSrc],pointStep         //  data[5]
+        ld2     {dXr6,dXi6},[pSrc],pointStep         //  data[6]
+        //  data[7] & update pSrc for the next set
+        //  setStep = -7*pointStep + 16
+        ld2     {dXr7,dXi7},[pSrc],setStep
+        // grp = 0 a special case since all the twiddle factors are 1
+        // Loop on the sets
+
+radix8fsGrpZeroSetLoop\name :
+
+        // Decrement setcount
+        SUBS    setCount,setCount,#2
+
+
+        // finish first stage of 8 point FFT
+
+        // fadd    qU0,qX0,qX4
+        // fadd    qU2,qX1,qX5
+        // fadd    qU4,qX2,qX6
+        // fadd    qU6,qX3,qX7
+        fadd    dUr0,dXr0,dXr4
+        fadd    dUr2,dXr1,dXr5
+        fadd    dUr4,dXr2,dXr6
+        fadd    dUr6,dXr3,dXr7
+        fadd    dUi0,dXi0,dXi4
+        fadd    dUi2,dXi1,dXi5
+        fadd    dUi4,dXi2,dXi6
+        fadd    dUi6,dXi3,dXi7
+
+        // finish second stage of 8 point FFT
+
+        // fadd    qV0,qU0,qU4
+        // fsub    qV2,qU0,qU4
+        // fadd    qV4,qU2,qU6
+        // fsub    qV6,qU2,qU6
+        fadd    dVr0,dUr0,dUr4
+        fsub    dVr2,dUr0,dUr4
+        fadd    dVr4,dUr2,dUr6
+        fsub    dVr6,dUr2,dUr6
+        fadd    dVi0,dUi0,dUi4
+        fsub    dVi2,dUi0,dUi4
+        fadd    dVi4,dUi2,dUi6
+        fsub    dVi6,dUi2,dUi6
+
+        // finish third stage of 8 point FFT
+
+        // fadd    qY0,qV0,qV4
+        // fsub    qY4,qV0,qV4
+        fadd    dYr0,dVr0,dVr4
+        fsub    dYr4,dVr0,dVr4
+        fadd    dYi0,dVi0,dVi4
+        fsub    dYi4,dVi0,dVi4
+
+        st2     {dYr0,dYi0},[pDst],step1         // store y0
+
+        .ifeqs  "\inverse", "TRUE"
+
+            fsub    dYr2,dVr2,dVi6
+            fadd    dYi2,dVi2,dVr6
+
+            fadd    dYr6,dVr2,dVi6
+            st2     {dYr2,dYi2},[pDst],step1     // store y2
+            fsub    dYi6,dVi2,dVr6
+
+            // fsub    qU1,qX0,qX4
+            fsub    dUr1,dXr0,dXr4
+            fsub    dUi1,dXi0,dXi4
+
+            st2     {dYr4,dYi4},[pDst],step1     // store y4
+
+            // fsub    qU3,qX1,qX5
+            // fsub    qU5,qX2,qX6
+            fsub    dUr3,dXr1,dXr5
+            fsub    dUr5,dXr2,dXr6
+            fsub    dUi3,dXi1,dXi5
+            fsub    dUi5,dXi2,dXi6
+
+            st2     {dYr6,dYi6},[pDst],step1     // store y6
+
+        .ELSE
+
+            fadd    dYr6,dVr2,dVi6
+            fsub    dYi6,dVi2,dVr6
+
+            fsub    dYr2,dVr2,dVi6
+            st2     {dYr6,dYi6},[pDst],step1     // store y2
+            fadd    dYi2,dVi2,dVr6
+
+
+            // fsub    qU1,qX0,qX4
+            fsub    dUr1,dXr0,dXr4
+            fsub    dUi1,dXi0,dXi4
+
+            st2     {dYr4,dYi4},[pDst],step1     // store y4
+
+            // fsub    qU3,qX1,qX5
+            // fsub    qU5,qX2,qX6
+            fsub    dUr3,dXr1,dXr5
+            fsub    dUr5,dXr2,dXr6
+            fsub    dUi3,dXi1,dXi5
+            fsub    dUi5,dXi2,dXi6
+
+            st2     {dYr2,dYi2},[pDst],step1     // store y6
+
+
+        .ENDIF
+
+        // finish first stage of 8 point FFT
+
+        // fsub    qU7,qX3,qX7
+        fsub    dUr7,dXr3,dXr7
+        fsub    dUi7,dXi3,dXi7
+
+        mov     dT0s[0], t0
+
+        // finish second stage of 8 point FFT
+
+        fsub    dVr1,dUr1,dUi5
+        //  data[0] for next iteration
+        ld2     {dXr0,dXi0},[pSrc],pointStep
+        fadd    dVi1,dUi1,dUr5
+        fadd    dVr3,dUr1,dUi5
+        ld2     {dXr1,dXi1},[pSrc],pointStep     //  data[1]
+        fsub    dVi3,dUi1,dUr5
+
+        fsub    dVr5,dUr3,dUi7
+        ld2     {dXr2,dXi2},[pSrc],pointStep     //  data[2]
+        fadd    dVi5,dUi3,dUr7
+        fadd    dVr7,dUr3,dUi7
+        ld2     {dXr3,dXi3},[pSrc],pointStep     //  data[3]
+        fsub    dVi7,dUi3,dUr7
+
+        // finish third stage of 8 point FFT
+
+        .ifeqs  "\inverse", "TRUE"
+
+            // calculate a*v5
+            fmul    dT1,dVr5,dT0[0]              // use dVi0 for dT1
+
+            ld2     {dXr4,dXi4},[pSrc],pointStep //  data[4]
+            fmul    dVi5,dVi5,dT0[0]
+
+            ld2     {dXr5,dXi5},[pSrc],pointStep //  data[5]
+            fsub    dVr5,dT1,dVi5                // a * V5
+            fadd    dVi5,dT1,dVi5
+
+            ld2     {dXr6,dXi6},[pSrc],pointStep //  data[6]
+
+            // calculate  b*v7
+            fmul    dT1,dVr7,dT0[0]
+            fmul    dVi7,dVi7,dT0[0]
+
+            // fadd    qY1,qV1,qV5
+            // fsub    qY5,qV1,qV5
+            fadd    dYr1,dVr1,dVr5
+            fsub    dYr5,dVr1,dVr5
+            fadd    dYi1,dVi1,dVi5
+            fsub    dYi5,dVi1,dVi5
+
+            fadd    dVr7,dT1,dVi7                // b * V7
+            fsub    dVi7,dVi7,dT1
+            SUB     pDst, pDst, step2            // set pDst to y1
+
+            // On the last iteration,  this will read past the end of pSrc, 
+            // so skip this read.
+            BEQ     radix8SkipLastUpdateInv\name
+            ld2     {dXr7,dXi7},[pSrc],setStep   //  data[7]
+radix8SkipLastUpdateInv\name:
+
+            fsub    dYr3,dVr3,dVr7
+            fsub    dYi3,dVi3,dVi7
+            st2     {dYr1,dYi1},[pDst],step1     // store y1
+            fadd    dYr7,dVr3,dVr7
+            fadd    dYi7,dVi3,dVi7
+
+
+            st2     {dYr3,dYi3},[pDst],step1     // store y3
+            st2     {dYr5,dYi5},[pDst],step1     // store y5
+            st2     {dYr7,dYi7},[pDst]           // store y7
+            ADD pDst, pDst, #16
+
+        .ELSE
+
+            // calculate  b*v7
+            fmul    dT1,dVr7,dT0[0]
+            ld2     {dXr4,dXi4},[pSrc],pointStep //  data[4]
+            fmul    dVi7,dVi7,dT0[0]
+
+            ld2     {dXr5,dXi5},[pSrc],pointStep //  data[5]
+            fadd    dVr7,dT1,dVi7                     // b * V7
+            fsub    dVi7,dVi7,dT1
+
+            ld2     {dXr6,dXi6},[pSrc],pointStep //  data[6]
+
+            // calculate a*v5
+            fmul    dT1,dVr5,dT0[0]              // use dVi0 for dT1
+            fmul    dVi5,dVi5,dT0[0]
+
+            fadd    dYr7,dVr3,dVr7
+            fadd    dYi7,dVi3,dVi7
+            SUB     pDst, pDst, step2            // set pDst to y1
+
+            fsub    dVr5,dT1,dVi5                // a * V5
+            fadd    dVi5,dT1,dVi5
+
+            // On the last iteration,  this will read past the end of pSrc, 
+            // so skip this read.
+            BEQ     radix8SkipLastUpdateFwd\name
+            ld2     {dXr7,dXi7},[pSrc],setStep   //  data[7]
+radix8SkipLastUpdateFwd\name:
+
+            // fsub    qY5,qV1,qV5
+            fsub    dYr5,dVr1,dVr5
+            fsub    dYi5,dVi1,dVi5
+
+            fsub    dYr3,dVr3,dVr7
+            st2     {dYr7,dYi7},[pDst],step1     // store y1
+            fsub    dYi3,dVi3,dVi7
+
+            // fadd    qY1,qV1,qV5
+            fadd    dYr1,dVr1,dVr5
+            fadd    dYi1,dVi1,dVi5
+
+            st2     {dYr5,dYi5},[pDst],step1     // store y3
+            st2     {dYr3,dYi3},[pDst],step1     // store y5
+            st2     {dYr1,dYi1},[pDst],#16       // store y7
+
+        .ENDIF
+
+
+        // update pDst for the next set
+        SUB     pDst, pDst, step2
+        BGT     radix8fsGrpZeroSetLoop\name
+
+        // Save subFFTNum and subFFTSize for next stage
+        str     subFFTNum, [pSubFFTNum]
+        str     subFFTSize, [pSubFFTSize]
+        
+        .endm
+
+
+        // Allocate stack memory required by the function
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace,,d15
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace,,d15
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+
+        .end
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c b/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c
new file mode 100644
index 0000000..f29796b
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c
@@ -0,0 +1,190 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+/**
+ * Function:  omxSP_FFTFwd_CToC_FC32_Sfs   (2.2.4.2.2)
+ *
+ * Description:
+ * Compute an FFT for a complex signal of length of 2^order, 
+ * where 0 <= order <= 15. 
+ * Transform length is determined by the specification structure, which 
+ * must be initialized prior to calling the FFT function using the appropriate 
+ * helper, i.e., <FFTInit_C_sc32> or <FFTInit_C_SC16>. The relationship 
+ * between the input and output sequences can be expressed in terms of the 
+ * DFT, i.e., 
+ *
+ *      X[k] = SUM[n=0...N-1]x[n].e^(-jnk.2.pi/N)
+ *      k = 0,1,2,..., N-1
+ *      N = 2^order
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the input signal, a complex-valued vector of length
+ *            2^order; must be aligned on a 32 byte boundary. 
+ *   pFFTSpec - pointer to the preallocated and initialized specification 
+ *            structure 
+ *
+ * Output Arguments:
+ *   pDst - pointer to the complex-valued output vector, of length 2^order;
+ *            must be aligned on an 32-byte boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - returned if one or more of the following conditions 
+ *              is true: 
+ *    -   one or more of the following pointers is NULL: pSrc, pDst, or 
+ *              pFFTSpec. 
+ *    -    pSrc or pDst is not 32-byte aligned 
+ *
+ */
+
+OMXResult omxSP_FFTFwd_CToC_FC32_Sfs(const OMX_FC32* pSrc,
+                                     OMX_FC32* pDst,
+                                     const OMXFFTSpec_C_FC32* pFFTSpec) {
+  ARMsFFTSpec_FC32* spec = (ARMsFFTSpec_FC32*)pFFTSpec;
+  int order;
+  long subFFTSize;
+  long subFFTNum;
+  OMX_FC32* pTwiddle;
+  OMX_FC32* pOut;
+
+  /*
+   * Check args are not NULL and the source and destination pointers
+   * are properly aligned.
+   */
+  if (!validateParametersFC32(pSrc, pDst, spec))
+    return OMX_Sts_BadArgErr;
+
+  order = fastlog2(spec->N);
+
+  subFFTSize = 1;
+  subFFTNum = spec->N;
+  pTwiddle = spec->pTwiddle;
+  pOut = spec->pBuf;
+
+  if (order > 3) {
+    OMX_FC32* argDst;
+
+    /*
+     * Set up argDst and pOut appropriately so that pOut = pDst for
+     * the very last FFT stage.
+     */
+    if ((order & 2) == 0) {
+      argDst = pOut;
+      pOut = pDst;
+    } else {
+      argDst = pDst;
+    }
+
+    /*
+     * Odd order uses a radix 8 first stage; even order, a radix 4
+     * first stage.
+     */
+    if (order & 1) {
+      armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+          pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    } else {
+      armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+          pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+
+    /*
+     * Now use radix 4 stages to finish rest of the FFT
+     */
+    if (subFFTNum >= 4) {
+      while (subFFTNum > 4) {
+        OMX_FC32* tmp;
+
+        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+            argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+        /*
+         * Swap argDst and pOut
+         */
+        tmp = pOut;
+        pOut = argDst;
+        argDst = tmp;
+      }
+
+      armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+          argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+  } else if (order == 3) {
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+        pDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 2) {
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else {
+    /* Order = 1 */
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  }
+
+  return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c b/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c
new file mode 100644
index 0000000..f1e503e
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c
@@ -0,0 +1,213 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void ComplexToRealFixup(OMX_FC32* pSrc,
+                               OMX_F32* pDst,
+                               const OMX_FC32* pTwiddle,
+                               OMX_F32* pBuf,
+                               long N);
+
+/**
+ * Function:  omxSP_FFTFwd_CToC_FC32_Sfs   (2.2.4.2.2)
+ *
+ * Description:
+ * Compute an FFT for a complex signal of length of 2^order,
+ * where 0 <= order <= 15.
+ * Transform length is determined by the specification structure, which
+ * must be initialized prior to calling the FFT function using the appropriate
+ * helper, i.e., <FFTInit_C_sc32> or <FFTInit_C_SC16>. The relationship
+ * between the input and output sequences can be expressed in terms of the
+ * DFT, i.e.,
+ *
+ *      X[k] = SUM[n=0...N-1]x[n].e^(-jnk.2.pi/N)
+ *      k = 0,1,2,..., N-1
+ *      N = 2^order
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the input signal, a complex-valued vector of length
+ *            2^order; must be aligned on a 32 byte boundary.
+ *   pFFTSpec - pointer to the preallocated and initialized specification
+ *            structure
+ *
+ * Output Arguments:
+ *   pDst - pointer to the complex-valued output vector, of length 2^order;
+ *            must be aligned on an 32-byte boundary.
+ *
+ * Return Value:
+ *
+ *    OMX_Sts_NoErr - no error
+ *    OMX_Sts_BadArgErr - returned if one or more of the following conditions
+ *              is true:
+ *    -   one or more of the following pointers is NULL: pSrc, pDst, or
+ *              pFFTSpec.
+ *    -    pSrc or pDst is not 32-byte aligned
+ *
+ */
+
+OMXResult omxSP_FFTFwd_RToCCS_F32_Sfs(const OMX_F32* pSrc,
+                                      OMX_F32* pDst,
+                                      const OMXFFTSpec_R_F32* pFFTSpec) {
+  ARMsFFTSpec_R_FC32* spec = (ARMsFFTSpec_R_FC32*)pFFTSpec;
+  int order;
+  long subFFTSize;
+  long subFFTNum;
+  OMX_FC32* pTwiddle;
+  OMX_FC32* pOut;
+  OMX_FC32* pComplexSrc = (OMX_FC32*) pSrc;
+  OMX_FC32* pComplexDst = (OMX_FC32*) pDst;
+
+  /*
+   * Check args are not NULL and the source and destination pointers
+   * are properly aligned.
+   */
+  if (!validateParametersF32(pSrc, pDst, spec))
+    return OMX_Sts_BadArgErr;
+
+  /*
+   * Compute the RFFT using a complex FFT of one less order, so set
+   * order to be the order of the complex FFT.
+   */
+  order = fastlog2(spec->N) - 1;
+
+  subFFTSize = 1;
+  subFFTNum = spec->N >> 1;
+  pTwiddle = spec->pTwiddle;
+  pOut = (OMX_FC32*) spec->pBuf;
+
+  if (order > 3) {
+    OMX_FC32* argDst;
+    OMX_FC32* pComplexDst = (OMX_FC32*) pDst;
+
+    /*
+     * Set up argDst and pOut appropriately so that pOut = pDst for
+     * ComplexToRealFixup.
+     */
+    if ((order & 2) != 0) {
+      argDst = pOut;
+      pOut = pComplexDst;
+    } else {
+      argDst = pComplexDst;
+    }
+
+    /*
+     * Odd order uses a radix 8 first stage; even order, a radix 4
+     * first stage.
+     */
+    if (order & 1) {
+      armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+          pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    } else {
+      armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+          pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+
+    /*
+     * Now use radix 4 stages to finish rest of the FFT
+     */
+    if (subFFTNum >= 4) {
+      while (subFFTNum > 4) {
+        OMX_FC32* tmp;
+
+        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+            argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+        /*
+         * Swap argDst and pOut
+         */
+        tmp = pOut;
+        pOut = argDst;
+        argDst = tmp;
+      }
+
+      armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+          argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+  } else if (order == 3) {
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+        pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+        pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 2) {
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+        pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 1) {
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+  } else {
+    /* Handle complex order 0 specially */
+    pOut->Re = pSrc[0];
+    pOut->Im = pSrc[1];
+  }
+
+  /*
+   * Complex FFT done. Fix up the complex result to give the correct
+   * RFFT.
+   */
+
+  ComplexToRealFixup(pOut, pDst, pTwiddle, spec->pBuf, spec->N);
+
+  return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c b/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c
new file mode 100644
index 0000000..84de9cf
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c
@@ -0,0 +1,259 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CCSToR_F32_preTwiddleRadix2(
+    const OMX_F32* pSrc,
+    const OMX_FC32* pTwiddle,
+    OMX_F32* pBuf,
+    long N);
+
+/*
+ * Scale FFT data by 1/|length|. |length| must be a power of two
+ */
+static inline ScaleRFFTData(OMX_F32* fftData, unsigned length) {
+  float32_t* data = (float32_t*)fftData;
+  float32_t scale = 2.0f / length;
+
+  if (length >= 4) {
+    /*
+     * Do 4 float elements at a time because |length| is always a
+     * multiple of 4 when |length| >= 4.
+     *
+     * TODO(rtoy): Figure out how to process 8 elements at a time
+     * using intrinsics or replace this with inline assembly.
+     */
+    do {
+      float32x4_t x = vld1q_f32(data);
+
+      length -= 4;
+      x = vmulq_n_f32(x, scale);
+      vst1q_f32(data, x);
+      data += 4;
+    } while (length > 0);
+  } else if (length == 2) {
+    float32x2_t x = vld1_f32(data);
+    x = vmul_n_f32(x, scale);
+    vst1_f32(data, x);
+  } else {
+    fftData[0] *= scale;
+  }
+}
+
+/**
+ * Function:  omxSP_FFTInv_CCSToR_F32_Sfs
+ *
+ * Description:
+ * These functions compute the inverse FFT for a conjugate-symmetric input 
+ * sequence.  Transform length is determined by the specification structure, 
+ * which must be initialized prior to calling the FFT function using 
+ * <FFTInit_R_F32>. For a transform of length M, the input sequence is 
+ * represented using a packed CCS vector of length M+2, and is organized 
+ * as follows: 
+ *
+ *   Index:   0  1  2    3    4    5    . . .  M-2       M-1      M      M+1 
+ *   Comp:  R[0] 0  R[1] I[1] R[2] I[2] . . .  R[M/2-1]  I[M/2-1] R[M/2] 0 
+ *
+ * where R[n] and I[n], respectively, denote the real and imaginary
+ * components for FFT bin n. Bins are numbered from 0 to M/2, where M
+ * is the FFT length.  Bin index 0 corresponds to the DC component,
+ * and bin index M/2 corresponds to the foldover frequency.
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the complex-valued input sequence represented
+ *          using CCS format, of length (2^order) + 2; must be aligned on a
+ *          32-byte boundary.
+ *   pFFTSpec - pointer to the preallocated and initialized
+ *              specification structure
+ *
+ * Output Arguments:
+ *   pDst - pointer to the real-valued output sequence, of length
+ *          2^order ; must be aligned on a 32-byte boundary.
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+
+ *    OMX_Sts_BadArgErr - bad arguments if one or more of the
+ *      following is true:
+ *    -    pSrc, pDst, or pFFTSpec is NULL 
+ *    -    pSrc or pDst is not aligned on a 32-byte boundary 
+ *
+ */
+OMXResult omxSP_FFTInv_CCSToR_F32_Sfs(
+    const OMX_F32* pSrc,
+    OMX_F32* pDst,
+    const OMXFFTSpec_R_F32* pFFTSpec) {
+  ARMsFFTSpec_R_FC32* spec = (ARMsFFTSpec_R_FC32*)pFFTSpec;
+  int order;
+  long subFFTSize;
+  long subFFTNum;
+  OMX_FC32* pTwiddle;
+  OMX_FC32* pOut;
+  OMX_FC32* pComplexSrc;
+  OMX_FC32* pComplexDst = (OMX_FC32*) pDst;
+
+  /*
+   * Check args are not NULL and the source and destination pointers
+   * are properly aligned.
+   */
+  if (!validateParametersF32(pSrc, pDst, spec))
+    return OMX_Sts_BadArgErr;
+
+  /*
+   * Preprocess the input before calling the complex inverse FFT. The
+   * result is actually stored in the second half of the temp buffer
+   * in pFFTSpec.
+   */
+  if (spec->N > 1)
+    armSP_FFTInv_CCSToR_F32_preTwiddleRadix2(
+        pSrc, spec->pTwiddle, spec->pBuf, spec->N);
+  
+  /*
+   * Do a complex inverse FFT of half size.
+   */
+  order = fastlog2(spec->N) - 1;
+
+  subFFTSize = 1;
+  subFFTNum = spec->N >> 1;
+  pTwiddle = spec->pTwiddle;
+  /*
+   * The pBuf is split in half. The first half is the temp buffer. The
+   * second half holds the source data that was placed there by
+   * armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe.
+   */
+  pOut = (OMX_FC32*) spec->pBuf;
+  pComplexSrc = pOut + (1 << order);
+
+
+  if (order > 3) {
+    OMX_FC32* argDst;
+
+    /*
+     * Set up argDst and pOut appropriately so that pOut = pDst for
+     * the very last FFT stage.
+     */
+    if ((order & 2) == 0) {
+      argDst = pOut;
+      pOut = pComplexDst;
+    } else {
+      argDst = pComplexDst;
+    }
+
+    /*
+     * Odd order uses a radix 8 first stage; even order, a radix 4
+     * first stage.
+     */
+    if (order & 1) {
+      armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+          pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    } else {
+      armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+          pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+
+    /*
+     * Now use radix 4 stages to finish rest of the FFT
+     */
+    if (subFFTNum >= 4) {
+      while (subFFTNum > 4) {
+        OMX_FC32* tmp;
+
+        armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+            argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+        /*
+         * Swap argDst and pOut
+         */
+        tmp = pOut;
+        pOut = argDst;
+        argDst = tmp;
+      }
+
+      armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+          argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+  } else if (order == 3) {
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+        pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 2) {
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 1) {
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else {
+    /* Order = 0 */
+    *pComplexDst = *pComplexSrc;
+  }
+
+  ScaleRFFTData(pDst, spec->N);
+  return OMX_Sts_NoErr;
+}
+
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c b/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c
new file mode 100644
index 0000000..eec05e9
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c
@@ -0,0 +1,214 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+/*
+ * Scale FFT data by 1/|length|. |length| must be a power of two
+ */
+static inline ScaleFFTData(OMX_FC32* fftData, unsigned length) {
+  float32_t* data = (float32_t*)fftData;
+  float32_t scale = 1.0f / length;
+
+  /*
+   * Do two complex elements at a time because |length| is always
+   * greater than or equal to 2 (order >= 1)
+   */
+  do {
+    float32x4_t x = vld1q_f32(data);
+
+    length -= 2;
+    x = vmulq_n_f32(x, scale);
+    vst1q_f32(data, x);
+    data += 4;
+  } while (length > 0);
+}
+
+/**
+ * Function:  omxSP_FFTInv_CToC_FC32
+ *
+ * Description:
+ * These functions compute an inverse FFT for a complex signal of
+ * length of 2^order, where 0 <= order <= 15. Transform length is
+ * determined by the specification structure, which must be
+ * initialized prior to calling the FFT function using the appropriate
+ * helper, i.e., <FFTInit_C_FC32>. The relationship between the input
+ * and output sequences can be expressed in terms of the IDFT, i.e.:
+ *
+ *     x[n] = SUM[k=0,...,N-1] X[k].e^(jnk.2.pi/N)
+ *     n=0,1,2,...N-1
+ *     N=2^order.
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the complex-valued input signal, of length 2^order ;
+ *          must be aligned on a 32-byte boundary.
+ *   pFFTSpec - pointer to the preallocated and initialized specification
+ *            structure
+ *
+ * Output Arguments:
+ *   order
+ *   pDst - pointer to the complex-valued output signal, of length 2^order;
+ *          must be aligned on a 32-byte boundary.
+ *
+ * Return Value:
+ *
+ *    OMX_Sts_NoErr - no error
+ *    OMX_Sts_BadArgErr - returned if one or more of the following conditions
+ *              is true:
+ *    -   one or more of the following pointers is NULL: pSrc, pDst, or
+ *              pFFTSpec.
+ *    -   pSrc or pDst is not 32-byte aligned
+ *
+ */
+
+OMXResult omxSP_FFTInv_CToC_FC32_Sfs(const OMX_FC32* pSrc,
+                                     OMX_FC32* pDst,
+                                     const OMXFFTSpec_C_FC32* pFFTSpec) {
+  ARMsFFTSpec_FC32* spec = (ARMsFFTSpec_FC32*)pFFTSpec;
+  int order;
+  long subFFTSize;
+  long subFFTNum;
+  OMX_FC32* pTwiddle;
+  OMX_FC32* pOut;
+
+  /*
+   * Check args are not NULL and the source and destination pointers
+   * are properly aligned.
+   */
+  if (!validateParametersFC32(pSrc, pDst, spec))
+    return OMX_Sts_BadArgErr;
+
+  order = fastlog2(spec->N);
+
+  subFFTSize = 1;
+  subFFTNum = spec->N;
+  pTwiddle = spec->pTwiddle;
+  pOut = spec->pBuf;
+
+  if (order > 3) {
+    OMX_FC32* argDst;
+
+    /*
+     * Set up argDst and pOut appropriately so that pOut = pDst for
+     * the very last FFT stage.
+     */
+    if ((order & 2) == 0) {
+      argDst = pOut;
+      pOut = pDst;
+    } else {
+      argDst = pDst;
+    }
+
+    /*
+     * Odd order uses a radix 8 first stage; even order, a radix 4
+     * first stage.
+     */
+    if (order & 1) {
+      armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+          pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    } else {
+      armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+          pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+
+    /*
+     * Now use radix 4 stages to finish rest of the FFT
+     */
+    if (subFFTNum >= 4) {
+      while (subFFTNum > 4) {
+        OMX_FC32* tmp;
+
+        armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+            argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+        /*
+         * Swap argDst and pOut
+         */
+        tmp = pOut;
+        pOut = argDst;
+        argDst = tmp;
+      }
+
+      armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+          argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+  } else if (order == 3) {
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+        pDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 2) {
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else {
+    /* Order = 1 */
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  }
+
+  ScaleFFTData(pDst, spec->N);
+  return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/test/support/float_fft_neon.c b/dl/sp/src/test/support/float_fft_neon.c
index 3f0cf16..a10d803 100644
--- a/dl/sp/src/test/support/float_fft_neon.c
+++ b/dl/sp/src/test/support/float_fft_neon.c
@@ -8,22 +8,37 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
+
 #include "dl/sp/api/armSP.h"
 #include "dl/sp/api/omxSP.h"
 #include "dl/sp/src/test/test_util.h"
 
 static const char* message =
-    "Test forward and inverse floating-point FFT (NEON)\n";
+    "Test forward and inverse floating-point FFT"
+#if defined(__aarch64__)
+    " (ARM64)\n"
+#else
+    " (NEON)\n"
+#endif
+    ;
 
 const char* UsageMessage() {
   return message;
 }
 
+#if defined(__aarch64__)
+#define FINISHED_MESSAGE "ARM64 tests finished.\n"
+#else
+#define FINISHED_MESSAGE "NEON tests finished.\n"
+#endif
+
 void FinishedMessage() {
-  printf("NEON tests finished.\n");
+  printf(FINISHED_MESSAGE);
 }
 
 void SetThresholds(struct TestInfo* info) {
+#if defined(__arm__)
 #ifdef BIG_FFT_TABLE
   info->forward_threshold_ = 138.81;
   info->inverse_threshold_ = 137.81;
@@ -31,6 +46,15 @@
   info->forward_threshold_ = 138.81;
   info->inverse_threshold_ = 138.81;
 #endif
+#else
+#ifdef BIG_FFT_TABLE
+  info->forward_threshold_ = 138.96;
+  info->inverse_threshold_ = 138.96;
+#else
+  info->forward_threshold_ = 138.96;
+  info->inverse_threshold_ = 138.96;
+#endif
+#endif
 }
 
 OMXResult ForwardFFT(OMX_FC32* x,
diff --git a/dl/sp/src/test/support/float_rfft_thresholds.h b/dl/sp/src/test/support/float_rfft_thresholds.h
index 2d84eec..6bbc937 100644
--- a/dl/sp/src/test/support/float_rfft_thresholds.h
+++ b/dl/sp/src/test/support/float_rfft_thresholds.h
@@ -23,6 +23,14 @@
 #define FLOAT_RFFT_FORWARD_THRESHOLD_ARMV7 (134.95)
 #define FLOAT_RFFT_INVERSE_THRESHOLD_ARMV7 (142.25)
 #endif
+#elif defined(__aarch64__)
+#ifdef BIG_FFT_TABLE
+#define FLOAT_RFFT_FORWARD_THRESHOLD_NEON (136.55)
+#define FLOAT_RFFT_INVERSE_THRESHOLD_NEON (141.55)
+#else
+#define FLOAT_RFFT_FORWARD_THRESHOLD_NEON (136.55)
+#define FLOAT_RFFT_INVERSE_THRESHOLD_NEON (142.74)
+#endif
 #else
 #ifdef BIG_FFT_TABLE
 #define FLOAT_RFFT_FORWARD_THRESHOLD_X86 (135.97)
diff --git a/dl/sp/src/test/test_fft.gyp b/dl/sp/src/test/test_fft.gyp
index b1d88e3..3c739cf 100644
--- a/dl/sp/src/test/test_fft.gyp
+++ b/dl/sp/src/test/test_fft.gyp
@@ -128,6 +128,19 @@
         },
       ],
     }],
+    ['target_arch == "arm64"', {
+      'targets': [
+        {
+          # Test complex floating-point FFT
+          'target_name': 'test_float_fft',
+          'type': 'executable',
+          'sources': [
+            'test_float_fft.c',
+            'support/float_fft_neon.c',
+          ],
+        },
+      ],
+    }],
   ],
   'targets': [
     # Targets that should be supported by all architectures
@@ -155,7 +168,7 @@
         'support/float_rfft_thresholds.h',
       ],
       'conditions': [
-        ['target_arch == "arm"', {
+        ['target_arch == "arm" or target_arch == "arm64"', {
           'sources': [
             'support/float_rfft_neon.c',
           ],
@@ -175,9 +188,9 @@
         'test_fft_time.c',
       ],
       'conditions': [
-        ['target_arch == "ia32"', {
+        ['target_arch == "ia32" or target_arch == "arm64"', {
           'defines': [
-            # Timing test only for float FFTs on x86
+            # Timing test only for float FFTs on x86 and arm64.
             'FLOAT_ONLY',
           ],
         }],
@@ -206,6 +219,12 @@
             'test_float_rfft_detect',
           ],
         }],
+        ['target_arch == "arm64"', {
+          # Supported test programs for ARM64
+          'dependencies': [
+            'test_float_fft',
+           ],
+        }],
       ],
       'dependencies' : [
         # All architectures must support at least the float rfft test
diff --git a/dl/sp/src/test/test_fft_time.c b/dl/sp/src/test/test_fft_time.c
index db65134..6154228 100644
--- a/dl/sp/src/test/test_fft_time.c
+++ b/dl/sp/src/test/test_fft_time.c
@@ -51,7 +51,7 @@
   S32,
 } s16_s32;
 
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
 void TimeOneFloatFFT(int count, int fft_log_size, float signal_value,
                      int signal_type);
 void TimeFloatFFT(int count, float signal_value, int signal_type);
@@ -112,7 +112,7 @@
       "  -T          Run just one FFT timing test\n"
       "  -f          FFT type:\n"
       "              0 - Complex Float\n"
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
       "              1 - Real Float\n"
 #endif
 #ifdef ENABLE_FIXED_POINT_FFT_TESTS
@@ -217,7 +217,7 @@
     printf("Warning:  -f ignored when -T not specified\n");
 
   if (test_mode) {
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
     TimeFloatFFT(count, signal_value, signal_type);
 #endif
     TimeFloatRFFT(count, signal_value, signal_type);
@@ -229,7 +229,7 @@
 #endif
   } else {
     switch (fft_type) {
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
       case 0:
         TimeOneFloatFFT(count, fft_log_size, signal_value, signal_type);
         break;
@@ -323,7 +323,7 @@
   return count;
 }
 
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
 void TimeOneFloatFFT(int count, int fft_log_size, float signal_value,
                      int signal_type) {
   struct AlignedPtr* x_aligned;
@@ -1061,11 +1061,11 @@
 
     if(s16s32 == S32) {
       PrintResult("Forward RFFT16 (with S32)",
-		  fft_log_size, elapsed_time, count);
+                  fft_log_size, elapsed_time, count);
     }
     else {
       PrintResult("Forward RFFT16 (with S16)",
-		  fft_log_size, elapsed_time, count);
+                  fft_log_size, elapsed_time, count);
     }
   }
 
@@ -1124,11 +1124,11 @@
 
     if(s16s32 == S32) {
       PrintResult("Inverse RFFT16 (with S32)",
-		  fft_log_size, elapsed_time, count);
+                  fft_log_size, elapsed_time, count);
     }
     else {
       PrintResult("Inverse RFFT16 (with S16)",
-		  fft_log_size, elapsed_time, count);
+                  fft_log_size, elapsed_time, count);
     }
   }
 
diff --git a/dl/sp/src/test/test_float_fft.c b/dl/sp/src/test/test_float_fft.c
index 289b197..6aac3a4 100644
--- a/dl/sp/src/test/test_float_fft.c
+++ b/dl/sp/src/test/test_float_fft.c
@@ -154,7 +154,7 @@
 
   status = ForwardFFT(x, y, fft_fwd_spec);
 
-  if (status) {
+  if (status != OMX_Sts_NoErr) {
     fprintf(stderr, "Forward FFT failed: status = %d\n", status);
     exit(1);
   }
@@ -223,7 +223,7 @@
 
   status = InverseFFT(y, z, fft_inv_spec);
 
-  if (status) {
+  if (status != OMX_Sts_NoErr) {
     fprintf(stderr, "Inverse FFT failed: status = %d\n", status);
     exit(1);
   }
diff --git a/dl/sp/src/test/test_float_rfft.c b/dl/sp/src/test/test_float_rfft.c
index a976162..3b37ece 100644
--- a/dl/sp/src/test/test_float_rfft.c
+++ b/dl/sp/src/test/test_float_rfft.c
@@ -177,7 +177,7 @@
   }
 
   status = ForwardRFFT(x, (OMX_F32*) y, fft_fwd_spec);
-  if (status) {
+  if (status != OMX_Sts_NoErr) {
     fprintf(stderr, "Forward FFT failed: status = %d\n", status);
     exit(1);
   }
@@ -257,7 +257,7 @@
   }
 
   status = InverseRFFT((OMX_F32 *) yTrue, z, fft_inv_spec);
-  if (status) {
+  if (status != OMX_Sts_NoErr) {
     fprintf(stderr, "Inverse FFT failed: status = %d\n", status);
     exit(1);
   }