diff --git a/dl/api/arm/arm64COMM_s.h b/dl/api/arm/arm64COMM_s.h
new file mode 100644
index 0000000..e19ceee
--- /dev/null
+++ b/dl/api/arm/arm64COMM_s.h
@@ -0,0 +1,258 @@
+// -*- Mode: asm; -*-
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This file was originally licensed as follows. It has been
+//  relicensed with permission from the copyright holders.
+//
+	
+// 
+// File Name:  armCOMM_s.h
+// OpenMAX DL: v1.0.2
+// Last Modified Revision:   13871
+// Last Modified Date:       Fri, 09 May 2008
+// 
+// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+// 
+// 
+//
+// ARM optimized OpenMAX common header file
+//
+
+	.set	_SBytes, 0	// Number of scratch bytes on stack
+	.set	_Workspace, 0	// Stack offset of scratch workspace
+
+	.set	_RRegList, 0	// R saved register list (last register number)
+	.set	_DRegList, 0	// D saved register list (last register number)
+
+	// Work out list of D saved registers, like for R registers.
+	.macro	_M_GETDREGLIST dreg
+	.ifeqs "\dreg", ""
+	.set	_DRegList, 0
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d8"
+	.set	_DRegList, 8
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d9"
+	.set	_DRegList, 9
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d10"
+	.set	_DRegList, 10
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d11"
+	.set	_DRegList, 11
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d12"
+	.set	_DRegList, 12
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d13"
+	.set	_DRegList, 13
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d14"
+	.set	_DRegList, 14
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d15"
+	.set	_DRegList, 15
+	.exitm
+	.endif
+
+	.warning "Unrecognized saved d register limit: \rreg"
+	.endm
+
+//////////////////////////////////////////////////////////
+// Function header and footer macros
+//////////////////////////////////////////////////////////      
+	
+        // Function Header Macro    
+        // Generates the function prologue
+        // Note that functions should all be "stack-moves-once"
+        // The FNSTART and FNEND macros should be the only places
+        // where the stack moves.
+        //    
+        //  name  = function name
+        //  rreg  = ""   don't stack any registers
+        //          "lr" stack "lr" only
+        //          "rN" stack registers "r4-rN,lr"
+        //  dreg  = ""   don't stack any D registers
+        //          "dN" stack registers "d8-dN"
+        //
+        // Note: ARM Archicture procedure call standard AAPCS
+        // states that r4-r11, sp, d8-d15 must be preserved by
+        // a compliant function.
+	.macro	M_START name, rreg, dreg
+	.set	_Workspace, 0
+
+	// Define the function and make it external.
+	.global	\name
+	.func	\name
+	.section	.text.\name,"ax",%progbits
+	.align	4
+\name :		
+//.fnstart
+	// Save specified R registers
+	_M_PUSH_RREG
+
+	// Save specified D registers
+        _M_GETDREGLIST  \dreg
+	_M_PUSH_DREG
+
+	// Ensure size claimed on stack is 16-byte aligned for ARM64
+	.if (_SBytes & 15) != 0
+	.set	_SBytes, _SBytes + (16 - (_SBytes & 15))
+	.endif
+	.if _SBytes != 0
+		sub	sp, sp, #_SBytes
+	.endif	
+	.endm
+
+        // Function Footer Macro        
+        // Generates the function epilogue
+	.macro M_END
+	// Restore the stack pointer to its original value on function entry
+	.if _SBytes != 0
+		add	sp, sp, #_SBytes
+	.endif
+	// Restore any saved R or D registers.
+	_M_RET
+	//.fnend	
+	.endfunc
+        // Reset the global stack tracking variables back to their
+	// initial values.
+	.set _SBytes, 0
+	.endm
+
+	// Based on the value of _DRegList, push the specified set of registers 
+	// to the stack.
+	// The ARM64 ABI says only v8-v15 needs to be saved across calls and only 
+	// the lower 64 bits need to be saved.
+	.macro _M_PUSH_DREG
+	.if _DRegList >= 8
+		sub	sp, sp, (_DRegList - 7) * 16	// 16-byte alignment
+		str	q8, [sp]
+	.endif
+	
+	.if _DRegList >= 9
+		str	q9, [sp, #16]
+	.endif
+	
+	.if _DRegList >= 10
+		str	q10, [sp, #32]
+	.endif
+	
+	.if _DRegList >= 11
+		str	q11, [sp, #48]
+	.endif
+	
+	.if _DRegList >= 12
+		str	q12, [sp, #64]
+	.endif
+	
+	.if _DRegList >= 13
+		str	q13, [sp, #80]
+	.endif
+	
+	.if _DRegList >= 14
+		str	q14, [sp, #96]
+	.endif
+	
+	.if _DRegList >= 15
+		str	q15, [sp, #112]
+	.endif
+	
+	.exitm
+	.endm
+
+	// Based on the value of _RRegList, push the specified set of registers 
+	// to the stack.
+	// The ARM64 ABI says registers r19-r29 needs to be saved across calls.
+	// But for the FFT routines, we don't need to save anything, so just
+	// preserve the SP and LR.
+	.macro _M_PUSH_RREG
+	sub	sp, sp, #16
+	str	x30, [sp]
+	str	x29, [sp, #8]
+	.exitm
+	.endm
+
+	// The opposite of _M_PUSH_DREG
+	.macro  _M_POP_DREG
+	.if _DRegList >= 8
+		ldr	q8, [sp]
+	.endif
+	
+	.if _DRegList >= 9
+		ldr	q9, [sp, #16]
+	.endif
+	
+	.if _DRegList >= 10
+		ldr	q10, [sp, #32]
+	.endif
+	
+	.if _DRegList >= 11
+		ldr	q11, [sp, #48]
+	.endif
+	
+	.if _DRegList >= 12
+		ldr	q12, [sp, #64]
+	.endif
+	
+	.if _DRegList >= 13
+		ldr	q13, [sp, #80]
+	.endif
+	
+	.if _DRegList >= 14
+		ldr	q14, [sp, #96]
+	.endif
+	
+	.if _DRegList >= 15
+		ldr	q15, [sp, #112]
+	.endif
+
+	.if _DRegList >= 8
+		add	sp, sp, (_DRegList - 7) * 16	// 16-byte alignment
+	.endif
+	.exitm
+	.endm
+
+	// The opposite of _M_PUSH_RREG
+	.macro _M_POP_RREG cc
+	ldr	x29, [sp, #8]
+	ldr	x30, [sp]
+	add	sp, sp, #16
+	.exitm
+	.endm
+	
+        // Produce function return instructions
+	.macro	_M_RET cc
+	_M_POP_DREG \cc
+	_M_POP_RREG \cc
+	ret
+	.endm	
+	// rsb - reverse subtract
+	// compute dst = src2 - src1, useful when src2 is an immediate value
+	.macro	rsb	dst, src1, src2
+	sub	\dst, \src1, \src2
+	neg	\dst, \dst
+	.endm
diff --git a/dl/api/arm/omxtypes_s.h b/dl/api/arm/omxtypes_s.h
index d880d35..b27f72d 100644
--- a/dl/api/arm/omxtypes_s.h
+++ b/dl/api/arm/omxtypes_s.h
@@ -1,76 +1,76 @@
-@//
-@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
-@//
-@//  Use of this source code is governed by a BSD-style license
-@//  that can be found in the LICENSE file in the root of the source
-@//  tree. An additional intellectual property rights grant can be found
-@//  in the file PATENTS.  All contributing project authors may
-@//  be found in the AUTHORS file in the root of the source tree.
-@//
-@//  This file was originally licensed as follows. It has been
-@//  relicensed with permission from the copyright holders.
-@//
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This file was originally licensed as follows. It has been
+//  relicensed with permission from the copyright holders.
+//
 
-@// 
-@// File Name:  omxtypes_s.h
-@// OpenMAX DL: v1.0.2
-@// Last Modified Revision:   9622
-@// Last Modified Date:       Wed, 06 Feb 2008
-@// 
-@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
-@// 
-@//
+// 
+// File Name:  omxtypes_s.h
+// OpenMAX DL: v1.0.2
+// Last Modified Revision:   9622
+// Last Modified Date:       Wed, 06 Feb 2008
+// 
+// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+// 
+//
 
-@// Mandatory return codes - use cases are explicitly described for each function 
-	.equ	OMX_Sts_NoErr, 0    @// No error the function completed successfully 
-	.equ	OMX_Sts_Err, -2    @// Unknown/unspecified error     
-	.equ	OMX_Sts_InvalidBitstreamValErr, -182  @// Invalid value detected during bitstream processing     
-	.equ	OMX_Sts_MemAllocErr, -9    @// Not enough memory allocated for the operation 
-	.equ	OMX_StsACAAC_GainCtrErr, -159  @// AAC: Unsupported gain control data detected 
-	.equ	OMX_StsACAAC_PrgNumErr, -167  @// AAC: Invalid number of elements for one program   
-	.equ	OMX_StsACAAC_CoefValErr, -163  @// AAC: Invalid quantized coefficient value               
-	.equ	OMX_StsACAAC_MaxSfbErr, -162  @// AAC: Invalid maxSfb value in relation to numSwb     
-	.equ	OMX_StsACAAC_PlsDataErr, -160  @// AAC: pulse escape sequence data error 
+// Mandatory return codes - use cases are explicitly described for each function 
+        .equ    OMX_Sts_NoErr, 0    // No error the function completed successfully 
+        .equ    OMX_Sts_Err, -2    // Unknown/unspecified error     
+        .equ    OMX_Sts_InvalidBitstreamValErr, -182  // Invalid value detected during bitstream processing     
+        .equ    OMX_Sts_MemAllocErr, -9    // Not enough memory allocated for the operation 
+        .equ    OMX_StsACAAC_GainCtrErr, -159  // AAC: Unsupported gain control data detected 
+        .equ    OMX_StsACAAC_PrgNumErr, -167  // AAC: Invalid number of elements for one program   
+        .equ    OMX_StsACAAC_CoefValErr, -163  // AAC: Invalid quantized coefficient value               
+        .equ    OMX_StsACAAC_MaxSfbErr, -162  // AAC: Invalid maxSfb value in relation to numSwb     
+        .equ    OMX_StsACAAC_PlsDataErr, -160  // AAC: pulse escape sequence data error 
 
-@// Optional return codes - use cases are explicitly described for each function
-	.equ	OMX_Sts_BadArgErr, -5    @// Bad Arguments 
+// Optional return codes - use cases are explicitly described for each function
+        .equ    OMX_Sts_BadArgErr, -5    // Bad Arguments 
 
-	.equ	OMX_StsACAAC_TnsNumFiltErr, -157  @// AAC: Invalid number of TNS filters  
-	.equ	OMX_StsACAAC_TnsLenErr, -156  @// AAC: Invalid TNS region length     
-	.equ	OMX_StsACAAC_TnsOrderErr, -155  @// AAC: Invalid order of TNS filter                    
-	.equ	OMX_StsACAAC_TnsCoefResErr, -154  @// AAC: Invalid bit-resolution for TNS filter coefficients  
-	.equ	OMX_StsACAAC_TnsCoefErr, -153  @// AAC: Invalid TNS filter coefficients                    
-	.equ	OMX_StsACAAC_TnsDirectErr, -152  @// AAC: Invalid TNS filter direction    
-	.equ	OMX_StsICJP_JPEGMarkerErr, -183  @// JPEG marker encountered within an entropy-coded block; 
-                                            @// Huffman decoding operation terminated early.           
-	.equ	OMX_StsICJP_JPEGMarker, -181  @// JPEG marker encountered; Huffman decoding 
-                                            @// operation terminated early.                         
-	.equ	OMX_StsIPPP_ContextMatchErr, -17   @// Context parameter doesn't match to the operation 
+        .equ    OMX_StsACAAC_TnsNumFiltErr, -157  // AAC: Invalid number of TNS filters  
+        .equ    OMX_StsACAAC_TnsLenErr, -156  // AAC: Invalid TNS region length     
+        .equ    OMX_StsACAAC_TnsOrderErr, -155  // AAC: Invalid order of TNS filter                    
+        .equ    OMX_StsACAAC_TnsCoefResErr, -154  // AAC: Invalid bit-resolution for TNS filter coefficients  
+        .equ    OMX_StsACAAC_TnsCoefErr, -153  // AAC: Invalid TNS filter coefficients                    
+        .equ    OMX_StsACAAC_TnsDirectErr, -152  // AAC: Invalid TNS filter direction    
+        .equ    OMX_StsICJP_JPEGMarkerErr, -183  // JPEG marker encountered within an entropy-coded block; 
+                                            // Huffman decoding operation terminated early.           
+        .equ    OMX_StsICJP_JPEGMarker, -181  // JPEG marker encountered; Huffman decoding 
+                                            // operation terminated early.                         
+        .equ    OMX_StsIPPP_ContextMatchErr, -17   // Context parameter doesn't match to the operation 
 
-	.equ	OMX_StsSP_EvenMedianMaskSizeErr, -180  @// Even size of the Median Filter mask was replaced by the odd one 
+        .equ    OMX_StsSP_EvenMedianMaskSizeErr, -180  // Even size of the Median Filter mask was replaced by the odd one 
 
-	.equ	OMX_Sts_MaximumEnumeration, 0x7FFFFFFF
+        .equ    OMX_Sts_MaximumEnumeration, 0x7FFFFFFF
 
 
 
-	.equ	OMX_MIN_S8, (-128)
-	.equ	OMX_MIN_U8, 0
-	.equ	OMX_MIN_S16, (-32768)
-	.equ	OMX_MIN_U16, 0
+        .equ    OMX_MIN_S8, (-128)
+        .equ    OMX_MIN_U8, 0
+        .equ    OMX_MIN_S16, (-32768)
+        .equ    OMX_MIN_U16, 0
 
 
-	.equ	OMX_MIN_S32, (-2147483647-1)
-	.equ	OMX_MIN_U32, 0
+        .equ    OMX_MIN_S32, (-2147483647-1)
+        .equ    OMX_MIN_U32, 0
 
-	.equ	OMX_MAX_S8, (127)
-	.equ	OMX_MAX_U8, (255)
-	.equ	OMX_MAX_S16, (32767)
-	.equ	OMX_MAX_U16, (0xFFFF)
-	.equ	OMX_MAX_S32, (2147483647)
-	.equ	OMX_MAX_U32, (0xFFFFFFFF)
+        .equ    OMX_MAX_S8, (127)
+        .equ    OMX_MAX_U8, (255)
+        .equ    OMX_MAX_S16, (32767)
+        .equ    OMX_MAX_U16, (0xFFFF)
+        .equ    OMX_MAX_S32, (2147483647)
+        .equ    OMX_MAX_U32, (0xFFFFFFFF)
 
-	.equ	OMX_VC_UPPER, 0x1                 @// Used by the PredictIntra functions   
-	.equ	OMX_VC_LEFT, 0x2                 @// Used by the PredictIntra functions 
-	.equ	OMX_VC_UPPER_RIGHT, 0x40          @// Used by the PredictIntra functions   
+        .equ    OMX_VC_UPPER, 0x1                 // Used by the PredictIntra functions   
+        .equ    OMX_VC_LEFT, 0x2                 // Used by the PredictIntra functions 
+        .equ    OMX_VC_UPPER_RIGHT, 0x40          // Used by the PredictIntra functions   
 
-	.equ	NULL, 0
+        .equ    NULL, 0
diff --git a/dl/dl.gyp b/dl/dl.gyp
index d4c812b..8972d07 100644
--- a/dl/dl.gyp
+++ b/dl/dl.gyp
@@ -34,6 +34,21 @@
             'BIG_FFT_TABLE',
           ],
         }],
+        ['target_arch=="arm" or target_arch=="arm64"', {
+          'sources':[
+            # Common files that are used by both arm and arm64 code.
+            'api/arm/armOMX.h',
+            'api/arm/omxtypes_s.h',
+            'sp/api/armSP.h',
+            'sp/src/arm/armSP_FFT_S32TwiddleTable.c',
+            'sp/src/arm/omxSP_FFTGetBufSize_C_FC32.c',
+            'sp/src/arm/omxSP_FFTGetBufSize_C_SC32.c',
+            'sp/src/arm/omxSP_FFTGetBufSize_R_F32.c',
+            'sp/src/arm/omxSP_FFTGetBufSize_R_S32.c',
+            'sp/src/arm/omxSP_FFTInit_C_FC32.c',
+            'sp/src/arm/omxSP_FFTInit_R_F32.c',
+          ],
+        }],
         ['target_arch=="arm"', {
           'cflags!': [
             '-mfpu=vfpv3-d16',
@@ -49,21 +64,11 @@
           'sources': [
             # Common files that are used by both the NEON and non-NEON code.
             'api/armCOMM_s.h',
-            'api/armOMX.h',
-            'api/omxtypes_s.h',
-            'sp/api/armSP.h',
-            'sp/src/arm/armSP_FFT_S32TwiddleTable.c',
-            'sp/src/arm/omxSP_FFTGetBufSize_C_FC32.c',
             'sp/src/arm/omxSP_FFTGetBufSize_C_SC16.c',
-            'sp/src/arm/omxSP_FFTGetBufSize_C_SC32.c',
-            'sp/src/arm/omxSP_FFTGetBufSize_R_F32.c',
             'sp/src/arm/omxSP_FFTGetBufSize_R_S16.c',
             'sp/src/arm/omxSP_FFTGetBufSize_R_S16S32.c',
-            'sp/src/arm/omxSP_FFTGetBufSize_R_S32.c',
-            'sp/src/arm/omxSP_FFTInit_C_FC32.c',
             'sp/src/arm/omxSP_FFTInit_C_SC16.c',
             'sp/src/arm/omxSP_FFTInit_C_SC32.c',
-            'sp/src/arm/omxSP_FFTInit_R_F32.c',
             'sp/src/arm/omxSP_FFTInit_R_S16.c',
             'sp/src/arm/omxSP_FFTInit_R_S16S32.c',
             'sp/src/arm/omxSP_FFTInit_R_S32.c',
@@ -153,6 +158,27 @@
             'sp/src/x86/x86SP_SSE_Math.h',
           ],
         }],
+        ['target_arch=="arm64"', {
+          'sources':[
+            'api/arm/arm64COMM_s.h',
+
+            # Complex floating-point FFT
+            'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S',
+            'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S',
+            'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S',
+            'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S',
+            'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S',
+            'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S',
+            'sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S',
+            'sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c',
+            'sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c',
+            # Real floating-point FFT
+            'sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S',
+            'sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c',
+            'sp/src/arm/arm64/ComplexToRealFixup.S',
+            'sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c',
+          ],
+        }],
       ],
     },
   ],
diff --git a/dl/sp/api/armSP.h b/dl/sp/api/armSP.h
index 4972f09..cf17ec5 100644
--- a/dl/sp/api/armSP.h
+++ b/dl/sp/api/armSP.h
@@ -29,6 +29,8 @@
 #ifndef _armSP_H_
 #define _armSP_H_
 
+#include <stdint.h>
+
 #include "dl/api/omxtypes.h"
 
 #ifdef __cplusplus
@@ -88,6 +90,42 @@
     OMX_FC32* pBuf;
 } ARMsFFTSpec_FC32;
 
+/*
+ * Compute log2(x), where x must be a power of 2.
+ */
+static inline OMX_U32 fastlog2(long x) {
+  OMX_U32 out;
+  asm("clz %0,%1\n\t"
+      "sub %0, %0, #63\n\t"
+      "neg %0, %0\n\t"
+      : "=r"(out)
+      : "r"(x)
+      :);
+  return out;
+}
+
+/*
+ *  Validate args. All pointers must be non-NULL; the source and
+ *  destination pointers must be aligned on a 32-byte boundary; the
+ *  FFT spec must have non-NULL pointers; and the FFT size must be
+ *  within range.
+ */
+static inline int validateParametersFC32(const void* pSrc,
+					 const void* pDst,
+					 const ARMsFFTSpec_FC32* pFFTSpec) {
+  return pSrc && pDst && pFFTSpec && !(((uintptr_t)pSrc) & 31) &&
+         !(((uintptr_t)pDst) & 31) && pFFTSpec->pTwiddle && pFFTSpec->pBuf &&
+         (pFFTSpec->N >= 2) && (pFFTSpec->N <= (1 << TWIDDLE_TABLE_ORDER));
+}
+
+static inline int validateParametersF32(const void* pSrc,
+					const void* pDst,
+					const ARMsFFTSpec_R_FC32* pFFTSpec) {
+  return pSrc && pDst && pFFTSpec && !(((uintptr_t)pSrc) & 31) &&
+         !(((uintptr_t)pDst) & 31) && pFFTSpec->pTwiddle && pFFTSpec->pBuf &&
+         (pFFTSpec->N >= 2) && (pFFTSpec->N <= (1 << TWIDDLE_TABLE_ORDER));
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/dl/sp/src/arm/arm64/ComplexToRealFixup.S b/dl/sp/src/arm/arm64/ComplexToRealFixup.S
new file mode 100644
index 0000000..9b30093
--- /dev/null
+++ b/dl/sp/src/arm/arm64/ComplexToRealFixup.S
@@ -0,0 +1,261 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute FFT for a real signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+    // Guarding implementation by the processor name
+
+// Import symbols required from other files
+
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pOut		x3	
+#define	subFFTNum	x4
+
+// Output registers
+
+//Local Scratch Registers
+
+#define argTwiddle      x5
+#define argDst          x6
+#define subFFTSize      x7
+#define N               subFFTNum
+#define order           x14
+#define step            x8
+#define step1           pTwiddle
+#define twStep          x9
+#define zero            w10
+#define pTwiddleTmp     pOut
+
+// Neon registers
+
+#define dX0       v0.2s
+#define dX0s      v0.s
+#define dX0r      v2.2s
+#define dX0rs     v2.s
+#define dX0i      v3.2s
+#define dX0is     v3.s
+#define dX1r      v4.2s
+#define dX1i      v5.2s
+#define dT0       v6.2s
+#define dT1       v7.2s
+#define dT2       v8.2s
+#define dT3       v9.2s
+#define qT0       v10.2s
+#define qT1       v12.2s
+#define dW0r      v14.2s
+#define dW0r8b    v14.8b
+#define dW0i      v15.2s
+#define dW1r      v16.2s
+#define dW1r8b    v16.8b
+#define dW1i      v17.2s
+#define dY0r      v14.2s
+#define dY0i      v15.2s
+#define dY1r      v16.2s
+#define dY1i      v17.2s
+#define qT2       v18.2s
+#define qT3       v20.2s
+
+#define half      v0.2s
+#define dZip      v21.2s
+#define dZip8b    v21.8b
+        
+    // Allocate stack memory required by the function
+
+    // Write function header
+        M_START     ComplexToRealFixup,,d15
+
+        asr     N, N, #1
+
+        clz     order, subFFTNum                    // N = 2^order
+
+        RSB     order,order,#63
+        MOV     subFFTSize,subFFTNum            // subFFTSize = N/2
+        //MOV     subFFTNum,N
+        mov     argDst, pDst
+        mov     argTwiddle, pTwiddle
+
+        // F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
+        // 1/2[(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
+        // 1/2[2a+j0] - j [0+j2b]
+        // (a+b, 0)
+
+        // F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
+        // 1/2[(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
+        // 1/2[2a+j0] + j [0+j2b]
+        // (a-b, 0)
+
+        // F(0) and F(N/2)
+        ld2     {dX0rs,dX0is}[0],[pSrc], #8
+        MOV     zero,#0
+        mov    dX0rs[1],zero
+        lsl     step,subFFTSize, #3               // step = N/2 * 8 bytes
+        mov    dX0i[1],zero
+        // twStep = 3N/8 * 8 bytes pointing to W^1
+        SUB     twStep,step,subFFTSize,LSL #1
+
+        fadd    dY0r,dX0r,dX0i                    // F(0) = ((Z0.r+Z0.i) , 0)
+        lsl     step1,subFFTSize, #2              // step1 = N/2 * 4 bytes
+        fsub    dY0i,dX0r,dX0i                    // F(N/2) = ((Z0.r-Z0.i) , 0)
+        SUBS    subFFTSize,subFFTSize,#2
+
+        st1     {dY0r},[argDst],step
+        ADD     pTwiddleTmp,argTwiddle,#8         // W^2
+        st1     {dY0i},[argDst], #8
+        ADD     argTwiddle,argTwiddle,twStep      // W^1
+
+//        dup     dzero,zero
+        SUB     argDst,argDst,step
+
+        BLT     End
+        BEQ     lastElement
+        SUB     step,step,#24
+        SUB     step1,step1,#8                    // (N/4-1)*8 bytes
+
+        // F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
+        // Note: W^k is stored as negative values in the table
+        // Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
+        // since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+
+        fmov     half, #0.5
+
+evenOddButterflyLoop:
+
+
+        ld1     {dW0r},[argTwiddle],step1
+        ld1     {dW1r},[argTwiddle], #8
+
+        ld2     {dX0r,dX0i},[pSrc],step
+        SUB     argTwiddle,argTwiddle,step1
+        ld2     {dX1r,dX1i},[pSrc], #16
+
+
+
+        SUB     step1,step1,#8                    // (N/4-2)*8 bytes
+        ld1     {dW0i},[pTwiddleTmp],step1
+        ld1     {dW1i},[pTwiddleTmp], #8
+        SUB     pSrc,pSrc,step
+
+        SUB     pTwiddleTmp,pTwiddleTmp,step1
+        rev64   dX1r,dX1r
+        rev64   dX1i,dX1i
+        SUBS    subFFTSize,subFFTSize,#4
+
+
+
+        fsub    dT2,dX0r,dX1r                     // a-c
+        SUB     step1,step1,#8
+        fadd    dT0,dX0r,dX1r                     // a+c
+        fsub    dT1,dX0i,dX1i                     // b-d
+        fadd    dT3,dX0i,dX1i                     // b+d
+        fmul   dT0,dT0,half[0]
+        fmul   dT1,dT1,half[0]
+        // VZIP    dW1r,dW1i
+        // VZIP    dW0r,dW0i
+        zip1    dZip, dW1r, dW1i
+        zip2    dW1i, dW1r, dW1i
+        mov     dW1r8b, dZip8b
+        zip1    dZip, dW0r, dW0i
+        zip2    dW0i, dW0r, dW0i
+        mov     dW0r8b, dZip8b
+
+        fmul   qT0,dW1r,dT2
+        fmul   qT1,dW1r,dT3
+        fmul   qT2,dW0r,dT2
+        fmul   qT3,dW0r,dT3
+
+        fmla   qT0,dW1i,dT3
+        fmls   qT1,dW1i,dT2
+
+        fmls   qT2,dW0i,dT3
+        fmla   qT3,dW0i,dT2
+
+
+        fmul  dX1r,qT0,half[0]
+        fmul  dX1i,qT1,half[0]
+
+        fsub    dY1r,dT0,dX1i                     // F(N/2 -1)
+        fadd    dY1i,dT1,dX1r
+        fneg    dY1i,dY1i
+
+        rev64   dY1r,dY1r
+        rev64   dY1i,dY1i
+
+
+        fmul  dX0r,qT2,half[0]
+        fmul  dX0i,qT3,half[0]
+
+        fsub    dY0r,dT0,dX0i                     // F(1)
+        fadd    dY0i,dT1,dX0r
+
+
+        st2     {dY0r,dY0i},[argDst],step
+        st2     {dY1r,dY1i},[argDst], #16
+        SUB     argDst,argDst,step
+        SUB     step,step,#32                     // (N/2-4)*8 bytes
+
+
+        BGT     evenOddButterflyLoop
+
+        // set both the ptrs to the last element
+        SUB     pSrc,pSrc,#8
+        SUB     argDst,argDst,#8
+
+
+
+        // Last element can be expanded as follows
+        // 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
+        // 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
+        // 1/2[2a+j0] + j (c+jd) [0+j2b]
+        // (a-bc, -bd)
+        // Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement:
+        ld1     {dX0r},[pSrc]
+
+        st1     {dX0rs}[0],[argDst], #4
+        fneg    dX0r,dX0r
+        st1     {dX0rs}[1],[argDst], #4
+End:
+
+        // Write function tail
+        M_END
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S b/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
new file mode 100644
index 0000000..da68314
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_s.S
@@ -0,0 +1,280 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of
+//  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float
+//  instead of SC32.
+//
+
+//
+// Description:
+// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
+// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+      // Guarding implementation by the processor name
+
+
+
+//Input Registers
+
+#define pSrc            x0
+#define pTwiddle        x1
+#define	pOut		x2	
+#define	subFFTNum	x3
+
+// Output registers
+
+//Local Scratch Registers
+
+#define argTwiddle      x5
+#define argDst          x6
+#define subFFTSize      x7
+#define N               subFFTNum
+
+#define pOut1           x13
+	
+#define size            x7
+#define step            x8
+#define step1           x9
+#define twStep          x10
+#define pTwiddleTmp     x11
+#define argTwiddle1     x12
+
+// Neon registers
+
+#define dX0     v0.2s
+#define dX0s    v0.s
+#define dShift  v1.2s
+#define dX1     v1.2s
+#define dX1s    v1.s
+#define dY0     v2.2s
+#define dY08b   v2.8b
+#define dY1     v3.2s
+#define dX0r    v0.2s
+#define dX0rs   v0.s
+#define dX0i    v1.2s
+#define dX1r    v2.2s
+#define dX1i    v3.2s
+#define dW0r    v4.2s
+#define dW0r8b  v4.8b
+#define dW0i    v5.2s
+#define dW1r    v6.2s
+#define dW1r8b  v6.8b
+#define dW1i    v7.2s
+#define dT0     v8.2s
+#define dT1     v9.2s
+#define dT2     v10.2s
+#define dT3     v11.2s
+#define qT0     v12.2s
+#define qT1     v14.2s
+#define qT2     v16.2s
+#define qT3     v18.2s
+#define dY0r    v4.2s
+#define dY0i    v5.2s
+#define dY1r    v6.2s
+#define dY1i    v7.2s
+
+#define dY2     v4.2s
+#define dY3     v5.2s
+#define dW0     v6.2s
+#define dW1     v7.2s
+#define dW0Tmp  v10.2s
+#define dW1Neg  v11.2s
+
+#define dZip    v19.2s
+#define dZip8b  v19.8b
+#define half    v13.2s
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        fmov    half, 0.5
+
+        asr     size, subFFTNum, #1           // preserve the contents of N = subFFTNum
+        lsl     step, subFFTNum, #2           // step = N/2 * 8 bytes
+
+
+        // Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
+        // Note: W^(k) is stored as negated value and also need to
+        // conjugate the values from the table
+
+        // Z(0) : no need of twiddle multiply
+        // Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
+
+        ld1     {dX0},[pSrc],step
+        ADD     pOut1,pOut,step               // pOut1 = pOut+ N/2*8 bytes
+
+        ld1     {dX1},[pSrc], #8
+        // twStep = 3N/8 * 8 bytes pointing to W^1
+        SUB     twStep,step,size,LSL #1
+
+        lsl     step1,size, #2                // step1 = N/4 * 8 = N/2*4 bytes
+        SUB     step1,step1,#8                // (N/4-1)*8 bytes
+
+        fadd    dY0,dX0,dX1                   // [b+d | a+c]
+        fsub    dY1,dX0,dX1                   // [b-d | a-c]
+        fmul    dY0, dY0, half[0]
+        fmul    dY1, dY1, half[0]
+
+        // dY0= [a-c | a+c] ;dY1= [b-d | b+d]
+        // VZIP    dY0,dY1
+        zip1    dZip,dY0,dY1
+        zip2    dY1,dY0,dY1
+        mov     dY08b, dZip8b
+
+        fsub   dX0,dY0,dY1
+        SUBS   size,size,#2
+        fadd   dX1,dY0,dY1
+
+        SUB     pSrc,pSrc,step
+
+        st1     {dX0s}[0],[pOut1], #4
+        ADD     pTwiddleTmp,pTwiddle,#8       // W^2
+        st1     {dX1s}[1],[pOut1], #4
+        ADD     argTwiddle1,pTwiddle,twStep   // W^1
+
+
+        BLT     decrementScale\name
+        BEQ     lastElement\name
+
+
+        // Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
+        // Note: W^k is stored as negative values in the table and also
+        // need to conjugate the values from the table.
+        //
+        // Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+        // since both of them require F(1),F(2) and F(N/2-2),F(N/2-1)
+
+
+        SUB     step,step,#24
+evenOddButterflyLoop\name :
+
+
+        ld1     {dW0r},[argTwiddle1],step1
+        ld1     {dW1r},[argTwiddle1], #8
+
+        ld2     {dX0r,dX0i},[pSrc],step
+        SUB     argTwiddle1,argTwiddle1,step1
+        ld2     {dX1r,dX1i},[pSrc], #16
+
+        SUB     step1,step1,#8                // (N/4-2)*8 bytes
+        ld1     {dW0i},[pTwiddleTmp],step1
+        ld1     {dW1i},[pTwiddleTmp], #8
+        SUB     pSrc,pSrc,step
+
+        SUB     pTwiddleTmp,pTwiddleTmp,step1
+        rev64   dX1r,dX1r
+        rev64   dX1i,dX1i
+        SUBS    size,size,#4
+
+
+        fsub    dT2,dX0r,dX1r                 // a-c
+        fadd    dT3,dX0i,dX1i                 // b+d
+        fadd    dT0,dX0r,dX1r                 // a+c
+        fsub    dT1,dX0i,dX1i                 // b-d
+        SUB     step1,step1,#8
+
+        fmul    dT2, dT2, half[0]
+        fmul    dT3, dT3, half[0]
+
+        fmul    dT0, dT0, half[0]
+        fmul    dT1, dT1, half[0]
+
+        // VZIP    dW1r,dW1i
+        // VZIP    dW0r,dW0i
+        zip1    dZip, dW1r,dW1i
+        zip2    dW1i,dW1r,dW1i
+        mov     dW1r8b, dZip8b
+        zip1    dZip,dW0r,dW0i
+        zip2    dW0i,dW0r,dW0i
+        mov     dW0r8b, dZip8b
+
+        fmul   dX1r,dW1r,dT2
+        fmul   dX1i,dW1r,dT3
+        fmul   dX0r,dW0r,dT2
+        fmul   dX0i,dW0r,dT3
+
+        fmls   dX1r,dW1i,dT3
+        fmla   dX1i,dW1i,dT2
+
+        fmla   dX0r,dW0i,dT3
+        fmls   dX0i,dW0i,dT2
+
+
+        fadd    dY1r,dT0,dX1i                 // F(N/2 -1)
+        fsub    dY1i,dX1r,dT1
+
+        rev64   dY1r,dY1r
+        rev64   dY1i,dY1i
+
+
+        fadd    dY0r,dT0,dX0i                 // F(1)
+        fsub    dY0i,dT1,dX0r
+
+
+        st2     {dY0r,dY0i},[pOut1],step
+        st2     {dY1r,dY1i},[pOut1], #16
+        SUB     pOut1,pOut1,step
+        SUB     step,step,#32                 // (N/2-4)*8 bytes
+
+
+        BGT     evenOddButterflyLoop\name
+
+
+        // set both the ptrs to the last element
+        SUB     pSrc,pSrc,#8
+        SUB     pOut1,pOut1,#8
+
+        // Last element can be expanded as follows
+        // 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as
+        // -ve)
+        // 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
+        // 1/2[2a+j0] - j (c-jd) [0+j2b]
+        // (a+bc, -bd)
+        // Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement\name :
+        ld1     {dX0r},[pSrc]
+
+        st1     {dX0rs}[0],[pOut1], #4
+        fneg    dX0r,dX0r
+        st1     {dX0rs}[1],[pOut1]
+
+
+
+decrementScale\name :
+
+        .endm
+
+        M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2,,d15
+            FFTSTAGE "FALSE","TRUE",Inv
+        M_END
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S
new file mode 100644
index 0000000..b22912d
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_fs_s.S
@@ -0,0 +1,136 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute the first stage of a Radix 2 DIT in-order out-of-place FFT
+// stage for a N point complex signal.
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+// Guarding implementation by the processor name
+
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define pointStep       x7
+#define outPointStep    x7
+#define grpSize         x8
+#define setCount        x8
+#define step            x9
+#define dstStep         x9
+
+// Neon Registers
+#define dX0     v0.2s
+#define dX1     v1.2s
+#define dY0     v2.2s
+#define dY1     v3.2s
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+
+        // update subFFTSize and subFFTNum into RN6 and RN7 for the next stage
+
+
+        MOV        subFFTSize,#2
+        LSR        grpSize,subFFTNum,#1
+        MOV        subFFTNum,grpSize
+
+
+        // pT0+1 increments pT0 by 8 bytes
+        // pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+        // Note: outPointStep = pointStep for firststage
+        // Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+
+        lsl     pointStep, grpSize, #3
+        rsb     step, pointStep, #8
+
+        // Loop on the sets for grp zero
+
+grpZeroSetLoop\name :
+
+        LD1    {dX0},[pSrc],pointStep
+        LD1    {dX1},[pSrc],step                   // step = -pointStep + 8
+
+        SUBS    setCount,setCount,#1
+
+        fadd    dY0,dX0,dX1
+        fsub    dY1,dX0,dX1
+
+        ST1    {dY0},[pDst],outPointStep
+        // dstStep =  step = -pointStep + 8
+        ST1    {dY1},[pDst],dstStep
+
+        BGT     grpZeroSetLoop\name
+
+
+        // Save subFFTNum and subFFTSize for next stage
+        str     subFFTNum, [pSubFFTNum]
+        str     subFFTSize, [pSubFFTSize]
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S
new file mode 100644
index 0000000..e7de11e
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_ls_s.S
@@ -0,0 +1,149 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute the last stage of a Radix 2 DIT in-order out-of-place FFT
+// stage for a N point complex signal.
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+// Guarding implementation by the processor name
+
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define outPointStep    x8
+#define grpCount        x9
+#define dstStep         x10
+
+// Neon Registers
+
+#define dWr     v0.2s
+#define dWi     v1.2s
+#define dXr0    v2.2s
+#define dXi0    v3.2s
+#define dXr1    v4.2s
+#define dXi1    v5.2s
+#define dYr0    v6.2s
+#define dYi0    v7.2s
+#define dYr1    v8.2s
+#define dYi1    v9.2s
+#define qT0     v10.2s
+#define qT1     v12.2s
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        // Move parameters into our work registers
+        ldr     subFFTSize, [pSubFFTSize]
+
+        lsl     outPointStep, subFFTSize, #3
+
+        // Update grpCount and grpSize rightaway
+
+        MOV     subFFTNum,#1                          //after the last stage
+        LSL     grpCount,subFFTSize,#1
+
+        // update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        rsb     dstStep,outPointStep,#16
+
+        // Loop on 2 grps at a time for the last stage
+
+radix2lsGrpLoop\name :
+        // dWr = [pTwiddle[0].Re, pTwiddle[1].Re]
+        // dWi = [pTwiddle[0].Im, pTwiddle[1].Im]
+        ld2     {dWr,dWi},[pTwiddle], #16
+
+        // dXr0 = [pSrc[0].Re, pSrc[2].Re]
+        // dXi0 = [pSrc[0].Im, pSrc[2].Im]
+        // dXr1 = [pSrc[1].Re, pSrc[3].Re]
+        // dXi1 = [pSrc[1].Im, pSrc[3].Im]
+        ld4     {dXr0,dXi0,dXr1,dXi1}, [pSrc], #32
+
+        SUBS    grpCount,grpCount,#4                  // grpCount is multiplied by 2
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   qT0,dWr,dXr1
+            fmla   qT0,dWi,dXi1                       // real part
+            fmul   qT1,dWr,dXi1
+            fmls   qT1,dWi,dXr1                       // imag part
+
+        .else
+
+            fmul   qT0,dWr,dXr1
+            fmls   qT0,dWi,dXi1                       // real part
+            fmul   qT1,dWr,dXi1
+            fmla   qT1,dWi,dXr1                       // imag part
+
+        .endif
+
+        fsub    dYr0,dXr0,qT0
+        fsub    dYi0,dXi0,qT1
+        fadd    dYr1,dXr0,qT0
+        fadd    dYi1,dXi0,qT1
+
+        st2     {dYr0,dYi0},[pDst],outPointStep
+        st2     {dYr1,dYi1},[pDst],dstStep            // dstStep =  step = -outPointStep + 16
+
+        BGT     radix2lsGrpLoop\name
+
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace,,d12
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace,,d12
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S
new file mode 100644
index 0000000..530a815
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix2_s.S
@@ -0,0 +1,185 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix2_unsafe_s.s
+//  to support float instead of SC32.
+//
+
+// Description:
+// Compute a Radix 2 DIT in-order out-of-place FFT stage for an N point
+// complex signal.  This handles the general stage, not the first or last
+// stage.
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+
+// Guarding implementation by the processor name
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define outPointStep    x8
+#define pointStep       x9
+#define pointStep32     w9
+#define grpCount        x10
+#define grpCount32      w10
+#define setCount        x13
+#define step            x15
+#define dstStep         x11
+
+// Neon Registers
+
+#define dW      v0.2s
+#define dX0     v2.2s
+#define dX1     v3.2s
+#define dX2     v4.2s
+#define dX3     v5.2s
+#define dY0     v6.2s
+#define dY1     v7.2s
+#define dY2     v8.2s
+#define dY3     v9.2s
+#define qT0     v10.2s
+#define qT1     v11.2s
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+
+        // Update grpCount and grpSize rightaway inorder to reuse pGrpCount
+        // and pGrpSize regs
+
+        LSR     subFFTNum,subFFTNum,#1                 //grpSize
+        LSL     grpCount,subFFTSize,#1
+
+
+        // pT0+1 increments pT0 by 8 bytes
+        // pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+        lsl     pointStep, subFFTNum, #2
+
+        // update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        // pOut0+1 increments pOut0 by 8 bytes
+        // pOut0+outPointStep == increment of 8*outPointStep bytes =
+        //    4*size bytes
+        smull   outPointStep, grpCount32, pointStep32
+
+        LSL     pointStep,pointStep,#1
+
+
+        rsb      step,pointStep,#16
+        rsb      dstStep,outPointStep,#16
+
+        // Loop on the groups
+
+radix2GrpLoop\name :
+        lsr     setCount, pointStep, #3
+        LD1     {dW},[pTwiddle],pointStep              //[wi | wr]
+
+
+        // Loop on the sets
+
+
+radix2SetLoop\name :
+
+
+        // point0: dX0-real part dX1-img part
+        LD2    {dX0,dX1},[pSrc],pointStep
+        // point1: dX2-real part dX3-img part
+        LD2    {dX2,dX3},[pSrc],step
+
+        SUBS    setCount,setCount,#2
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   qT0,dX2,dW[0]
+            fmla   qT0,dX3,dW[1]                       // real part
+            fmul   qT1,dX3,dW[0]
+            fmls   qT1,dX2,dW[1]                       // imag part
+
+        .else
+
+            fmul   qT0,dX2,dW[0]
+            fmls   qT0,dX3,dW[1]                       // real part
+            fmul   qT1,dX3,dW[0]
+            fmla   qT1,dX2,dW[1]                       // imag part
+
+        .endif
+
+        fsub    dY0,dX0,qT0
+        fsub    dY1,dX1,qT1
+        fadd    dY2,dX0,qT0
+        fadd    dY3,dX1,qT1
+
+        st2    {dY0,dY1},[pDst],outPointStep
+        // dstStep = -outPointStep + 16
+        st2    {dY2,dY3},[pDst],dstStep
+
+        BGT     radix2SetLoop\name
+
+        SUBS    grpCount,grpCount,#2
+        ADD     pSrc,pSrc,pointStep
+        BGT     radix2GrpLoop\name
+
+
+        str     subFFTNum, [pSubFFTNum]
+        str     subFFTSize, [pSubFFTSize]
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace,,d11
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace,,d11
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S
new file mode 100644
index 0000000..624ef3e
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_fs_s.S
@@ -0,0 +1,266 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.s
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a first stage Radix 4 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+// Guarding implementation by the processor name
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define grpSize         x7
+// Reuse grpSize as setCount
+#define setCount        x7
+#define pointStep       x8
+#define outPointStep    x8
+#define setStep         x9
+#define step1           x10
+#define step3           x11
+
+// Neon Registers
+
+#define dXr0    v0.2s
+#define dXi0    v1.2s
+#define dXr1    v2.2s
+#define dXi1    v3.2s
+#define dXr2    v4.2s
+#define dXi2    v5.2s
+#define dXr3    v6.2s
+#define dXi3    v7.2s
+#define dYr0    v8.2s
+#define dYi0    v9.2s
+#define dYr1    v10.2s
+#define dYi1    v11.2s
+#define dYr2    v12.2s
+#define dYi2    v13.2s
+#define dYr3    v14.2s
+#define dYi3    v15.2s
+#define dZr0    v16.2s
+#define dZi0    v17.2s
+#define dZr1    v18.2s
+#define dZi1    v19.2s
+#define dZr2    v20.2s
+#define dZi2    v21.2s
+#define dZr3    v22.2s
+#define dZi3    v23.2s
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+        
+        // pT0+1 increments pT0 by 8 bytes
+        // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+        // Note: outPointStep = pointStep for firststage
+
+        lsl     pointStep, subFFTNum, #1
+
+        // Update pSubFFTSize and pSubFFTNum regs
+        ld2     {dXr0,dXi0}, [pSrc], pointStep          // data[0]
+
+        // subFFTSize = 1 for the first stage
+        MOV     subFFTSize,#4
+
+        // Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#2
+        ld2     {dXr1,dXi1}, [pSrc], pointStep          //  data[1]
+        MOV     subFFTNum,grpSize
+
+
+        // Calculate the step of input data for the next set
+        //MOV     setStep,pointStep,LSL #1
+        lsl     setStep, grpSize, #4
+        ld2     {dXr2,dXi2}, [pSrc], pointStep          //  data[2]
+
+        // setStep = 3*pointStep
+        ADD     setStep,setStep,pointStep
+        // setStep = - 3*pointStep+16
+
+        rsb     setStep,setStep,#16
+        //  data[3] & update pSrc for the next set
+        ld2     {dXr3,dXi3}, [pSrc], setStep
+
+        // step1 = 2*pointStep
+        lsl     step1, pointStep, #1
+
+        // fadd qY0, qX0, qX2
+        fadd    dYr0, dXr0, dXr2        
+        fadd    dYi0, dXi0, dXi2
+        // step3 = -pointStep
+        neg     step3, pointStep
+
+        // grp = 0 a special case since all the twiddle factors are 1
+        // Loop on the sets : 2 sets at a time
+
+radix4fsGrpZeroSetLoop\name :
+
+
+
+        // Decrement setcount
+        SUBS    setCount,setCount,#2
+
+
+        // finish first stage of 4 point FFT
+
+
+        // fsub qy2,qx0,qx2
+        fsub    dYr2, dXr0, dXr2
+        fsub    dYi2, dXi0, dXi2
+
+        ld2     {dXr0,dXi0}, [pSrc], step1              //  data[0]
+        // fadd qy1,qx1,qx3     
+        fadd    dYr1, dXr1, dXr3                    
+        fadd    dYi1, dXi1, dXi3
+        ld2     {dXr2,dXi2}, [pSrc], step3              //  data[2]
+        // fsub qy3,qx1,qx3     
+        fsub    dYr3, dXr1, dXr3                    
+        fsub    dYi3, dXi1, dXi3
+
+
+        // finish second stage of 4 point FFT
+
+        .ifeqs "\inverse", "TRUE"
+
+            ld2     {dXr1,dXi1}, [pSrc], step1          //  data[1]
+            // fadd  qz0,qy0,qy1
+            fadd    dZr0, dYr0, dYr1                    
+            fadd    dZi0, dYi0, dYi1
+
+            //  data[3] & update pSrc for the next set, but not if it's the
+            //  last iteration so that we don't read past the end of the 
+            //  input array.
+            BEQ     radix4SkipLastUpdateInv\name
+            ld2     {dXr3,dXi3}, [pSrc], setStep
+
+radix4SkipLastUpdateInv\name:
+            FSUB    dZr3,dYr2,dYi3
+
+            st2    {dZr0,dZi0},[pDst],outPointStep
+            FADD    dZi3,dYi2,dYr3
+
+            // fsub qZ1,qY0,qY1 
+            FSUB    dZr1, dYr0, dYr1
+            FSUB    dZi1, dYi0, dYi1
+            st2    {dZr3,dZi3},[pDst],outPointStep
+
+            FADD    dZr2,dYr2,dYi3
+            st2    {dZr1,dZi1},[pDst],outPointStep
+            FSUB    dZi2,dYi2,dYr3
+
+            //  fadd qY0, qX0, qX2
+            FADD    dYr0, dXr0, dXr2                    // u0 for next iteration
+            FADD    dYi0, dXi0, dXi2
+            st2    {dZr2,dZi2},[pDst],setStep
+
+
+        .else
+
+            ld2     {dXr1,dXi1}, [pSrc], step1          //  data[1]
+            // fadd qZ0,qY0,qY1
+            fadd    dZr0, dYr0, dYr1
+            fadd    dZi0, dYi0, dYi1
+
+            //  data[3] & update pSrc for the next set, but not if it's the
+            //  last iteration so that we don't read past the end of the 
+            //  input array.
+            BEQ     radix4SkipLastUpdateFwd\name
+            ld2     {dXr3,dXi3}, [pSrc], setStep
+
+radix4SkipLastUpdateFwd\name:
+            FADD    dZr2,dYr2,dYi3
+
+            st2    {dZr0,dZi0},[pDst],outPointStep
+            FSUB    dZi2,dYi2,dYr3
+
+            // fsub  qz1,qy0,qy1
+            fsub    dZr1, dYr0, dYr1
+            fsub    dZi1, dYi0, dYi1
+            st2    {dZr2,dZi2},[pDst],outPointStep
+
+            FSUB    dZr3,dYr2,dYi3
+            st2    {dZr1,dZi1},[pDst],outPointStep
+            FADD    dZi3,dYi2,dYr3
+
+            //  fadd  qy0,qx0,qx2
+            fadd    dYr0, dXr0, dXr2                    // u0 for next iteration
+            fadd    dYi0, dXi0, dXi2
+
+            st2    {dZr3,dZi3},[pDst],setStep
+
+        .endif
+
+        BGT     radix4fsGrpZeroSetLoop\name
+
+        // Save subFFTNum and subFFTSize for next stage
+        str     subFFTNum, [pSubFFTNum]
+        str     subFFTSize, [pSubFFTSize]
+        
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace,,d15
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace,,d15
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S
new file mode 100644
index 0000000..2fc2e60
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_ls_s.S
@@ -0,0 +1,371 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a Radix 4 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+// Guarding implementation by the processor name
+
+
+// Import symbols required from other files
+// (For example tables)
+    //IMPORT  armAAC_constTable
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define outPointStep    x8
+#define grpCount        x9
+#define dstStep         x10
+#define grpTwStep       x13
+#define stepTwiddle     x14
+#define twStep          x15
+#define step16          x11
+#define step24          x12
+
+
+// Neon Registers
+
+#define dButterfly1Real02       v0.2s
+#define dButterfly1Real028b     v0.8b
+#define dButterfly1Imag02       v1.2s
+#define dButterfly1Imag028b     v1.8b
+#define dButterfly1Real13       v2.2s
+#define dButterfly1Real138b     v2.8b
+#define dButterfly1Imag13       v3.2s
+#define dButterfly1Imag138b     v3.8b
+#define dButterfly2Real02       v4.2s
+#define dButterfly2Imag02       v5.2s
+#define dButterfly2Real13       v6.2s
+#define dButterfly2Imag13       v7.2s
+#define dXr0                    v0.2s
+#define dXi0                    v1.2s
+#define dXr08b                  v0.8b
+#define dXi08b                  v1.8b
+#define dXr1                    v2.2s
+#define dXi1                    v3.2s
+#define dXr2                    v4.2s
+#define dXi2                    v5.2s
+#define dXr3                    v6.2s
+#define dXi3                    v7.2s
+
+#define dYr0                    v16.2s
+#define dYi0                    v17.2s
+#define dYr1                    v18.2s
+#define dYi1                    v19.2s
+#define dYr2                    v20.2s
+#define dYi2                    v21.2s
+#define dYr3                    v22.2s
+#define dYi3                    v23.2s
+
+#define dW1r                    v8.2s
+#define dW1i                    v9.2s
+#define dW2r                    v10.2s
+#define dW2r8b                  v10.8b
+#define dW2i                    v11.2s
+#define dW3r                    v12.2s
+#define dW3r8b                  v12.8b
+#define dW3i                    v13.2s
+
+#define dZr0                    v14.2s
+#define dZi0                    v15.2s
+#define dZr08b                  v14.8b
+#define dZi08b                  v15.8b
+#define dZr1                    v26.2s
+#define dZi1                    v27.2s
+#define dZr2                    v28.2s
+#define dZi2                    v29.2s
+#define dZr3                    v30.2s
+#define dZi3                    v31.2s
+
+#define dZip                    v24.2s
+#define dZip8b                  v24.8b
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+
+        // pOut0+1 increments pOut0 by 8 bytes
+        // pOut0+outPointStep == increment of 8*outPointStep bytes
+        lsl     outPointStep,subFFTSize, #3
+
+        // Update grpCount and grpSize rightaway
+
+        ld2    {dW1r,dW1i},[pTwiddle]             // [wi|wr]
+        MOV     step16,#16
+        LSL     grpCount,subFFTSize,#2
+
+        ld1    {dW2r},[pTwiddle]                  // [wi|wr]
+        MOV     subFFTNum,#1                      //after the last stage
+
+        ld1    {dW3r},[pTwiddle],step16           // [wi|wr]
+        MOV     stepTwiddle,#0
+
+        ld1    {dW2i},[pTwiddle],#8               // [wi|wr]
+        SUB     grpTwStep,stepTwiddle,#8          // grpTwStep = -8 to start with
+
+        // update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+        ld1    {dW3i},[pTwiddle],grpTwStep        // [wi|wr]
+        lsl     dstStep,outPointStep, #1
+
+        // AC.r AC.i BD.r BD.i
+        ld4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
+        ADD     dstStep,dstStep,outPointStep      // dstStep = 3*outPointStep
+
+        rsb     dstStep,dstStep,#16               // dstStep = - 3*outPointStep+16
+        MOV     step24,#24
+
+        // AC.r AC.i BD.r BD.i
+        ld4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
+
+
+        // Process two groups at a time
+
+radix4lsGrpLoop\name :
+
+        // VZIP    dW2r,dW2i
+        zip1    dZip, dW2r, dW2i
+        zip2    dW2i, dW2r, dW2i
+        mov     dW2r8b, dZip8b
+
+        ADD     stepTwiddle,stepTwiddle,#16
+
+        // VZIP    dW3r,dW3i
+        zip1    dZip, dW3r,dW3i
+        zip2    dW3i, dW3r, dW3i
+        mov     dW3r8b, dZip8b
+        ADD     grpTwStep,stepTwiddle,#4
+
+        // VUZP     dButterfly1Real13, dButterfly2Real13      // B.r D.r
+        uzp1     dZip, dButterfly1Real13, dButterfly2Real13   // B.r D.r
+        uzp2     dButterfly2Real13, dButterfly1Real13, dButterfly2Real13   // B.r D.r
+        mov      dButterfly1Real138b, dZip8b
+
+        SUB     twStep,stepTwiddle,#16                        // -16+stepTwiddle
+
+        // VUZP     dButterfly1Imag13, dButterfly2Imag13      // B.i D.i
+        uzp1     dZip, dButterfly1Imag13, dButterfly2Imag13   // B.i D.i
+        uzp2     dButterfly2Imag13, dButterfly1Imag13, dButterfly2Imag13   // B.i D.i
+        mov      dButterfly1Imag138b, dZip8b
+        lsl     grpTwStep,grpTwStep,#1
+
+        // VUZP     dButterfly1Real02, dButterfly2Real02      // A.r C.r
+        uzp1     dZip, dButterfly1Real02, dButterfly2Real02   // A.r C.r
+        uzp2     dButterfly2Real02, dButterfly1Real02, dButterfly2Real02   // A.r C.r
+        mov      dButterfly1Real028b, dZip8b
+        rsb     grpTwStep,grpTwStep,#0                        // -8-2*stepTwiddle
+
+        // VUZP     dButterfly1Imag02, dButterfly2Imag02      // A.i C.i
+        uzp1     dZip, dButterfly1Imag02, dButterfly2Imag02   // A.i C.i
+        uzp2     dButterfly2Imag02, dButterfly1Imag02, dButterfly2Imag02   // A.i C.i
+        mov      dButterfly1Imag028b, dZip8b
+
+
+        // grpCount is multiplied by 4
+        SUBS    grpCount,grpCount,#8
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   dZr1,dW1r,dXr1
+            fmla   dZr1,dW1i,dXi1                       // real part
+            fmul   dZi1,dW1r,dXi1
+            fmls   dZi1,dW1i,dXr1                       // imag part
+
+        .else
+
+            fmul   dZr1,dW1r,dXr1
+            fmls   dZr1,dW1i,dXi1                       // real part
+            fmul   dZi1,dW1r,dXi1
+            fmla   dZi1,dW1i,dXr1                       // imag part
+
+        .endif
+
+        ld2    {dW1r,dW1i},[pTwiddle],stepTwiddle       // [wi|wr]
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   dZr2,dW2r,dXr2
+            fmla   dZr2,dW2i,dXi2                       // real part
+            fmul   dZi2,dW2r,dXi2
+            ld1   {dW2r},[pTwiddle],step16              // [wi|wr]
+            fmls   dZi2,dW2i,dXr2                       // imag part
+
+        .else
+
+            fmul   dZr2,dW2r,dXr2
+            fmls   dZr2,dW2i,dXi2                       // real part
+            fmul   dZi2,dW2r,dXi2
+            ld1    {dW2r},[pTwiddle],step16             // [wi|wr]
+            fmla   dZi2,dW2i,dXr2                       // imag part
+
+        .endif
+
+
+        ld1    {dW2i},[pTwiddle],twStep                 // [wi|wr]
+
+        // move qX0 so as to load for the next iteration
+        // MOV     qZ0,qX0
+        mov     dZr08b, dXr08b
+        mov     dZi08b, dXi08b
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   dZr3,dW3r,dXr3
+            fmla   dZr3,dW3i,dXi3                       // real part
+            fmul   dZi3,dW3r,dXi3
+            ld1    {dW3r},[pTwiddle],step24
+            fmls   dZi3,dW3i,dXr3                       // imag part
+
+        .else
+
+            fmul   dZr3,dW3r,dXr3
+            fmls   dZr3,dW3i,dXi3                       // real part
+            fmul   dZi3,dW3r,dXi3
+            ld1    {dW3r},[pTwiddle],step24
+            fmla   dZi3,dW3i,dXr3                       // imag part
+
+        .endif
+
+        ld1    {dW3i},[pTwiddle],grpTwStep              // [wi|wr]
+
+        // Don't do the load on the last iteration so we don't read past the end
+        // of pSrc.
+        bne     skipIncrement\name
+        add     pSrc, pSrc, #64
+skipIncrement\name:     
+        beq     radix4lsSkipRead\name
+        // AC.r AC.i BD.r BD.i
+        ld4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc], #32
+
+        // AC.r AC.i BD.r BD.i
+        ld4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc], #32
+radix4lsSkipRead\name:
+
+        // finish first stage of 4 point FFT
+
+        // fadd    qY0,qZ0,qZ2
+        fadd    dYr0,dZr0,dZr2
+        fadd    dYi0,dZi0,dZi2
+        // fsub    qY2,qZ0,qZ2
+        fsub    dYr2,dZr0,dZr2
+        fsub    dYi2,dZi0,dZi2
+        // fadd    qY1,qZ1,qZ3
+        fadd    dYr1,dZr1,dZr3
+        fadd    dYi1,dZi1,dZi3
+        // fsub    qY3,qZ1,qZ3
+        fsub    dYr3,dZr1,dZr3
+        fsub    dYi3,dZi1,dZi3
+
+
+        // finish second stage of 4 point FFT
+
+        .ifeqs  "\inverse", "TRUE"
+
+            // fsub    qZ0,qY2,qY1
+            fsub    dZr0,dYr2,dYr1
+            fsub    dZi0,dYi2,dYi1
+            fadd    dZr3,dYr0,dYi3
+            st2    {dZr0,dZi0},[pDst],outPointStep
+            fsub    dZi3,dYi0,dYr3
+
+            // fadd    qZ2,qY2,qY1
+            fadd    dZr2,dYr2,dYr1
+            fadd    dZi2,dYi2,dYi1
+
+            st2    {dZr3,dZi3},[pDst],outPointStep
+
+            fsub    dZr1,dYr0,dYi3
+            st2    {dZr2,dZi2},[pDst],outPointStep
+            fadd    dZi1,dYi0,dYr3
+
+            // dstStep = -outPointStep + 16
+            st2    {dZr1,dZi1},[pDst],dstStep
+
+
+        .else
+
+            // fsub    qZ0,qY2,qY1
+            fsub    dZr0,dYr2,dYr1
+            fsub    dZi0,dYi2,dYi1
+
+            fsub    dZr1,dYr0,dYi3
+            st2    {dZr0,dZi0},[pDst],outPointStep
+            fadd    dZi1,dYi0,dYr3
+
+            // fadd    qZ2,qY2,qY1
+            fadd    dZr2,dYr2,dYr1
+            fadd    dZi2,dYi2,dYi1
+
+            st2    {dZr1,dZi1},[pDst],outPointStep
+
+            fadd    dZr3,dYr0,dYi3
+            st2    {dZr2,dZi2},[pDst],outPointStep
+            fsub    dZi3,dYi0,dYr3
+
+            // dstStep = -outPointStep + 16
+            st2    {dZr3,dZi3},[pDst],dstStep
+
+
+        .endif
+
+        BGT     radix4lsGrpLoop\name
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace,,d15
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace,,d15
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S
new file mode 100644
index 0000000..830fd16
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix4_s.S
@@ -0,0 +1,339 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//
+//  This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a Radix 4 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+
+// Guarding implementation by the processor name
+
+
+// Import symbols required from other files
+// (For example tables)
+
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define grpCount        x7
+#define grpCount32      w7
+#define pointStep       x8
+#define pointStep32     w8
+#define outPointStep    x9
+#define stepTwiddle     x10
+#define setCount        x11
+#define srcStep         x12
+#define setStep         x13
+#define dstStep         x14
+#define twStep          x15
+
+// Neon Registers
+
+#define dW1     v0.2s
+#define dW2     v1.2s
+#define dW3     v2.2s
+
+#define dXr0    v4.2s
+#define dXi0    v5.2s
+#define dXr1    v6.2s
+#define dXi1    v7.2s
+#define dXr2    v8.2s
+#define dXi2    v9.2s
+#define dXr3    v10.2s
+#define dXi3    v11.2s
+#define dYr0    v12.2s
+#define dYi0    v13.2s
+#define dYr1    v14.2s
+#define dYi1    v15.2s
+#define dYr2    v16.2s
+#define dYi2    v17.2s
+#define dYr3    v18.2s
+#define dYi3    v19.2s
+#define dZr0    v20.2s
+#define dZi0    v21.2s
+#define dZr1    v22.2s
+#define dZi1    v23.2s
+#define dZr2    v24.2s
+#define dZi2    v25.2s
+#define dZr3    v26.2s
+#define dZi3    v27.2s
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+
+        // Update grpCount and grpSize rightaway inorder to reuse
+        // pGrpCount and pGrpSize regs
+
+        LSL     grpCount,subFFTSize,#2
+        LSR     subFFTNum,subFFTNum,#2
+        MOV     subFFTSize,grpCount
+
+        ld1      {dW1},[pTwiddle]                    //[wi | wr]
+        // pT0+1 increments pT0 by 8 bytes
+        // pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+        lsl     pointStep,subFFTNum, #1
+
+        // pOut0+1 increments pOut0 by 8 bytes
+        // pOut0+outPointStep == increment of 8*outPointStep bytes
+        //   = 2*size bytes
+
+        MOV     stepTwiddle,#0
+        ld1      {dW2},[pTwiddle]                    //[wi | wr]
+        smull   outPointStep,grpCount32,pointStep32
+
+        LSL     pointStep,pointStep,#2             // 2*grpSize
+
+        ld1      {dW3},[pTwiddle]                  //[wi | wr]
+        lsl     srcStep,pointStep, #1              // srcStep = 2*pointStep
+
+        ADD     setStep,srcStep,pointStep          // setStep = 3*pointStep
+
+        rsb     setStep,setStep,#0                 // setStep = - 3*pointStep
+        SUB     srcStep,srcStep,#16                // srcStep = 2*pointStep-16
+
+        lsl     dstStep,outPointStep, #1
+
+        ADD     dstStep,dstStep,outPointStep       // dstStep = 3*outPointStep
+        // dstStep = - 3*outPointStep+16
+        rsb     dstStep,dstStep,#16
+
+
+radix4GrpLoop\name :
+
+        ld2     {dXr0,dXi0},[pSrc],pointStep       //  data[0]
+        ADD      stepTwiddle,stepTwiddle,pointStep
+        ld2     {dXr1,dXi1},[pSrc],pointStep       //  data[1]
+        // set pTwiddle to the first point
+        ADD      pTwiddle,pTwiddle,stepTwiddle
+        ld2     {dXr2,dXi2},[pSrc],pointStep       //  data[2]
+        lsl      twStep,stepTwiddle, #2
+
+        //  data[3] & update pSrc for the next set
+        ld2     {dXr3,dXi3},[pSrc],setStep
+        SUB      twStep,stepTwiddle,twStep         // twStep = -3*stepTwiddle
+
+        lsr      setCount,pointStep, #3
+
+        // set pSrc to data[0] of the next set
+        ADD     pSrc,pSrc,#16
+        // increment to data[1] of the next set
+        ADD     pSrc,pSrc,pointStep
+
+
+        // Loop on the sets
+
+radix4SetLoop\name :
+
+
+
+        .ifeqs  "\inverse", "TRUE"
+            fmul   dZr1,dXr1,dW1[0]
+            fmul   dZi1,dXi1,dW1[0]
+            fmul   dZr2,dXr2,dW2[0]
+            fmul   dZi2,dXi2,dW2[0]
+            fmul   dZr3,dXr3,dW3[0]
+            fmul   dZi3,dXi3,dW3[0]
+
+            fmla   dZr1,dXi1,dW1[1]                // real part
+            fmls   dZi1,dXr1,dW1[1]                // imag part
+
+            //  data[1] for next iteration
+            ld2     {dXr1,dXi1},[pSrc],pointStep
+
+            fmla   dZr2,dXi2,dW2[1]                // real part
+            fmls   dZi2,dXr2,dW2[1]                // imag part
+
+            //  data[2] for next iteration
+            ld2     {dXr2,dXi2},[pSrc],pointStep
+
+            fmla   dZr3,dXi3,dW3[1]                // real part
+            fmls   dZi3,dXr3,dW3[1]                // imag part
+        .else
+            fmul   dZr1,dXr1,dW1[0]
+            fmul   dZi1,dXi1,dW1[0]
+            fmul   dZr2,dXr2,dW2[0]
+            fmul   dZi2,dXi2,dW2[0]
+            fmul   dZr3,dXr3,dW3[0]
+            fmul   dZi3,dXi3,dW3[0]
+
+            fmls   dZr1,dXi1,dW1[1]                // real part
+            fmla   dZi1,dXr1,dW1[1]                // imag part
+
+            //  data[1] for next iteration
+            ld2     {dXr1,dXi1},[pSrc],pointStep
+
+            fmls   dZr2,dXi2,dW2[1]                // real part
+            fmla   dZi2,dXr2,dW2[1]                // imag part
+
+            //  data[2] for next iteration
+            ld2     {dXr2,dXi2},[pSrc],pointStep
+
+            fmls   dZr3,dXi3,dW3[1]                // real part
+            fmla   dZi3,dXr3,dW3[1]                // imag part
+        .endif
+
+        //  data[3] & update pSrc to data[0]
+        // But don't read on the very last iteration because that reads past 
+        // the end of pSrc. The last iteration is grpCount = 4, setCount = 2.
+        cmp     grpCount, #4
+        
+        b.ne    skipUpdate\name
+        cmp     setCount, #2
+        b.ne    skipUpdate\name
+        add     pSrc, pSrc, setStep
+        beq     radix4SkipRead\name
+skipUpdate\name:
+        ld2     {dXr3,dXi3},[pSrc],setStep
+radix4SkipRead\name:
+
+        SUBS    setCount,setCount,#2
+
+        // finish first stage of 4 point FFT
+        // fadd    qY0,qX0,qZ2
+        // fsub    qY2,qX0,qZ2
+        fadd    dYr0,dXr0,dZr2
+        fsub    dYr2,dXr0,dZr2
+        fadd    dYi0,dXi0,dZi2
+        fsub    dYi2,dXi0,dZi2
+
+        //  data[0] for next iteration
+        ld2     {dXr0,dXi0},[pSrc], #16
+        // fadd    qY1,qZ1,qZ3
+        // fsub    qY3,qZ1,qZ3
+        fadd    dYr1,dZr1,dZr3
+        fsub    dYr3,dZr1,dZr3
+        fadd    dYi1,dZi1,dZi3
+        fsub    dYi3,dZi1,dZi3
+
+        // finish second stage of 4 point FFT
+
+        // fsub    qZ0,qY2,qY1
+        fsub    dZr0,dYr2,dYr1
+        fsub    dZi0,dYi2,dYi1
+
+        .ifeqs  "\inverse", "TRUE"
+
+            fadd    dZr3,dYr0,dYi3
+            st2     {dZr0,dZi0},[pDst],outPointStep
+            fsub    dZi3,dYi0,dYr3
+
+            // fadd    qZ2,qY2,qY1
+            fadd    dZr2,dYr2,dYr1
+            fadd    dZi2,dYi2,dYi1
+
+            st2     {dZr3,dZi3},[pDst],outPointStep
+
+            fsub    dZr1,dYr0,dYi3
+            st2     {dZr2,dZi2},[pDst],outPointStep
+            fadd    dZi1,dYi0,dYr3
+
+            st2     {dZr1,dZi1},[pDst],dstStep
+
+
+        .else
+
+            fsub    dZr1,dYr0,dYi3
+            st2     {dZr0,dZi0},[pDst],outPointStep
+            fadd    dZi1,dYi0,dYr3
+
+            // fadd    qZ2,qY2,qY1
+            fadd    dZr2,dYr2,dYr1
+            fadd    dZi2,dYi2,dYi1
+
+            st2     {dZr1,dZi1},[pDst],outPointStep
+
+            fadd    dZr3,dYr0,dYi3
+            st2     {dZr2,dZi2},[pDst],outPointStep
+            fsub    dZi3,dYi0,dYr3
+
+            st2     {dZr3,dZi3},[pDst],dstStep
+
+
+        .endif
+
+        // increment to data[1] of the next set
+        ADD     pSrc,pSrc,pointStep
+        BGT     radix4SetLoop\name
+
+
+        ld1      {dW1},[pTwiddle],stepTwiddle    //[wi | wr]
+        // subtract 4 since grpCount multiplied by 4
+        SUBS    grpCount,grpCount,#4
+        ld1      {dW2},[pTwiddle],stepTwiddle    //[wi | wr]
+        // increment pSrc for the next grp
+        ADD     pSrc,pSrc,srcStep
+        ld1      {dW3},[pTwiddle],twStep         //[wi | wr]
+        BGT     radix4GrpLoop\name
+
+        str     subFFTNum, [pSubFFTNum]
+        str     subFFTSize, [pSubFFTSize]
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace,,d15
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace,,d15
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        .end
diff --git a/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
new file mode 100644
index 0000000..f348e6a
--- /dev/null
+++ b/dl/sp/src/arm/arm64/armSP_FFT_CToC_FC32_Radix8_fs_s.S
@@ -0,0 +1,473 @@
+//
+//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+//
+//  Use of this source code is governed by a BSD-style license
+//  that can be found in the LICENSE file in the root of the source
+//  tree. An additional intellectual property rights grant can be found
+//  in the file PATENTS.  All contributing project authors may
+//  be found in the AUTHORS file in the root of the source tree.
+//
+//  This is a modification of armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.s
+//  to support float instead of SC32.
+//
+
+//
+// Description:
+// Compute a first stage Radix 8 FFT stage for a N point complex signal
+//
+//
+
+
+// Include standard headers
+
+#include "dl/api/arm/arm64COMM_s.h"
+#include "dl/api/arm/omxtypes_s.h"
+
+// Import symbols required from other files
+// (For example tables)
+
+
+// Set debugging level
+//DEBUG_ON    SETL {TRUE}
+
+
+
+// Guarding implementation by the processor name
+
+
+
+
+// Guarding implementation by the processor name
+
+//Input Registers
+
+#define pSrc            x0
+#define pDst            x1
+#define pTwiddle        x2
+#define	pSubFFTNum	x3
+#define pSubFFTSize	x4	
+
+
+//Output Registers
+
+
+//Local Scratch Registers
+
+#define subFFTNum       x5
+#define subFFTSize      x6
+#define grpSize         x7
+// Reuse grpSize as setCount
+#define setCount        x7
+#define pointStep       x8
+#define outPointStep    x8
+#define setStep         x9
+#define step1           x10
+#define step2           x11
+#define t0              w12
+
+
+// Neon Registers
+
+#define dXr0    v0.2s
+#define dXi0    v1.2s
+#define dXr1    v2.2s
+#define dXi1    v3.2s
+#define dXr2    v4.2s
+#define dXi2    v5.2s
+#define dXr3    v6.2s
+#define dXi3    v7.2s
+#define dXr4    v8.2s
+#define dXi4    v9.2s
+#define dXr5    v10.2s
+#define dXi5    v11.2s
+#define dXr6    v12.2s
+#define dXi6    v13.2s
+#define dXr7    v14.2s
+#define dXi7    v15.2s
+#define qX0     v0.4s
+#define qX1     v1.4s
+#define qX2     v2.4s
+#define qX3     v3.4s
+#define qX4     v4.4s
+#define qX5     v5.4s
+#define qX6     v6.4s
+#define qX7     v7.4s
+
+#define dUr0    v16.2s
+#define dUi0    v17.2s
+#define dUr2    v18.2s
+#define dUi2    v19.2s
+#define dUr4    v20.2s
+#define dUi4    v21.2s
+#define dUr6    v22.2s
+#define dUi6    v23.2s
+#define dUr1    v24.2s
+#define dUi1    v25.2s
+#define dUr3    v26.2s
+#define dUi3    v27.2s
+#define dUr5    v28.2s
+#define dUi5    v29.2s
+// reuse dXr7 and dXi7
+#define dUr7    v30.2s
+#define dUi7    v31.2s
+#define qU0     v8.4s
+#define qU1     v12.4s
+#define qU2     v9.4s
+#define qU3     v13.4s
+#define qU4     v10.4s
+#define qU5     v14.4s
+#define qU6     v11.4s
+#define qU7     v15.4s
+
+
+#define dVr0    v24.2s
+#define dVi0    v25.2s
+#define dVr2    v26.2s
+#define dVi2    v27.2s
+#define dVr4    v28.2s
+#define dVi4    v29.2s
+#define dVr6    v30.2s
+#define dVi6    v31.2s
+#define dVr1    v16.2s
+#define dVi1    v17.2s
+#define dVr3    v18.2s
+#define dVi3    v19.2s
+#define dVr5    v20.2s
+#define dVi5    v21.2s
+#define dVr7    v22.2s
+#define dVi7    v23.2s
+#define qV0     v12.4s
+#define qV1     v8.4s
+#define qV2     v13.4s
+#define qV3     v9.4s
+#define qV4     v14.4s
+#define qV5     v10.4s
+#define qV6     v15.4s
+#define qV7     v11.4s
+
+#define dYr0    v16.2s
+#define dYi0    v17.2s
+#define dYr2    v18.2s
+#define dYi2    v19.2s
+#define dYr4    v20.2s
+#define dYi4    v21.2s
+#define dYr6    v22.2s
+#define dYi6    v23.2s
+#define dYr1    v24.2s
+#define dYi1    v25.2s
+#define dYr3    v26.2s
+#define dYi3    v27.2s
+#define dYr5    v28.2s
+#define dYi5    v29.2s
+#define dYr7    v30.2s
+#define dYi7    v31.2s
+#define qY0     v8.4s
+#define qY1     v12.4s
+#define qY2     v9.4s
+#define qY3     v13.4s
+#define qY4     v10.4s
+#define qY5     v14.4s
+#define qY6     v11.4s
+#define qY7     v15.4s
+
+#define dT0     v14.2s
+#define dT0s    v14.s
+#define dT1     v15.2s
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        // Define stack arguments
+
+        // Move args values into our work registers
+        ldr     subFFTNum, [pSubFFTNum]
+        ldr     subFFTSize, [pSubFFTSize]
+
+        // Update pSubFFTSize and pSubFFTNum regs
+        // subFFTSize = 1 for the first stage
+
+        movz    t0, 0x3f35, lsl #16               // High half word of sqrt(1/2).
+        movk    t0, 0x04f3                        // Low half word of sqrt(1/2).
+        MOV     subFFTSize,#8
+
+        // Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#3
+        MOV     subFFTNum,grpSize
+
+
+        // pT0+1 increments pT0 by 8 bytes
+        // pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
+        // Note: outPointStep = pointStep for firststage
+
+        lsl     pointStep,grpSize, #3
+
+
+        // Calculate the step of input data for the next set
+        //MOV     step1,pointStep,LSL #1             // step1 = 2*pointStep
+        ld2     {dXr0,dXi0},[pSrc],pointStep         //  data[0]
+        lsl     step1,grpSize, #4
+        lsl     step2,pointStep, #3
+
+        ld2     {dXr1,dXi1},[pSrc],pointStep         //  data[1]
+        SUB     step2,step2,pointStep                // step2 = 7*pointStep
+        // setStep = - 7*pointStep+16
+        rsb     setStep,step2,#16
+
+        ld2     {dXr2,dXi2},[pSrc],pointStep         //  data[2]
+        ld2     {dXr3,dXi3},[pSrc],pointStep         //  data[3]
+        ld2     {dXr4,dXi4},[pSrc],pointStep         //  data[4]
+        ld2     {dXr5,dXi5},[pSrc],pointStep         //  data[5]
+        ld2     {dXr6,dXi6},[pSrc],pointStep         //  data[6]
+        //  data[7] & update pSrc for the next set
+        //  setStep = -7*pointStep + 16
+        ld2     {dXr7,dXi7},[pSrc],setStep
+        // grp = 0 a special case since all the twiddle factors are 1
+        // Loop on the sets
+
+radix8fsGrpZeroSetLoop\name :
+
+        // Decrement setcount
+        SUBS    setCount,setCount,#2
+
+
+        // finish first stage of 8 point FFT
+
+        // fadd    qU0,qX0,qX4
+        // fadd    qU2,qX1,qX5
+        // fadd    qU4,qX2,qX6
+        // fadd    qU6,qX3,qX7
+        fadd    dUr0,dXr0,dXr4
+        fadd    dUr2,dXr1,dXr5
+        fadd    dUr4,dXr2,dXr6
+        fadd    dUr6,dXr3,dXr7
+        fadd    dUi0,dXi0,dXi4
+        fadd    dUi2,dXi1,dXi5
+        fadd    dUi4,dXi2,dXi6
+        fadd    dUi6,dXi3,dXi7
+
+        // finish second stage of 8 point FFT
+
+        // fadd    qV0,qU0,qU4
+        // fsub    qV2,qU0,qU4
+        // fadd    qV4,qU2,qU6
+        // fsub    qV6,qU2,qU6
+        fadd    dVr0,dUr0,dUr4
+        fsub    dVr2,dUr0,dUr4
+        fadd    dVr4,dUr2,dUr6
+        fsub    dVr6,dUr2,dUr6
+        fadd    dVi0,dUi0,dUi4
+        fsub    dVi2,dUi0,dUi4
+        fadd    dVi4,dUi2,dUi6
+        fsub    dVi6,dUi2,dUi6
+
+        // finish third stage of 8 point FFT
+
+        // fadd    qY0,qV0,qV4
+        // fsub    qY4,qV0,qV4
+        fadd    dYr0,dVr0,dVr4
+        fsub    dYr4,dVr0,dVr4
+        fadd    dYi0,dVi0,dVi4
+        fsub    dYi4,dVi0,dVi4
+
+        st2     {dYr0,dYi0},[pDst],step1         // store y0
+
+        .ifeqs  "\inverse", "TRUE"
+
+            fsub    dYr2,dVr2,dVi6
+            fadd    dYi2,dVi2,dVr6
+
+            fadd    dYr6,dVr2,dVi6
+            st2     {dYr2,dYi2},[pDst],step1     // store y2
+            fsub    dYi6,dVi2,dVr6
+
+            // fsub    qU1,qX0,qX4
+            fsub    dUr1,dXr0,dXr4
+            fsub    dUi1,dXi0,dXi4
+
+            st2     {dYr4,dYi4},[pDst],step1     // store y4
+
+            // fsub    qU3,qX1,qX5
+            // fsub    qU5,qX2,qX6
+            fsub    dUr3,dXr1,dXr5
+            fsub    dUr5,dXr2,dXr6
+            fsub    dUi3,dXi1,dXi5
+            fsub    dUi5,dXi2,dXi6
+
+            st2     {dYr6,dYi6},[pDst],step1     // store y6
+
+        .ELSE
+
+            fadd    dYr6,dVr2,dVi6
+            fsub    dYi6,dVi2,dVr6
+
+            fsub    dYr2,dVr2,dVi6
+            st2     {dYr6,dYi6},[pDst],step1     // store y2
+            fadd    dYi2,dVi2,dVr6
+
+
+            // fsub    qU1,qX0,qX4
+            fsub    dUr1,dXr0,dXr4
+            fsub    dUi1,dXi0,dXi4
+
+            st2     {dYr4,dYi4},[pDst],step1     // store y4
+
+            // fsub    qU3,qX1,qX5
+            // fsub    qU5,qX2,qX6
+            fsub    dUr3,dXr1,dXr5
+            fsub    dUr5,dXr2,dXr6
+            fsub    dUi3,dXi1,dXi5
+            fsub    dUi5,dXi2,dXi6
+
+            st2     {dYr2,dYi2},[pDst],step1     // store y6
+
+
+        .ENDIF
+
+        // finish first stage of 8 point FFT
+
+        // fsub    qU7,qX3,qX7
+        fsub    dUr7,dXr3,dXr7
+        fsub    dUi7,dXi3,dXi7
+
+        mov     dT0s[0], t0
+
+        // finish second stage of 8 point FFT
+
+        fsub    dVr1,dUr1,dUi5
+        //  data[0] for next iteration
+        ld2     {dXr0,dXi0},[pSrc],pointStep
+        fadd    dVi1,dUi1,dUr5
+        fadd    dVr3,dUr1,dUi5
+        ld2     {dXr1,dXi1},[pSrc],pointStep     //  data[1]
+        fsub    dVi3,dUi1,dUr5
+
+        fsub    dVr5,dUr3,dUi7
+        ld2     {dXr2,dXi2},[pSrc],pointStep     //  data[2]
+        fadd    dVi5,dUi3,dUr7
+        fadd    dVr7,dUr3,dUi7
+        ld2     {dXr3,dXi3},[pSrc],pointStep     //  data[3]
+        fsub    dVi7,dUi3,dUr7
+
+        // finish third stage of 8 point FFT
+
+        .ifeqs  "\inverse", "TRUE"
+
+            // calculate a*v5
+            fmul    dT1,dVr5,dT0[0]              // use dVi0 for dT1
+
+            ld2     {dXr4,dXi4},[pSrc],pointStep //  data[4]
+            fmul    dVi5,dVi5,dT0[0]
+
+            ld2     {dXr5,dXi5},[pSrc],pointStep //  data[5]
+            fsub    dVr5,dT1,dVi5                // a * V5
+            fadd    dVi5,dT1,dVi5
+
+            ld2     {dXr6,dXi6},[pSrc],pointStep //  data[6]
+
+            // calculate  b*v7
+            fmul    dT1,dVr7,dT0[0]
+            fmul    dVi7,dVi7,dT0[0]
+
+            // fadd    qY1,qV1,qV5
+            // fsub    qY5,qV1,qV5
+            fadd    dYr1,dVr1,dVr5
+            fsub    dYr5,dVr1,dVr5
+            fadd    dYi1,dVi1,dVi5
+            fsub    dYi5,dVi1,dVi5
+
+            fadd    dVr7,dT1,dVi7                // b * V7
+            fsub    dVi7,dVi7,dT1
+            SUB     pDst, pDst, step2            // set pDst to y1
+
+            // On the last iteration,  this will read past the end of pSrc, 
+            // so skip this read.
+            BEQ     radix8SkipLastUpdateInv\name
+            ld2     {dXr7,dXi7},[pSrc],setStep   //  data[7]
+radix8SkipLastUpdateInv\name:
+
+            fsub    dYr3,dVr3,dVr7
+            fsub    dYi3,dVi3,dVi7
+            st2     {dYr1,dYi1},[pDst],step1     // store y1
+            fadd    dYr7,dVr3,dVr7
+            fadd    dYi7,dVi3,dVi7
+
+
+            st2     {dYr3,dYi3},[pDst],step1     // store y3
+            st2     {dYr5,dYi5},[pDst],step1     // store y5
+            st2     {dYr7,dYi7},[pDst]           // store y7
+            ADD pDst, pDst, #16
+
+        .ELSE
+
+            // calculate  b*v7
+            fmul    dT1,dVr7,dT0[0]
+            ld2     {dXr4,dXi4},[pSrc],pointStep //  data[4]
+            fmul    dVi7,dVi7,dT0[0]
+
+            ld2     {dXr5,dXi5},[pSrc],pointStep //  data[5]
+            fadd    dVr7,dT1,dVi7                     // b * V7
+            fsub    dVi7,dVi7,dT1
+
+            ld2     {dXr6,dXi6},[pSrc],pointStep //  data[6]
+
+            // calculate a*v5
+            fmul    dT1,dVr5,dT0[0]              // use dVi0 for dT1
+            fmul    dVi5,dVi5,dT0[0]
+
+            fadd    dYr7,dVr3,dVr7
+            fadd    dYi7,dVi3,dVi7
+            SUB     pDst, pDst, step2            // set pDst to y1
+
+            fsub    dVr5,dT1,dVi5                // a * V5
+            fadd    dVi5,dT1,dVi5
+
+            // On the last iteration,  this will read past the end of pSrc, 
+            // so skip this read.
+            BEQ     radix8SkipLastUpdateFwd\name
+            ld2     {dXr7,dXi7},[pSrc],setStep   //  data[7]
+radix8SkipLastUpdateFwd\name:
+
+            // fsub    qY5,qV1,qV5
+            fsub    dYr5,dVr1,dVr5
+            fsub    dYi5,dVi1,dVi5
+
+            fsub    dYr3,dVr3,dVr7
+            st2     {dYr7,dYi7},[pDst],step1     // store y1
+            fsub    dYi3,dVi3,dVi7
+
+            // fadd    qY1,qV1,qV5
+            fadd    dYr1,dVr1,dVr5
+            fadd    dYi1,dVi1,dVi5
+
+            st2     {dYr5,dYi5},[pDst],step1     // store y3
+            st2     {dYr3,dYi3},[pDst],step1     // store y5
+            st2     {dYr1,dYi1},[pDst],#16       // store y7
+
+        .ENDIF
+
+
+        // update pDst for the next set
+        SUB     pDst, pDst, step2
+        BGT     radix8fsGrpZeroSetLoop\name
+
+        // Save subFFTNum and subFFTSize for next stage
+        str     subFFTNum, [pSubFFTNum]
+        str     subFFTSize, [pSubFFTSize]
+        
+        .endm
+
+
+        // Allocate stack memory required by the function
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace,,d15
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace,,d15
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+
+        .end
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c b/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c
new file mode 100644
index 0000000..f29796b
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTFwd_CToC_FC32.c
@@ -0,0 +1,190 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+/**
+ * Function:  omxSP_FFTFwd_CToC_FC32_Sfs   (2.2.4.2.2)
+ *
+ * Description:
+ * Compute an FFT for a complex signal of length of 2^order, 
+ * where 0 <= order <= 15. 
+ * Transform length is determined by the specification structure, which 
+ * must be initialized prior to calling the FFT function using the appropriate 
+ * helper, i.e., <FFTInit_C_sc32> or <FFTInit_C_SC16>. The relationship 
+ * between the input and output sequences can be expressed in terms of the 
+ * DFT, i.e., 
+ *
+ *      X[k] = SUM[n=0...N-1]x[n].e^(-jnk.2.pi/N)
+ *      k = 0,1,2,..., N-1
+ *      N = 2^order
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the input signal, a complex-valued vector of length
+ *            2^order; must be aligned on a 32 byte boundary. 
+ *   pFFTSpec - pointer to the preallocated and initialized specification 
+ *            structure 
+ *
+ * Output Arguments:
+ *   pDst - pointer to the complex-valued output vector, of length 2^order;
+ *            must be aligned on an 32-byte boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - returned if one or more of the following conditions 
+ *              is true: 
+ *    -   one or more of the following pointers is NULL: pSrc, pDst, or 
+ *              pFFTSpec. 
+ *    -    pSrc or pDst is not 32-byte aligned 
+ *
+ */
+
+OMXResult omxSP_FFTFwd_CToC_FC32_Sfs(const OMX_FC32* pSrc,
+                                     OMX_FC32* pDst,
+                                     const OMXFFTSpec_C_FC32* pFFTSpec) {
+  ARMsFFTSpec_FC32* spec = (ARMsFFTSpec_FC32*)pFFTSpec;
+  int order;
+  long subFFTSize;
+  long subFFTNum;
+  OMX_FC32* pTwiddle;
+  OMX_FC32* pOut;
+
+  /*
+   * Check args are not NULL and the source and destination pointers
+   * are properly aligned.
+   */
+  if (!validateParametersFC32(pSrc, pDst, spec))
+    return OMX_Sts_BadArgErr;
+
+  order = fastlog2(spec->N);
+
+  subFFTSize = 1;
+  subFFTNum = spec->N;
+  pTwiddle = spec->pTwiddle;
+  pOut = spec->pBuf;
+
+  if (order > 3) {
+    OMX_FC32* argDst;
+
+    /*
+     * Set up argDst and pOut appropriately so that pOut = pDst for
+     * the very last FFT stage.
+     */
+    if ((order & 2) == 0) {
+      argDst = pOut;
+      pOut = pDst;
+    } else {
+      argDst = pDst;
+    }
+
+    /*
+     * Odd order uses a radix 8 first stage; even order, a radix 4
+     * first stage.
+     */
+    if (order & 1) {
+      armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+          pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    } else {
+      armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+          pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+
+    /*
+     * Now use radix 4 stages to finish rest of the FFT
+     */
+    if (subFFTNum >= 4) {
+      while (subFFTNum > 4) {
+        OMX_FC32* tmp;
+
+        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+            argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+        /*
+         * Swap argDst and pOut
+         */
+        tmp = pOut;
+        pOut = argDst;
+        argDst = tmp;
+      }
+
+      armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+          argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+  } else if (order == 3) {
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+        pDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 2) {
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else {
+    /* Order = 1 */
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  }
+
+  return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c b/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c
new file mode 100644
index 0000000..f1e503e
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTFwd_RToCCS_F32.c
@@ -0,0 +1,213 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void ComplexToRealFixup(OMX_FC32* pSrc,
+                               OMX_F32* pDst,
+                               const OMX_FC32* pTwiddle,
+                               OMX_F32* pBuf,
+                               long N);
+
+/**
+ * Function:  omxSP_FFTFwd_CToC_FC32_Sfs   (2.2.4.2.2)
+ *
+ * Description:
+ * Compute an FFT for a complex signal of length of 2^order,
+ * where 0 <= order <= 15.
+ * Transform length is determined by the specification structure, which
+ * must be initialized prior to calling the FFT function using the appropriate
+ * helper, i.e., <FFTInit_C_sc32> or <FFTInit_C_SC16>. The relationship
+ * between the input and output sequences can be expressed in terms of the
+ * DFT, i.e.,
+ *
+ *      X[k] = SUM[n=0...N-1]x[n].e^(-jnk.2.pi/N)
+ *      k = 0,1,2,..., N-1
+ *      N = 2^order
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the input signal, a complex-valued vector of length
+ *            2^order; must be aligned on a 32 byte boundary.
+ *   pFFTSpec - pointer to the preallocated and initialized specification
+ *            structure
+ *
+ * Output Arguments:
+ *   pDst - pointer to the complex-valued output vector, of length 2^order;
+ *            must be aligned on an 32-byte boundary.
+ *
+ * Return Value:
+ *
+ *    OMX_Sts_NoErr - no error
+ *    OMX_Sts_BadArgErr - returned if one or more of the following conditions
+ *              is true:
+ *    -   one or more of the following pointers is NULL: pSrc, pDst, or
+ *              pFFTSpec.
+ *    -    pSrc or pDst is not 32-byte aligned
+ *
+ */
+
+OMXResult omxSP_FFTFwd_RToCCS_F32_Sfs(const OMX_F32* pSrc,
+                                      OMX_F32* pDst,
+                                      const OMXFFTSpec_R_F32* pFFTSpec) {
+  ARMsFFTSpec_R_FC32* spec = (ARMsFFTSpec_R_FC32*)pFFTSpec;
+  int order;
+  long subFFTSize;
+  long subFFTNum;
+  OMX_FC32* pTwiddle;
+  OMX_FC32* pOut;
+  OMX_FC32* pComplexSrc = (OMX_FC32*) pSrc;
+  OMX_FC32* pComplexDst = (OMX_FC32*) pDst;
+
+  /*
+   * Check args are not NULL and the source and destination pointers
+   * are properly aligned.
+   */
+  if (!validateParametersF32(pSrc, pDst, spec))
+    return OMX_Sts_BadArgErr;
+
+  /*
+   * Compute the RFFT using a complex FFT of one less order, so set
+   * order to be the order of the complex FFT.
+   */
+  order = fastlog2(spec->N) - 1;
+
+  subFFTSize = 1;
+  subFFTNum = spec->N >> 1;
+  pTwiddle = spec->pTwiddle;
+  pOut = (OMX_FC32*) spec->pBuf;
+
+  if (order > 3) {
+    OMX_FC32* argDst;
+    OMX_FC32* pComplexDst = (OMX_FC32*) pDst;
+
+    /*
+     * Set up argDst and pOut appropriately so that pOut = pDst for
+     * ComplexToRealFixup.
+     */
+    if ((order & 2) != 0) {
+      argDst = pOut;
+      pOut = pComplexDst;
+    } else {
+      argDst = pComplexDst;
+    }
+
+    /*
+     * Odd order uses a radix 8 first stage; even order, a radix 4
+     * first stage.
+     */
+    if (order & 1) {
+      armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace(
+          pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    } else {
+      armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace(
+          pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+
+    /*
+     * Now use radix 4 stages to finish rest of the FFT
+     */
+    if (subFFTNum >= 4) {
+      while (subFFTNum > 4) {
+        OMX_FC32* tmp;
+
+        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace(
+            argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+        /*
+         * Swap argDst and pOut
+         */
+        tmp = pOut;
+        pOut = argDst;
+        argDst = tmp;
+      }
+
+      armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace(
+          argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+  } else if (order == 3) {
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace(
+        pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+        pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 2) {
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace(
+        pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 1) {
+    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+  } else {
+    /* Handle complex order 0 specially */
+    pOut->Re = pSrc[0];
+    pOut->Im = pSrc[1];
+  }
+
+  /*
+   * Complex FFT done. Fix up the complex result to give the correct
+   * RFFT.
+   */
+
+  ComplexToRealFixup(pOut, pDst, pTwiddle, spec->pBuf, spec->N);
+
+  return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c b/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c
new file mode 100644
index 0000000..84de9cf
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTInv_CCSToR_F32.c
@@ -0,0 +1,259 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CCSToR_F32_preTwiddleRadix2(
+    const OMX_F32* pSrc,
+    const OMX_FC32* pTwiddle,
+    OMX_F32* pBuf,
+    long N);
+
+/*
+ * Scale FFT data by 1/|length|. |length| must be a power of two
+ */
+static inline ScaleRFFTData(OMX_F32* fftData, unsigned length) {
+  float32_t* data = (float32_t*)fftData;
+  float32_t scale = 2.0f / length;
+
+  if (length >= 4) {
+    /*
+     * Do 4 float elements at a time because |length| is always a
+     * multiple of 4 when |length| >= 4.
+     *
+     * TODO(rtoy): Figure out how to process 8 elements at a time
+     * using intrinsics or replace this with inline assembly.
+     */
+    do {
+      float32x4_t x = vld1q_f32(data);
+
+      length -= 4;
+      x = vmulq_n_f32(x, scale);
+      vst1q_f32(data, x);
+      data += 4;
+    } while (length > 0);
+  } else if (length == 2) {
+    float32x2_t x = vld1_f32(data);
+    x = vmul_n_f32(x, scale);
+    vst1_f32(data, x);
+  } else {
+    fftData[0] *= scale;
+  }
+}
+
+/**
+ * Function:  omxSP_FFTInv_CCSToR_F32_Sfs
+ *
+ * Description:
+ * These functions compute the inverse FFT for a conjugate-symmetric input 
+ * sequence.  Transform length is determined by the specification structure, 
+ * which must be initialized prior to calling the FFT function using 
+ * <FFTInit_R_F32>. For a transform of length M, the input sequence is 
+ * represented using a packed CCS vector of length M+2, and is organized 
+ * as follows: 
+ *
+ *   Index:   0  1  2    3    4    5    . . .  M-2       M-1      M      M+1 
+ *   Comp:  R[0] 0  R[1] I[1] R[2] I[2] . . .  R[M/2-1]  I[M/2-1] R[M/2] 0 
+ *
+ * where R[n] and I[n], respectively, denote the real and imaginary
+ * components for FFT bin n. Bins are numbered from 0 to M/2, where M
+ * is the FFT length.  Bin index 0 corresponds to the DC component,
+ * and bin index M/2 corresponds to the foldover frequency.
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the complex-valued input sequence represented
+ *          using CCS format, of length (2^order) + 2; must be aligned on a
+ *          32-byte boundary.
+ *   pFFTSpec - pointer to the preallocated and initialized
+ *              specification structure
+ *
+ * Output Arguments:
+ *   pDst - pointer to the real-valued output sequence, of length
+ *          2^order ; must be aligned on a 32-byte boundary.
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+
+ *    OMX_Sts_BadArgErr - bad arguments if one or more of the
+ *      following is true:
+ *    -    pSrc, pDst, or pFFTSpec is NULL 
+ *    -    pSrc or pDst is not aligned on a 32-byte boundary 
+ *
+ */
+OMXResult omxSP_FFTInv_CCSToR_F32_Sfs(
+    const OMX_F32* pSrc,
+    OMX_F32* pDst,
+    const OMXFFTSpec_R_F32* pFFTSpec) {
+  ARMsFFTSpec_R_FC32* spec = (ARMsFFTSpec_R_FC32*)pFFTSpec;
+  int order;
+  long subFFTSize;
+  long subFFTNum;
+  OMX_FC32* pTwiddle;
+  OMX_FC32* pOut;
+  OMX_FC32* pComplexSrc;
+  OMX_FC32* pComplexDst = (OMX_FC32*) pDst;
+
+  /*
+   * Check args are not NULL and the source and destination pointers
+   * are properly aligned.
+   */
+  if (!validateParametersF32(pSrc, pDst, spec))
+    return OMX_Sts_BadArgErr;
+
+  /*
+   * Preprocess the input before calling the complex inverse FFT. The
+   * result is actually stored in the second half of the temp buffer
+   * in pFFTSpec.
+   */
+  if (spec->N > 1)
+    armSP_FFTInv_CCSToR_F32_preTwiddleRadix2(
+        pSrc, spec->pTwiddle, spec->pBuf, spec->N);
+  
+  /*
+   * Do a complex inverse FFT of half size.
+   */
+  order = fastlog2(spec->N) - 1;
+
+  subFFTSize = 1;
+  subFFTNum = spec->N >> 1;
+  pTwiddle = spec->pTwiddle;
+  /*
+   * The pBuf is split in half. The first half is the temp buffer. The
+   * second half holds the source data that was placed there by
+   * armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe.
+   */
+  pOut = (OMX_FC32*) spec->pBuf;
+  pComplexSrc = pOut + (1 << order);
+
+
+  if (order > 3) {
+    OMX_FC32* argDst;
+
+    /*
+     * Set up argDst and pOut appropriately so that pOut = pDst for
+     * the very last FFT stage.
+     */
+    if ((order & 2) == 0) {
+      argDst = pOut;
+      pOut = pComplexDst;
+    } else {
+      argDst = pComplexDst;
+    }
+
+    /*
+     * Odd order uses a radix 8 first stage; even order, a radix 4
+     * first stage.
+     */
+    if (order & 1) {
+      armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+          pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    } else {
+      armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+          pComplexSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+
+    /*
+     * Now use radix 4 stages to finish rest of the FFT
+     */
+    if (subFFTNum >= 4) {
+      while (subFFTNum > 4) {
+        OMX_FC32* tmp;
+
+        armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+            argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+        /*
+         * Swap argDst and pOut
+         */
+        tmp = pOut;
+        pOut = argDst;
+        argDst = tmp;
+      }
+
+      armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+          argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+  } else if (order == 3) {
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+        pComplexDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 2) {
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 1) {
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pComplexSrc, pComplexDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else {
+    /* Order = 0 */
+    *pComplexDst = *pComplexSrc;
+  }
+
+  ScaleRFFTData(pDst, spec->N);
+  return OMX_Sts_NoErr;
+}
+
diff --git a/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c b/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c
new file mode 100644
index 0000000..eec05e9
--- /dev/null
+++ b/dl/sp/src/arm/arm64/omxSP_FFTInv_CToC_FC32.c
@@ -0,0 +1,214 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+extern void armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+    const OMX_FC32* pSrc,
+    OMX_FC32* pDst,
+    OMX_FC32* pTwiddle,
+    long* subFFTNum,
+    long* subFFTSize);
+
+/*
+ * Scale FFT data by 1/|length|. |length| must be a power of two
+ */
+static inline ScaleFFTData(OMX_FC32* fftData, unsigned length) {
+  float32_t* data = (float32_t*)fftData;
+  float32_t scale = 1.0f / length;
+
+  /*
+   * Do two complex elements at a time because |length| is always
+   * greater than or equal to 2 (order >= 1)
+   */
+  do {
+    float32x4_t x = vld1q_f32(data);
+
+    length -= 2;
+    x = vmulq_n_f32(x, scale);
+    vst1q_f32(data, x);
+    data += 4;
+  } while (length > 0);
+}
+
+/**
+ * Function:  omxSP_FFTInv_CToC_FC32
+ *
+ * Description:
+ * These functions compute an inverse FFT for a complex signal of
+ * length of 2^order, where 0 <= order <= 15. Transform length is
+ * determined by the specification structure, which must be
+ * initialized prior to calling the FFT function using the appropriate
+ * helper, i.e., <FFTInit_C_FC32>. The relationship between the input
+ * and output sequences can be expressed in terms of the IDFT, i.e.:
+ *
+ *     x[n] = SUM[k=0,...,N-1] X[k].e^(jnk.2.pi/N)
+ *     n=0,1,2,...N-1
+ *     N=2^order.
+ *
+ * Input Arguments:
+ *   pSrc - pointer to the complex-valued input signal, of length 2^order ;
+ *          must be aligned on a 32-byte boundary.
+ *   pFFTSpec - pointer to the preallocated and initialized specification
+ *            structure
+ *
+ * Output Arguments:
+ *   order
+ *   pDst - pointer to the complex-valued output signal, of length 2^order;
+ *          must be aligned on a 32-byte boundary.
+ *
+ * Return Value:
+ *
+ *    OMX_Sts_NoErr - no error
+ *    OMX_Sts_BadArgErr - returned if one or more of the following conditions
+ *              is true:
+ *    -   one or more of the following pointers is NULL: pSrc, pDst, or
+ *              pFFTSpec.
+ *    -   pSrc or pDst is not 32-byte aligned
+ *
+ */
+
+OMXResult omxSP_FFTInv_CToC_FC32_Sfs(const OMX_FC32* pSrc,
+                                     OMX_FC32* pDst,
+                                     const OMXFFTSpec_C_FC32* pFFTSpec) {
+  ARMsFFTSpec_FC32* spec = (ARMsFFTSpec_FC32*)pFFTSpec;
+  int order;
+  long subFFTSize;
+  long subFFTNum;
+  OMX_FC32* pTwiddle;
+  OMX_FC32* pOut;
+
+  /*
+   * Check args are not NULL and the source and destination pointers
+   * are properly aligned.
+   */
+  if (!validateParametersFC32(pSrc, pDst, spec))
+    return OMX_Sts_BadArgErr;
+
+  order = fastlog2(spec->N);
+
+  subFFTSize = 1;
+  subFFTNum = spec->N;
+  pTwiddle = spec->pTwiddle;
+  pOut = spec->pBuf;
+
+  if (order > 3) {
+    OMX_FC32* argDst;
+
+    /*
+     * Set up argDst and pOut appropriately so that pOut = pDst for
+     * the very last FFT stage.
+     */
+    if ((order & 2) == 0) {
+      argDst = pOut;
+      pOut = pDst;
+    } else {
+      argDst = pDst;
+    }
+
+    /*
+     * Odd order uses a radix 8 first stage; even order, a radix 4
+     * first stage.
+     */
+    if (order & 1) {
+      armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace(
+          pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    } else {
+      armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace(
+          pSrc, argDst, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+
+    /*
+     * Now use radix 4 stages to finish rest of the FFT
+     */
+    if (subFFTNum >= 4) {
+      while (subFFTNum > 4) {
+        OMX_FC32* tmp;
+
+        armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace(
+            argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+        /*
+         * Swap argDst and pOut
+         */
+        tmp = pOut;
+        pOut = argDst;
+        argDst = tmp;
+      }
+
+      armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace(
+          argDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    }
+  } else if (order == 3) {
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace(
+        pDst, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else if (order == 2) {
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pOut, pTwiddle, &subFFTNum, &subFFTSize);
+    armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace(
+        pOut, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  } else {
+    /* Order = 1 */
+    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace(
+        pSrc, pDst, pTwiddle, &subFFTNum, &subFFTSize);
+  }
+
+  ScaleFFTData(pDst, spec->N);
+  return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/test/support/float_fft_neon.c b/dl/sp/src/test/support/float_fft_neon.c
index 3f0cf16..a10d803 100644
--- a/dl/sp/src/test/support/float_fft_neon.c
+++ b/dl/sp/src/test/support/float_fft_neon.c
@@ -8,22 +8,37 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
+
 #include "dl/sp/api/armSP.h"
 #include "dl/sp/api/omxSP.h"
 #include "dl/sp/src/test/test_util.h"
 
 static const char* message =
-    "Test forward and inverse floating-point FFT (NEON)\n";
+    "Test forward and inverse floating-point FFT"
+#if defined(__aarch64__)
+    " (ARM64)\n"
+#else
+    " (NEON)\n"
+#endif
+    ;
 
 const char* UsageMessage() {
   return message;
 }
 
+#if defined(__aarch64__)
+#define FINISHED_MESSAGE "ARM64 tests finished.\n"
+#else
+#define FINISHED_MESSAGE "NEON tests finished.\n"
+#endif
+
 void FinishedMessage() {
-  printf("NEON tests finished.\n");
+  printf(FINISHED_MESSAGE);
 }
 
 void SetThresholds(struct TestInfo* info) {
+#if defined(__arm__)
 #ifdef BIG_FFT_TABLE
   info->forward_threshold_ = 138.81;
   info->inverse_threshold_ = 137.81;
@@ -31,6 +46,15 @@
   info->forward_threshold_ = 138.81;
   info->inverse_threshold_ = 138.81;
 #endif
+#else
+#ifdef BIG_FFT_TABLE
+  info->forward_threshold_ = 138.96;
+  info->inverse_threshold_ = 138.96;
+#else
+  info->forward_threshold_ = 138.96;
+  info->inverse_threshold_ = 138.96;
+#endif
+#endif
 }
 
 OMXResult ForwardFFT(OMX_FC32* x,
diff --git a/dl/sp/src/test/support/float_rfft_thresholds.h b/dl/sp/src/test/support/float_rfft_thresholds.h
index 2d84eec..6bbc937 100644
--- a/dl/sp/src/test/support/float_rfft_thresholds.h
+++ b/dl/sp/src/test/support/float_rfft_thresholds.h
@@ -23,6 +23,14 @@
 #define FLOAT_RFFT_FORWARD_THRESHOLD_ARMV7 (134.95)
 #define FLOAT_RFFT_INVERSE_THRESHOLD_ARMV7 (142.25)
 #endif
+#elif defined(__aarch64__)
+#ifdef BIG_FFT_TABLE
+#define FLOAT_RFFT_FORWARD_THRESHOLD_NEON (136.55)
+#define FLOAT_RFFT_INVERSE_THRESHOLD_NEON (141.55)
+#else
+#define FLOAT_RFFT_FORWARD_THRESHOLD_NEON (136.55)
+#define FLOAT_RFFT_INVERSE_THRESHOLD_NEON (142.74)
+#endif
 #else
 #ifdef BIG_FFT_TABLE
 #define FLOAT_RFFT_FORWARD_THRESHOLD_X86 (135.97)
diff --git a/dl/sp/src/test/test_fft.gyp b/dl/sp/src/test/test_fft.gyp
index b1d88e3..3c739cf 100644
--- a/dl/sp/src/test/test_fft.gyp
+++ b/dl/sp/src/test/test_fft.gyp
@@ -128,6 +128,19 @@
         },
       ],
     }],
+    ['target_arch == "arm64"', {
+      'targets': [
+        {
+          # Test complex floating-point FFT
+          'target_name': 'test_float_fft',
+          'type': 'executable',
+          'sources': [
+            'test_float_fft.c',
+            'support/float_fft_neon.c',
+          ],
+        },
+      ],
+    }],
   ],
   'targets': [
     # Targets that should be supported by all architectures
@@ -155,7 +168,7 @@
         'support/float_rfft_thresholds.h',
       ],
       'conditions': [
-        ['target_arch == "arm"', {
+        ['target_arch == "arm" or target_arch == "arm64"', {
           'sources': [
             'support/float_rfft_neon.c',
           ],
@@ -175,9 +188,9 @@
         'test_fft_time.c',
       ],
       'conditions': [
-        ['target_arch == "ia32"', {
+        ['target_arch == "ia32" or target_arch == "arm64"', {
           'defines': [
-            # Timing test only for float FFTs on x86
+            # Timing test only for float FFTs on x86 and arm64.
             'FLOAT_ONLY',
           ],
         }],
@@ -206,6 +219,12 @@
             'test_float_rfft_detect',
           ],
         }],
+        ['target_arch == "arm64"', {
+          # Supported test programs for ARM64
+          'dependencies': [
+            'test_float_fft',
+           ],
+        }],
       ],
       'dependencies' : [
         # All architectures must support at least the float rfft test
diff --git a/dl/sp/src/test/test_fft_time.c b/dl/sp/src/test/test_fft_time.c
index db65134..6154228 100644
--- a/dl/sp/src/test/test_fft_time.c
+++ b/dl/sp/src/test/test_fft_time.c
@@ -51,7 +51,7 @@
   S32,
 } s16_s32;
 
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
 void TimeOneFloatFFT(int count, int fft_log_size, float signal_value,
                      int signal_type);
 void TimeFloatFFT(int count, float signal_value, int signal_type);
@@ -112,7 +112,7 @@
       "  -T          Run just one FFT timing test\n"
       "  -f          FFT type:\n"
       "              0 - Complex Float\n"
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
       "              1 - Real Float\n"
 #endif
 #ifdef ENABLE_FIXED_POINT_FFT_TESTS
@@ -217,7 +217,7 @@
     printf("Warning:  -f ignored when -T not specified\n");
 
   if (test_mode) {
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
     TimeFloatFFT(count, signal_value, signal_type);
 #endif
     TimeFloatRFFT(count, signal_value, signal_type);
@@ -229,7 +229,7 @@
 #endif
   } else {
     switch (fft_type) {
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
       case 0:
         TimeOneFloatFFT(count, fft_log_size, signal_value, signal_type);
         break;
@@ -323,7 +323,7 @@
   return count;
 }
 
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
 void TimeOneFloatFFT(int count, int fft_log_size, float signal_value,
                      int signal_type) {
   struct AlignedPtr* x_aligned;
@@ -1061,11 +1061,11 @@
 
     if(s16s32 == S32) {
       PrintResult("Forward RFFT16 (with S32)",
-		  fft_log_size, elapsed_time, count);
+                  fft_log_size, elapsed_time, count);
     }
     else {
       PrintResult("Forward RFFT16 (with S16)",
-		  fft_log_size, elapsed_time, count);
+                  fft_log_size, elapsed_time, count);
     }
   }
 
@@ -1124,11 +1124,11 @@
 
     if(s16s32 == S32) {
       PrintResult("Inverse RFFT16 (with S32)",
-		  fft_log_size, elapsed_time, count);
+                  fft_log_size, elapsed_time, count);
     }
     else {
       PrintResult("Inverse RFFT16 (with S16)",
-		  fft_log_size, elapsed_time, count);
+                  fft_log_size, elapsed_time, count);
     }
   }
 
diff --git a/dl/sp/src/test/test_float_fft.c b/dl/sp/src/test/test_float_fft.c
index 289b197..6aac3a4 100644
--- a/dl/sp/src/test/test_float_fft.c
+++ b/dl/sp/src/test/test_float_fft.c
@@ -154,7 +154,7 @@
 
   status = ForwardFFT(x, y, fft_fwd_spec);
 
-  if (status) {
+  if (status != OMX_Sts_NoErr) {
     fprintf(stderr, "Forward FFT failed: status = %d\n", status);
     exit(1);
   }
@@ -223,7 +223,7 @@
 
   status = InverseFFT(y, z, fft_inv_spec);
 
-  if (status) {
+  if (status != OMX_Sts_NoErr) {
     fprintf(stderr, "Inverse FFT failed: status = %d\n", status);
     exit(1);
   }
diff --git a/dl/sp/src/test/test_float_rfft.c b/dl/sp/src/test/test_float_rfft.c
index a976162..3b37ece 100644
--- a/dl/sp/src/test/test_float_rfft.c
+++ b/dl/sp/src/test/test_float_rfft.c
@@ -177,7 +177,7 @@
   }
 
   status = ForwardRFFT(x, (OMX_F32*) y, fft_fwd_spec);
-  if (status) {
+  if (status != OMX_Sts_NoErr) {
     fprintf(stderr, "Forward FFT failed: status = %d\n", status);
     exit(1);
   }
@@ -257,7 +257,7 @@
   }
 
   status = InverseRFFT((OMX_F32 *) yTrue, z, fft_inv_spec);
-  if (status) {
+  if (status != OMX_Sts_NoErr) {
     fprintf(stderr, "Inverse FFT failed: status = %d\n", status);
     exit(1);
   }
