Implement x86 real float fft for openmax_dl.
Optimized by SSE2 intrisics.
Mainly target for the web audio usage.

BUG=
R=aedla@chromium.org, andrew@webrtc.org, rtoy@google.com

Review URL: https://webrtc-codereview.appspot.com/2208004

git-svn-id: http://webrtc.googlecode.com/svn/deps/third_party/openmax@5109 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/dl/dl.gyp b/dl/dl.gyp
index 2c5acd4..919eeb7 100644
--- a/dl/dl.gyp
+++ b/dl/dl.gyp
@@ -18,95 +18,10 @@
       'include_dirs': [
         '../',
       ],
-      'cflags!': [
-        '-mfpu=vfpv3-d16',
-      ],
-      'cflags': [
-        # We enable Neon instructions even with arm_neon==0, to support
-        # runtime detection.
-        '-mfpu=neon',
-      ],
-      'direct_dependent_settings': {
-        'include_dirs': [
-          '../',
-        ],
-      },
       'sources': [
-        'api/arm/armCOMM_s.h',
-        'api/arm/armOMX.h',
-        'api/arm/omxtypes_s.h',
         'api/omxtypes.h',
-        'sp/api/armSP.h',
         'sp/api/omxSP.h',
-        # Common C code that can be shared between different
-        # architectures.
         'sp/src/armSP_FFT_F32TwiddleTable.c',
-
-        # Common C code for NEON and non-NEON implementations.
-        'sp/src/arm/armSP_FFT_S32TwiddleTable.c',
-        'sp/src/arm/omxSP_FFTGetBufSize_C_SC32.c',
-        'sp/src/arm/omxSP_FFTInit_C_SC32.c',
-        'sp/src/arm/omxSP_FFTGetBufSize_R_S32.c',
-        'sp/src/arm/omxSP_FFTInit_R_S32.c',
-        'sp/src/arm/omxSP_FFTInit_C_SC16.c',
-        'sp/src/arm/omxSP_FFTGetBufSize_C_SC16.c',
-        'sp/src/arm/omxSP_FFTGetBufSize_R_S16.c',
-        'sp/src/arm/omxSP_FFTInit_R_S16.c',
-        'sp/src/arm/omxSP_FFTGetBufSize_R_S16S32.c',
-        'sp/src/arm/omxSP_FFTInit_R_S16S32.c',
-        'sp/src/arm/omxSP_FFTGetBufSize_C_FC32.c',
-        'sp/src/arm/omxSP_FFTInit_C_FC32.c',
-        'sp/src/arm/omxSP_FFTGetBufSize_R_F32.c',
-        'sp/src/arm/omxSP_FFTInit_R_F32.c',
-
-        # NEON-specific implementation
-        # Complex 32-bit fixed-point FFT.
-        'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S',
-        'sp/src/arm/neon/omxSP_FFTInv_CToC_SC32_Sfs_s.S',
-        'sp/src/arm/neon/omxSP_FFTFwd_CToC_SC32_Sfs_s.S',
-        # Real 32-bit fixed-point FFT
-        'sp/src/arm/neon/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S',
-        'sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S',
-        'sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32_Sfs_s.S',
-        # Complex 16-bit fixed-point FFT
-        'sp/src/arm/neon/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S',
-        'sp/src/arm/neon/omxSP_FFTFwd_CToC_SC16_Sfs_s.S',
-        'sp/src/arm/neon/omxSP_FFTInv_CToC_SC16_Sfs_s.S',
-        # Real 16-bit fixed-point FFT
-        'sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S',
-        'sp/src/arm/neon/omxSP_FFTInv_CCSToR_S16_Sfs_s.S',
-        'sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S',
-        'sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S',
-        # Complex floating-point FFT
-        'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S',
-        'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S',
-        'sp/src/arm/neon/omxSP_FFTInv_CToC_FC32_Sfs_s.S',
-        'sp/src/arm/neon/omxSP_FFTFwd_CToC_FC32_Sfs_s.S',
-        # Real floating-point FFT
-        'sp/src/arm/neon/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S',
-        'sp/src/arm/neon/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S',
-        'sp/src/arm/neon/omxSP_FFTInv_CCSToR_F32_Sfs_s.S',
       ],
       'conditions' : [
         ['big_float_fft == 1', {
@@ -114,6 +29,120 @@
             'BIG_FFT_TABLE',
           ],
         }],
+        ['target_arch=="arm"', {
+          'cflags!': [
+            '-mfpu=vfpv3-d16',
+          ],
+          'cflags': [
+            # We enable Neon instructions even with arm_neon==0, to support
+            # runtime detection.
+            '-mfpu=neon',
+          ],
+          'sources': [
+            'api/armCOMM_s.h',
+            'api/armOMX.h',
+            'api/omxtypes_s.h',
+            'sp/api/armSP.h',
+            # Complex 32-bit fixed-point FFT.
+            'sp/src/armSP_FFT_S32TwiddleTable.c',
+            'sp/src/omxSP_FFTGetBufSize_C_SC32.c',
+            'sp/src/omxSP_FFTInit_C_SC32.c',
+            'sp/src/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S',
+            'sp/src/omxSP_FFTInv_CToC_SC32_Sfs_s.S',
+            'sp/src/omxSP_FFTFwd_CToC_SC32_Sfs_s.S',
+            # Real 32-bit fixed-point FFT
+            'sp/src/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S',
+            'sp/src/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S',
+            'sp/src/omxSP_FFTGetBufSize_R_S32.c',
+            'sp/src/omxSP_FFTInit_R_S32.c',
+            'sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S',
+            # Complex 16-bit fixed-point FFT
+            'sp/src/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S',
+            'sp/src/omxSP_FFTInit_C_SC16.c',
+            'sp/src/omxSP_FFTGetBufSize_C_SC16.c',
+            'sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S',
+            'sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S',
+            'sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S',
+            # Real 16-bit fixed-point FFT
+            'sp/src/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S',
+            'sp/src/omxSP_FFTGetBufSize_R_S16.c',
+            'sp/src/omxSP_FFTInit_R_S16.c',
+            'sp/src/omxSP_FFTInv_CCSToR_S16_Sfs_s.S',
+            'sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S',
+            'sp/src/omxSP_FFTGetBufSize_R_S16S32.c',
+            'sp/src/omxSP_FFTInit_R_S16S32.c',
+            'sp/src/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S',
+            # Complex floating-point FFT
+            'sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S',
+            'sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S',
+            'sp/src/omxSP_FFTGetBufSize_C_FC32.c',
+            'sp/src/omxSP_FFTInit_C_FC32.c',
+            'sp/src/omxSP_FFTInv_CToC_FC32_Sfs_s.S',
+            'sp/src/omxSP_FFTFwd_CToC_FC32_Sfs_s.S',
+            # Real floating-point FFT
+            'sp/src/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S',
+            'sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S',
+            'sp/src/omxSP_FFTGetBufSize_R_F32.c',
+            'sp/src/omxSP_FFTInit_R_F32.c',
+            'sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_s.S',
+          ],
+        }],
+        ['target_arch=="ia32" or target_arch=="x64"', {
+          'cflags': [
+            '-msse2',
+          ],
+          'sources': [
+            # Real 32-bit floating-point FFT.
+            'sp/api/x86SP.h',
+            'sp/src/x86/omxSP_FFTFwd_RToCCS_F32_Sfs.c',
+            'sp/src/x86/omxSP_FFTGetBufSize_R_F32.c',
+            'sp/src/x86/omxSP_FFTInit_R_F32.c',
+            'sp/src/x86/omxSP_FFTInv_CCSToR_F32_Sfs.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_fs.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ms.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_fs.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ms.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c',
+            'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse.c',
+            'sp/src/x86/x86SP_FFT_F32_radix2_kernel.c',
+            'sp/src/x86/x86SP_FFT_F32_radix4_kernel.c',
+            'sp/src/x86/x86SP_SSE_Math.h',
+          ],
+        }],
       ],
   }]
 }
diff --git a/dl/sp/api/x86SP.h b/dl/sp/api/x86SP.h
new file mode 100644
index 0000000..5312734
--- /dev/null
+++ b/dl/sp/api/x86SP.h
@@ -0,0 +1,39 @@
+/*
+ *  Copyright (c) 2007-2008 ARM Limited. All Rights Reserved.
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  It has been relicensed with permission from the copyright holders.
+ */
+
+#ifndef _x86SP_H_
+#define _x86SP_H_
+
+#include "dl/api/omxtypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern OMX_F32 armSP_FFT_F32TwiddleTable[];
+
+typedef struct X86FFTSpec_R_FC32_Tag
+{
+    OMX_U32 N;
+    OMX_F32* pTwiddle;
+    // Ping Pong buffer for doing the N/2 point complex FFT.
+    OMX_F32* pBuf1;
+    OMX_F32* pBuf2;
+
+} X86FFTSpec_R_FC32;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/dl/sp/src/x86/omxSP_FFTFwd_RToCCS_F32_Sfs.c b/dl/sp/src/x86/omxSP_FFTFwd_RToCCS_F32_Sfs.c
new file mode 100644
index 0000000..b6d1c98
--- /dev/null
+++ b/dl/sp/src/x86/omxSP_FFTFwd_RToCCS_F32_Sfs.c
@@ -0,0 +1,228 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/omxSP.h"
+#include "dl/sp/api/x86SP.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+#include <stdbool.h>
+
+extern OMX_F32* x86SP_F32_radix2_kernel_OutOfPlace(
+    const OMX_F32 *src,
+    OMX_F32 *buf1,
+    OMX_F32 *buf2,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    bool forward_fft);
+
+extern OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace_sse(
+    const OMX_F32 *src,
+    OMX_F32 *buf1,
+    OMX_F32 *buf2,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    bool forward_fft);
+
+/**
+ * A two-for-one algorithm is used here to do the real fft:
+ *
+ * Input x[n], (n = 0, ..., N - 1)
+ * Output X[k] = DFT(N, k){x}
+ * a[n] = x[2n], (n = 0, ..., N/2 - 1)
+ * b[n] = x[2n + 1], (n = 0, ..., N/2 - 1)
+ * z[n] = a[n] + j * b[n]
+ * Z[k] = DFT(N/2, k){z}
+ * Z' is the complex conjugate of Z
+ * A[k] = (Z[k] + Z'[N/2 - k]) / 2
+ * B[k] = -j * (Z[k] - Z'[N/2 - k]) / 2
+ * X[k] = A[k] + B[k] * W[k], (W = exp(-j*2*PI*k/N); k = 0, ..., N/2 - 1)
+ * X[k] = A[k] - B[k], (k = N/2)
+ * X' is complex conjugate of X
+ * X[k] = X'[N - k], (k = N/2 + 1, ..., N - 1)
+ */
+
+/**
+ * This function is the last permutation of two-for-one FFT algorithm.
+ * We move the division by 2 to the last step in the implementation, so:
+ * A[k] = (Z[k] + Z'[N/2 - k])
+ * B[k] = -j * (Z[k] - Z'[N/2 - k])
+ * X[k] = (A[k] + B[k] * W[k]) / 2, (k = 0, ..., N/2 - 1)
+ * X[k] = (A[k] - B[k]), (k = N/2)
+ * X[k] = X'[N - k], (k = N/2 + 1, ..., N - 1)
+ */
+static void RevbinPermuteFwd(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n) {
+  OMX_INT i;
+  OMX_INT j;
+  OMX_INT n_by_2 = n >> 1;
+  OMX_INT n_by_4 = n >> 2;
+
+  OMX_FC32 big_a;
+  OMX_FC32 big_b;
+  OMX_FC32 temp;
+  const OMX_F32 *tw;
+
+  for (i = 1, j = n_by_2 - 1; i < n_by_4; i++, j--) {
+    // A[k] = (Z[k] + Z'[N/2 - k])
+    big_a.Re = in[i] + in[j];
+    big_a.Im = in[j + n_by_2] - in[i + n_by_2];
+
+    // B[k] = -j * (Z[k] - Z'[N/2 - k])
+    big_b.Re = in[j] - in[i];
+    big_b.Im = in[j + n_by_2] + in[i + n_by_2];
+
+    // W[k]
+    tw = twiddle + i;
+
+    // temp = B[k] * W[k]
+    temp.Re =  big_b.Re * tw[0] + big_b.Im * tw[n];
+    temp.Im =  big_b.Re * tw[n] - big_b.Im * tw[0];
+
+    // Convert split format to interleaved format.
+    // X[k] = (A[k] + B[k] * W[k]) / 2, (k = 0, ..., N/2 - 1)
+    out[i << 1] = 0.5f * (big_a.Re - temp.Im);
+    out[(i << 1) + 1] = 0.5f * (temp.Re - big_a.Im);
+    // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1)
+    out[j << 1] = 0.5f * (big_a.Re + temp.Im);
+    out[(j << 1) + 1] = 0.5f * (temp.Re + big_a.Im);
+  }
+
+  // X[k] = A[k] - B[k] (k = N/2)
+  out[n_by_2] = in[n_by_4];
+  out[n_by_2 + 1] = -in[n_by_4 + n_by_2];
+
+  out[0] = in[0] + in[n_by_2];
+  out[1] = 0;
+  out[n] = in[0] - in[n_by_2];
+  out[n + 1] = 0;
+}
+
+// Sse version of RevbinPermuteFwd function.
+static void RevbinPermuteFwdSse(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n) {
+  OMX_INT i;
+  OMX_INT j;
+  OMX_INT n_by_2 = n >> 1;
+  OMX_INT n_by_4 = n >> 2;
+
+  VC v_i;
+  VC v_j;
+  VC v_big_a;
+  VC v_big_b;
+  VC v_temp;
+  VC v_x0;
+  VC v_x1;
+  VC v_tw;
+
+  __m128 factor = _mm_set1_ps(0.5f);
+
+  for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) {
+    VC_LOAD_SPLIT(&v_i, (in + i), n_by_2);
+
+    VC_LOADU_SPLIT(&v_j, (in + j), n_by_2);
+    VC_REVERSE(&v_j);
+
+    // A[k] = (Z[k] + Z'[N/2 - k])
+    VC_ADD_SUB(&v_big_a, &v_j, &v_i);
+
+    // B[k] = -j * (Z[k] - Z'[N/2 - k])
+    VC_SUB_ADD(&v_big_b, &v_j, &v_i);
+
+    // W[k]
+    VC_LOAD_SPLIT(&v_tw, (twiddle + i), n);
+
+    // temp = B[k] * W[k]
+    VC_CONJ_MUL(&v_temp, &v_big_b, &v_tw);
+
+    VC_SUB_X(&v_x0, &v_big_a, &v_temp);
+    VC_ADD_X(&v_x1, &v_big_a, &v_temp);
+
+    VC_MUL_F(&v_x0, &v_x0, factor);
+    VC_MUL_F(&v_x1, &v_x1, factor);
+
+    // X[k] = A[k] + B[k] * W[k] (k = 0, ..., N/2 - 1)
+    VC_STORE_INTERLEAVE((out + (i << 1)), &v_x0);
+
+    // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1)
+    VC_REVERSE(&v_x1);
+    VC_STOREU_INTERLEAVE((out + (j << 1)), &v_x1);
+  }
+
+  out[n_by_2] = in[n_by_4];
+  out[n_by_2 + 1] = -in[n_by_4 + n_by_2];
+
+  out[0] = in[0] + in[n_by_2];
+  out[1] = 0;
+  out[n] = in[0] - in[n_by_2];
+  out[n + 1] = 0;
+}
+
+OMXResult omxSP_FFTFwd_RToCCS_F32_Sfs(const OMX_F32 *pSrc, OMX_F32 *pDst,
+                                      const OMXFFTSpec_R_F32 *pFFTSpec) {
+  // Input must be 32 byte aligned
+  if (!pSrc || !pDst || (OMX_INT)pSrc & 31 || (OMX_INT)pDst & 31)
+    return OMX_Sts_BadArgErr;
+
+  OMX_INT n;
+  OMX_INT n_by_2;
+  OMX_INT n_by_4;
+  const OMX_F32 *twiddle;
+  OMX_F32 *buf;
+
+  const X86FFTSpec_R_FC32 *pFFTStruct = (const X86FFTSpec_R_FC32*) pFFTSpec;
+
+  n = pFFTStruct->N;
+
+  // This is to handle the case of order == 1.
+  if (n == 2) {
+    pDst[0] = (pSrc[0] + pSrc[1]);
+    pDst[1] = 0.0f;
+    pDst[2] = (pSrc[0] - pSrc[1]);
+    pDst[3] = 0.0f;
+    return OMX_Sts_NoErr;
+  }
+
+  n_by_2 = n >> 1;
+  n_by_4 = n >> 2;
+  buf = pFFTStruct->pBuf1;
+  twiddle = pFFTStruct->pTwiddle;
+
+  if(n_by_2 >= 16) {
+    buf = x86SP_F32_radix4_kernel_OutOfPlace_sse(
+        pSrc,
+        pFFTStruct->pBuf2,
+        buf,
+        twiddle,
+        n_by_2,
+        1);
+  } else {
+    buf = x86SP_F32_radix2_kernel_OutOfPlace(
+        pSrc,
+        pFFTStruct->pBuf2,
+        buf,
+        twiddle,
+        n_by_2,
+        1);
+  }
+
+  if(n >= 8)
+    RevbinPermuteFwdSse(buf, pDst, twiddle, n);
+  else
+    RevbinPermuteFwd(buf, pDst, twiddle, n);
+
+  return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/x86/omxSP_FFTGetBufSize_R_F32.c b/dl/sp/src/x86/omxSP_FFTGetBufSize_R_F32.c
new file mode 100644
index 0000000..f686a7f
--- /dev/null
+++ b/dl/sp/src/x86/omxSP_FFTGetBufSize_R_F32.c
@@ -0,0 +1,60 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/x86SP.h"
+#include "dl/sp/api/omxSP.h"
+
+/**
+ * Function: omxSP_FFTGetBufSize_R_F32
+ *
+ * Description:
+ * Computes the size of the specification structure required for the length
+ * 2^order real FFT and IFFT functions.
+ *
+ * Remarks:
+ * This function is used in conjunction with the 32-bit functions
+ * <FFTFwd_RToCCS_F32_Sfs> and <FFTInv_CCSToR_F32_Sfs>.
+ *
+ * Parameters:
+ * [in]  order       base-2 logarithm of the length; valid in the range
+ *                    [1,12]. ([1,15] if BIG_FFT_TABLE is defined.)
+ * [out] pSize	   pointer to the number of bytes required for the
+ *			   specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTGetBufSize_R_F32(OMX_INT order, OMX_INT *pSize) {
+  if (!pSize || (order < 1) || (order > TWIDDLE_TABLE_ORDER))
+    return OMX_Sts_BadArgErr;
+
+    OMX_INT n_by_2;
+    OMX_INT n;
+
+    n_by_2 = 1 << (order - 1);
+    n = n_by_2 << 1;
+
+    *pSize = sizeof(X86FFTSpec_R_FC32) +
+             // Twiddle factors.
+             sizeof(OMX_F32) * (n << 1) +
+             // Ping Pong buffer for doing the n/2 point complex FFT.
+             // pBuf1
+             sizeof(OMX_F32) * n + 4 +
+             // pBuf2
+             sizeof(OMX_F32) * n + 4 +
+             // Extra bytes to get 32 byte alignment of ptwiddle, pBuf1
+             62;
+
+    return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/x86/omxSP_FFTInit_R_F32.c b/dl/sp/src/x86/omxSP_FFTInit_R_F32.c
new file mode 100644
index 0000000..564f166
--- /dev/null
+++ b/dl/sp/src/x86/omxSP_FFTInit_R_F32.c
@@ -0,0 +1,126 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This is a modification of omxSP_FFTInit_R_S32.c to support float
+ *  instead of S32.
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/omxSP.h"
+#include "dl/sp/api/x86SP.h"
+
+/**
+ * Function: omxSP_FFTInit_R_F32
+ *
+ * Description:
+ * Initialize the real forward-FFT specification information struct.
+ *
+ * Remarks:
+ * This function is used to initialize the specification structures
+ * for functions |omxSP_FFTFwd_RToCCS_F32_Sfs| and
+ * |omxSP_FFTInv_CCSToR_F32_Sfs|. Memory for *pFFTSpec must be
+ * allocated prior to calling this function. The number of bytes
+ * required for *pFFTSpec can be determined using
+ * |omxSP_FFTGetBufSize_R_F32|.
+ *
+ * Parameters:
+ * [in]  order       base-2 logarithm of the desired block length;
+ *                         valid in the range [1,12].  ([1,15] if
+ *                         BIG_FFT_TABLE is defined.)
+ * [out] pFFTFwdSpec pointer to the initialized specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTInit_R_F32(OMXFFTSpec_R_F32 *pFFTSpec, OMX_INT order)
+{
+  OMX_F32 *pTwiddle;
+  OMX_F32 *pBuf;
+  OMX_INT i;
+  OMX_INT j;
+  OMX_INT N;
+  OMX_INT NBy2;
+  OMX_INT NBy4;
+  OMX_INT diff;
+  OMX_U32 pTmp;
+  X86FFTSpec_R_FC32  *pFFTStruct = (X86FFTSpec_R_FC32 *) pFFTSpec;
+  OMX_F32 real;
+  OMX_F32 imag;
+
+  if (!pFFTSpec || (order < 1) || (order > TWIDDLE_TABLE_ORDER))
+    return OMX_Sts_BadArgErr;
+
+  N = 1 << order;
+  NBy2 = N >> 1;
+
+  pTwiddle = (OMX_F32*) (sizeof(X86FFTSpec_R_FC32) + (OMX_S8*) pFFTSpec);
+
+  // Align to 32 byte boundary.
+  pTmp = ((OMX_U32)pTwiddle) & 31;
+  if (pTmp)
+    pTwiddle = (OMX_F32*) ((OMX_S8*)pTwiddle + (32 - pTmp));
+
+  pBuf = (OMX_F32*) (sizeof(OMX_F32) * (N << 1) + (OMX_S8*) pTwiddle);
+
+  // Align to 32 byte boundary.
+  pTmp = ((OMX_U32)pBuf) & 31;
+  if (pTmp)
+    pBuf = (OMX_F32*) ((OMX_S8*)pBuf + (32 - pTmp));
+
+  // Calculating Twiddle Factors.
+  diff = 1 << (TWIDDLE_TABLE_ORDER - order + 1);
+
+  // For SSE optimization, using twiddle with split format by which the real and
+  // imag data are stored into first and last halves of the buffer separately
+  // The negatives are moved when generating pTwiddle table.
+  if (order > 1) {
+    NBy4 = N >> 2;
+    for (i = 0, j = 0; i <= NBy4 >> 1; ++i, j += diff) {
+      real = armSP_FFT_F32TwiddleTable[j];
+      imag = armSP_FFT_F32TwiddleTable[j + 1];
+
+      pTwiddle[i] = -real;
+      pTwiddle[i + N] = -imag;
+
+      pTwiddle[NBy4 - i] = imag;
+      pTwiddle[NBy4 - i + N] = real;
+
+      pTwiddle[NBy4 + i] = -imag;
+      pTwiddle[NBy4 + i + N] = real;
+
+      pTwiddle[NBy2 - i] = real;
+      pTwiddle[NBy2 - i + N] = -imag;
+
+      pTwiddle[NBy2 + i] = real;
+      pTwiddle[NBy2 + i + N] = imag;
+
+      pTwiddle[NBy4 * 3 - i] = -imag;
+      pTwiddle[NBy4 * 3 - i + N] = -real;
+
+      pTwiddle[NBy4 * 3 + i] = imag;
+      pTwiddle[NBy4 * 3 + i + N] = -real;
+
+      pTwiddle[N - i - 1] = -real;
+      pTwiddle[(N << 1) - i - 1] = imag;
+    }
+  } else {
+    pTwiddle[0] = armSP_FFT_F32TwiddleTable[0];
+    pTwiddle[2] = armSP_FFT_F32TwiddleTable[1];
+    pTwiddle[1] = -pTwiddle[0];
+    pTwiddle[3] = pTwiddle[2];
+  }
+  pFFTStruct->N = N;
+  pFFTStruct->pTwiddle = pTwiddle;
+  pFFTStruct->pBuf1 = pBuf;
+  pFFTStruct->pBuf2 = pBuf + N + 4;
+
+  return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/x86/omxSP_FFTInv_CCSToR_F32_Sfs.c b/dl/sp/src/x86/omxSP_FFTInv_CCSToR_F32_Sfs.c
new file mode 100644
index 0000000..1733d66
--- /dev/null
+++ b/dl/sp/src/x86/omxSP_FFTInv_CCSToR_F32_Sfs.c
@@ -0,0 +1,252 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/omxSP.h"
+#include "dl/sp/api/x86SP.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+#include <stdbool.h>
+
+extern OMX_F32* x86SP_F32_radix2_kernel_OutOfPlace(
+    const OMX_F32 *src,
+    OMX_F32 *buf1,
+    OMX_F32 *buf2,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    bool forward_fft);
+
+extern OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace_sse(
+    const OMX_F32 *src,
+    OMX_F32 *buf1,
+    OMX_F32 *buf2,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    bool forward_fft);
+
+/**
+ * A two-for-one algorithm is used here to do the real ifft:
+ *
+ * Input X[k], (k = 0, ..., N - 1)
+ * Output x[n] = IDFT(N, k){X}
+ * X' is complex conjugate of X
+ * A[k] = (X[k] + X'[N/2 - k]) / 2
+ * B[k] = (X[k] - X'[N/2 - k]) / 2 * W[k], (W = exp(j*2*PI*k/N);
+ *                                          k = 0, ..., N/2 - 1)
+ * Z[k] = A[k] + j * B[k], (k = 0, ..., N/2 - 1)
+ * z[n] = IDFT(N/2, k){Z}
+ * x[2n] = Re(z[n]), (n = 0, ..., N/2 - 1)
+ * x[2n + 1] = Im(z[n]), (n = 0, ..., N/2 - 1)
+ */
+
+/**
+ * This function is the first permutation of two-for-one IFFT algorithm.
+ * We move the division by 2 to the last step in the implementation, so:
+ * A[k] = (X[k] + X'[N/2 - k])
+ * B[k] = (X[k] - X'[N/2 - k]) * W[k], (k = 0, ..., N/2 - 1)
+ * Z[k] = (A[k] + j * B[k]) / 2, (k = 0, ..., N/2 - 1)
+ */
+static void RevbinPermuteInv(const OMX_F32 *in,
+                             OMX_F32 *out,
+                             const OMX_F32 *twiddle,
+                             OMX_INT n) {
+  OMX_INT i;
+  OMX_INT j;
+  OMX_INT i_by_2;
+  OMX_INT j_by_2;
+  OMX_INT n_by_2 = n >> 1;
+  OMX_INT n_by_4 = n >> 2;
+
+  OMX_FC32 big_a;
+  OMX_FC32 big_b;
+  OMX_FC32 temp;
+  const OMX_F32 *tw;
+
+  for (i = 2, j = n - 2; i < n_by_2; i += 2, j -= 2) {
+    // A[k] = (X[k] + X'[N/2 - k])
+    big_a.Re = in[i] + in[j];
+    big_a.Im = in[i + 1] - in[j + 1];
+
+    // temp = (X[k] - X'[N/2 - k])
+    temp.Re = in[i] - in[j];
+    temp.Im = in[i + 1] + in[j + 1];
+
+    i_by_2 = i >> 1;
+    j_by_2 = j >> 1;
+
+    // W[k]
+    tw = twiddle + i_by_2;
+
+    // B[k] = (X[k] - X'[N/2 - k]) * W[k]
+    big_b.Re =  temp.Re * tw[0] + temp.Im * tw[n];
+    big_b.Im =  temp.Re * tw[n] - temp.Im * tw[0];
+
+    // Convert split format to interleaved format.
+    // Z[k] = (A[k] + j * B[k]) (k = 0, ..., N/2 - 1)
+    // The scaling of 1/2 will be merged into to the scaling in
+    // the last step before the output in omxSP_FFTInv_CCSToR_F32_Sfs.
+    out[i_by_2] = big_a.Re + big_b.Im;
+    out[i_by_2 + n_by_2] = big_b.Re + big_a.Im;
+    out[j_by_2] = big_a.Re - big_b.Im;
+    out[j_by_2 + n_by_2] = big_b.Re - big_a.Im;
+  }
+
+  // The n_by_2 complex point
+  out[n_by_4] = 2.0f * in[n_by_2];
+  out[n_by_4 + n_by_2] = -2.0f * in[n_by_2 + 1];
+
+  // The first complex point
+  out[0] = in[0] + in[n];
+  out[n_by_2] = in[0] - in[n];
+}
+
+// Sse version of RevbinPermuteInv function.
+static void RevbinPermuteInvSse(const OMX_F32 *in,
+                                OMX_F32 *out,
+                                const OMX_F32 *twiddle,
+                                OMX_INT n) {
+  OMX_INT i;
+  OMX_INT j;
+  OMX_INT n_by_2 = n >> 1;
+  OMX_INT n_by_4 = n >> 2;
+  const OMX_F32 *tw;
+  const OMX_F32 *pi;
+  const OMX_F32 *pj;
+
+  VC v_i;
+  VC v_j;
+  VC v_big_a;
+  VC v_big_b;
+  VC v_temp;
+  VC v_tw;
+
+  for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) {
+    pi = in + (i << 1);
+    pj = in + (j << 1);
+    VC_LOAD_INTERLEAVE(&v_i, pi);
+
+    v_j.real = _mm_set_ps(pj[0], pj[2], pj[4], pj[6]);
+    v_j.imag = _mm_set_ps(pj[1], pj[3], pj[5], pj[7]);
+
+    // A[k] = (X[k] + X'[N/2 - k])
+    VC_ADD_SUB(&v_big_a, &v_i, &v_j);
+
+    // temp = (X[k] - X'[N/2 - k])
+    VC_SUB_ADD(&v_temp, &v_i, &v_j);
+
+    // W[k]
+    tw = twiddle + i;
+    VC_LOAD_SPLIT(&v_tw, tw, n);
+
+    // B[k] = (X[k] - X'[N/2 - k]) * W[k]
+    VC_CONJ_MUL(&v_big_b, &v_temp, &v_tw);
+
+    // Convert split format to interleaved format.
+    // Z[k] = (A[k] + j * B[k]) (k = 0, ..., N/2 - 1)
+    // The scaling of 1/2 will be merged into to the scaling in
+    // the last step before the output in omxSP_FFTInv_CCSToR_F32_Sfs.
+    VC_ADD_X_STORE_SPLIT((out + i), &v_big_a, &v_big_b, n_by_2);
+
+    VC_SUB_X_INVERSE_STOREU_SPLIT((out + j), &v_big_a, &v_big_b, n_by_2);
+  }
+
+  // The n_by_2 complex point
+  out[n_by_4] = 2.0f * in[n_by_2];
+  out[n_by_4 + n_by_2] = -2.0f * in[n_by_2 + 1];
+
+  // The first complex point
+  out[0] = in[0] + in[n];
+  out[n_by_2] = in[0] - in[n];
+}
+
+OMXResult omxSP_FFTInv_CCSToR_F32_Sfs(const OMX_F32 *pSrc, OMX_F32 *pDst,
+                                      const OMXFFTSpec_R_F32 *pFFTSpec) {
+  // Input must be 32 byte aligned
+  if (!pSrc || !pDst || (OMX_INT)pSrc & 31 || (OMX_INT)pDst & 31)
+    return OMX_Sts_BadArgErr;
+
+  OMX_INT n;
+  OMX_INT n_by_2;
+  OMX_INT n_by_4;
+  OMX_INT i;
+  const OMX_F32 *twiddle;
+  OMX_F32 *buf;
+  OMX_F32 *in = (OMX_F32*) pSrc;
+
+  const X86FFTSpec_R_FC32 *pFFTStruct = (const X86FFTSpec_R_FC32*) pFFTSpec;
+
+  n = pFFTStruct->N;
+
+  // This is to handle the case of order == 1.
+  if (n == 2) {
+    pDst[0] = (pSrc[0] + pSrc[2]) / 2;
+    pDst[1] = (pSrc[0] - pSrc[2]) / 2;
+    return OMX_Sts_NoErr;
+  }
+
+  n_by_2 = n >> 1;
+  n_by_4 = n >> 2;
+  buf = pFFTStruct->pBuf1;
+
+  twiddle = pFFTStruct->pTwiddle;
+
+  if (n < 8)
+    RevbinPermuteInv(in, buf, twiddle, n);
+  else
+    RevbinPermuteInvSse(in, buf, twiddle, n);
+
+  if (n_by_2 < 16) {
+    buf = x86SP_F32_radix2_kernel_OutOfPlace(
+        buf,
+        pFFTStruct->pBuf2,
+        buf,
+        twiddle,
+        n_by_2,
+        0);
+  } else {
+    buf = x86SP_F32_radix4_kernel_OutOfPlace_sse(
+        buf,
+        pFFTStruct->pBuf2,
+        buf,
+        twiddle,
+        n_by_2,
+        0);
+  }
+
+  // Scale the result by 1/n.
+  // It contains a scaling factor of 1/2 in
+  // RevbinPermuteInv/RevbinPermuteInvSse.
+  OMX_F32 factor = 1.0f / n;
+
+  if (n < 8) {
+    for (i = 0; i < n_by_2; i++) {
+      pDst[i << 1] = buf[i] * factor;
+      pDst[(i << 1) + 1] = buf[i + n_by_2] * factor;
+    }
+  } else {
+    OMX_F32 *base;
+    OMX_F32 *dst;
+    VC temp0;
+    VC temp1;
+    __m128 mFactor = _mm_load1_ps(&factor);
+
+    // Two things are done in this loop:
+    // 1 Get the result scaled; 2 Change the format from split to interleaved.
+    for (i = 0; i < n_by_2; i += 4) {
+      base = buf + i;
+      dst = pDst + (i << 1);
+      VC_LOAD_SPLIT(&temp0, base, n_by_2);
+      VC_MUL_F(&temp1, &temp0, mFactor);
+      VC_STORE_INTERLEAVE(dst, &temp1);
+    }
+  }
+
+  return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_fs.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_fs.c
new file mode 100644
index 0000000..6fa21cf
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_fs.c
@@ -0,0 +1,36 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix2_fs(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    OMX_INT n) {
+  OMX_INT i;
+  OMX_F32 *out0 = out;
+
+  for (i = 0; i < n; i += 2) {
+    const OMX_F32 *in0 = in + i;
+    const OMX_F32 *in1 = in0 + n;
+    OMX_F32 *out1 = out0 + (n >> 1);
+
+    // CADD out0, in0, in1
+    out0[0] = in0[0] + in1[0];
+    out0[n] = in0[1] + in1[1];
+
+    // CSUB out1, in0, in1
+    out1[0] = in0[0] - in1[0];
+    out1[n] = in0[1] - in1[1];
+
+    out0 += 1;
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls.c
new file mode 100644
index 0000000..f4d991c
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls.c
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix2_ls(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n) {
+  OMX_INT i;
+  OMX_F32 *out0 = out;
+
+  for (i = 0; i < n; i += 2) {
+    OMX_FC32 t;
+    const OMX_F32 *tw = twiddle + i;
+    const OMX_F32 *in0 = in + i;
+    const OMX_F32 *in1 = in0 + 1;
+    OMX_F32 *out1 = out0 + (n >> 1);
+
+    // CMUL t, tw, in1
+    t.Re = tw[0] * in1[0] - tw[n << 1] * in1[n];
+    t.Im = tw[0] * in1[n] + tw[n << 1] * in1[0];
+
+    // CADD out0, in0, t
+    out0[0] = in0[0] + t.Re;
+    out0[n] = in0[n] + t.Im;
+
+    // CSUB out1, in0, t
+    out1[0] = in0[0] - t.Re;
+    out1[n] = in0[n] - t.Im;
+
+    out0 += 1;
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse.c
new file mode 100644
index 0000000..a712d96
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse.c
@@ -0,0 +1,56 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n) {
+  OMX_F32 *out0 = out;
+  OMX_INT i;
+
+  // This function is used when n >= 8
+  assert(n >= 8);
+  if (n < 8) return;
+
+  for (i = 0; i < n; i += 8) {
+    VC v_tw;
+    VC v_t0;
+    VC v_t1;
+    VC v_temp;
+
+    // Load twiddle
+    const OMX_F32 *tw = twiddle + i;
+    v_tw.real = _mm_set_ps(tw[6], tw[4], tw[2], tw[0]);
+    const OMX_F32 * twi = tw + (n << 1);
+    v_tw.imag = _mm_set_ps(twi[6], twi[4], twi[2], twi[0]);
+
+    // Load real part
+    const OMX_F32 *t = in + i;
+    VC_LOAD_SHUFFLE(&(v_t0.real), &(v_t1.real), t);
+
+    // Load imag part
+    t = t + n;
+    VC_LOAD_SHUFFLE(&(v_t0.imag), &(v_t1.imag), t);
+
+    OMX_F32 *out1 = out0 + (n >> 1);
+    VC_MUL(&v_temp, &v_tw, &v_t1);
+
+    VC_SUB_STORE_SPLIT(out1, &v_t0, &v_temp, n);
+
+    VC_ADD_STORE_SPLIT(out0, &v_t0, &v_temp, n);
+
+    out0 += 4;
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ms.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ms.c
new file mode 100644
index 0000000..3714877
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ms.c
@@ -0,0 +1,50 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix2_ms(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    OMX_INT sub_size,
+    OMX_INT sub_num) {
+  OMX_INT grp;
+  OMX_F32 *out0 = out;
+  OMX_INT set_count = sub_num >> 1;
+
+  for (grp = 0; grp < sub_size; ++grp) {
+    OMX_INT set;
+    const OMX_F32 *tw = twiddle + grp * sub_num;
+
+    for (set = 0; set < set_count; ++set) {
+      OMX_FC32 t;
+      const OMX_F32 *in0 = in + set + grp * sub_num;
+      const OMX_F32 *in1 = in0 + set_count;
+      OMX_F32 *out1 = out0 + (n >> 1);
+
+      // CMUL t, tw, in1
+      t.Re = tw[0] * in1[0] - tw[n << 1] * in1[n];
+      t.Im = tw[0] * in1[n] + tw[n << 1] * in1[0];
+
+      // CADD out0, in0, t
+      out0[0] = in0[0] + t.Re;
+      out0[n] = in0[n] + t.Im;
+
+      // CSUB out1, in0, t
+      out1[0] = in0[0] - t.Re;
+      out1[n] = in0[n] - t.Im;
+
+      out0 += 1;
+    }
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs.c
new file mode 100644
index 0000000..36a40d8
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs.c
@@ -0,0 +1,72 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_fs(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    OMX_INT n) {
+  OMX_INT i;
+  OMX_INT n_by_4 = n >> 2;
+
+  // Transform from interleaved format to split format.
+  for (i = 0; i < n; i++) {
+    out[i] = in[i << 1];
+    out[i + n] = in[(i << 1) + 1];
+  }
+
+  // As we have already moved data from [in] to [out],
+  // next calculation will be produced in in-place mode.
+  for (i = 0; i < n_by_4; i++) {
+    OMX_F32 *out0 = out + i;
+    OMX_F32 *out1 = out0 + n_by_4;
+    OMX_F32 *out2 = out1 + n_by_4;
+    OMX_F32 *out3 = out2 + n_by_4;
+
+    OMX_FC32 t0;
+    OMX_FC32 t1;
+    OMX_FC32 t2;
+    OMX_FC32 t3;
+
+    // CADD t0, out0, out2
+    t0.Re = out0[0] + out2[0];
+    t0.Im = out0[n] + out2[n];
+
+    // CSUB t1, out0, out2
+    t1.Re = out0[0] - out2[0];
+    t1.Im = out0[n] - out2[n];
+
+    // CADD t2, out1, out3
+    t2.Re = out1[0] + out3[0];
+    t2.Im = out1[n] + out3[n];
+
+    // CSUB t3, out1, out3
+    t3.Re = out1[0] - out3[0];
+    t3.Im = out1[n] - out3[n];
+
+    // CADD out0, t0, t2
+    out0[0] = t0.Re + t2.Re;
+    out0[n] = t0.Im + t2.Im;
+
+    // CSUB out2, t0, t2
+    out2[0] = t0.Re - t2.Re;
+    out2[n] = t0.Im - t2.Im;
+
+    // CADD_SUB_X out1, t1, t3
+    out1[0] = t1.Re + t3.Im;
+    out1[n] = t1.Im - t3.Re;
+
+    // CSUB_ADD_X out3, t1, t3
+    out3[0] = t1.Re - t3.Im;
+    out3[n] = t1.Im + t3.Re;
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse.c
new file mode 100644
index 0000000..58908d3
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse.c
@@ -0,0 +1,56 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    OMX_INT n) {
+  OMX_INT i;
+  OMX_INT n_by_2 = n >> 1;
+  OMX_INT n_by_4 = n >> 2;
+  OMX_F32 *out0 = out;
+
+  for (i = 0; i < n_by_2; i += 8) {
+    VC v_t0;
+    VC v_t1;
+    VC v_t2;
+    VC v_t3;
+    VC v_t4;
+    VC v_t5;
+    VC v_t6;
+    VC v_t7;
+
+    const OMX_F32 *in0 = in + i;
+    const OMX_F32 *in1 = in0 + n_by_2;
+    const OMX_F32 *in2 = in1 + n_by_2;
+    const OMX_F32 *in3 = in2 + n_by_2;
+
+    OMX_F32 *out1 = out0 + n_by_4;
+    OMX_F32 *out2 = out1 + n_by_4;
+    OMX_F32 *out3 = out2 + n_by_4;
+
+    VC_LOAD_SHUFFLE(&(v_t0.real), &(v_t0.imag), in0);
+    VC_LOAD_SHUFFLE(&(v_t1.real), &(v_t1.imag), in1);
+    VC_LOAD_SHUFFLE(&(v_t2.real), &(v_t2.imag), in2);
+    VC_LOAD_SHUFFLE(&(v_t3.real), &(v_t3.imag), in3);
+
+    RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7,
+                        &v_t0, &v_t1, &v_t2, &v_t3);
+
+    RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3,
+                               &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+    out0 += 4;
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls.c
new file mode 100644
index 0000000..08ab35b
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls.c
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_ls(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n) {
+  OMX_INT n_by_2 = n >> 1;
+  OMX_INT n_by_4 = n >> 2;
+  OMX_INT n_mul_2 = n << 1;
+  OMX_INT i;
+  OMX_F32 *out0 = out;
+
+  for (i = 0; i < n_by_2; i += 2) {
+    OMX_FC32 t0;
+    OMX_FC32 t1;
+    OMX_FC32 t2;
+    OMX_FC32 t3;
+    OMX_FC32 tt1;
+    OMX_FC32 tt2;
+    OMX_FC32 tt3;
+    const OMX_F32 *tw1 = twiddle + i;
+    const OMX_F32 *tw2 = tw1 + i;
+    const OMX_F32 *tw3 = tw2 + i;
+    const OMX_F32 *in0 = in + (i << 1);
+    const OMX_F32 *in1 = in0 + 1;
+    const OMX_F32 *in2 = in1 + 1;
+    const OMX_F32 *in3 = in2 + 1;
+    OMX_F32 *out1 = out0 + n_by_4;
+    OMX_F32 *out2 = out1 + n_by_4;
+    OMX_F32 *out3 = out2 + n_by_4;
+
+    // CMUL tt1, tw1, in1
+    tt1.Re = tw1[0] * in1[0] - tw1[n_mul_2] * in1[n];
+    tt1.Im = tw1[0] * in1[n] + tw1[n_mul_2] * in1[0];
+
+    // CMUL tt2, tw2, in2
+    tt2.Re = tw2[0] * in2[0] - tw2[n_mul_2] * in2[n];
+    tt2.Im = tw2[0] * in2[n] + tw2[n_mul_2] * in2[0];
+
+    // CMUL tt3, tw3, in3
+    tt3.Re = tw3[0] * in3[0] - tw3[n_mul_2] * in3[n];
+    tt3.Im = tw3[0] * in3[n] + tw3[n_mul_2] * in3[0];
+
+    // CADD t0, in0, tt2
+    t0.Re = in0[0] + tt2.Re;
+    t0.Im = in0[n] + tt2.Im;
+
+    // CSUB t1, in0, tt2
+    t1.Re = in0[0] - tt2.Re;
+    t1.Im = in0[n] - tt2.Im;
+
+    // CADD t2, tt1, tt3
+    t2.Re = tt1.Re + tt3.Re;
+    t2.Im = tt1.Im + tt3.Im;
+
+    // CSUB t3, tt1, tt3
+    t3.Re = tt1.Re - tt3.Re;
+    t3.Im = tt1.Im - tt3.Im;
+
+    // CADD out0, t0, t2
+    out0[0] = t0.Re + t2.Re;
+    out0[n] = t0.Im + t2.Im;
+
+    // CSUB out2, t0, t2
+    out2[0] = t0.Re - t2.Re;
+    out2[n] = t0.Im - t2.Im;
+
+    // CADD_SUB_X out1, t1, t3
+    out1[0] = t1.Re + t3.Im;
+    out1[n] = t1.Im - t3.Re;
+
+    // CSUB_ADD_X out3, t1, t3
+    out3[0] = t1.Re - t3.Im;
+    out3[n] = t1.Im + t3.Re;
+
+    out0 += 1;
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse.c
new file mode 100644
index 0000000..4fc3427
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse.c
@@ -0,0 +1,81 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n) {
+  OMX_INT n_by_2 = n >> 1;
+  OMX_INT n_by_4 = n >> 2;
+  OMX_INT n_mul_2 = n << 1;
+  OMX_INT i;
+
+  OMX_F32 *out0 = out;
+
+  for (i = 0; i < n_by_2; i += 8) {
+    const OMX_F32 *tw1 = twiddle + i;
+    const OMX_F32 *tw2 = tw1 + i;
+    const OMX_F32 *tw3 = tw2 + i;
+    const OMX_F32 *in0 = in + (i << 1);
+    const OMX_F32 *in1 = in0 + 4;
+    const OMX_F32 *in2 = in1 + 4;
+    const OMX_F32 *in3 = in2 + 4;
+    OMX_F32 *out1 = out0 + n_by_4;
+    OMX_F32 *out2 = out1 + n_by_4;
+    OMX_F32 *out3 = out2 + n_by_4;
+
+    VC v_tw1;
+    VC v_tw2;
+    VC v_tw3;
+    VC v_t0;
+    VC v_t1;
+    VC v_t2;
+    VC v_t3;
+    VC v_t4;
+    VC v_t5;
+    VC v_t6;
+    VC v_t7;
+
+    v_tw1.real = _mm_set_ps(tw1[6], tw1[4], tw1[2], tw1[0]);
+    v_tw1.imag = _mm_set_ps(
+        tw1[6 + n_mul_2],
+        tw1[4 + n_mul_2],
+        tw1[2 + n_mul_2],
+        tw1[n_mul_2]);
+    v_tw2.real = _mm_set_ps(tw2[12], tw2[8], tw2[4], tw2[0]);
+    v_tw2.imag = _mm_set_ps(
+        tw2[12 + n_mul_2],
+        tw2[8 + n_mul_2],
+        tw2[4 + n_mul_2],
+        tw2[n_mul_2]);
+    v_tw3.real = _mm_set_ps(tw3[18], tw3[12], tw3[6], tw3[0]);
+    v_tw3.imag = _mm_set_ps(
+        tw3[18 + n_mul_2],
+        tw3[12 + n_mul_2],
+        tw3[6 + n_mul_2],
+        tw3[n_mul_2]);
+
+    VC_LOAD_MATRIX_TRANSPOSE(&v_t0, &v_t1, &v_t2, &v_t3, in0, in1, in2, in3, n);
+
+    RADIX4_FWD_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+                         &v_tw1, &v_tw2, &v_tw3,
+                         &v_t0, &v_t1, &v_t2, &v_t3);
+
+    RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3,
+                               &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+    out0 += 4;
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms.c
new file mode 100644
index 0000000..de2a1be
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms.c
@@ -0,0 +1,149 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_ms(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    OMX_INT sub_size,
+    OMX_INT sub_num) {
+  OMX_INT set;
+  OMX_INT grp;
+  OMX_INT step = sub_num >> 1;
+  OMX_INT set_count = sub_num >> 2;
+  OMX_INT n_by_4 = n >> 2;
+  OMX_INT n_mul_2 = n << 1;
+  OMX_F32 *out0 = out;
+
+  // grp == 0
+  for (set = 0; set < set_count; ++set) {
+    OMX_FC32 t0;
+    OMX_FC32 t1;
+    OMX_FC32 t2;
+    OMX_FC32 t3;
+
+    const OMX_F32 *in0 = in + set;
+    const OMX_F32 *in1 = in0 + set_count;
+    const OMX_F32 *in2 = in1 + set_count;
+    const OMX_F32 *in3 = in2 + set_count;
+    OMX_F32 *out1 = out0 + n_by_4;
+    OMX_F32 *out2 = out1 + n_by_4;
+    OMX_F32 *out3 = out2 + n_by_4;
+
+    // CADD t0, in0, in2
+    t0.Re = in0[0] + in2[0];
+    t0.Im = in0[n] + in2[n];
+
+    // CSUB t1, in0, in2
+    t1.Re = in0[0] - in2[0];
+    t1.Im = in0[n] - in2[n];
+
+    // CADD t2, in1, in3
+    t2.Re = in1[0] + in3[0];
+    t2.Im = in1[n] + in3[n];
+
+    // CSUB t3, in1, in3
+    t3.Re = in1[0] - in3[0];
+    t3.Im = in1[n] - in3[n];
+
+    // CADD out0, t0, t2
+    out0[0] = t0.Re + t2.Re;
+    out0[n] = t0.Im + t2.Im;
+
+    // CSUB out2, t0, t2
+    out2[0] = t0.Re - t2.Re;
+    out2[n] = t0.Im - t2.Im;
+
+    // CSUB_ADD_X out3, t1, t3
+    out3[0] = t1.Re - t3.Im;
+    out3[n] = t1.Im + t3.Re;
+
+    // CADD_SUB_X out1, t1, t3
+    out1[0] = t1.Re + t3.Im;
+    out1[n] = t1.Im - t3.Re;
+
+    out0 += 1;
+  }
+
+  // grp > 0
+  for (grp = 1; grp < sub_size; ++grp) {
+    const OMX_F32 *tw1 = twiddle + grp * step;
+    const OMX_F32 *tw2 = tw1 + grp * step;
+    const OMX_F32 *tw3 = tw2 + grp * step;
+
+    for (set = 0; set < set_count; ++set) {
+      OMX_FC32 t0;
+      OMX_FC32 t1;
+      OMX_FC32 t2;
+      OMX_FC32 t3;
+      OMX_FC32 tt1;
+      OMX_FC32 tt2;
+      OMX_FC32 tt3;
+
+      const OMX_F32 *in0 = in + set + grp * sub_num;
+      const OMX_F32 *in1 = in0 + set_count;
+      const OMX_F32 *in2 = in1 + set_count;
+      const OMX_F32 *in3 = in2 + set_count;
+      OMX_F32 *out1 = out0 + n_by_4;
+      OMX_F32 *out2 = out1 + n_by_4;
+      OMX_F32 *out3 = out2 + n_by_4;
+
+      // CMUL tt1, Tw1, in1
+      tt1.Re = tw1[0] * in1[0] - tw1[n_mul_2] * in1[n];
+      tt1.Im = tw1[0] * in1[n] + tw1[n_mul_2] * in1[0];
+
+      // CMUL tt2, Tw2, in2
+      tt2.Re = tw2[0] * in2[0] - tw2[n_mul_2] * in2[n];
+      tt2.Im = tw2[0] * in2[n] + tw2[n_mul_2] * in2[0];
+
+      // CMUL tt3, Tw3, in3
+      tt3.Re = tw3[0] * in3[0] - tw3[n_mul_2] * in3[n];
+      tt3.Im = tw3[0] * in3[n] + tw3[n_mul_2] * in3[0];
+
+      // CADD t0, in0, tt2
+      t0.Re = in0[0] + tt2.Re;
+      t0.Im = in0[n] + tt2.Im;
+
+      // CSUB t1, in0, tt2
+      t1.Re = in0[0] - tt2.Re;
+      t1.Im = in0[n] - tt2.Im;
+
+      // CADD t2, tt1, tt3
+      t2.Re = tt1.Re + tt3.Re;
+      t2.Im = tt1.Im + tt3.Im;
+
+      // CSUB t3, tt1, tt3
+      t3.Re = tt1.Re - tt3.Re;
+      t3.Im = tt1.Im - tt3.Im;
+
+      // CADD out0, t0, t2
+      out0[0] = t0.Re + t2.Re;
+      out0[n] = t0.Im + t2.Im;
+
+      // CSUB out2, t0, t2
+      out2[0] = t0.Re - t2.Re;
+      out2[n] = t0.Im - t2.Im;
+
+      // CADD_SUB_X out1, t1, t3
+      out1[0] = t1.Re + t3.Im;
+      out1[n] = t1.Im - t3.Re;
+
+      // CSUB_ADD_X out3, t1, t3
+      out3[0] = t1.Re - t3.Im;
+      out3[n] = t1.Im + t3.Re;
+
+      out0 += 1;
+    }
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse.c
new file mode 100644
index 0000000..286f842
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse.c
@@ -0,0 +1,215 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+// This function handles the case when set_count = 2, in which we cannot
+// unroll the set loop by 4 to meet the SSE requirement (4 elements).
+static void InternalUnroll2Fwd(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    OMX_F32 *twiddle,
+    OMX_INT n) {
+  OMX_INT i;
+  OMX_INT n_by_2 = n >> 1;
+  OMX_INT n_by_4 = n >> 2;
+  OMX_INT n_mul_2 = n << 1;
+  OMX_F32 *out0 = out;
+
+  for (i = 0; i < n_by_2; i += 8) {
+    const OMX_F32 *tw1  = twiddle + i;
+    const OMX_F32 *tw2  = tw1 + i;
+    const OMX_F32 *tw3  = tw2 + i;
+    const OMX_F32 *tw1e = tw1 + 4;
+    const OMX_F32 *tw2e = tw2 + 8;
+    const OMX_F32 *tw3e = tw3 + 12;
+
+    VC v_tw1;
+    VC v_tw2;
+    VC v_tw3;
+    VC v_t0;
+    VC v_t1;
+    VC v_t2;
+    VC v_t3;
+    VC v_t4;
+    VC v_t5;
+    VC v_t6;
+    VC v_t7;
+
+    v_tw1.real = _mm_shuffle_ps(_mm_load_ss(tw1),
+                                _mm_load_ss(tw1e),
+                                _MM_SHUFFLE(0, 0, 0, 0));
+    v_tw1.imag = _mm_shuffle_ps(_mm_load_ss(tw1 + n_mul_2),
+                                _mm_load_ss(tw1e + n_mul_2),
+                                _MM_SHUFFLE(0, 0, 0, 0));
+    v_tw2.real = _mm_shuffle_ps(_mm_load_ss(tw2),
+                                _mm_load_ss(tw2e),
+                                _MM_SHUFFLE(0, 0, 0, 0));
+    v_tw2.imag = _mm_shuffle_ps(_mm_load_ss(tw2 + n_mul_2),
+                                _mm_load_ss(tw2e + n_mul_2),
+                                _MM_SHUFFLE(0, 0, 0, 0));
+    v_tw3.real = _mm_shuffle_ps(_mm_load_ss(tw3),
+                                _mm_load_ss(tw3e),
+                                _MM_SHUFFLE(0, 0, 0, 0));
+    v_tw3.imag = _mm_shuffle_ps(_mm_load_ss(tw3 + n_mul_2),
+                                _mm_load_ss(tw3e + n_mul_2),
+                                _MM_SHUFFLE(0, 0, 0, 0));
+
+    __m128 xmm0;
+    __m128 xmm1;
+    __m128 xmm2;
+    __m128 xmm3;
+    __m128 xmm4;
+    __m128 xmm5;
+    __m128 xmm6;
+    __m128 xmm7;
+
+    const OMX_F32 *in0 = in + (i << 1);
+    xmm0 = _mm_load_ps(in0);
+    xmm1 = _mm_load_ps(in0 + 4);
+    xmm2 = _mm_load_ps(in0 + 8);
+    xmm3 = _mm_load_ps(in0 + 12);
+    v_t0.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(1, 0, 1, 0));
+    v_t1.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(3, 2, 3, 2));
+    v_t2.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(1, 0, 1, 0));
+    v_t3.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(3, 2, 3, 2));
+
+    xmm4 = _mm_load_ps(in0 + n);
+    xmm5 = _mm_load_ps(in0 + n + 4);
+    xmm6 = _mm_load_ps(in0 + n + 8);
+    xmm7 = _mm_load_ps(in0 + n + 12);
+    v_t0.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
+    v_t1.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
+    v_t2.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
+    v_t3.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
+
+    OMX_F32 *out1 = out0 + n_by_4;
+    OMX_F32 *out2 = out1 + n_by_4;
+    OMX_F32 *out3 = out2 + n_by_4;
+
+    RADIX4_FWD_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+                         &v_tw1, &v_tw2,
+                         &v_tw3, &v_t0, &v_t1, &v_t2, &v_t3);
+
+    RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3,
+                               &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+    out0 += 4;
+  }
+}
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    OMX_F32 *twiddle,
+    OMX_INT n,
+    OMX_INT sub_size,
+    OMX_INT sub_num) {
+  OMX_INT set;
+  OMX_INT grp;
+  OMX_INT step = sub_num >> 1;
+  OMX_INT set_count = sub_num >> 2;
+  OMX_INT n_by_4 = n >> 2;
+  OMX_INT n_mul_2 = n << 1;
+
+  OMX_F32 *out0 = out;
+
+  if (set_count == 2) {
+    InternalUnroll2Fwd(in, out, twiddle, n);
+    return;
+  }
+
+  // grp == 0
+  for (set = 0; set < set_count; set += 4) {
+    const OMX_F32 * in0 = in + set;
+    const OMX_F32 *in1 = in0 + set_count;
+    const OMX_F32 *in2 = in1 + set_count;
+    const OMX_F32 *in3 = in2 + set_count;
+
+    VC v_t0;
+    VC v_t1;
+    VC v_t2;
+    VC v_t3;
+    VC v_t4;
+    VC v_t5;
+    VC v_t6;
+    VC v_t7;
+
+    VC_LOAD_SPLIT(&v_t0, in0, n);
+    VC_LOAD_SPLIT(&v_t1, in1, n);
+    VC_LOAD_SPLIT(&v_t2, in2, n);
+    VC_LOAD_SPLIT(&v_t3, in3, n);
+
+    OMX_F32 *out1 = out0 + n_by_4;
+    OMX_F32 *out2 = out1 + n_by_4;
+    OMX_F32 *out3 = out2 + n_by_4;
+
+    RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7,
+                        &v_t0, &v_t1, &v_t2, &v_t3);
+
+    RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3,
+                               &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+    out0 += 4;
+  }
+
+  for (grp = 1; grp < sub_size; ++grp) {
+    const OMX_F32 *tw1 = twiddle + grp * step;
+    const OMX_F32 *tw2 = tw1 + grp * step;
+    const OMX_F32 *tw3 = tw2 + grp * step;
+
+    VC v_tw1;
+    VC v_tw2;
+    VC v_tw3;
+
+    v_tw1.real = _mm_load1_ps(tw1);
+    v_tw1.imag = _mm_load1_ps(tw1 + n_mul_2);
+    v_tw2.real = _mm_load1_ps(tw2);
+    v_tw2.imag = _mm_load1_ps(tw2 + n_mul_2);
+    v_tw3.real = _mm_load1_ps(tw3);
+    v_tw3.imag = _mm_load1_ps(tw3 + n_mul_2);
+
+    for (set = 0; set < set_count; set += 4) {
+      const OMX_F32 *in0 = in + set + grp * sub_num;
+      const OMX_F32 *in1 = in0 + set_count;
+      const OMX_F32 *in2 = in1 + set_count;
+      const OMX_F32 *in3 = in2 + set_count;
+
+      VC v_t0;
+      VC v_t1;
+      VC v_t2;
+      VC v_t3;
+      VC v_t4;
+      VC v_t5;
+      VC v_t6;
+      VC v_t7;
+
+      VC_LOAD_SPLIT(&v_t0, in0, n);
+      VC_LOAD_SPLIT(&v_t1, in1, n);
+      VC_LOAD_SPLIT(&v_t2, in2, n);
+      VC_LOAD_SPLIT(&v_t3, in3, n);
+
+      OMX_F32 *out1 = out0 + n_by_4;
+      OMX_F32 *out2 = out1 + n_by_4;
+      OMX_F32 *out3 = out2 + n_by_4;
+
+      RADIX4_FWD_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+                           &v_tw1, &v_tw2, &v_tw3,
+                           &v_t0, &v_t1, &v_t2, &v_t3);
+
+      RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3,
+                                 &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+      out0 += 4;
+    }
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_fs.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_fs.c
new file mode 100644
index 0000000..9f17d61
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_fs.c
@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix2_fs(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    OMX_INT n) {
+  OMX_INT i;
+  OMX_INT n_by_2 = n >> 1;
+  OMX_F32 *out0 = out;
+
+  for (i = 0; i < n_by_2; i++) {
+    const OMX_F32 *in0 = in + i;
+    const OMX_F32 *in1 = in0 + n_by_2;
+    OMX_F32 *out1 = out0 + n_by_2;
+
+    // CADD out0, in0, in1
+    out0[0] = in0[0] + in1[0];
+    out0[n] = in0[n] + in1[n];
+
+    // CSUB out1, in0, in1
+    out1[0] = in0[0] - in1[0];
+    out1[n] = in0[n] - in1[n];
+
+    out0 += 1;
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls.c
new file mode 100644
index 0000000..ec545c5
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls.c
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix2_ls(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n) {
+  OMX_INT i;
+  OMX_F32 *out0 = out;
+
+  for (i = 0; i < n; i += 2) {
+    OMX_FC32 t;
+    const OMX_F32 *tw = twiddle + i;
+    const OMX_F32 *in0 = in + i;
+    const OMX_F32 *in1 = in0 + 1;
+    OMX_F32 *out1 = out0 + (n >> 1);
+
+    // CMUL t, tw, in1
+    t.Re = tw[0] * in1[0] + tw[n << 1] * in1[n];
+    t.Im = tw[0] * in1[n] - tw[n << 1] * in1[0];
+
+    // CADD out0, in0, t
+    out0[0] = in0[0] + t.Re;
+    out0[n] = in0[n] + t.Im;
+
+    // CSUB out1, in0, t
+    out1[0] = in0[0] - t.Re;
+    out1[n] = in0[n] - t.Im;
+
+    out0 += 1;
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse.c
new file mode 100644
index 0000000..abad0cc
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse.c
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n) {
+  OMX_F32 *out0 =out;
+  OMX_INT i;
+
+  for (i = 0; i < n; i += 8) {
+  VC v_tw;
+  VC v_t0;
+  VC v_t1;
+  VC v_temp;
+
+    // Load twiddle
+    const OMX_F32 *tw = twiddle + i;
+    v_tw.real = _mm_set_ps(tw[6], tw[4], tw[2], tw[0]);
+    const OMX_F32 * twi = tw + (n << 1);
+    v_tw.imag = _mm_set_ps(twi[6], twi[4], twi[2], twi[0]);
+
+    // Load real part
+    const OMX_F32 *t = in + i;
+    VC_LOAD_SHUFFLE(&(v_t0.real), &(v_t1.real), t);
+
+    // Load imag part
+    t = t + n;
+    VC_LOAD_SHUFFLE(&(v_t0.imag), &(v_t1.imag), t);
+
+    OMX_F32 *out1 = out0 + (n >> 1);
+    VC_CONJ_MUL(&v_temp, &v_tw, &v_t1);
+
+    VC_SUB_STORE_SPLIT(out1, &v_t0, &v_temp, n);
+
+    VC_ADD_STORE_SPLIT(out0, &v_t0, &v_temp, n);
+
+    out0 += 4;
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ms.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ms.c
new file mode 100644
index 0000000..78bc9eb
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ms.c
@@ -0,0 +1,50 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix2_ms(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    OMX_INT sub_size,
+    OMX_INT sub_num) {
+  OMX_INT grp;
+  OMX_F32 *out0 = out;
+  OMX_INT set_count = sub_num >> 1;
+
+  for (grp = 0; grp < sub_size; ++grp) {
+    OMX_INT set;
+    const OMX_F32 *tw = twiddle + grp * sub_num;
+
+    for (set = 0; set < set_count; ++set) {
+      OMX_FC32 t;
+      const OMX_F32 *in0 = in + set + grp * sub_num;
+      const OMX_F32 *in1 = in0 + set_count;
+      OMX_F32 *out1 = out0 + (n >> 1);
+
+      // CMUL t, tw, in1
+      t.Re = tw[0] * in1[0] + tw[n << 1] * in1[n];
+      t.Im = tw[0] * in1[n] - tw[n << 1] * in1[0];
+
+      // CADD out0, in0, t
+      out0[0] = in0[0] + t.Re;
+      out0[n] = in0[n] + t.Im;
+
+      // CSUB out1, in0, t
+      out1[0] = in0[0] - t.Re;
+      out1[n] = in0[n] - t.Im;
+
+      out0 += 1;
+    }
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs.c
new file mode 100644
index 0000000..bb80fa3
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs.c
@@ -0,0 +1,70 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_fs(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    OMX_INT n) {
+  OMX_INT i;
+  OMX_INT n_by_4 = n >> 2;
+  OMX_F32 *out0 = out;
+
+  for (i = 0; i < n_by_4; i++) {
+    const OMX_F32 *in0 = in + i;
+    const OMX_F32 *in1 = in0 + n_by_4;
+    const OMX_F32 *in2 = in1 + n_by_4;
+    const OMX_F32 *in3 = in2 + n_by_4;
+    OMX_F32 *out1 = out0 + n_by_4;
+    OMX_F32 *out2 = out1 + n_by_4;
+    OMX_F32 *out3 = out2 + n_by_4;
+
+    OMX_FC32 t0;
+    OMX_FC32 t1;
+    OMX_FC32 t2;
+    OMX_FC32 t3;
+
+    // CADD t0, in0, in2
+    t0.Re = in0[0] + in2[0];
+    t0.Im = in0[n] + in2[n];
+
+    // CSUB t1, in0, in2
+    t1.Re = in0[0] - in2[0];
+    t1.Im = in0[n] - in2[n];
+
+    // CADD t2, in1, in3
+    t2.Re = in1[0] + in3[0];
+    t2.Im = in1[n] + in3[n];
+
+    // CSUB t3, in1, in3
+    t3.Re = in1[0] - in3[0];
+    t3.Im = in1[n] - in3[n];
+
+    // CADD out0, t0, t2
+    out0[0] = t0.Re + t2.Re;
+    out0[n] = t0.Im + t2.Im;
+
+    // CSUB out2, t0, t2
+    out2[0] = t0.Re - t2.Re;
+    out2[n] = t0.Im - t2.Im;
+
+    // CSUB_ADD_X out1, t1, t3
+    out1[0] = t1.Re - t3.Im;
+    out1[n] = t1.Im + t3.Re;
+
+    // CADD_SUB_X out3, t1, t3
+    out3[0] = t1.Re + t3.Im;
+    out3[n] = t1.Im - t3.Re;
+
+    out0 += 1;
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse.c
new file mode 100644
index 0000000..c3921bc
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse.c
@@ -0,0 +1,55 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    OMX_INT n) {
+  OMX_INT i;
+  OMX_INT n_by_4 = n >> 2;
+  OMX_F32 *out0 = out;
+
+  for (i = 0; i < n_by_4; i += 4) {
+    VC v_t0;
+    VC v_t1;
+    VC v_t2;
+    VC v_t3;
+    VC v_t4;
+    VC v_t5;
+    VC v_t6;
+    VC v_t7;
+
+    const OMX_F32 *in0 = in + i;
+    const OMX_F32 *in1 = in0 + n_by_4;
+    const OMX_F32 *in2 = in1 + n_by_4;
+    const OMX_F32 *in3 = in2 + n_by_4;
+
+    OMX_F32 *out1 = out0 + n_by_4;
+    OMX_F32 *out2 = out1 + n_by_4;
+    OMX_F32 *out3 = out2 + n_by_4;
+
+    VC_LOAD_SPLIT(&v_t0, in0, n);
+    VC_LOAD_SPLIT(&v_t1, in1, n);
+    VC_LOAD_SPLIT(&v_t2, in2, n);
+    VC_LOAD_SPLIT(&v_t3, in3, n);
+
+    RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7,
+                        &v_t0, &v_t1, &v_t2, &v_t3);
+
+    RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
+                               &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+    out0 += 4;
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls.c
new file mode 100644
index 0000000..705d9cb
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls.c
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_ls(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n) {
+  OMX_INT n_by_2 = n >> 1;
+  OMX_INT n_by_4 = n >> 2;
+  OMX_INT n_mul_2 = n << 1;
+  OMX_INT i;
+  OMX_F32 *out0 = out;
+
+  for (i = 0; i < n_by_2; i += 2) {
+    OMX_FC32 t0;
+    OMX_FC32 t1;
+    OMX_FC32 t2;
+    OMX_FC32 t3;
+    OMX_FC32 tt1;
+    OMX_FC32 tt2;
+    OMX_FC32 tt3;
+    const OMX_F32 *tw1 = twiddle + i;
+    const OMX_F32 *tw2 = tw1 + i;
+    const OMX_F32 *tw3 = tw2 + i;
+    const OMX_F32 *in0 = in + (i << 1);
+    const OMX_F32 *in1 = in0 + 1;
+    const OMX_F32 *in2 = in1 + 1;
+    const OMX_F32 *in3 = in2 + 1;
+    OMX_F32 *out1 = out0 + n_by_4;
+    OMX_F32 *out2 = out1 + n_by_4;
+    OMX_F32 *out3 = out2 + n_by_4;
+
+    // CMUL tt1, Tw1, in1
+    tt1.Re = tw1[0] * in1[0] + tw1[n_mul_2] * in1[n];
+    tt1.Im = tw1[0] * in1[n] - tw1[n_mul_2] * in1[0];
+
+    // CMUL tt2, Tw2, in2
+    tt2.Re = tw2[0] * in2[0] + tw2[n_mul_2] * in2[n];
+    tt2.Im = tw2[0] * in2[n] - tw2[n_mul_2] * in2[0];
+
+    // CMUL tt3, Tw3, in3
+    tt3.Re = tw3[0] * in3[0] + tw3[n_mul_2] * in3[n];
+    tt3.Im = tw3[0] * in3[n] - tw3[n_mul_2] * in3[0];
+
+    // CADD t0, in0, tt2
+    t0.Re = in0[0] + tt2.Re;
+    t0.Im = in0[n] + tt2.Im;
+
+    // CSUB t1, in0, tt2
+    t1.Re = in0[0] - tt2.Re;
+    t1.Im = in0[n] - tt2.Im;
+
+    // CADD t2, tt1, tt3
+    t2.Re = tt1.Re + tt3.Re;
+    t2.Im = tt1.Im + tt3.Im;
+
+    // CSUB t3, tt1, tt3
+    t3.Re = tt1.Re - tt3.Re;
+    t3.Im = tt1.Im - tt3.Im;
+
+    // CADD out0, t0, t2
+    out0[0] = t0.Re + t2.Re;
+    out0[n] = t0.Im + t2.Im;
+
+    // CSUB out2, t0, t2
+    out2[0] = t0.Re - t2.Re;
+    out2[n] = t0.Im - t2.Im;
+
+    // CSUB_ADD_X out1, t1, t3
+    out1[0] = t1.Re - t3.Im;
+    out1[n] = t1.Im + t3.Re;
+
+    // CADD_SUB_X out3, t1, t3
+    out3[0] = t1.Re + t3.Im;
+    out3[n] = t1.Im - t3.Re;
+
+    out0 += 1;
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse.c
new file mode 100644
index 0000000..2e245fa
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse.c
@@ -0,0 +1,81 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n) {
+  OMX_INT n_by_2 = n >> 1;
+  OMX_INT n_by_4 = n >> 2;
+  OMX_INT n_mul_2 = n << 1;
+  OMX_INT i;
+
+  OMX_F32 *out0 = out;
+
+  for (i = 0; i < n_by_2; i += 8) {
+    const OMX_F32 *tw1 = twiddle + i;
+    const OMX_F32 *tw2 = tw1 + i;
+    const OMX_F32 *tw3 = tw2 + i;
+    const OMX_F32 *in0 = in + (i << 1);
+    const OMX_F32 *in1 = in0 + 4;
+    const OMX_F32 *in2 = in1 + 4;
+    const OMX_F32 *in3 = in2 + 4;
+    OMX_F32 *out1 = out0 + n_by_4;
+    OMX_F32 *out2 = out1 + n_by_4;
+    OMX_F32 *out3 = out2 + n_by_4;
+
+    VC v_tw1;
+    VC v_tw2;
+    VC v_tw3;
+    VC v_t0;
+    VC v_t1;
+    VC v_t2;
+    VC v_t3;
+    VC v_t4;
+    VC v_t5;
+    VC v_t6;
+    VC v_t7;
+
+    v_tw1.real = _mm_set_ps(tw1[6], tw1[4], tw1[2], tw1[0]);
+    v_tw1.imag = _mm_set_ps(
+        tw1[6 + n_mul_2],
+        tw1[4 + n_mul_2],
+        tw1[2 + n_mul_2],
+        tw1[n_mul_2]);
+    v_tw2.real = _mm_set_ps(tw2[12], tw2[8], tw2[4], tw2[0]);
+    v_tw2.imag = _mm_set_ps(
+        tw2[12 + n_mul_2],
+        tw2[8 + n_mul_2],
+        tw2[4 + n_mul_2],
+        tw2[n_mul_2]);
+    v_tw3.real = _mm_set_ps(tw3[18], tw3[12], tw3[6], tw3[0]);
+    v_tw3.imag = _mm_set_ps(
+        tw3[18 + n_mul_2],
+        tw3[12 + n_mul_2],
+        tw3[6 + n_mul_2],
+        tw3[n_mul_2]);
+
+    VC_LOAD_MATRIX_TRANSPOSE(&v_t0, &v_t1, &v_t2, &v_t3, in0, in1, in2, in3, n);
+
+    RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+                         &v_tw1, &v_tw2, &v_tw3,
+                         &v_t0, &v_t1, &v_t2, &v_t3);
+
+    RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
+                               &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+    out0 += 4;
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c
new file mode 100644
index 0000000..499036b
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c
@@ -0,0 +1,149 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_ms(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    OMX_INT sub_size,
+    OMX_INT sub_num) {
+  OMX_INT set;
+  OMX_INT grp;
+  OMX_INT step = sub_num >> 1;
+  OMX_INT set_count = sub_num >> 2;
+  OMX_INT n_by_4 = n >> 2;
+  OMX_INT n_mul_2 = n << 1;
+  OMX_F32 *out0 = out;
+
+  // grp == 0
+  for (set = 0; set < set_count; ++set) {
+    OMX_FC32 t0;
+    OMX_FC32 t1;
+    OMX_FC32 t2;
+    OMX_FC32 t3;
+
+    const OMX_F32 *in0 = in + set;
+    const OMX_F32 *in1 = in0 + set_count;
+    const OMX_F32 *in2 = in1 + set_count;
+    const OMX_F32 *in3 = in2 + set_count;
+    OMX_F32 *out1 = out0 + n_by_4;
+    OMX_F32 *out2 = out1 + n_by_4;
+    OMX_F32 *out3 = out2 + n_by_4;
+
+    // CADD t0, in0, in2
+    t0.Re = in0[0] + in2[0];
+    t0.Im = in0[n] + in2[n];
+
+    // CSUB t1, in0, in2
+    t1.Re = in0[0] - in2[0];
+    t1.Im = in0[n] - in2[n];
+
+    // CADD t2, in1, in3
+    t2.Re = in1[0] + in3[0];
+    t2.Im = in1[n] + in3[n];
+
+    // CSUB t3, in1, in3
+    t3.Re = in1[0] - in3[0];
+    t3.Im = in1[n] - in3[n];
+
+    // CADD out0, t0, t2
+    out0[0] = t0.Re + t2.Re;
+    out0[n] = t0.Im + t2.Im;
+
+    // CSUB out2, t0, t2
+    out2[0] = t0.Re - t2.Re;
+    out2[n] = t0.Im - t2.Im;
+
+    // CSUB_ADD_X out1, t1, t3
+    out1[0] = t1.Re - t3.Im;
+    out1[n] = t1.Im + t3.Re;
+
+    // CADD_SUB_X out3, t1, t3
+    out3[0] = t1.Re + t3.Im;
+    out3[n] = t1.Im - t3.Re;
+
+    out0 += 1;
+  }
+
+  // grp > 0
+  for (grp = 1; grp < sub_size; ++grp) {
+    const OMX_F32 *tw1 = twiddle + grp * step;
+    const OMX_F32 *tw2 = tw1 + grp * step;
+    const OMX_F32 *tw3 = tw2 + grp * step;
+
+    for (set = 0; set < set_count; ++set) {
+      OMX_FC32 t0;
+      OMX_FC32 t1;
+      OMX_FC32 t2;
+      OMX_FC32 t3;
+      OMX_FC32 tt1;
+      OMX_FC32 tt2;
+      OMX_FC32 tt3;
+
+      const OMX_F32 *in0 = in + set + grp * sub_num;
+      const OMX_F32 *in1 = in0 + set_count;
+      const OMX_F32 *in2 = in1 + set_count;
+      const OMX_F32 *in3 = in2 + set_count;
+      OMX_F32 *out1 = out0 + n_by_4;
+      OMX_F32 *out2 = out1 + n_by_4;
+      OMX_F32 *out3 = out2 + n_by_4;
+
+      // CMUL tt1, Tw1, in1
+      tt1.Re = tw1[0] * in1[0] + tw1[n_mul_2] * in1[n];
+      tt1.Im = tw1[0] * in1[n] - tw1[n_mul_2] * in1[0];
+
+      // CMUL tt2, Tw2, in2
+      tt2.Re = tw2[0] * in2[0] + tw2[n_mul_2] * in2[n];
+      tt2.Im = tw2[0] * in2[n] - tw2[n_mul_2] * in2[0];
+
+      // CMUL tt3, Tw3, in3
+      tt3.Re = tw3[0] * in3[0] + tw3[n_mul_2] * in3[n];
+      tt3.Im = tw3[0] * in3[n] - tw3[n_mul_2] * in3[0];
+
+      // CADD t0, in0, tt2
+      t0.Re = in0[0] + tt2.Re;
+      t0.Im = in0[n] + tt2.Im;
+
+      // CSUB t1, in0, tt2
+      t1.Re = in0[0] - tt2.Re;
+      t1.Im = in0[n] - tt2.Im;
+
+      // CADD t2, tt1, tt3
+      t2.Re = tt1.Re + tt3.Re;
+      t2.Im = tt1.Im + tt3.Im;
+
+      // CSUB t3, tt1, tt3
+      t3.Re = tt1.Re - tt3.Re;
+      t3.Im = tt1.Im - tt3.Im;
+
+      // CADD out0, t0, t2
+      out0[0] = t0.Re + t2.Re;
+      out0[n] = t0.Im + t2.Im;
+
+      // CSUB out2, t0, t2
+      out2[0] = t0.Re - t2.Re;
+      out2[n] = t0.Im - t2.Im;
+
+      // CSUB_ADD_X out1, t1, t3
+      out1[0] = t1.Re - t3.Im;
+      out1[n] = t1.Im + t3.Re;
+
+      // CADD_SUB_X out3, t1, t3
+      out3[0] = t1.Re + t3.Im;
+      out3[n] = t1.Im - t3.Re;
+
+      out0 += 1;
+    }
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse.c
new file mode 100644
index 0000000..703f316
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse.c
@@ -0,0 +1,215 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+// This function handles the case when set_count = 2, in which we cannot
+// unroll the set loop by 4 to meet the SSE requirement (4 elements).
+static void InternalUnroll2Inv(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n) {
+  OMX_INT i;
+  OMX_INT n_by_2 = n >> 1;
+  OMX_INT n_by_4 = n >> 2;
+  OMX_INT n_mul_2 = n << 1;
+  OMX_F32 *out0 = out;
+
+  for (i = 0; i < n_by_2; i += 8) {
+    const OMX_F32 *tw1  = twiddle + i;
+    const OMX_F32 *tw2  = tw1 + i;
+    const OMX_F32 *tw3  = tw2 + i;
+    const OMX_F32 *tw1e = tw1 + 4;
+    const OMX_F32 *tw2e = tw2 + 8;
+    const OMX_F32 *tw3e = tw3 + 12;
+
+    VC v_tw1;
+    VC v_tw2;
+    VC v_tw3;
+    VC v_t0;
+    VC v_t1;
+    VC v_t2;
+    VC v_t3;
+    VC v_t4;
+    VC v_t5;
+    VC v_t6;
+    VC v_t7;
+
+    v_tw1.real = _mm_shuffle_ps(_mm_load_ss(tw1),
+                                _mm_load_ss(tw1e),
+                                _MM_SHUFFLE(0, 0, 0, 0));
+    v_tw1.imag = _mm_shuffle_ps(_mm_load_ss(tw1 + n_mul_2),
+                                _mm_load_ss(tw1e + n_mul_2),
+                                _MM_SHUFFLE(0, 0, 0, 0));
+    v_tw2.real = _mm_shuffle_ps(_mm_load_ss(tw2),
+                                _mm_load_ss(tw2e),
+                                _MM_SHUFFLE(0, 0, 0, 0));
+    v_tw2.imag = _mm_shuffle_ps(_mm_load_ss(tw2 + n_mul_2),
+                                _mm_load_ss(tw2e + n_mul_2),
+                                _MM_SHUFFLE(0, 0, 0, 0));
+    v_tw3.real = _mm_shuffle_ps(_mm_load_ss(tw3),
+                                _mm_load_ss(tw3e),
+                                _MM_SHUFFLE(0, 0, 0, 0));
+    v_tw3.imag = _mm_shuffle_ps(_mm_load_ss(tw3 + n_mul_2),
+                                _mm_load_ss(tw3e + n_mul_2),
+                                _MM_SHUFFLE(0, 0, 0, 0));
+
+    __m128 xmm0;
+    __m128 xmm1;
+    __m128 xmm2;
+    __m128 xmm3;
+    __m128 xmm4;
+    __m128 xmm5;
+    __m128 xmm6;
+    __m128 xmm7;
+
+    const OMX_F32 *in0 = in + (i << 1);
+    xmm0 = _mm_load_ps(in0);
+    xmm1 = _mm_load_ps(in0 + 4);
+    xmm2 = _mm_load_ps(in0 + 8);
+    xmm3 = _mm_load_ps(in0 + 12);
+    v_t0.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(1, 0, 1, 0));
+    v_t1.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(3, 2, 3, 2));
+    v_t2.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(1, 0, 1, 0));
+    v_t3.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(3, 2, 3, 2));
+
+    xmm4 = _mm_load_ps(in0 + n);
+    xmm5 = _mm_load_ps(in0 + n + 4);
+    xmm6 = _mm_load_ps(in0 + n + 8);
+    xmm7 = _mm_load_ps(in0 + n + 12);
+    v_t0.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
+    v_t1.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
+    v_t2.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
+    v_t3.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
+
+    OMX_F32 *out1 = out0 + n_by_4;
+    OMX_F32 *out2 = out1 + n_by_4;
+    OMX_F32 *out3 = out2 + n_by_4;
+
+    RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+                         &v_tw1, &v_tw2, &v_tw3,
+                         &v_t0, &v_t1, &v_t2, &v_t3);
+
+    RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
+                               &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+    out0 += 4;
+  }
+}
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    OMX_INT sub_size,
+    OMX_INT sub_num) {
+  OMX_INT set;
+  OMX_INT grp;
+  OMX_INT step = sub_num >> 1;
+  OMX_INT set_count = sub_num >> 2;
+  OMX_INT n_by_4 = n >> 2;
+  OMX_INT n_mul_2 = n << 1;
+
+  OMX_F32 *out0 = out;
+
+  if (set_count == 2) {
+    InternalUnroll2Inv(in, out, twiddle, n);
+    return;
+  }
+
+  // grp == 0
+  for (set = 0; set < set_count; set += 4) {
+    const OMX_F32 * in0 = in + set;
+    const OMX_F32 *in1 = in0 + set_count;
+    const OMX_F32 *in2 = in1 + set_count;
+    const OMX_F32 *in3 = in2 + set_count;
+
+    VC v_t0;
+    VC v_t1;
+    VC v_t2;
+    VC v_t3;
+    VC v_t4;
+    VC v_t5;
+    VC v_t6;
+    VC v_t7;
+
+    VC_LOAD_SPLIT(&v_t0, in0, n);
+    VC_LOAD_SPLIT(&v_t1, in1, n);
+    VC_LOAD_SPLIT(&v_t2, in2, n);
+    VC_LOAD_SPLIT(&v_t3, in3, n);
+
+    OMX_F32 *out1 = out0 + n_by_4;
+    OMX_F32 *out2 = out1 + n_by_4;
+    OMX_F32 *out3 = out2 + n_by_4;
+
+    RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7,
+                        &v_t0, &v_t1, &v_t2, &v_t3);
+
+    RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
+                               &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+    out0 += 4;
+  }
+
+  for (grp = 1; grp < sub_size; ++grp) {
+    const OMX_F32 *tw1 = twiddle + grp * step;
+    const OMX_F32 *tw2 = tw1 + grp * step;
+    const OMX_F32 *tw3 = tw2 + grp * step;
+
+    VC v_tw1;
+    VC v_tw2;
+    VC v_tw3;
+
+    v_tw1.real = _mm_load1_ps(tw1);
+    v_tw1.imag = _mm_load1_ps(tw1 + n_mul_2);
+    v_tw2.real = _mm_load1_ps(tw2);
+    v_tw2.imag = _mm_load1_ps(tw2 + n_mul_2);
+    v_tw3.real = _mm_load1_ps(tw3);
+    v_tw3.imag = _mm_load1_ps(tw3 + n_mul_2);
+
+    for (set = 0; set < set_count; set += 4) {
+      const OMX_F32 *in0 = in + set + grp * sub_num;
+      const OMX_F32 *in1 = in0 + set_count;
+      const OMX_F32 *in2 = in1 + set_count;
+      const OMX_F32 *in3 = in2 + set_count;
+
+      VC v_t0;
+      VC v_t1;
+      VC v_t2;
+      VC v_t3;
+      VC v_t4;
+      VC v_t5;
+      VC v_t6;
+      VC v_t7;
+
+      VC_LOAD_SPLIT(&v_t0, in0, n);
+      VC_LOAD_SPLIT(&v_t1, in1, n);
+      VC_LOAD_SPLIT(&v_t2, in2, n);
+      VC_LOAD_SPLIT(&v_t3, in3, n);
+
+      OMX_F32 *out1 = out0 + n_by_4;
+      OMX_F32 *out2 = out1 + n_by_4;
+      OMX_F32 *out3 = out2 + n_by_4;
+
+      RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+                           &v_tw1, &v_tw2, &v_tw3,
+                           &v_t0, &v_t1, &v_t2, &v_t3);
+
+      RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
+                                 &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+      out0 += 4;
+    }
+  }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_F32_radix2_kernel.c b/dl/sp/src/x86/x86SP_FFT_F32_radix2_kernel.c
new file mode 100644
index 0000000..0a3d816
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_F32_radix2_kernel.c
@@ -0,0 +1,99 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include <stdbool.h>
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix2_fs(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix2_fs(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix2_ms(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    OMX_INT sub_size,
+    OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix2_ms(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    OMX_INT sub_size,
+    OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix2_ls(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix2_ls(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n);
+
+OMX_F32* x86SP_F32_radix2_kernel_OutOfPlace(
+    const OMX_F32 *src,
+    // Two Ping Pong buffers for out of place kernel.
+    OMX_F32 *buf1,
+    OMX_F32 *buf2,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    bool forward_fft) {
+  OMX_INT sub_size;
+  OMX_INT sub_num;
+  OMX_INT n_by_2 = n >> 1;
+  OMX_F32 *in = buf1;
+  OMX_F32 *out = buf2;
+
+  if (forward_fft)
+    x86SP_FFT_CToC_FC32_Fwd_Radix2_fs(src, in, n);
+  else
+    x86SP_FFT_CToC_FC32_Inv_Radix2_fs(src, in, n);
+
+  for (sub_size = 2, sub_num = n_by_2;
+       sub_size < n_by_2;
+       sub_size = sub_size << 1, sub_num = sub_num >> 1) {
+
+    if (forward_fft) {
+      x86SP_FFT_CToC_FC32_Fwd_Radix2_ms(in, out, twiddle,
+                                        n, sub_size, sub_num);
+    } else {
+      x86SP_FFT_CToC_FC32_Inv_Radix2_ms(in, out, twiddle,
+                                        n, sub_size, sub_num);
+    }
+
+    OMX_F32 *temp = out;
+    out = in;
+    in = temp;
+  }
+
+  // If sub_num <= 1, no need to do the last stage.
+  if (sub_num <= 1)
+    return in;
+
+  if (forward_fft)
+    x86SP_FFT_CToC_FC32_Fwd_Radix2_ls(in, out, twiddle, n);
+  else
+    x86SP_FFT_CToC_FC32_Inv_Radix2_ls(in, out, twiddle, n);
+
+  return out;
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_F32_radix4_kernel.c b/dl/sp/src/x86/x86SP_FFT_F32_radix4_kernel.c
new file mode 100644
index 0000000..e7c7b89
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_F32_radix4_kernel.c
@@ -0,0 +1,190 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+#include "dl/api/omxtypes.h"
+#include <stdbool.h>
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_fs(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_fs(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_ms(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    OMX_INT sub_size,
+    OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_ms(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    OMX_INT sub_size,
+    OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    OMX_INT sub_size,
+    OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    OMX_INT sub_size,
+    OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_ls(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_ls(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse(
+    const OMX_F32 *in,
+    OMX_F32 *out,
+    const OMX_F32 *twiddle,
+    OMX_INT n);
+
+OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace(
+    const OMX_F32 *src,
+    OMX_F32 *buf1,
+    OMX_F32 *buf2,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    bool forward_fft) {
+  OMX_INT sub_size;
+  OMX_INT sub_num;
+  OMX_INT n_by_4 = n >> 2;
+  OMX_F32 *in = buf1;
+  OMX_F32 *out = buf2;
+
+  if (forward_fft)
+    x86SP_FFT_CToC_FC32_Fwd_Radix4_fs(src, in, n);
+  else
+    x86SP_FFT_CToC_FC32_Inv_Radix4_fs(src, in, n);
+
+  for (sub_size = 4, sub_num = n_by_4;
+       sub_size < n_by_4;
+       sub_size = sub_size << 2, sub_num = sub_num >> 2) {
+
+    if (forward_fft) {
+      x86SP_FFT_CToC_FC32_Fwd_Radix4_ms(in, out, twiddle,
+                                        n, sub_size, sub_num);
+    } else {
+      x86SP_FFT_CToC_FC32_Inv_Radix4_ms(in, out, twiddle,
+                                        n, sub_size, sub_num);
+    }
+
+    OMX_F32 *temp = out;
+    out = in;
+    in = temp;
+  }
+
+  if (forward_fft) {
+    if (sub_num == 2)
+      x86SP_FFT_CToC_FC32_Fwd_Radix2_ls(in, out, twiddle, n);
+    else
+      x86SP_FFT_CToC_FC32_Fwd_Radix4_ls(in, out, twiddle, n);
+  } else {
+    if (sub_num == 2)
+      x86SP_FFT_CToC_FC32_Inv_Radix2_ls(in, out, twiddle, n);
+    else
+      x86SP_FFT_CToC_FC32_Inv_Radix4_ls(in, out, twiddle, n);
+  }
+
+  return out;
+}
+
+OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace_sse(
+    const OMX_F32 *src,
+    OMX_F32 *buf1,
+    OMX_F32 *buf2,
+    const OMX_F32 *twiddle,
+    OMX_INT n,
+    // true for forward, false for inverse.
+    bool forward_fft) {
+  OMX_INT sub_size, sub_num;
+  OMX_INT n_by_4 = n >> 2;
+  OMX_F32 *in, *out;
+  in = buf1;
+  out = buf2;
+
+  if (forward_fft)
+    x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse(src, in, n);
+  else
+    x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse(src, in, n);
+
+  for (sub_size = 4, sub_num = n_by_4;
+       sub_size < n_by_4;
+       sub_size = sub_size << 2, sub_num = sub_num >> 2) {
+
+    if (forward_fft) {
+      x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse(in, out, twiddle,
+                                            n, sub_size, sub_num);
+    } else {
+      x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse(in, out, twiddle,
+                                            n, sub_size, sub_num);
+    }
+
+    OMX_F32 *temp = out;
+    out = in;
+    in = temp;
+  }
+
+  // If n is not power of 4, sub_num == 2.
+  if (forward_fft) {
+    if (sub_num == 2)
+      x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse(in, out, twiddle, n);
+    else
+      x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse(in, out, twiddle, n);
+  } else {
+    if (sub_num == 2)
+      x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse(in, out, twiddle, n);
+    else
+      x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse(in, out, twiddle, n);
+  }
+
+  return out;
+}
diff --git a/dl/sp/src/x86/x86SP_SSE_Math.h b/dl/sp/src/x86/x86SP_SSE_Math.h
new file mode 100644
index 0000000..d10a851
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_SSE_Math.h
@@ -0,0 +1,488 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights realserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include <emmintrin.h>
+#include <assert.h>
+
+/**
+ * Two data formats are used by the FFT routines, internally. The
+ * interface to the main external FFT routines use interleaved complex
+ * values where the real part is followed by the imaginary part.
+ *
+ * One is the split format where a complex vector of real and imaginary
+ * values are split such that all of the real values are placed in the
+ * first half of the vector and the corresponding values are placed in
+ * the second half, in the same order. The conversion from interleaved
+ * complex values to split format and back is transparent to the
+ * external FFT interface.
+ *
+ * VComplex uses split format.
+ */
+
+/** VComplex hold 4 complex float elements, with the real parts stored
+ * in real and corresponding imaginary parts in imag.
+ */
+typedef struct VComplex {
+  __m128 real;
+  __m128 imag;
+} VC;
+
+/* out = a * b */
+static inline void VC_MUL(VC *out, VC *a, VC *b) {
+  out->real = _mm_sub_ps(_mm_mul_ps(a->real, b->real),
+      _mm_mul_ps(a->imag, b->imag));
+  out->imag = _mm_add_ps(_mm_mul_ps(a->real, b->imag),
+      _mm_mul_ps(a->imag, b->real));
+}
+
+/* out = conj(a) * b */
+static inline void VC_CONJ_MUL(VC *out, VC *a, VC *b) {
+  out->real = _mm_add_ps(_mm_mul_ps(a->real, b->real),
+      _mm_mul_ps(a->imag, b->imag));
+  out->imag = _mm_sub_ps(_mm_mul_ps(a->real, b->imag),
+      _mm_mul_ps(a->imag, b->real));
+}
+
+/* Scale complex by a real factor */
+static inline void VC_MUL_F(VC *out, VC *a, __m128 factor) {
+  out->real = _mm_mul_ps(factor, a->real);
+  out->imag = _mm_mul_ps(factor, a->imag);
+}
+
+/* out = a + b */
+static inline void VC_ADD(VC *out, VC *a, VC *b) {
+  out->real = _mm_add_ps(a->real, b->real);
+  out->imag = _mm_add_ps(a->imag, b->imag);
+}
+
+/**
+ * out.real = a.real + b.imag
+ * out.imag = a.imag + b.real
+ */
+static inline void VC_ADD_X(VC *out, VC *a, VC *b) {
+  out->real = _mm_add_ps(a->real, b->imag);
+  out->imag = _mm_add_ps(b->real, a->imag);
+}
+
+/* VC_ADD and store the result with Split format. */
+static inline void VC_ADD_STORE_SPLIT(
+    OMX_F32 *out,
+    VC *a,
+    VC *b,
+    OMX_INT offset) {
+  _mm_store_ps(out, _mm_add_ps(a->real, b->real));
+  _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->imag));
+}
+
+/* out = a - b */
+static inline void VC_SUB(VC *out, VC *a, VC *b) {
+  out->real = _mm_sub_ps(a->real, b->real);
+  out->imag = _mm_sub_ps(a->imag, b->imag);
+}
+
+/**
+ * out.real = a.real - b.imag
+ * out.imag = a.imag - b.real
+ */
+static inline void VC_SUB_X(VC *out, VC *a, VC *b) {
+  out->real = _mm_sub_ps(a->real, b->imag);
+  out->imag = _mm_sub_ps(b->real, a->imag);
+}
+
+/* VC_SUB and store the result with Split format. */
+static inline void VC_SUB_STORE_SPLIT(
+    OMX_F32 *out,
+    VC *a,
+    VC *b,
+    OMX_INT offset) {
+  _mm_store_ps(out, _mm_sub_ps(a->real, b->real));
+  _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->imag));
+}
+
+/**
+ * out.real = a.real + b.real
+ * out.imag = a.imag - b.imag
+ */
+static inline void VC_ADD_SUB(VC *out, VC *a, VC *b) {
+  out->real = _mm_add_ps(a->real, b->real);
+  out->imag = _mm_sub_ps(a->imag, b->imag);
+}
+
+/**
+ * out.real = a.real + b.imag
+ * out.imag = a.imag - b.real
+ */
+static inline void VC_ADD_SUB_X(VC *out, VC *a, VC *b) {
+  out->real = _mm_add_ps(a->real, b->imag);
+  out->imag = _mm_sub_ps(a->imag, b->real);
+}
+
+/* VC_ADD_SUB_X and store the result with Split format. */
+static inline void VC_ADD_SUB_X_STORE_SPLIT(
+    OMX_F32 *out,
+    VC *a,
+    VC *b,
+    OMX_INT offset) {
+  _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
+  _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->real));
+}
+
+/**
+ * out.real = a.real - b.real
+ * out.imag = a.imag + b.imag
+ */
+static inline void VC_SUB_ADD(VC *out, VC *a, VC *b) {
+  out->real = _mm_sub_ps(a->real, b->real);
+  out->imag = _mm_add_ps(a->imag, b->imag);
+}
+
+/**
+ * out.real = a.real - b.imag
+ * out.imag = a.imag + b.real
+ */
+static inline void VC_SUB_ADD_X(VC *out, VC *a, VC *b) {
+  out->real = _mm_sub_ps(a->real, b->imag);
+  out->imag = _mm_add_ps(a->imag, b->real);
+}
+
+/* VC_SUB_ADD_X and store the result with Split format. */
+static inline void VC_SUB_ADD_X_STORE_SPLIT(
+    OMX_F32 *out,
+    VC *a, VC *b,
+    OMX_INT offset) {
+  _mm_store_ps(out, _mm_sub_ps(a->real, b->imag));
+  _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->real));
+}
+
+/**
+ * out[0]      = in.real
+ * out[offset] = in.imag
+ */
+static inline void VC_STORE_SPLIT(
+    OMX_F32 *out,
+    VC *in,
+    OMX_INT offset) {
+  _mm_store_ps(out, in->real);
+  _mm_store_ps(out + offset, in->imag);
+}
+
+/**
+ * out.real = in[0];
+ * out.imag = in[offset];
+*/
+static inline void VC_LOAD_SPLIT(
+    VC *out,
+    const OMX_F32 *in,
+    OMX_INT offset) {
+  out->real = _mm_load_ps(in);
+  out->imag = _mm_load_ps(in + offset);
+}
+
+/* Vector Complex Unpack from Split format to Interleaved format. */
+static inline void VC_UNPACK(VC *out, VC *in) {
+    out->real = _mm_unpacklo_ps(in->real, in->imag);
+    out->imag = _mm_unpackhi_ps(in->real, in->imag);
+}
+
+/**
+ * Vector Complex load from interleaved complex array.
+ * out.real = [in[0].real, in[1].real, in[2].real, in[3].real]
+ * out.imag = [in[0].imag, in[1].imag, in[2].imag, in[3].imag]
+ */
+static inline void VC_LOAD_INTERLEAVE(VC *out, const OMX_F32 *in) {
+    __m128 temp0 = _mm_load_ps(in);
+    __m128 temp1 = _mm_load_ps(in + 4);
+    out->real = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0));
+    out->imag = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1));
+}
+/**
+ * Vector Complex Load with Split format.
+ * The input address is not 16 byte aligned.
+ */
+static inline void VC_LOADU_SPLIT(
+    VC *out,
+    const OMX_F32 *in,
+    OMX_INT offset) {
+  out->real = _mm_loadu_ps(in);
+  out->imag = _mm_loadu_ps(in + offset);
+}
+
+/* Reverse the order of the Complex Vector. */
+static inline void VC_REVERSE(VC *v) {
+  v->real = _mm_shuffle_ps(v->real, v->real, _MM_SHUFFLE(0, 1, 2, 3));
+  v->imag = _mm_shuffle_ps(v->imag, v->imag, _MM_SHUFFLE(0, 1, 2, 3));
+}
+/*
+ * Vector Complex store to interleaved complex array
+ * out[0] = in.real[0]
+ * out[1] = in.imag[0]
+ * out[2] = in.real[1]
+ * out[3] = in.imag[1]
+ * out[4] = in.real[2]
+ * out[5] = in.imag[2]
+ * out[6] = in.real[3]
+ * out[7] = in.imag[3]
+ */
+static inline void VC_STORE_INTERLEAVE(OMX_F32 *out, VC *in) {
+  _mm_store_ps(out, _mm_unpacklo_ps(in->real, in->imag));
+  _mm_store_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
+}
+
+/**
+ * Vector Complex Store with Interleaved format.
+ * Address is not 16 byte aligned.
+ */
+static inline void VC_STOREU_INTERLEAVE(OMX_F32 *out, VC *in) {
+  _mm_storeu_ps(out, _mm_unpacklo_ps(in->real, in->imag));
+  _mm_storeu_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
+}
+
+/* VC_ADD_X and store the result with Split format. */
+static inline void VC_ADD_X_STORE_SPLIT(
+    OMX_F32 *out,
+    VC *a, VC *b,
+    OMX_INT offset) {
+  _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
+  _mm_store_ps(out + offset, _mm_add_ps(b->real, a->imag));
+}
+
+/**
+ * VC_SUB_X and store the result with inverse order.
+ * Address is not 16 byte aligned.
+ */
+static inline void VC_SUB_X_INVERSE_STOREU_SPLIT(
+    OMX_F32 *out,
+    VC *a,
+    VC *b,
+    OMX_INT offset) {
+  __m128 t;
+  t = _mm_sub_ps(a->real, b->imag);
+  _mm_storeu_ps(out, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
+  t = _mm_sub_ps(b->real, a->imag);
+  _mm_storeu_ps(out + offset, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
+}
+
+/**
+ * Vector Complex Load from Interleaved format to Split format.
+ * Store the result into two __m128 registers.
+ */
+static inline void VC_LOAD_SHUFFLE(
+    __m128 *out0,
+    __m128 *out1,
+    const OMX_F32 *in) {
+  VC temp;
+  VC_LOAD_INTERLEAVE(&temp, in);
+  *out0 = temp.real;
+  *out1 = temp.imag;
+}
+
+/* Finish the butterfly calculation of forward radix4 and store the outputs. */
+static inline void RADIX4_FWD_BUTTERFLY_STORE(
+    OMX_F32 *out0,
+    OMX_F32 *out1,
+    OMX_F32 *out2,
+    OMX_F32 *out3,
+    VC *t0,
+    VC *t1,
+    VC *t2,
+    VC *t3,
+    OMX_INT n) {
+  /* CADD out0, t0, t2 */
+  VC_ADD_STORE_SPLIT(out0, t0, t2, n);
+
+  /* CSUB out2, t0, t2 */
+  VC_SUB_STORE_SPLIT(out2, t0, t2, n);
+
+  /* CADD_SUB_X out1, t1, t3 */
+  VC_ADD_SUB_X_STORE_SPLIT(out1, t1, t3, n);
+
+  /* CSUB_ADD_X out3, t1, t3 */
+  VC_SUB_ADD_X_STORE_SPLIT(out3, t1, t3, n);
+}
+
+/* Finish the butterfly calculation of inverse radix4 and store the outputs. */
+static inline void RADIX4_INV_BUTTERFLY_STORE(
+    OMX_F32 *out0,
+    OMX_F32 *out1,
+    OMX_F32 *out2,
+    OMX_F32 *out3,
+    VC *t0,
+    VC *t1,
+    VC *t2,
+    VC *t3,
+    OMX_INT n) {
+  /* CADD out0, t0, t2 */
+  VC_ADD_STORE_SPLIT(out0, t0, t2, n);
+
+  /* CSUB out2, t0, t2 */
+  VC_SUB_STORE_SPLIT(out2, t0, t2, n);
+
+  /* CSUB_ADD_X out1, t1, t3 */
+  VC_SUB_ADD_X_STORE_SPLIT(out1, t1, t3, n);
+
+  /* CADD_SUB_X out3, t1, t3 */
+  VC_ADD_SUB_X_STORE_SPLIT(out3, t1, t3, n);
+}
+
+/* Radix4 forward butterfly */
+static inline void RADIX4_FWD_BUTTERFLY(
+    VC *t0,
+    VC *t1,
+    VC *t2,
+    VC *t3,
+    VC *Tw1,
+    VC *Tw2,
+    VC *Tw3,
+    VC *T0,
+    VC *T1,
+    VC *T2,
+    VC *T3) {
+  VC tt1, tt2, tt3;
+
+  /* CMUL tt1, Tw1, T1 */
+  VC_MUL(&tt1, Tw1, T1);
+
+  /* CMUL tt2, Tw2, T2 */
+  VC_MUL(&tt2, Tw2, T2);
+
+  /* CMUL tt3, Tw3, T3 */
+  VC_MUL(&tt3, Tw3, T3);
+
+  /* CADD t0, T0, tt2 */
+  VC_ADD(t0, T0, &tt2);
+
+  /* CSUB t1, T0, tt2 */
+  VC_SUB(t1, T0, &tt2);
+
+  /* CADD t2, tt1, tt3 */
+  VC_ADD(t2, &tt1, &tt3);
+
+  /* CSUB t3, tt1, tt3 */
+  VC_SUB(t3, &tt1, &tt3);
+}
+
+/* Radix4 inverse butterfly */
+static inline void RADIX4_INV_BUTTERFLY(
+    VC *t0,
+    VC *t1,
+    VC *t2,
+    VC *t3,
+    VC *Tw1,
+    VC *Tw2,
+    VC *Tw3,
+    VC *T0,
+    VC *T1,
+    VC *T2,
+    VC *T3) {
+  VC tt1, tt2, tt3;
+
+  /* CMUL tt1, Tw1, T1 */
+  VC_CONJ_MUL(&tt1, Tw1, T1);
+
+  /* CMUL tt2, Tw2, T2 */
+  VC_CONJ_MUL(&tt2, Tw2, T2);
+
+  /* CMUL tt3, Tw3, T3 */
+  VC_CONJ_MUL(&tt3, Tw3, T3);
+
+  /* CADD t0, T0, tt2 */
+  VC_ADD(t0, T0, &tt2);
+
+  /* CSUB t1, T0, tt2 */
+  VC_SUB(t1, T0, &tt2);
+
+  /* CADD t2, tt1, tt3 */
+  VC_ADD(t2, &tt1, &tt3);
+
+  /* CSUB t3, tt1, tt3 */
+  VC_SUB(t3, &tt1, &tt3);
+}
+
+/* Radix4 butterfly in first stage for both forward and inverse */
+static inline void RADIX4_BUTTERFLY_FS(
+    VC *t0,
+    VC *t1,
+    VC *t2,
+    VC *t3,
+    VC *T0,
+    VC *T1,
+    VC *T2,
+    VC *T3) {
+  /* CADD t0, T0, T2 */
+  VC_ADD(t0, T0, T2);
+
+  /* CSUB t1, T0, T2 */
+  VC_SUB(t1, T0, T2);
+
+  /* CADD t2, T1, T3 */
+  VC_ADD(t2, T1, T3);
+
+  /* CSUB t3, T1, T3 */
+  VC_SUB(t3, T1, T3);
+}
+
+/**
+ * Load 16 float elements (4 sse registers) which is a 4 * 4 matrix.
+ * Then Do transpose on the matrix.
+ * 3,  2,  1,  0                  12, 8,  4,  0
+ * 7,  6,  5,  4        =====>    13, 9,  5,  1
+ * 11, 10, 9,  8                  14, 10, 6,  2
+ * 15, 14, 13, 12                 15, 11, 7,  3
+ */
+static inline void VC_LOAD_MATRIX_TRANSPOSE(
+    VC *T0,
+    VC *T1,
+    VC *T2,
+    VC *T3,
+    const OMX_F32 *pT0,
+    const OMX_F32 *pT1,
+    const OMX_F32 *pT2,
+    const OMX_F32 *pT3,
+    OMX_INT n) {
+  __m128 xmm0;
+  __m128 xmm1;
+  __m128 xmm2;
+  __m128 xmm3;
+  __m128 xmm4;
+  __m128 xmm5;
+  __m128 xmm6;
+  __m128 xmm7;
+
+  xmm0 = _mm_load_ps(pT0);
+  xmm1 = _mm_load_ps(pT1);
+  xmm2 = _mm_load_ps(pT2);
+  xmm3 = _mm_load_ps(pT3);
+
+  /* Matrix transpose */
+  xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
+  xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
+  xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
+  xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
+  T0->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
+  T1->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
+  T2->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
+  T3->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
+
+  xmm0 = _mm_load_ps(pT0 + n);
+  xmm1 = _mm_load_ps(pT1 + n);
+  xmm2 = _mm_load_ps(pT2 + n);
+  xmm3 = _mm_load_ps(pT3 + n);
+
+  /* Matrix transpose */
+  xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
+  xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
+  xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
+  xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
+  T0->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
+  T1->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
+  T2->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
+  T3->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
+}