Implement x86 real float fft for openmax_dl.
Optimized by SSE2 intrisics.
Mainly target for the web audio usage.
BUG=
R=aedla@chromium.org, andrew@webrtc.org, rtoy@google.com
Review URL: https://webrtc-codereview.appspot.com/2208004
git-svn-id: http://webrtc.googlecode.com/svn/deps/third_party/openmax@5109 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/dl/dl.gyp b/dl/dl.gyp
index 2c5acd4..919eeb7 100644
--- a/dl/dl.gyp
+++ b/dl/dl.gyp
@@ -18,95 +18,10 @@
'include_dirs': [
'../',
],
- 'cflags!': [
- '-mfpu=vfpv3-d16',
- ],
- 'cflags': [
- # We enable Neon instructions even with arm_neon==0, to support
- # runtime detection.
- '-mfpu=neon',
- ],
- 'direct_dependent_settings': {
- 'include_dirs': [
- '../',
- ],
- },
'sources': [
- 'api/arm/armCOMM_s.h',
- 'api/arm/armOMX.h',
- 'api/arm/omxtypes_s.h',
'api/omxtypes.h',
- 'sp/api/armSP.h',
'sp/api/omxSP.h',
- # Common C code that can be shared between different
- # architectures.
'sp/src/armSP_FFT_F32TwiddleTable.c',
-
- # Common C code for NEON and non-NEON implementations.
- 'sp/src/arm/armSP_FFT_S32TwiddleTable.c',
- 'sp/src/arm/omxSP_FFTGetBufSize_C_SC32.c',
- 'sp/src/arm/omxSP_FFTInit_C_SC32.c',
- 'sp/src/arm/omxSP_FFTGetBufSize_R_S32.c',
- 'sp/src/arm/omxSP_FFTInit_R_S32.c',
- 'sp/src/arm/omxSP_FFTInit_C_SC16.c',
- 'sp/src/arm/omxSP_FFTGetBufSize_C_SC16.c',
- 'sp/src/arm/omxSP_FFTGetBufSize_R_S16.c',
- 'sp/src/arm/omxSP_FFTInit_R_S16.c',
- 'sp/src/arm/omxSP_FFTGetBufSize_R_S16S32.c',
- 'sp/src/arm/omxSP_FFTInit_R_S16S32.c',
- 'sp/src/arm/omxSP_FFTGetBufSize_C_FC32.c',
- 'sp/src/arm/omxSP_FFTInit_C_FC32.c',
- 'sp/src/arm/omxSP_FFTGetBufSize_R_F32.c',
- 'sp/src/arm/omxSP_FFTInit_R_F32.c',
-
- # NEON-specific implementation
- # Complex 32-bit fixed-point FFT.
- 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S',
- 'sp/src/arm/neon/omxSP_FFTInv_CToC_SC32_Sfs_s.S',
- 'sp/src/arm/neon/omxSP_FFTFwd_CToC_SC32_Sfs_s.S',
- # Real 32-bit fixed-point FFT
- 'sp/src/arm/neon/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S',
- 'sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S',
- 'sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32_Sfs_s.S',
- # Complex 16-bit fixed-point FFT
- 'sp/src/arm/neon/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S',
- 'sp/src/arm/neon/omxSP_FFTFwd_CToC_SC16_Sfs_s.S',
- 'sp/src/arm/neon/omxSP_FFTInv_CToC_SC16_Sfs_s.S',
- # Real 16-bit fixed-point FFT
- 'sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S',
- 'sp/src/arm/neon/omxSP_FFTInv_CCSToR_S16_Sfs_s.S',
- 'sp/src/arm/neon/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S',
- 'sp/src/arm/neon/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S',
- # Complex floating-point FFT
- 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S',
- 'sp/src/arm/neon/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S',
- 'sp/src/arm/neon/omxSP_FFTInv_CToC_FC32_Sfs_s.S',
- 'sp/src/arm/neon/omxSP_FFTFwd_CToC_FC32_Sfs_s.S',
- # Real floating-point FFT
- 'sp/src/arm/neon/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S',
- 'sp/src/arm/neon/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S',
- 'sp/src/arm/neon/omxSP_FFTInv_CCSToR_F32_Sfs_s.S',
],
'conditions' : [
['big_float_fft == 1', {
@@ -114,6 +29,120 @@
'BIG_FFT_TABLE',
],
}],
+ ['target_arch=="arm"', {
+ 'cflags!': [
+ '-mfpu=vfpv3-d16',
+ ],
+ 'cflags': [
+ # We enable Neon instructions even with arm_neon==0, to support
+ # runtime detection.
+ '-mfpu=neon',
+ ],
+ 'sources': [
+ 'api/armCOMM_s.h',
+ 'api/armOMX.h',
+ 'api/omxtypes_s.h',
+ 'sp/api/armSP.h',
+ # Complex 32-bit fixed-point FFT.
+ 'sp/src/armSP_FFT_S32TwiddleTable.c',
+ 'sp/src/omxSP_FFTGetBufSize_C_SC32.c',
+ 'sp/src/omxSP_FFTInit_C_SC32.c',
+ 'sp/src/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S',
+ 'sp/src/omxSP_FFTInv_CToC_SC32_Sfs_s.S',
+ 'sp/src/omxSP_FFTFwd_CToC_SC32_Sfs_s.S',
+ # Real 32-bit fixed-point FFT
+ 'sp/src/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S',
+ 'sp/src/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S',
+ 'sp/src/omxSP_FFTGetBufSize_R_S32.c',
+ 'sp/src/omxSP_FFTInit_R_S32.c',
+ 'sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S',
+ # Complex 16-bit fixed-point FFT
+ 'sp/src/armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe_s.S',
+ 'sp/src/omxSP_FFTInit_C_SC16.c',
+ 'sp/src/omxSP_FFTGetBufSize_C_SC16.c',
+ 'sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S',
+ 'sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S',
+ 'sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S',
+ # Real 16-bit fixed-point FFT
+ 'sp/src/omxSP_FFTFwd_RToCCS_S16_Sfs_s.S',
+ 'sp/src/omxSP_FFTGetBufSize_R_S16.c',
+ 'sp/src/omxSP_FFTInit_R_S16.c',
+ 'sp/src/omxSP_FFTInv_CCSToR_S16_Sfs_s.S',
+ 'sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S',
+ 'sp/src/omxSP_FFTGetBufSize_R_S16S32.c',
+ 'sp/src/omxSP_FFTInit_R_S16S32.c',
+ 'sp/src/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S',
+ # Complex floating-point FFT
+ 'sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S',
+ 'sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S',
+ 'sp/src/omxSP_FFTGetBufSize_C_FC32.c',
+ 'sp/src/omxSP_FFTInit_C_FC32.c',
+ 'sp/src/omxSP_FFTInv_CToC_FC32_Sfs_s.S',
+ 'sp/src/omxSP_FFTFwd_CToC_FC32_Sfs_s.S',
+ # Real floating-point FFT
+ 'sp/src/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S',
+ 'sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S',
+ 'sp/src/omxSP_FFTGetBufSize_R_F32.c',
+ 'sp/src/omxSP_FFTInit_R_F32.c',
+ 'sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_s.S',
+ ],
+ }],
+ ['target_arch=="ia32" or target_arch=="x64"', {
+ 'cflags': [
+ '-msse2',
+ ],
+ 'sources': [
+ # Real 32-bit floating-point FFT.
+ 'sp/api/x86SP.h',
+ 'sp/src/x86/omxSP_FFTFwd_RToCCS_F32_Sfs.c',
+ 'sp/src/x86/omxSP_FFTGetBufSize_R_F32.c',
+ 'sp/src/x86/omxSP_FFTInit_R_F32.c',
+ 'sp/src/x86/omxSP_FFTInv_CCSToR_F32_Sfs.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_fs.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ms.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_fs.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ms.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c',
+ 'sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse.c',
+ 'sp/src/x86/x86SP_FFT_F32_radix2_kernel.c',
+ 'sp/src/x86/x86SP_FFT_F32_radix4_kernel.c',
+ 'sp/src/x86/x86SP_SSE_Math.h',
+ ],
+ }],
],
}]
}
diff --git a/dl/sp/api/x86SP.h b/dl/sp/api/x86SP.h
new file mode 100644
index 0000000..5312734
--- /dev/null
+++ b/dl/sp/api/x86SP.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2007-2008 ARM Limited. All Rights Reserved.
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * It has been relicensed with permission from the copyright holders.
+ */
+
+#ifndef _x86SP_H_
+#define _x86SP_H_
+
+#include "dl/api/omxtypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern OMX_F32 armSP_FFT_F32TwiddleTable[];
+
+typedef struct X86FFTSpec_R_FC32_Tag
+{
+ OMX_U32 N;
+ OMX_F32* pTwiddle;
+ // Ping Pong buffer for doing the N/2 point complex FFT.
+ OMX_F32* pBuf1;
+ OMX_F32* pBuf2;
+
+} X86FFTSpec_R_FC32;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/dl/sp/src/x86/omxSP_FFTFwd_RToCCS_F32_Sfs.c b/dl/sp/src/x86/omxSP_FFTFwd_RToCCS_F32_Sfs.c
new file mode 100644
index 0000000..b6d1c98
--- /dev/null
+++ b/dl/sp/src/x86/omxSP_FFTFwd_RToCCS_F32_Sfs.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/omxSP.h"
+#include "dl/sp/api/x86SP.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+#include <stdbool.h>
+
+extern OMX_F32* x86SP_F32_radix2_kernel_OutOfPlace(
+ const OMX_F32 *src,
+ OMX_F32 *buf1,
+ OMX_F32 *buf2,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ bool forward_fft);
+
+extern OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace_sse(
+ const OMX_F32 *src,
+ OMX_F32 *buf1,
+ OMX_F32 *buf2,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ bool forward_fft);
+
+/**
+ * A two-for-one algorithm is used here to do the real fft:
+ *
+ * Input x[n], (n = 0, ..., N - 1)
+ * Output X[k] = DFT(N, k){x}
+ * a[n] = x[2n], (n = 0, ..., N/2 - 1)
+ * b[n] = x[2n + 1], (n = 0, ..., N/2 - 1)
+ * z[n] = a[n] + j * b[n]
+ * Z[k] = DFT(N/2, k){z}
+ * Z' is the complex conjugate of Z
+ * A[k] = (Z[k] + Z'[N/2 - k]) / 2
+ * B[k] = -j * (Z[k] - Z'[N/2 - k]) / 2
+ * X[k] = A[k] + B[k] * W[k], (W = exp(-j*2*PI*k/N); k = 0, ..., N/2 - 1)
+ * X[k] = A[k] - B[k], (k = N/2)
+ * X' is complex conjugate of X
+ * X[k] = X'[N - k], (k = N/2 + 1, ..., N - 1)
+ */
+
+/**
+ * This function is the last permutation of two-for-one FFT algorithm.
+ * We move the division by 2 to the last step in the implementation, so:
+ * A[k] = (Z[k] + Z'[N/2 - k])
+ * B[k] = -j * (Z[k] - Z'[N/2 - k])
+ * X[k] = (A[k] + B[k] * W[k]) / 2, (k = 0, ..., N/2 - 1)
+ * X[k] = (A[k] - B[k]), (k = N/2)
+ * X[k] = X'[N - k], (k = N/2 + 1, ..., N - 1)
+ */
+static void RevbinPermuteFwd(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT j;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+
+ OMX_FC32 big_a;
+ OMX_FC32 big_b;
+ OMX_FC32 temp;
+ const OMX_F32 *tw;
+
+ for (i = 1, j = n_by_2 - 1; i < n_by_4; i++, j--) {
+ // A[k] = (Z[k] + Z'[N/2 - k])
+ big_a.Re = in[i] + in[j];
+ big_a.Im = in[j + n_by_2] - in[i + n_by_2];
+
+ // B[k] = -j * (Z[k] - Z'[N/2 - k])
+ big_b.Re = in[j] - in[i];
+ big_b.Im = in[j + n_by_2] + in[i + n_by_2];
+
+ // W[k]
+ tw = twiddle + i;
+
+ // temp = B[k] * W[k]
+ temp.Re = big_b.Re * tw[0] + big_b.Im * tw[n];
+ temp.Im = big_b.Re * tw[n] - big_b.Im * tw[0];
+
+ // Convert split format to interleaved format.
+ // X[k] = (A[k] + B[k] * W[k]) / 2, (k = 0, ..., N/2 - 1)
+ out[i << 1] = 0.5f * (big_a.Re - temp.Im);
+ out[(i << 1) + 1] = 0.5f * (temp.Re - big_a.Im);
+ // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1)
+ out[j << 1] = 0.5f * (big_a.Re + temp.Im);
+ out[(j << 1) + 1] = 0.5f * (temp.Re + big_a.Im);
+ }
+
+ // X[k] = A[k] - B[k] (k = N/2)
+ out[n_by_2] = in[n_by_4];
+ out[n_by_2 + 1] = -in[n_by_4 + n_by_2];
+
+ out[0] = in[0] + in[n_by_2];
+ out[1] = 0;
+ out[n] = in[0] - in[n_by_2];
+ out[n + 1] = 0;
+}
+
+// Sse version of RevbinPermuteFwd function.
+static void RevbinPermuteFwdSse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT j;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+
+ VC v_i;
+ VC v_j;
+ VC v_big_a;
+ VC v_big_b;
+ VC v_temp;
+ VC v_x0;
+ VC v_x1;
+ VC v_tw;
+
+ __m128 factor = _mm_set1_ps(0.5f);
+
+ for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) {
+ VC_LOAD_SPLIT(&v_i, (in + i), n_by_2);
+
+ VC_LOADU_SPLIT(&v_j, (in + j), n_by_2);
+ VC_REVERSE(&v_j);
+
+ // A[k] = (Z[k] + Z'[N/2 - k])
+ VC_ADD_SUB(&v_big_a, &v_j, &v_i);
+
+ // B[k] = -j * (Z[k] - Z'[N/2 - k])
+ VC_SUB_ADD(&v_big_b, &v_j, &v_i);
+
+ // W[k]
+ VC_LOAD_SPLIT(&v_tw, (twiddle + i), n);
+
+ // temp = B[k] * W[k]
+ VC_CONJ_MUL(&v_temp, &v_big_b, &v_tw);
+
+ VC_SUB_X(&v_x0, &v_big_a, &v_temp);
+ VC_ADD_X(&v_x1, &v_big_a, &v_temp);
+
+ VC_MUL_F(&v_x0, &v_x0, factor);
+ VC_MUL_F(&v_x1, &v_x1, factor);
+
+ // X[k] = A[k] + B[k] * W[k] (k = 0, ..., N/2 - 1)
+ VC_STORE_INTERLEAVE((out + (i << 1)), &v_x0);
+
+ // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1)
+ VC_REVERSE(&v_x1);
+ VC_STOREU_INTERLEAVE((out + (j << 1)), &v_x1);
+ }
+
+ out[n_by_2] = in[n_by_4];
+ out[n_by_2 + 1] = -in[n_by_4 + n_by_2];
+
+ out[0] = in[0] + in[n_by_2];
+ out[1] = 0;
+ out[n] = in[0] - in[n_by_2];
+ out[n + 1] = 0;
+}
+
+OMXResult omxSP_FFTFwd_RToCCS_F32_Sfs(const OMX_F32 *pSrc, OMX_F32 *pDst,
+ const OMXFFTSpec_R_F32 *pFFTSpec) {
+ // Input must be 32 byte aligned
+ if (!pSrc || !pDst || (OMX_INT)pSrc & 31 || (OMX_INT)pDst & 31)
+ return OMX_Sts_BadArgErr;
+
+ OMX_INT n;
+ OMX_INT n_by_2;
+ OMX_INT n_by_4;
+ const OMX_F32 *twiddle;
+ OMX_F32 *buf;
+
+ const X86FFTSpec_R_FC32 *pFFTStruct = (const X86FFTSpec_R_FC32*) pFFTSpec;
+
+ n = pFFTStruct->N;
+
+ // This is to handle the case of order == 1.
+ if (n == 2) {
+ pDst[0] = (pSrc[0] + pSrc[1]);
+ pDst[1] = 0.0f;
+ pDst[2] = (pSrc[0] - pSrc[1]);
+ pDst[3] = 0.0f;
+ return OMX_Sts_NoErr;
+ }
+
+ n_by_2 = n >> 1;
+ n_by_4 = n >> 2;
+ buf = pFFTStruct->pBuf1;
+ twiddle = pFFTStruct->pTwiddle;
+
+ if(n_by_2 >= 16) {
+ buf = x86SP_F32_radix4_kernel_OutOfPlace_sse(
+ pSrc,
+ pFFTStruct->pBuf2,
+ buf,
+ twiddle,
+ n_by_2,
+ 1);
+ } else {
+ buf = x86SP_F32_radix2_kernel_OutOfPlace(
+ pSrc,
+ pFFTStruct->pBuf2,
+ buf,
+ twiddle,
+ n_by_2,
+ 1);
+ }
+
+ if(n >= 8)
+ RevbinPermuteFwdSse(buf, pDst, twiddle, n);
+ else
+ RevbinPermuteFwd(buf, pDst, twiddle, n);
+
+ return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/x86/omxSP_FFTGetBufSize_R_F32.c b/dl/sp/src/x86/omxSP_FFTGetBufSize_R_F32.c
new file mode 100644
index 0000000..f686a7f
--- /dev/null
+++ b/dl/sp/src/x86/omxSP_FFTGetBufSize_R_F32.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/x86SP.h"
+#include "dl/sp/api/omxSP.h"
+
+/**
+ * Function: omxSP_FFTGetBufSize_R_F32
+ *
+ * Description:
+ * Computes the size of the specification structure required for the length
+ * 2^order real FFT and IFFT functions.
+ *
+ * Remarks:
+ * This function is used in conjunction with the 32-bit functions
+ * <FFTFwd_RToCCS_F32_Sfs> and <FFTInv_CCSToR_F32_Sfs>.
+ *
+ * Parameters:
+ * [in] order base-2 logarithm of the length; valid in the range
+ * [1,12]. ([1,15] if BIG_FFT_TABLE is defined.)
+ * [out] pSize pointer to the number of bytes required for the
+ * specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTGetBufSize_R_F32(OMX_INT order, OMX_INT *pSize) {
+ if (!pSize || (order < 1) || (order > TWIDDLE_TABLE_ORDER))
+ return OMX_Sts_BadArgErr;
+
+ OMX_INT n_by_2;
+ OMX_INT n;
+
+ n_by_2 = 1 << (order - 1);
+ n = n_by_2 << 1;
+
+ *pSize = sizeof(X86FFTSpec_R_FC32) +
+ // Twiddle factors.
+ sizeof(OMX_F32) * (n << 1) +
+ // Ping Pong buffer for doing the n/2 point complex FFT.
+ // pBuf1
+ sizeof(OMX_F32) * n + 4 +
+ // pBuf2
+ sizeof(OMX_F32) * n + 4 +
+ // Extra bytes to get 32 byte alignment of ptwiddle, pBuf1
+ 62;
+
+ return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/x86/omxSP_FFTInit_R_F32.c b/dl/sp/src/x86/omxSP_FFTInit_R_F32.c
new file mode 100644
index 0000000..564f166
--- /dev/null
+++ b/dl/sp/src/x86/omxSP_FFTInit_R_F32.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * This is a modification of omxSP_FFTInit_R_S32.c to support float
+ * instead of S32.
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/omxSP.h"
+#include "dl/sp/api/x86SP.h"
+
+/**
+ * Function: omxSP_FFTInit_R_F32
+ *
+ * Description:
+ * Initialize the real forward-FFT specification information struct.
+ *
+ * Remarks:
+ * This function is used to initialize the specification structures
+ * for functions |omxSP_FFTFwd_RToCCS_F32_Sfs| and
+ * |omxSP_FFTInv_CCSToR_F32_Sfs|. Memory for *pFFTSpec must be
+ * allocated prior to calling this function. The number of bytes
+ * required for *pFFTSpec can be determined using
+ * |omxSP_FFTGetBufSize_R_F32|.
+ *
+ * Parameters:
+ * [in] order base-2 logarithm of the desired block length;
+ * valid in the range [1,12]. ([1,15] if
+ * BIG_FFT_TABLE is defined.)
+ * [out] pFFTFwdSpec pointer to the initialized specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTInit_R_F32(OMXFFTSpec_R_F32 *pFFTSpec, OMX_INT order)
+{
+ OMX_F32 *pTwiddle;
+ OMX_F32 *pBuf;
+ OMX_INT i;
+ OMX_INT j;
+ OMX_INT N;
+ OMX_INT NBy2;
+ OMX_INT NBy4;
+ OMX_INT diff;
+ OMX_U32 pTmp;
+ X86FFTSpec_R_FC32 *pFFTStruct = (X86FFTSpec_R_FC32 *) pFFTSpec;
+ OMX_F32 real;
+ OMX_F32 imag;
+
+ if (!pFFTSpec || (order < 1) || (order > TWIDDLE_TABLE_ORDER))
+ return OMX_Sts_BadArgErr;
+
+ N = 1 << order;
+ NBy2 = N >> 1;
+
+ pTwiddle = (OMX_F32*) (sizeof(X86FFTSpec_R_FC32) + (OMX_S8*) pFFTSpec);
+
+ // Align to 32 byte boundary.
+ pTmp = ((OMX_U32)pTwiddle) & 31;
+ if (pTmp)
+ pTwiddle = (OMX_F32*) ((OMX_S8*)pTwiddle + (32 - pTmp));
+
+ pBuf = (OMX_F32*) (sizeof(OMX_F32) * (N << 1) + (OMX_S8*) pTwiddle);
+
+ // Align to 32 byte boundary.
+ pTmp = ((OMX_U32)pBuf) & 31;
+ if (pTmp)
+ pBuf = (OMX_F32*) ((OMX_S8*)pBuf + (32 - pTmp));
+
+ // Calculating Twiddle Factors.
+ diff = 1 << (TWIDDLE_TABLE_ORDER - order + 1);
+
+ // For SSE optimization, using twiddle with split format by which the real and
+ // imag data are stored into first and last halves of the buffer separately
+ // The negatives are moved when generating pTwiddle table.
+ if (order > 1) {
+ NBy4 = N >> 2;
+ for (i = 0, j = 0; i <= NBy4 >> 1; ++i, j += diff) {
+ real = armSP_FFT_F32TwiddleTable[j];
+ imag = armSP_FFT_F32TwiddleTable[j + 1];
+
+ pTwiddle[i] = -real;
+ pTwiddle[i + N] = -imag;
+
+ pTwiddle[NBy4 - i] = imag;
+ pTwiddle[NBy4 - i + N] = real;
+
+ pTwiddle[NBy4 + i] = -imag;
+ pTwiddle[NBy4 + i + N] = real;
+
+ pTwiddle[NBy2 - i] = real;
+ pTwiddle[NBy2 - i + N] = -imag;
+
+ pTwiddle[NBy2 + i] = real;
+ pTwiddle[NBy2 + i + N] = imag;
+
+ pTwiddle[NBy4 * 3 - i] = -imag;
+ pTwiddle[NBy4 * 3 - i + N] = -real;
+
+ pTwiddle[NBy4 * 3 + i] = imag;
+ pTwiddle[NBy4 * 3 + i + N] = -real;
+
+ pTwiddle[N - i - 1] = -real;
+ pTwiddle[(N << 1) - i - 1] = imag;
+ }
+ } else {
+ pTwiddle[0] = armSP_FFT_F32TwiddleTable[0];
+ pTwiddle[2] = armSP_FFT_F32TwiddleTable[1];
+ pTwiddle[1] = -pTwiddle[0];
+ pTwiddle[3] = pTwiddle[2];
+ }
+ pFFTStruct->N = N;
+ pFFTStruct->pTwiddle = pTwiddle;
+ pFFTStruct->pBuf1 = pBuf;
+ pFFTStruct->pBuf2 = pBuf + N + 4;
+
+ return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/x86/omxSP_FFTInv_CCSToR_F32_Sfs.c b/dl/sp/src/x86/omxSP_FFTInv_CCSToR_F32_Sfs.c
new file mode 100644
index 0000000..1733d66
--- /dev/null
+++ b/dl/sp/src/x86/omxSP_FFTInv_CCSToR_F32_Sfs.c
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/omxSP.h"
+#include "dl/sp/api/x86SP.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+#include <stdbool.h>
+
+extern OMX_F32* x86SP_F32_radix2_kernel_OutOfPlace(
+ const OMX_F32 *src,
+ OMX_F32 *buf1,
+ OMX_F32 *buf2,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ bool forward_fft);
+
+extern OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace_sse(
+ const OMX_F32 *src,
+ OMX_F32 *buf1,
+ OMX_F32 *buf2,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ bool forward_fft);
+
+/**
+ * A two-for-one algorithm is used here to do the real ifft:
+ *
+ * Input X[k], (k = 0, ..., N - 1)
+ * Output x[n] = IDFT(N, k){X}
+ * X' is complex conjugate of X
+ * A[k] = (X[k] + X'[N/2 - k]) / 2
+ * B[k] = (X[k] - X'[N/2 - k]) / 2 * W[k], (W = exp(j*2*PI*k/N);
+ * k = 0, ..., N/2 - 1)
+ * Z[k] = A[k] + j * B[k], (k = 0, ..., N/2 - 1)
+ * z[n] = IDFT(N/2, k){Z}
+ * x[2n] = Re(z[n]), (n = 0, ..., N/2 - 1)
+ * x[2n + 1] = Im(z[n]), (n = 0, ..., N/2 - 1)
+ */
+
+/**
+ * This function is the first permutation of two-for-one IFFT algorithm.
+ * We move the division by 2 to the last step in the implementation, so:
+ * A[k] = (X[k] + X'[N/2 - k])
+ * B[k] = (X[k] - X'[N/2 - k]) * W[k], (k = 0, ..., N/2 - 1)
+ * Z[k] = (A[k] + j * B[k]) / 2, (k = 0, ..., N/2 - 1)
+ */
+static void RevbinPermuteInv(const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT j;
+ OMX_INT i_by_2;
+ OMX_INT j_by_2;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+
+ OMX_FC32 big_a;
+ OMX_FC32 big_b;
+ OMX_FC32 temp;
+ const OMX_F32 *tw;
+
+ for (i = 2, j = n - 2; i < n_by_2; i += 2, j -= 2) {
+ // A[k] = (X[k] + X'[N/2 - k])
+ big_a.Re = in[i] + in[j];
+ big_a.Im = in[i + 1] - in[j + 1];
+
+ // temp = (X[k] - X'[N/2 - k])
+ temp.Re = in[i] - in[j];
+ temp.Im = in[i + 1] + in[j + 1];
+
+ i_by_2 = i >> 1;
+ j_by_2 = j >> 1;
+
+ // W[k]
+ tw = twiddle + i_by_2;
+
+ // B[k] = (X[k] - X'[N/2 - k]) * W[k]
+ big_b.Re = temp.Re * tw[0] + temp.Im * tw[n];
+ big_b.Im = temp.Re * tw[n] - temp.Im * tw[0];
+
+ // Convert split format to interleaved format.
+ // Z[k] = (A[k] + j * B[k]) (k = 0, ..., N/2 - 1)
+ // The scaling of 1/2 will be merged into to the scaling in
+ // the last step before the output in omxSP_FFTInv_CCSToR_F32_Sfs.
+ out[i_by_2] = big_a.Re + big_b.Im;
+ out[i_by_2 + n_by_2] = big_b.Re + big_a.Im;
+ out[j_by_2] = big_a.Re - big_b.Im;
+ out[j_by_2 + n_by_2] = big_b.Re - big_a.Im;
+ }
+
+ // The n_by_2 complex point
+ out[n_by_4] = 2.0f * in[n_by_2];
+ out[n_by_4 + n_by_2] = -2.0f * in[n_by_2 + 1];
+
+ // The first complex point
+ out[0] = in[0] + in[n];
+ out[n_by_2] = in[0] - in[n];
+}
+
+// Sse version of RevbinPermuteInv function.
+static void RevbinPermuteInvSse(const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT j;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+ const OMX_F32 *tw;
+ const OMX_F32 *pi;
+ const OMX_F32 *pj;
+
+ VC v_i;
+ VC v_j;
+ VC v_big_a;
+ VC v_big_b;
+ VC v_temp;
+ VC v_tw;
+
+ for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) {
+ pi = in + (i << 1);
+ pj = in + (j << 1);
+ VC_LOAD_INTERLEAVE(&v_i, pi);
+
+ v_j.real = _mm_set_ps(pj[0], pj[2], pj[4], pj[6]);
+ v_j.imag = _mm_set_ps(pj[1], pj[3], pj[5], pj[7]);
+
+ // A[k] = (X[k] + X'[N/2 - k])
+ VC_ADD_SUB(&v_big_a, &v_i, &v_j);
+
+ // temp = (X[k] - X'[N/2 - k])
+ VC_SUB_ADD(&v_temp, &v_i, &v_j);
+
+ // W[k]
+ tw = twiddle + i;
+ VC_LOAD_SPLIT(&v_tw, tw, n);
+
+ // B[k] = (X[k] - X'[N/2 - k]) * W[k]
+ VC_CONJ_MUL(&v_big_b, &v_temp, &v_tw);
+
+ // Convert split format to interleaved format.
+ // Z[k] = (A[k] + j * B[k]) (k = 0, ..., N/2 - 1)
+ // The scaling of 1/2 will be merged into to the scaling in
+ // the last step before the output in omxSP_FFTInv_CCSToR_F32_Sfs.
+ VC_ADD_X_STORE_SPLIT((out + i), &v_big_a, &v_big_b, n_by_2);
+
+ VC_SUB_X_INVERSE_STOREU_SPLIT((out + j), &v_big_a, &v_big_b, n_by_2);
+ }
+
+ // The n_by_2 complex point
+ out[n_by_4] = 2.0f * in[n_by_2];
+ out[n_by_4 + n_by_2] = -2.0f * in[n_by_2 + 1];
+
+ // The first complex point
+ out[0] = in[0] + in[n];
+ out[n_by_2] = in[0] - in[n];
+}
+
+OMXResult omxSP_FFTInv_CCSToR_F32_Sfs(const OMX_F32 *pSrc, OMX_F32 *pDst,
+ const OMXFFTSpec_R_F32 *pFFTSpec) {
+ // Input must be 32 byte aligned
+ if (!pSrc || !pDst || (OMX_INT)pSrc & 31 || (OMX_INT)pDst & 31)
+ return OMX_Sts_BadArgErr;
+
+ OMX_INT n;
+ OMX_INT n_by_2;
+ OMX_INT n_by_4;
+ OMX_INT i;
+ const OMX_F32 *twiddle;
+ OMX_F32 *buf;
+ OMX_F32 *in = (OMX_F32*) pSrc;
+
+ const X86FFTSpec_R_FC32 *pFFTStruct = (const X86FFTSpec_R_FC32*) pFFTSpec;
+
+ n = pFFTStruct->N;
+
+ // This is to handle the case of order == 1.
+ if (n == 2) {
+ pDst[0] = (pSrc[0] + pSrc[2]) / 2;
+ pDst[1] = (pSrc[0] - pSrc[2]) / 2;
+ return OMX_Sts_NoErr;
+ }
+
+ n_by_2 = n >> 1;
+ n_by_4 = n >> 2;
+ buf = pFFTStruct->pBuf1;
+
+ twiddle = pFFTStruct->pTwiddle;
+
+ if (n < 8)
+ RevbinPermuteInv(in, buf, twiddle, n);
+ else
+ RevbinPermuteInvSse(in, buf, twiddle, n);
+
+ if (n_by_2 < 16) {
+ buf = x86SP_F32_radix2_kernel_OutOfPlace(
+ buf,
+ pFFTStruct->pBuf2,
+ buf,
+ twiddle,
+ n_by_2,
+ 0);
+ } else {
+ buf = x86SP_F32_radix4_kernel_OutOfPlace_sse(
+ buf,
+ pFFTStruct->pBuf2,
+ buf,
+ twiddle,
+ n_by_2,
+ 0);
+ }
+
+ // Scale the result by 1/n.
+ // It contains a scaling factor of 1/2 in
+ // RevbinPermuteInv/RevbinPermuteInvSse.
+ OMX_F32 factor = 1.0f / n;
+
+ if (n < 8) {
+ for (i = 0; i < n_by_2; i++) {
+ pDst[i << 1] = buf[i] * factor;
+ pDst[(i << 1) + 1] = buf[i + n_by_2] * factor;
+ }
+ } else {
+ OMX_F32 *base;
+ OMX_F32 *dst;
+ VC temp0;
+ VC temp1;
+ __m128 mFactor = _mm_load1_ps(&factor);
+
+ // Two things are done in this loop:
+ // 1 Get the result scaled; 2 Change the format from split to interleaved.
+ for (i = 0; i < n_by_2; i += 4) {
+ base = buf + i;
+ dst = pDst + (i << 1);
+ VC_LOAD_SPLIT(&temp0, base, n_by_2);
+ VC_MUL_F(&temp1, &temp0, mFactor);
+ VC_STORE_INTERLEAVE(dst, &temp1);
+ }
+ }
+
+ return OMX_Sts_NoErr;
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_fs.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_fs.c
new file mode 100644
index 0000000..6fa21cf
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_fs.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix2_fs(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n; i += 2) {
+ const OMX_F32 *in0 = in + i;
+ const OMX_F32 *in1 = in0 + n;
+ OMX_F32 *out1 = out0 + (n >> 1);
+
+ // CADD out0, in0, in1
+ out0[0] = in0[0] + in1[0];
+ out0[n] = in0[1] + in1[1];
+
+ // CSUB out1, in0, in1
+ out1[0] = in0[0] - in1[0];
+ out1[n] = in0[1] - in1[1];
+
+ out0 += 1;
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls.c
new file mode 100644
index 0000000..f4d991c
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix2_ls(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n; i += 2) {
+ OMX_FC32 t;
+ const OMX_F32 *tw = twiddle + i;
+ const OMX_F32 *in0 = in + i;
+ const OMX_F32 *in1 = in0 + 1;
+ OMX_F32 *out1 = out0 + (n >> 1);
+
+ // CMUL t, tw, in1
+ t.Re = tw[0] * in1[0] - tw[n << 1] * in1[n];
+ t.Im = tw[0] * in1[n] + tw[n << 1] * in1[0];
+
+ // CADD out0, in0, t
+ out0[0] = in0[0] + t.Re;
+ out0[n] = in0[n] + t.Im;
+
+ // CSUB out1, in0, t
+ out1[0] = in0[0] - t.Re;
+ out1[n] = in0[n] - t.Im;
+
+ out0 += 1;
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse.c
new file mode 100644
index 0000000..a712d96
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_F32 *out0 = out;
+ OMX_INT i;
+
+ // This function is used when n >= 8
+ assert(n >= 8);
+ if (n < 8) return;
+
+ for (i = 0; i < n; i += 8) {
+ VC v_tw;
+ VC v_t0;
+ VC v_t1;
+ VC v_temp;
+
+ // Load twiddle
+ const OMX_F32 *tw = twiddle + i;
+ v_tw.real = _mm_set_ps(tw[6], tw[4], tw[2], tw[0]);
+ const OMX_F32 * twi = tw + (n << 1);
+ v_tw.imag = _mm_set_ps(twi[6], twi[4], twi[2], twi[0]);
+
+ // Load real part
+ const OMX_F32 *t = in + i;
+ VC_LOAD_SHUFFLE(&(v_t0.real), &(v_t1.real), t);
+
+ // Load imag part
+ t = t + n;
+ VC_LOAD_SHUFFLE(&(v_t0.imag), &(v_t1.imag), t);
+
+ OMX_F32 *out1 = out0 + (n >> 1);
+ VC_MUL(&v_temp, &v_tw, &v_t1);
+
+ VC_SUB_STORE_SPLIT(out1, &v_t0, &v_temp, n);
+
+ VC_ADD_STORE_SPLIT(out0, &v_t0, &v_temp, n);
+
+ out0 += 4;
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ms.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ms.c
new file mode 100644
index 0000000..3714877
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix2_ms.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix2_ms(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num) {
+ OMX_INT grp;
+ OMX_F32 *out0 = out;
+ OMX_INT set_count = sub_num >> 1;
+
+ for (grp = 0; grp < sub_size; ++grp) {
+ OMX_INT set;
+ const OMX_F32 *tw = twiddle + grp * sub_num;
+
+ for (set = 0; set < set_count; ++set) {
+ OMX_FC32 t;
+ const OMX_F32 *in0 = in + set + grp * sub_num;
+ const OMX_F32 *in1 = in0 + set_count;
+ OMX_F32 *out1 = out0 + (n >> 1);
+
+ // CMUL t, tw, in1
+ t.Re = tw[0] * in1[0] - tw[n << 1] * in1[n];
+ t.Im = tw[0] * in1[n] + tw[n << 1] * in1[0];
+
+ // CADD out0, in0, t
+ out0[0] = in0[0] + t.Re;
+ out0[n] = in0[n] + t.Im;
+
+ // CSUB out1, in0, t
+ out1[0] = in0[0] - t.Re;
+ out1[n] = in0[n] - t.Im;
+
+ out0 += 1;
+ }
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs.c
new file mode 100644
index 0000000..36a40d8
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_fs(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT n_by_4 = n >> 2;
+
+ // Transform from interleaved format to split format.
+ for (i = 0; i < n; i++) {
+ out[i] = in[i << 1];
+ out[i + n] = in[(i << 1) + 1];
+ }
+
+ // As we have already moved data from [in] to [out],
+ // next calculation will be produced in in-place mode.
+ for (i = 0; i < n_by_4; i++) {
+ OMX_F32 *out0 = out + i;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ OMX_FC32 t0;
+ OMX_FC32 t1;
+ OMX_FC32 t2;
+ OMX_FC32 t3;
+
+ // CADD t0, out0, out2
+ t0.Re = out0[0] + out2[0];
+ t0.Im = out0[n] + out2[n];
+
+ // CSUB t1, out0, out2
+ t1.Re = out0[0] - out2[0];
+ t1.Im = out0[n] - out2[n];
+
+ // CADD t2, out1, out3
+ t2.Re = out1[0] + out3[0];
+ t2.Im = out1[n] + out3[n];
+
+ // CSUB t3, out1, out3
+ t3.Re = out1[0] - out3[0];
+ t3.Im = out1[n] - out3[n];
+
+ // CADD out0, t0, t2
+ out0[0] = t0.Re + t2.Re;
+ out0[n] = t0.Im + t2.Im;
+
+ // CSUB out2, t0, t2
+ out2[0] = t0.Re - t2.Re;
+ out2[n] = t0.Im - t2.Im;
+
+ // CADD_SUB_X out1, t1, t3
+ out1[0] = t1.Re + t3.Im;
+ out1[n] = t1.Im - t3.Re;
+
+ // CSUB_ADD_X out3, t1, t3
+ out3[0] = t1.Re - t3.Im;
+ out3[n] = t1.Im + t3.Re;
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse.c
new file mode 100644
index 0000000..58908d3
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_2; i += 8) {
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ const OMX_F32 *in0 = in + i;
+ const OMX_F32 *in1 = in0 + n_by_2;
+ const OMX_F32 *in2 = in1 + n_by_2;
+ const OMX_F32 *in3 = in2 + n_by_2;
+
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ VC_LOAD_SHUFFLE(&(v_t0.real), &(v_t0.imag), in0);
+ VC_LOAD_SHUFFLE(&(v_t1.real), &(v_t1.imag), in1);
+ VC_LOAD_SHUFFLE(&(v_t2.real), &(v_t2.imag), in2);
+ VC_LOAD_SHUFFLE(&(v_t3.real), &(v_t3.imag), in3);
+
+ RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls.c
new file mode 100644
index 0000000..08ab35b
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_ls(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+ OMX_INT i;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_2; i += 2) {
+ OMX_FC32 t0;
+ OMX_FC32 t1;
+ OMX_FC32 t2;
+ OMX_FC32 t3;
+ OMX_FC32 tt1;
+ OMX_FC32 tt2;
+ OMX_FC32 tt3;
+ const OMX_F32 *tw1 = twiddle + i;
+ const OMX_F32 *tw2 = tw1 + i;
+ const OMX_F32 *tw3 = tw2 + i;
+ const OMX_F32 *in0 = in + (i << 1);
+ const OMX_F32 *in1 = in0 + 1;
+ const OMX_F32 *in2 = in1 + 1;
+ const OMX_F32 *in3 = in2 + 1;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ // CMUL tt1, tw1, in1
+ tt1.Re = tw1[0] * in1[0] - tw1[n_mul_2] * in1[n];
+ tt1.Im = tw1[0] * in1[n] + tw1[n_mul_2] * in1[0];
+
+ // CMUL tt2, tw2, in2
+ tt2.Re = tw2[0] * in2[0] - tw2[n_mul_2] * in2[n];
+ tt2.Im = tw2[0] * in2[n] + tw2[n_mul_2] * in2[0];
+
+ // CMUL tt3, tw3, in3
+ tt3.Re = tw3[0] * in3[0] - tw3[n_mul_2] * in3[n];
+ tt3.Im = tw3[0] * in3[n] + tw3[n_mul_2] * in3[0];
+
+ // CADD t0, in0, tt2
+ t0.Re = in0[0] + tt2.Re;
+ t0.Im = in0[n] + tt2.Im;
+
+ // CSUB t1, in0, tt2
+ t1.Re = in0[0] - tt2.Re;
+ t1.Im = in0[n] - tt2.Im;
+
+ // CADD t2, tt1, tt3
+ t2.Re = tt1.Re + tt3.Re;
+ t2.Im = tt1.Im + tt3.Im;
+
+ // CSUB t3, tt1, tt3
+ t3.Re = tt1.Re - tt3.Re;
+ t3.Im = tt1.Im - tt3.Im;
+
+ // CADD out0, t0, t2
+ out0[0] = t0.Re + t2.Re;
+ out0[n] = t0.Im + t2.Im;
+
+ // CSUB out2, t0, t2
+ out2[0] = t0.Re - t2.Re;
+ out2[n] = t0.Im - t2.Im;
+
+ // CADD_SUB_X out1, t1, t3
+ out1[0] = t1.Re + t3.Im;
+ out1[n] = t1.Im - t3.Re;
+
+ // CSUB_ADD_X out3, t1, t3
+ out3[0] = t1.Re - t3.Im;
+ out3[n] = t1.Im + t3.Re;
+
+ out0 += 1;
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse.c
new file mode 100644
index 0000000..4fc3427
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+ OMX_INT i;
+
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_2; i += 8) {
+ const OMX_F32 *tw1 = twiddle + i;
+ const OMX_F32 *tw2 = tw1 + i;
+ const OMX_F32 *tw3 = tw2 + i;
+ const OMX_F32 *in0 = in + (i << 1);
+ const OMX_F32 *in1 = in0 + 4;
+ const OMX_F32 *in2 = in1 + 4;
+ const OMX_F32 *in3 = in2 + 4;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ VC v_tw1;
+ VC v_tw2;
+ VC v_tw3;
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ v_tw1.real = _mm_set_ps(tw1[6], tw1[4], tw1[2], tw1[0]);
+ v_tw1.imag = _mm_set_ps(
+ tw1[6 + n_mul_2],
+ tw1[4 + n_mul_2],
+ tw1[2 + n_mul_2],
+ tw1[n_mul_2]);
+ v_tw2.real = _mm_set_ps(tw2[12], tw2[8], tw2[4], tw2[0]);
+ v_tw2.imag = _mm_set_ps(
+ tw2[12 + n_mul_2],
+ tw2[8 + n_mul_2],
+ tw2[4 + n_mul_2],
+ tw2[n_mul_2]);
+ v_tw3.real = _mm_set_ps(tw3[18], tw3[12], tw3[6], tw3[0]);
+ v_tw3.imag = _mm_set_ps(
+ tw3[18 + n_mul_2],
+ tw3[12 + n_mul_2],
+ tw3[6 + n_mul_2],
+ tw3[n_mul_2]);
+
+ VC_LOAD_MATRIX_TRANSPOSE(&v_t0, &v_t1, &v_t2, &v_t3, in0, in1, in2, in3, n);
+
+ RADIX4_FWD_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_tw1, &v_tw2, &v_tw3,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms.c
new file mode 100644
index 0000000..de2a1be
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_ms(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num) {
+ OMX_INT set;
+ OMX_INT grp;
+ OMX_INT step = sub_num >> 1;
+ OMX_INT set_count = sub_num >> 2;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+ OMX_F32 *out0 = out;
+
+ // grp == 0
+ for (set = 0; set < set_count; ++set) {
+ OMX_FC32 t0;
+ OMX_FC32 t1;
+ OMX_FC32 t2;
+ OMX_FC32 t3;
+
+ const OMX_F32 *in0 = in + set;
+ const OMX_F32 *in1 = in0 + set_count;
+ const OMX_F32 *in2 = in1 + set_count;
+ const OMX_F32 *in3 = in2 + set_count;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ // CADD t0, in0, in2
+ t0.Re = in0[0] + in2[0];
+ t0.Im = in0[n] + in2[n];
+
+ // CSUB t1, in0, in2
+ t1.Re = in0[0] - in2[0];
+ t1.Im = in0[n] - in2[n];
+
+ // CADD t2, in1, in3
+ t2.Re = in1[0] + in3[0];
+ t2.Im = in1[n] + in3[n];
+
+ // CSUB t3, in1, in3
+ t3.Re = in1[0] - in3[0];
+ t3.Im = in1[n] - in3[n];
+
+ // CADD out0, t0, t2
+ out0[0] = t0.Re + t2.Re;
+ out0[n] = t0.Im + t2.Im;
+
+ // CSUB out2, t0, t2
+ out2[0] = t0.Re - t2.Re;
+ out2[n] = t0.Im - t2.Im;
+
+ // CSUB_ADD_X out3, t1, t3
+ out3[0] = t1.Re - t3.Im;
+ out3[n] = t1.Im + t3.Re;
+
+ // CADD_SUB_X out1, t1, t3
+ out1[0] = t1.Re + t3.Im;
+ out1[n] = t1.Im - t3.Re;
+
+ out0 += 1;
+ }
+
+ // grp > 0
+ for (grp = 1; grp < sub_size; ++grp) {
+ const OMX_F32 *tw1 = twiddle + grp * step;
+ const OMX_F32 *tw2 = tw1 + grp * step;
+ const OMX_F32 *tw3 = tw2 + grp * step;
+
+ for (set = 0; set < set_count; ++set) {
+ OMX_FC32 t0;
+ OMX_FC32 t1;
+ OMX_FC32 t2;
+ OMX_FC32 t3;
+ OMX_FC32 tt1;
+ OMX_FC32 tt2;
+ OMX_FC32 tt3;
+
+ const OMX_F32 *in0 = in + set + grp * sub_num;
+ const OMX_F32 *in1 = in0 + set_count;
+ const OMX_F32 *in2 = in1 + set_count;
+ const OMX_F32 *in3 = in2 + set_count;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ // CMUL tt1, Tw1, in1
+ tt1.Re = tw1[0] * in1[0] - tw1[n_mul_2] * in1[n];
+ tt1.Im = tw1[0] * in1[n] + tw1[n_mul_2] * in1[0];
+
+ // CMUL tt2, Tw2, in2
+ tt2.Re = tw2[0] * in2[0] - tw2[n_mul_2] * in2[n];
+ tt2.Im = tw2[0] * in2[n] + tw2[n_mul_2] * in2[0];
+
+ // CMUL tt3, Tw3, in3
+ tt3.Re = tw3[0] * in3[0] - tw3[n_mul_2] * in3[n];
+ tt3.Im = tw3[0] * in3[n] + tw3[n_mul_2] * in3[0];
+
+ // CADD t0, in0, tt2
+ t0.Re = in0[0] + tt2.Re;
+ t0.Im = in0[n] + tt2.Im;
+
+ // CSUB t1, in0, tt2
+ t1.Re = in0[0] - tt2.Re;
+ t1.Im = in0[n] - tt2.Im;
+
+ // CADD t2, tt1, tt3
+ t2.Re = tt1.Re + tt3.Re;
+ t2.Im = tt1.Im + tt3.Im;
+
+ // CSUB t3, tt1, tt3
+ t3.Re = tt1.Re - tt3.Re;
+ t3.Im = tt1.Im - tt3.Im;
+
+ // CADD out0, t0, t2
+ out0[0] = t0.Re + t2.Re;
+ out0[n] = t0.Im + t2.Im;
+
+ // CSUB out2, t0, t2
+ out2[0] = t0.Re - t2.Re;
+ out2[n] = t0.Im - t2.Im;
+
+ // CADD_SUB_X out1, t1, t3
+ out1[0] = t1.Re + t3.Im;
+ out1[n] = t1.Im - t3.Re;
+
+ // CSUB_ADD_X out3, t1, t3
+ out3[0] = t1.Re - t3.Im;
+ out3[n] = t1.Im + t3.Re;
+
+ out0 += 1;
+ }
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse.c
new file mode 100644
index 0000000..286f842
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+// This function handles the case when set_count = 2, in which we cannot
+// unroll the set loop by 4 to meet the SSE requirement (4 elements).
+static void InternalUnroll2Fwd(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_2; i += 8) {
+ const OMX_F32 *tw1 = twiddle + i;
+ const OMX_F32 *tw2 = tw1 + i;
+ const OMX_F32 *tw3 = tw2 + i;
+ const OMX_F32 *tw1e = tw1 + 4;
+ const OMX_F32 *tw2e = tw2 + 8;
+ const OMX_F32 *tw3e = tw3 + 12;
+
+ VC v_tw1;
+ VC v_tw2;
+ VC v_tw3;
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ v_tw1.real = _mm_shuffle_ps(_mm_load_ss(tw1),
+ _mm_load_ss(tw1e),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw1.imag = _mm_shuffle_ps(_mm_load_ss(tw1 + n_mul_2),
+ _mm_load_ss(tw1e + n_mul_2),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw2.real = _mm_shuffle_ps(_mm_load_ss(tw2),
+ _mm_load_ss(tw2e),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw2.imag = _mm_shuffle_ps(_mm_load_ss(tw2 + n_mul_2),
+ _mm_load_ss(tw2e + n_mul_2),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw3.real = _mm_shuffle_ps(_mm_load_ss(tw3),
+ _mm_load_ss(tw3e),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw3.imag = _mm_shuffle_ps(_mm_load_ss(tw3 + n_mul_2),
+ _mm_load_ss(tw3e + n_mul_2),
+ _MM_SHUFFLE(0, 0, 0, 0));
+
+ __m128 xmm0;
+ __m128 xmm1;
+ __m128 xmm2;
+ __m128 xmm3;
+ __m128 xmm4;
+ __m128 xmm5;
+ __m128 xmm6;
+ __m128 xmm7;
+
+ const OMX_F32 *in0 = in + (i << 1);
+ xmm0 = _mm_load_ps(in0);
+ xmm1 = _mm_load_ps(in0 + 4);
+ xmm2 = _mm_load_ps(in0 + 8);
+ xmm3 = _mm_load_ps(in0 + 12);
+ v_t0.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(1, 0, 1, 0));
+ v_t1.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(3, 2, 3, 2));
+ v_t2.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(1, 0, 1, 0));
+ v_t3.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(3, 2, 3, 2));
+
+ xmm4 = _mm_load_ps(in0 + n);
+ xmm5 = _mm_load_ps(in0 + n + 4);
+ xmm6 = _mm_load_ps(in0 + n + 8);
+ xmm7 = _mm_load_ps(in0 + n + 12);
+ v_t0.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
+ v_t1.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
+ v_t2.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
+ v_t3.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
+
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ RADIX4_FWD_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_tw1, &v_tw2,
+ &v_tw3, &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+}
+
+void x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num) {
+ OMX_INT set;
+ OMX_INT grp;
+ OMX_INT step = sub_num >> 1;
+ OMX_INT set_count = sub_num >> 2;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+
+ OMX_F32 *out0 = out;
+
+ if (set_count == 2) {
+ InternalUnroll2Fwd(in, out, twiddle, n);
+ return;
+ }
+
+ // grp == 0
+ for (set = 0; set < set_count; set += 4) {
+ const OMX_F32 * in0 = in + set;
+ const OMX_F32 *in1 = in0 + set_count;
+ const OMX_F32 *in2 = in1 + set_count;
+ const OMX_F32 *in3 = in2 + set_count;
+
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ VC_LOAD_SPLIT(&v_t0, in0, n);
+ VC_LOAD_SPLIT(&v_t1, in1, n);
+ VC_LOAD_SPLIT(&v_t2, in2, n);
+ VC_LOAD_SPLIT(&v_t3, in3, n);
+
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+
+ for (grp = 1; grp < sub_size; ++grp) {
+ const OMX_F32 *tw1 = twiddle + grp * step;
+ const OMX_F32 *tw2 = tw1 + grp * step;
+ const OMX_F32 *tw3 = tw2 + grp * step;
+
+ VC v_tw1;
+ VC v_tw2;
+ VC v_tw3;
+
+ v_tw1.real = _mm_load1_ps(tw1);
+ v_tw1.imag = _mm_load1_ps(tw1 + n_mul_2);
+ v_tw2.real = _mm_load1_ps(tw2);
+ v_tw2.imag = _mm_load1_ps(tw2 + n_mul_2);
+ v_tw3.real = _mm_load1_ps(tw3);
+ v_tw3.imag = _mm_load1_ps(tw3 + n_mul_2);
+
+ for (set = 0; set < set_count; set += 4) {
+ const OMX_F32 *in0 = in + set + grp * sub_num;
+ const OMX_F32 *in1 = in0 + set_count;
+ const OMX_F32 *in2 = in1 + set_count;
+ const OMX_F32 *in3 = in2 + set_count;
+
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ VC_LOAD_SPLIT(&v_t0, in0, n);
+ VC_LOAD_SPLIT(&v_t1, in1, n);
+ VC_LOAD_SPLIT(&v_t2, in2, n);
+ VC_LOAD_SPLIT(&v_t3, in3, n);
+
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ RADIX4_FWD_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_tw1, &v_tw2, &v_tw3,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_FWD_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_fs.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_fs.c
new file mode 100644
index 0000000..9f17d61
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_fs.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix2_fs(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_2; i++) {
+ const OMX_F32 *in0 = in + i;
+ const OMX_F32 *in1 = in0 + n_by_2;
+ OMX_F32 *out1 = out0 + n_by_2;
+
+ // CADD out0, in0, in1
+ out0[0] = in0[0] + in1[0];
+ out0[n] = in0[n] + in1[n];
+
+ // CSUB out1, in0, in1
+ out1[0] = in0[0] - in1[0];
+ out1[n] = in0[n] - in1[n];
+
+ out0 += 1;
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls.c
new file mode 100644
index 0000000..ec545c5
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix2_ls(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n; i += 2) {
+ OMX_FC32 t;
+ const OMX_F32 *tw = twiddle + i;
+ const OMX_F32 *in0 = in + i;
+ const OMX_F32 *in1 = in0 + 1;
+ OMX_F32 *out1 = out0 + (n >> 1);
+
+ // CMUL t, tw, in1
+ t.Re = tw[0] * in1[0] + tw[n << 1] * in1[n];
+ t.Im = tw[0] * in1[n] - tw[n << 1] * in1[0];
+
+ // CADD out0, in0, t
+ out0[0] = in0[0] + t.Re;
+ out0[n] = in0[n] + t.Im;
+
+ // CSUB out1, in0, t
+ out1[0] = in0[0] - t.Re;
+ out1[n] = in0[n] - t.Im;
+
+ out0 += 1;
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse.c
new file mode 100644
index 0000000..abad0cc
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_F32 *out0 =out;
+ OMX_INT i;
+
+ for (i = 0; i < n; i += 8) {
+ VC v_tw;
+ VC v_t0;
+ VC v_t1;
+ VC v_temp;
+
+ // Load twiddle
+ const OMX_F32 *tw = twiddle + i;
+ v_tw.real = _mm_set_ps(tw[6], tw[4], tw[2], tw[0]);
+ const OMX_F32 * twi = tw + (n << 1);
+ v_tw.imag = _mm_set_ps(twi[6], twi[4], twi[2], twi[0]);
+
+ // Load real part
+ const OMX_F32 *t = in + i;
+ VC_LOAD_SHUFFLE(&(v_t0.real), &(v_t1.real), t);
+
+ // Load imag part
+ t = t + n;
+ VC_LOAD_SHUFFLE(&(v_t0.imag), &(v_t1.imag), t);
+
+ OMX_F32 *out1 = out0 + (n >> 1);
+ VC_CONJ_MUL(&v_temp, &v_tw, &v_t1);
+
+ VC_SUB_STORE_SPLIT(out1, &v_t0, &v_temp, n);
+
+ VC_ADD_STORE_SPLIT(out0, &v_t0, &v_temp, n);
+
+ out0 += 4;
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ms.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ms.c
new file mode 100644
index 0000000..78bc9eb
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix2_ms.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix2_ms(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num) {
+ OMX_INT grp;
+ OMX_F32 *out0 = out;
+ OMX_INT set_count = sub_num >> 1;
+
+ for (grp = 0; grp < sub_size; ++grp) {
+ OMX_INT set;
+ const OMX_F32 *tw = twiddle + grp * sub_num;
+
+ for (set = 0; set < set_count; ++set) {
+ OMX_FC32 t;
+ const OMX_F32 *in0 = in + set + grp * sub_num;
+ const OMX_F32 *in1 = in0 + set_count;
+ OMX_F32 *out1 = out0 + (n >> 1);
+
+ // CMUL t, tw, in1
+ t.Re = tw[0] * in1[0] + tw[n << 1] * in1[n];
+ t.Im = tw[0] * in1[n] - tw[n << 1] * in1[0];
+
+ // CADD out0, in0, t
+ out0[0] = in0[0] + t.Re;
+ out0[n] = in0[n] + t.Im;
+
+ // CSUB out1, in0, t
+ out1[0] = in0[0] - t.Re;
+ out1[n] = in0[n] - t.Im;
+
+ out0 += 1;
+ }
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs.c
new file mode 100644
index 0000000..bb80fa3
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_fs(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_4; i++) {
+ const OMX_F32 *in0 = in + i;
+ const OMX_F32 *in1 = in0 + n_by_4;
+ const OMX_F32 *in2 = in1 + n_by_4;
+ const OMX_F32 *in3 = in2 + n_by_4;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ OMX_FC32 t0;
+ OMX_FC32 t1;
+ OMX_FC32 t2;
+ OMX_FC32 t3;
+
+ // CADD t0, in0, in2
+ t0.Re = in0[0] + in2[0];
+ t0.Im = in0[n] + in2[n];
+
+ // CSUB t1, in0, in2
+ t1.Re = in0[0] - in2[0];
+ t1.Im = in0[n] - in2[n];
+
+ // CADD t2, in1, in3
+ t2.Re = in1[0] + in3[0];
+ t2.Im = in1[n] + in3[n];
+
+ // CSUB t3, in1, in3
+ t3.Re = in1[0] - in3[0];
+ t3.Im = in1[n] - in3[n];
+
+ // CADD out0, t0, t2
+ out0[0] = t0.Re + t2.Re;
+ out0[n] = t0.Im + t2.Im;
+
+ // CSUB out2, t0, t2
+ out2[0] = t0.Re - t2.Re;
+ out2[n] = t0.Im - t2.Im;
+
+ // CSUB_ADD_X out1, t1, t3
+ out1[0] = t1.Re - t3.Im;
+ out1[n] = t1.Im + t3.Re;
+
+ // CADD_SUB_X out3, t1, t3
+ out3[0] = t1.Re + t3.Im;
+ out3[n] = t1.Im - t3.Re;
+
+ out0 += 1;
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse.c
new file mode 100644
index 0000000..c3921bc
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_4; i += 4) {
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ const OMX_F32 *in0 = in + i;
+ const OMX_F32 *in1 = in0 + n_by_4;
+ const OMX_F32 *in2 = in1 + n_by_4;
+ const OMX_F32 *in3 = in2 + n_by_4;
+
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ VC_LOAD_SPLIT(&v_t0, in0, n);
+ VC_LOAD_SPLIT(&v_t1, in1, n);
+ VC_LOAD_SPLIT(&v_t2, in2, n);
+ VC_LOAD_SPLIT(&v_t3, in3, n);
+
+ RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls.c
new file mode 100644
index 0000000..705d9cb
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_ls(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+ OMX_INT i;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_2; i += 2) {
+ OMX_FC32 t0;
+ OMX_FC32 t1;
+ OMX_FC32 t2;
+ OMX_FC32 t3;
+ OMX_FC32 tt1;
+ OMX_FC32 tt2;
+ OMX_FC32 tt3;
+ const OMX_F32 *tw1 = twiddle + i;
+ const OMX_F32 *tw2 = tw1 + i;
+ const OMX_F32 *tw3 = tw2 + i;
+ const OMX_F32 *in0 = in + (i << 1);
+ const OMX_F32 *in1 = in0 + 1;
+ const OMX_F32 *in2 = in1 + 1;
+ const OMX_F32 *in3 = in2 + 1;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ // CMUL tt1, Tw1, in1
+ tt1.Re = tw1[0] * in1[0] + tw1[n_mul_2] * in1[n];
+ tt1.Im = tw1[0] * in1[n] - tw1[n_mul_2] * in1[0];
+
+ // CMUL tt2, Tw2, in2
+ tt2.Re = tw2[0] * in2[0] + tw2[n_mul_2] * in2[n];
+ tt2.Im = tw2[0] * in2[n] - tw2[n_mul_2] * in2[0];
+
+ // CMUL tt3, Tw3, in3
+ tt3.Re = tw3[0] * in3[0] + tw3[n_mul_2] * in3[n];
+ tt3.Im = tw3[0] * in3[n] - tw3[n_mul_2] * in3[0];
+
+ // CADD t0, in0, tt2
+ t0.Re = in0[0] + tt2.Re;
+ t0.Im = in0[n] + tt2.Im;
+
+ // CSUB t1, in0, tt2
+ t1.Re = in0[0] - tt2.Re;
+ t1.Im = in0[n] - tt2.Im;
+
+ // CADD t2, tt1, tt3
+ t2.Re = tt1.Re + tt3.Re;
+ t2.Im = tt1.Im + tt3.Im;
+
+ // CSUB t3, tt1, tt3
+ t3.Re = tt1.Re - tt3.Re;
+ t3.Im = tt1.Im - tt3.Im;
+
+ // CADD out0, t0, t2
+ out0[0] = t0.Re + t2.Re;
+ out0[n] = t0.Im + t2.Im;
+
+ // CSUB out2, t0, t2
+ out2[0] = t0.Re - t2.Re;
+ out2[n] = t0.Im - t2.Im;
+
+ // CSUB_ADD_X out1, t1, t3
+ out1[0] = t1.Re - t3.Im;
+ out1[n] = t1.Im + t3.Re;
+
+ // CADD_SUB_X out3, t1, t3
+ out3[0] = t1.Re + t3.Im;
+ out3[n] = t1.Im - t3.Re;
+
+ out0 += 1;
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse.c
new file mode 100644
index 0000000..2e245fa
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+ OMX_INT i;
+
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_2; i += 8) {
+ const OMX_F32 *tw1 = twiddle + i;
+ const OMX_F32 *tw2 = tw1 + i;
+ const OMX_F32 *tw3 = tw2 + i;
+ const OMX_F32 *in0 = in + (i << 1);
+ const OMX_F32 *in1 = in0 + 4;
+ const OMX_F32 *in2 = in1 + 4;
+ const OMX_F32 *in3 = in2 + 4;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ VC v_tw1;
+ VC v_tw2;
+ VC v_tw3;
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ v_tw1.real = _mm_set_ps(tw1[6], tw1[4], tw1[2], tw1[0]);
+ v_tw1.imag = _mm_set_ps(
+ tw1[6 + n_mul_2],
+ tw1[4 + n_mul_2],
+ tw1[2 + n_mul_2],
+ tw1[n_mul_2]);
+ v_tw2.real = _mm_set_ps(tw2[12], tw2[8], tw2[4], tw2[0]);
+ v_tw2.imag = _mm_set_ps(
+ tw2[12 + n_mul_2],
+ tw2[8 + n_mul_2],
+ tw2[4 + n_mul_2],
+ tw2[n_mul_2]);
+ v_tw3.real = _mm_set_ps(tw3[18], tw3[12], tw3[6], tw3[0]);
+ v_tw3.imag = _mm_set_ps(
+ tw3[18 + n_mul_2],
+ tw3[12 + n_mul_2],
+ tw3[6 + n_mul_2],
+ tw3[n_mul_2]);
+
+ VC_LOAD_MATRIX_TRANSPOSE(&v_t0, &v_t1, &v_t2, &v_t3, in0, in1, in2, in3, n);
+
+ RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_tw1, &v_tw2, &v_tw3,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c
new file mode 100644
index 0000000..499036b
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_ms(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num) {
+ OMX_INT set;
+ OMX_INT grp;
+ OMX_INT step = sub_num >> 1;
+ OMX_INT set_count = sub_num >> 2;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+ OMX_F32 *out0 = out;
+
+ // grp == 0
+ for (set = 0; set < set_count; ++set) {
+ OMX_FC32 t0;
+ OMX_FC32 t1;
+ OMX_FC32 t2;
+ OMX_FC32 t3;
+
+ const OMX_F32 *in0 = in + set;
+ const OMX_F32 *in1 = in0 + set_count;
+ const OMX_F32 *in2 = in1 + set_count;
+ const OMX_F32 *in3 = in2 + set_count;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ // CADD t0, in0, in2
+ t0.Re = in0[0] + in2[0];
+ t0.Im = in0[n] + in2[n];
+
+ // CSUB t1, in0, in2
+ t1.Re = in0[0] - in2[0];
+ t1.Im = in0[n] - in2[n];
+
+ // CADD t2, in1, in3
+ t2.Re = in1[0] + in3[0];
+ t2.Im = in1[n] + in3[n];
+
+ // CSUB t3, in1, in3
+ t3.Re = in1[0] - in3[0];
+ t3.Im = in1[n] - in3[n];
+
+ // CADD out0, t0, t2
+ out0[0] = t0.Re + t2.Re;
+ out0[n] = t0.Im + t2.Im;
+
+ // CSUB out2, t0, t2
+ out2[0] = t0.Re - t2.Re;
+ out2[n] = t0.Im - t2.Im;
+
+ // CSUB_ADD_X out1, t1, t3
+ out1[0] = t1.Re - t3.Im;
+ out1[n] = t1.Im + t3.Re;
+
+ // CADD_SUB_X out3, t1, t3
+ out3[0] = t1.Re + t3.Im;
+ out3[n] = t1.Im - t3.Re;
+
+ out0 += 1;
+ }
+
+ // grp > 0
+ for (grp = 1; grp < sub_size; ++grp) {
+ const OMX_F32 *tw1 = twiddle + grp * step;
+ const OMX_F32 *tw2 = tw1 + grp * step;
+ const OMX_F32 *tw3 = tw2 + grp * step;
+
+ for (set = 0; set < set_count; ++set) {
+ OMX_FC32 t0;
+ OMX_FC32 t1;
+ OMX_FC32 t2;
+ OMX_FC32 t3;
+ OMX_FC32 tt1;
+ OMX_FC32 tt2;
+ OMX_FC32 tt3;
+
+ const OMX_F32 *in0 = in + set + grp * sub_num;
+ const OMX_F32 *in1 = in0 + set_count;
+ const OMX_F32 *in2 = in1 + set_count;
+ const OMX_F32 *in3 = in2 + set_count;
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ // CMUL tt1, Tw1, in1
+ tt1.Re = tw1[0] * in1[0] + tw1[n_mul_2] * in1[n];
+ tt1.Im = tw1[0] * in1[n] - tw1[n_mul_2] * in1[0];
+
+ // CMUL tt2, Tw2, in2
+ tt2.Re = tw2[0] * in2[0] + tw2[n_mul_2] * in2[n];
+ tt2.Im = tw2[0] * in2[n] - tw2[n_mul_2] * in2[0];
+
+ // CMUL tt3, Tw3, in3
+ tt3.Re = tw3[0] * in3[0] + tw3[n_mul_2] * in3[n];
+ tt3.Im = tw3[0] * in3[n] - tw3[n_mul_2] * in3[0];
+
+ // CADD t0, in0, tt2
+ t0.Re = in0[0] + tt2.Re;
+ t0.Im = in0[n] + tt2.Im;
+
+ // CSUB t1, in0, tt2
+ t1.Re = in0[0] - tt2.Re;
+ t1.Im = in0[n] - tt2.Im;
+
+ // CADD t2, tt1, tt3
+ t2.Re = tt1.Re + tt3.Re;
+ t2.Im = tt1.Im + tt3.Im;
+
+ // CSUB t3, tt1, tt3
+ t3.Re = tt1.Re - tt3.Re;
+ t3.Im = tt1.Im - tt3.Im;
+
+ // CADD out0, t0, t2
+ out0[0] = t0.Re + t2.Re;
+ out0[n] = t0.Im + t2.Im;
+
+ // CSUB out2, t0, t2
+ out2[0] = t0.Re - t2.Re;
+ out2[n] = t0.Im - t2.Im;
+
+ // CSUB_ADD_X out1, t1, t3
+ out1[0] = t1.Re - t3.Im;
+ out1[n] = t1.Im + t3.Re;
+
+ // CADD_SUB_X out3, t1, t3
+ out3[0] = t1.Re + t3.Im;
+ out3[n] = t1.Im - t3.Re;
+
+ out0 += 1;
+ }
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse.c b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse.c
new file mode 100644
index 0000000..703f316
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include "dl/sp/src/x86/x86SP_SSE_Math.h"
+
+// This function handles the case when set_count = 2, in which we cannot
+// unroll the set loop by 4 to meet the SSE requirement (4 elements).
+static void InternalUnroll2Inv(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n) {
+ OMX_INT i;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+ OMX_F32 *out0 = out;
+
+ for (i = 0; i < n_by_2; i += 8) {
+ const OMX_F32 *tw1 = twiddle + i;
+ const OMX_F32 *tw2 = tw1 + i;
+ const OMX_F32 *tw3 = tw2 + i;
+ const OMX_F32 *tw1e = tw1 + 4;
+ const OMX_F32 *tw2e = tw2 + 8;
+ const OMX_F32 *tw3e = tw3 + 12;
+
+ VC v_tw1;
+ VC v_tw2;
+ VC v_tw3;
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ v_tw1.real = _mm_shuffle_ps(_mm_load_ss(tw1),
+ _mm_load_ss(tw1e),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw1.imag = _mm_shuffle_ps(_mm_load_ss(tw1 + n_mul_2),
+ _mm_load_ss(tw1e + n_mul_2),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw2.real = _mm_shuffle_ps(_mm_load_ss(tw2),
+ _mm_load_ss(tw2e),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw2.imag = _mm_shuffle_ps(_mm_load_ss(tw2 + n_mul_2),
+ _mm_load_ss(tw2e + n_mul_2),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw3.real = _mm_shuffle_ps(_mm_load_ss(tw3),
+ _mm_load_ss(tw3e),
+ _MM_SHUFFLE(0, 0, 0, 0));
+ v_tw3.imag = _mm_shuffle_ps(_mm_load_ss(tw3 + n_mul_2),
+ _mm_load_ss(tw3e + n_mul_2),
+ _MM_SHUFFLE(0, 0, 0, 0));
+
+ __m128 xmm0;
+ __m128 xmm1;
+ __m128 xmm2;
+ __m128 xmm3;
+ __m128 xmm4;
+ __m128 xmm5;
+ __m128 xmm6;
+ __m128 xmm7;
+
+ const OMX_F32 *in0 = in + (i << 1);
+ xmm0 = _mm_load_ps(in0);
+ xmm1 = _mm_load_ps(in0 + 4);
+ xmm2 = _mm_load_ps(in0 + 8);
+ xmm3 = _mm_load_ps(in0 + 12);
+ v_t0.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(1, 0, 1, 0));
+ v_t1.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(3, 2, 3, 2));
+ v_t2.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(1, 0, 1, 0));
+ v_t3.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(3, 2, 3, 2));
+
+ xmm4 = _mm_load_ps(in0 + n);
+ xmm5 = _mm_load_ps(in0 + n + 4);
+ xmm6 = _mm_load_ps(in0 + n + 8);
+ xmm7 = _mm_load_ps(in0 + n + 12);
+ v_t0.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
+ v_t1.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
+ v_t2.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
+ v_t3.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
+
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_tw1, &v_tw2, &v_tw3,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+}
+
+void x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num) {
+ OMX_INT set;
+ OMX_INT grp;
+ OMX_INT step = sub_num >> 1;
+ OMX_INT set_count = sub_num >> 2;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_INT n_mul_2 = n << 1;
+
+ OMX_F32 *out0 = out;
+
+ if (set_count == 2) {
+ InternalUnroll2Inv(in, out, twiddle, n);
+ return;
+ }
+
+ // grp == 0
+ for (set = 0; set < set_count; set += 4) {
+ const OMX_F32 * in0 = in + set;
+ const OMX_F32 *in1 = in0 + set_count;
+ const OMX_F32 *in2 = in1 + set_count;
+ const OMX_F32 *in3 = in2 + set_count;
+
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ VC_LOAD_SPLIT(&v_t0, in0, n);
+ VC_LOAD_SPLIT(&v_t1, in1, n);
+ VC_LOAD_SPLIT(&v_t2, in2, n);
+ VC_LOAD_SPLIT(&v_t3, in3, n);
+
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+
+ for (grp = 1; grp < sub_size; ++grp) {
+ const OMX_F32 *tw1 = twiddle + grp * step;
+ const OMX_F32 *tw2 = tw1 + grp * step;
+ const OMX_F32 *tw3 = tw2 + grp * step;
+
+ VC v_tw1;
+ VC v_tw2;
+ VC v_tw3;
+
+ v_tw1.real = _mm_load1_ps(tw1);
+ v_tw1.imag = _mm_load1_ps(tw1 + n_mul_2);
+ v_tw2.real = _mm_load1_ps(tw2);
+ v_tw2.imag = _mm_load1_ps(tw2 + n_mul_2);
+ v_tw3.real = _mm_load1_ps(tw3);
+ v_tw3.imag = _mm_load1_ps(tw3 + n_mul_2);
+
+ for (set = 0; set < set_count; set += 4) {
+ const OMX_F32 *in0 = in + set + grp * sub_num;
+ const OMX_F32 *in1 = in0 + set_count;
+ const OMX_F32 *in2 = in1 + set_count;
+ const OMX_F32 *in3 = in2 + set_count;
+
+ VC v_t0;
+ VC v_t1;
+ VC v_t2;
+ VC v_t3;
+ VC v_t4;
+ VC v_t5;
+ VC v_t6;
+ VC v_t7;
+
+ VC_LOAD_SPLIT(&v_t0, in0, n);
+ VC_LOAD_SPLIT(&v_t1, in1, n);
+ VC_LOAD_SPLIT(&v_t2, in2, n);
+ VC_LOAD_SPLIT(&v_t3, in3, n);
+
+ OMX_F32 *out1 = out0 + n_by_4;
+ OMX_F32 *out2 = out1 + n_by_4;
+ OMX_F32 *out3 = out2 + n_by_4;
+
+ RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
+ &v_tw1, &v_tw2, &v_tw3,
+ &v_t0, &v_t1, &v_t2, &v_t3);
+
+ RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
+ &v_t4, &v_t5, &v_t6, &v_t7, n);
+
+ out0 += 4;
+ }
+ }
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_F32_radix2_kernel.c b/dl/sp/src/x86/x86SP_FFT_F32_radix2_kernel.c
new file mode 100644
index 0000000..0a3d816
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_F32_radix2_kernel.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/omxtypes.h"
+#include <stdbool.h>
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix2_fs(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix2_fs(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix2_ms(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix2_ms(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix2_ls(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix2_ls(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n);
+
+OMX_F32* x86SP_F32_radix2_kernel_OutOfPlace(
+ const OMX_F32 *src,
+ // Two Ping Pong buffers for out of place kernel.
+ OMX_F32 *buf1,
+ OMX_F32 *buf2,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ bool forward_fft) {
+ OMX_INT sub_size;
+ OMX_INT sub_num;
+ OMX_INT n_by_2 = n >> 1;
+ OMX_F32 *in = buf1;
+ OMX_F32 *out = buf2;
+
+ if (forward_fft)
+ x86SP_FFT_CToC_FC32_Fwd_Radix2_fs(src, in, n);
+ else
+ x86SP_FFT_CToC_FC32_Inv_Radix2_fs(src, in, n);
+
+ for (sub_size = 2, sub_num = n_by_2;
+ sub_size < n_by_2;
+ sub_size = sub_size << 1, sub_num = sub_num >> 1) {
+
+ if (forward_fft) {
+ x86SP_FFT_CToC_FC32_Fwd_Radix2_ms(in, out, twiddle,
+ n, sub_size, sub_num);
+ } else {
+ x86SP_FFT_CToC_FC32_Inv_Radix2_ms(in, out, twiddle,
+ n, sub_size, sub_num);
+ }
+
+ OMX_F32 *temp = out;
+ out = in;
+ in = temp;
+ }
+
+ // If sub_num <= 1, no need to do the last stage.
+ if (sub_num <= 1)
+ return in;
+
+ if (forward_fft)
+ x86SP_FFT_CToC_FC32_Fwd_Radix2_ls(in, out, twiddle, n);
+ else
+ x86SP_FFT_CToC_FC32_Inv_Radix2_ls(in, out, twiddle, n);
+
+ return out;
+}
diff --git a/dl/sp/src/x86/x86SP_FFT_F32_radix4_kernel.c b/dl/sp/src/x86/x86SP_FFT_F32_radix4_kernel.c
new file mode 100644
index 0000000..e7c7b89
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_FFT_F32_radix4_kernel.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+#include "dl/api/omxtypes.h"
+#include <stdbool.h>
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_fs(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_fs(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_ms(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_ms(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ OMX_INT sub_size,
+ OMX_INT sub_num);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_ls(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_ls(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n);
+
+extern void x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse(
+ const OMX_F32 *in,
+ OMX_F32 *out,
+ const OMX_F32 *twiddle,
+ OMX_INT n);
+
+OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace(
+ const OMX_F32 *src,
+ OMX_F32 *buf1,
+ OMX_F32 *buf2,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ bool forward_fft) {
+ OMX_INT sub_size;
+ OMX_INT sub_num;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_F32 *in = buf1;
+ OMX_F32 *out = buf2;
+
+ if (forward_fft)
+ x86SP_FFT_CToC_FC32_Fwd_Radix4_fs(src, in, n);
+ else
+ x86SP_FFT_CToC_FC32_Inv_Radix4_fs(src, in, n);
+
+ for (sub_size = 4, sub_num = n_by_4;
+ sub_size < n_by_4;
+ sub_size = sub_size << 2, sub_num = sub_num >> 2) {
+
+ if (forward_fft) {
+ x86SP_FFT_CToC_FC32_Fwd_Radix4_ms(in, out, twiddle,
+ n, sub_size, sub_num);
+ } else {
+ x86SP_FFT_CToC_FC32_Inv_Radix4_ms(in, out, twiddle,
+ n, sub_size, sub_num);
+ }
+
+ OMX_F32 *temp = out;
+ out = in;
+ in = temp;
+ }
+
+ if (forward_fft) {
+ if (sub_num == 2)
+ x86SP_FFT_CToC_FC32_Fwd_Radix2_ls(in, out, twiddle, n);
+ else
+ x86SP_FFT_CToC_FC32_Fwd_Radix4_ls(in, out, twiddle, n);
+ } else {
+ if (sub_num == 2)
+ x86SP_FFT_CToC_FC32_Inv_Radix2_ls(in, out, twiddle, n);
+ else
+ x86SP_FFT_CToC_FC32_Inv_Radix4_ls(in, out, twiddle, n);
+ }
+
+ return out;
+}
+
+OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace_sse(
+ const OMX_F32 *src,
+ OMX_F32 *buf1,
+ OMX_F32 *buf2,
+ const OMX_F32 *twiddle,
+ OMX_INT n,
+ // true for forward, false for inverse.
+ bool forward_fft) {
+ OMX_INT sub_size, sub_num;
+ OMX_INT n_by_4 = n >> 2;
+ OMX_F32 *in, *out;
+ in = buf1;
+ out = buf2;
+
+ if (forward_fft)
+ x86SP_FFT_CToC_FC32_Fwd_Radix4_fs_sse(src, in, n);
+ else
+ x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse(src, in, n);
+
+ for (sub_size = 4, sub_num = n_by_4;
+ sub_size < n_by_4;
+ sub_size = sub_size << 2, sub_num = sub_num >> 2) {
+
+ if (forward_fft) {
+ x86SP_FFT_CToC_FC32_Fwd_Radix4_ms_sse(in, out, twiddle,
+ n, sub_size, sub_num);
+ } else {
+ x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse(in, out, twiddle,
+ n, sub_size, sub_num);
+ }
+
+ OMX_F32 *temp = out;
+ out = in;
+ in = temp;
+ }
+
+ // If n is not power of 4, sub_num == 2.
+ if (forward_fft) {
+ if (sub_num == 2)
+ x86SP_FFT_CToC_FC32_Fwd_Radix2_ls_sse(in, out, twiddle, n);
+ else
+ x86SP_FFT_CToC_FC32_Fwd_Radix4_ls_sse(in, out, twiddle, n);
+ } else {
+ if (sub_num == 2)
+ x86SP_FFT_CToC_FC32_Inv_Radix2_ls_sse(in, out, twiddle, n);
+ else
+ x86SP_FFT_CToC_FC32_Inv_Radix4_ls_sse(in, out, twiddle, n);
+ }
+
+ return out;
+}
diff --git a/dl/sp/src/x86/x86SP_SSE_Math.h b/dl/sp/src/x86/x86SP_SSE_Math.h
new file mode 100644
index 0000000..d10a851
--- /dev/null
+++ b/dl/sp/src/x86/x86SP_SSE_Math.h
@@ -0,0 +1,488 @@
+/*
+ * Copyright (c) 2013 The WebRTC project authors. All Rights realserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include <emmintrin.h>
+#include <assert.h>
+
+/**
+ * Two data formats are used by the FFT routines, internally. The
+ * interface to the main external FFT routines use interleaved complex
+ * values where the real part is followed by the imaginary part.
+ *
+ * One is the split format where a complex vector of real and imaginary
+ * values are split such that all of the real values are placed in the
+ * first half of the vector and the corresponding values are placed in
+ * the second half, in the same order. The conversion from interleaved
+ * complex values to split format and back is transparent to the
+ * external FFT interface.
+ *
+ * VComplex uses split format.
+ */
+
+/** VComplex hold 4 complex float elements, with the real parts stored
+ * in real and corresponding imaginary parts in imag.
+ */
+typedef struct VComplex {
+ __m128 real;
+ __m128 imag;
+} VC;
+
+/* out = a * b */
+static inline void VC_MUL(VC *out, VC *a, VC *b) {
+ out->real = _mm_sub_ps(_mm_mul_ps(a->real, b->real),
+ _mm_mul_ps(a->imag, b->imag));
+ out->imag = _mm_add_ps(_mm_mul_ps(a->real, b->imag),
+ _mm_mul_ps(a->imag, b->real));
+}
+
+/* out = conj(a) * b */
+static inline void VC_CONJ_MUL(VC *out, VC *a, VC *b) {
+ out->real = _mm_add_ps(_mm_mul_ps(a->real, b->real),
+ _mm_mul_ps(a->imag, b->imag));
+ out->imag = _mm_sub_ps(_mm_mul_ps(a->real, b->imag),
+ _mm_mul_ps(a->imag, b->real));
+}
+
+/* Scale complex by a real factor */
+static inline void VC_MUL_F(VC *out, VC *a, __m128 factor) {
+ out->real = _mm_mul_ps(factor, a->real);
+ out->imag = _mm_mul_ps(factor, a->imag);
+}
+
+/* out = a + b */
+static inline void VC_ADD(VC *out, VC *a, VC *b) {
+ out->real = _mm_add_ps(a->real, b->real);
+ out->imag = _mm_add_ps(a->imag, b->imag);
+}
+
+/**
+ * out.real = a.real + b.imag
+ * out.imag = a.imag + b.real
+ */
+static inline void VC_ADD_X(VC *out, VC *a, VC *b) {
+ out->real = _mm_add_ps(a->real, b->imag);
+ out->imag = _mm_add_ps(b->real, a->imag);
+}
+
+/* VC_ADD and store the result with Split format. */
+static inline void VC_ADD_STORE_SPLIT(
+ OMX_F32 *out,
+ VC *a,
+ VC *b,
+ OMX_INT offset) {
+ _mm_store_ps(out, _mm_add_ps(a->real, b->real));
+ _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->imag));
+}
+
+/* out = a - b */
+static inline void VC_SUB(VC *out, VC *a, VC *b) {
+ out->real = _mm_sub_ps(a->real, b->real);
+ out->imag = _mm_sub_ps(a->imag, b->imag);
+}
+
+/**
+ * out.real = a.real - b.imag
+ * out.imag = a.imag - b.real
+ */
+static inline void VC_SUB_X(VC *out, VC *a, VC *b) {
+ out->real = _mm_sub_ps(a->real, b->imag);
+ out->imag = _mm_sub_ps(b->real, a->imag);
+}
+
+/* VC_SUB and store the result with Split format. */
+static inline void VC_SUB_STORE_SPLIT(
+ OMX_F32 *out,
+ VC *a,
+ VC *b,
+ OMX_INT offset) {
+ _mm_store_ps(out, _mm_sub_ps(a->real, b->real));
+ _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->imag));
+}
+
+/**
+ * out.real = a.real + b.real
+ * out.imag = a.imag - b.imag
+ */
+static inline void VC_ADD_SUB(VC *out, VC *a, VC *b) {
+ out->real = _mm_add_ps(a->real, b->real);
+ out->imag = _mm_sub_ps(a->imag, b->imag);
+}
+
+/**
+ * out.real = a.real + b.imag
+ * out.imag = a.imag - b.real
+ */
+static inline void VC_ADD_SUB_X(VC *out, VC *a, VC *b) {
+ out->real = _mm_add_ps(a->real, b->imag);
+ out->imag = _mm_sub_ps(a->imag, b->real);
+}
+
+/* VC_ADD_SUB_X and store the result with Split format. */
+static inline void VC_ADD_SUB_X_STORE_SPLIT(
+ OMX_F32 *out,
+ VC *a,
+ VC *b,
+ OMX_INT offset) {
+ _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
+ _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->real));
+}
+
+/**
+ * out.real = a.real - b.real
+ * out.imag = a.imag + b.imag
+ */
+static inline void VC_SUB_ADD(VC *out, VC *a, VC *b) {
+ out->real = _mm_sub_ps(a->real, b->real);
+ out->imag = _mm_add_ps(a->imag, b->imag);
+}
+
+/**
+ * out.real = a.real - b.imag
+ * out.imag = a.imag + b.real
+ */
+static inline void VC_SUB_ADD_X(VC *out, VC *a, VC *b) {
+ out->real = _mm_sub_ps(a->real, b->imag);
+ out->imag = _mm_add_ps(a->imag, b->real);
+}
+
+/* VC_SUB_ADD_X and store the result with Split format. */
+static inline void VC_SUB_ADD_X_STORE_SPLIT(
+ OMX_F32 *out,
+ VC *a, VC *b,
+ OMX_INT offset) {
+ _mm_store_ps(out, _mm_sub_ps(a->real, b->imag));
+ _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->real));
+}
+
+/**
+ * out[0] = in.real
+ * out[offset] = in.imag
+ */
+static inline void VC_STORE_SPLIT(
+ OMX_F32 *out,
+ VC *in,
+ OMX_INT offset) {
+ _mm_store_ps(out, in->real);
+ _mm_store_ps(out + offset, in->imag);
+}
+
+/**
+ * out.real = in[0];
+ * out.imag = in[offset];
+*/
+static inline void VC_LOAD_SPLIT(
+ VC *out,
+ const OMX_F32 *in,
+ OMX_INT offset) {
+ out->real = _mm_load_ps(in);
+ out->imag = _mm_load_ps(in + offset);
+}
+
+/* Vector Complex Unpack from Split format to Interleaved format. */
+static inline void VC_UNPACK(VC *out, VC *in) {
+ out->real = _mm_unpacklo_ps(in->real, in->imag);
+ out->imag = _mm_unpackhi_ps(in->real, in->imag);
+}
+
+/**
+ * Vector Complex load from interleaved complex array.
+ * out.real = [in[0].real, in[1].real, in[2].real, in[3].real]
+ * out.imag = [in[0].imag, in[1].imag, in[2].imag, in[3].imag]
+ */
+static inline void VC_LOAD_INTERLEAVE(VC *out, const OMX_F32 *in) {
+ __m128 temp0 = _mm_load_ps(in);
+ __m128 temp1 = _mm_load_ps(in + 4);
+ out->real = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0));
+ out->imag = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1));
+}
+/**
+ * Vector Complex Load with Split format.
+ * The input address is not 16 byte aligned.
+ */
+static inline void VC_LOADU_SPLIT(
+ VC *out,
+ const OMX_F32 *in,
+ OMX_INT offset) {
+ out->real = _mm_loadu_ps(in);
+ out->imag = _mm_loadu_ps(in + offset);
+}
+
+/* Reverse the order of the Complex Vector. */
+static inline void VC_REVERSE(VC *v) {
+ v->real = _mm_shuffle_ps(v->real, v->real, _MM_SHUFFLE(0, 1, 2, 3));
+ v->imag = _mm_shuffle_ps(v->imag, v->imag, _MM_SHUFFLE(0, 1, 2, 3));
+}
+/*
+ * Vector Complex store to interleaved complex array
+ * out[0] = in.real[0]
+ * out[1] = in.imag[0]
+ * out[2] = in.real[1]
+ * out[3] = in.imag[1]
+ * out[4] = in.real[2]
+ * out[5] = in.imag[2]
+ * out[6] = in.real[3]
+ * out[7] = in.imag[3]
+ */
+static inline void VC_STORE_INTERLEAVE(OMX_F32 *out, VC *in) {
+ _mm_store_ps(out, _mm_unpacklo_ps(in->real, in->imag));
+ _mm_store_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
+}
+
+/**
+ * Vector Complex Store with Interleaved format.
+ * Address is not 16 byte aligned.
+ */
+static inline void VC_STOREU_INTERLEAVE(OMX_F32 *out, VC *in) {
+ _mm_storeu_ps(out, _mm_unpacklo_ps(in->real, in->imag));
+ _mm_storeu_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
+}
+
+/* VC_ADD_X and store the result with Split format. */
+static inline void VC_ADD_X_STORE_SPLIT(
+ OMX_F32 *out,
+ VC *a, VC *b,
+ OMX_INT offset) {
+ _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
+ _mm_store_ps(out + offset, _mm_add_ps(b->real, a->imag));
+}
+
+/**
+ * VC_SUB_X and store the result with inverse order.
+ * Address is not 16 byte aligned.
+ */
+static inline void VC_SUB_X_INVERSE_STOREU_SPLIT(
+ OMX_F32 *out,
+ VC *a,
+ VC *b,
+ OMX_INT offset) {
+ __m128 t;
+ t = _mm_sub_ps(a->real, b->imag);
+ _mm_storeu_ps(out, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
+ t = _mm_sub_ps(b->real, a->imag);
+ _mm_storeu_ps(out + offset, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
+}
+
+/**
+ * Vector Complex Load from Interleaved format to Split format.
+ * Store the result into two __m128 registers.
+ */
+static inline void VC_LOAD_SHUFFLE(
+ __m128 *out0,
+ __m128 *out1,
+ const OMX_F32 *in) {
+ VC temp;
+ VC_LOAD_INTERLEAVE(&temp, in);
+ *out0 = temp.real;
+ *out1 = temp.imag;
+}
+
+/* Finish the butterfly calculation of forward radix4 and store the outputs. */
+static inline void RADIX4_FWD_BUTTERFLY_STORE(
+ OMX_F32 *out0,
+ OMX_F32 *out1,
+ OMX_F32 *out2,
+ OMX_F32 *out3,
+ VC *t0,
+ VC *t1,
+ VC *t2,
+ VC *t3,
+ OMX_INT n) {
+ /* CADD out0, t0, t2 */
+ VC_ADD_STORE_SPLIT(out0, t0, t2, n);
+
+ /* CSUB out2, t0, t2 */
+ VC_SUB_STORE_SPLIT(out2, t0, t2, n);
+
+ /* CADD_SUB_X out1, t1, t3 */
+ VC_ADD_SUB_X_STORE_SPLIT(out1, t1, t3, n);
+
+ /* CSUB_ADD_X out3, t1, t3 */
+ VC_SUB_ADD_X_STORE_SPLIT(out3, t1, t3, n);
+}
+
+/* Finish the butterfly calculation of inverse radix4 and store the outputs. */
+static inline void RADIX4_INV_BUTTERFLY_STORE(
+ OMX_F32 *out0,
+ OMX_F32 *out1,
+ OMX_F32 *out2,
+ OMX_F32 *out3,
+ VC *t0,
+ VC *t1,
+ VC *t2,
+ VC *t3,
+ OMX_INT n) {
+ /* CADD out0, t0, t2 */
+ VC_ADD_STORE_SPLIT(out0, t0, t2, n);
+
+ /* CSUB out2, t0, t2 */
+ VC_SUB_STORE_SPLIT(out2, t0, t2, n);
+
+ /* CSUB_ADD_X out1, t1, t3 */
+ VC_SUB_ADD_X_STORE_SPLIT(out1, t1, t3, n);
+
+ /* CADD_SUB_X out3, t1, t3 */
+ VC_ADD_SUB_X_STORE_SPLIT(out3, t1, t3, n);
+}
+
+/* Radix4 forward butterfly */
+static inline void RADIX4_FWD_BUTTERFLY(
+ VC *t0,
+ VC *t1,
+ VC *t2,
+ VC *t3,
+ VC *Tw1,
+ VC *Tw2,
+ VC *Tw3,
+ VC *T0,
+ VC *T1,
+ VC *T2,
+ VC *T3) {
+ VC tt1, tt2, tt3;
+
+ /* CMUL tt1, Tw1, T1 */
+ VC_MUL(&tt1, Tw1, T1);
+
+ /* CMUL tt2, Tw2, T2 */
+ VC_MUL(&tt2, Tw2, T2);
+
+ /* CMUL tt3, Tw3, T3 */
+ VC_MUL(&tt3, Tw3, T3);
+
+ /* CADD t0, T0, tt2 */
+ VC_ADD(t0, T0, &tt2);
+
+ /* CSUB t1, T0, tt2 */
+ VC_SUB(t1, T0, &tt2);
+
+ /* CADD t2, tt1, tt3 */
+ VC_ADD(t2, &tt1, &tt3);
+
+ /* CSUB t3, tt1, tt3 */
+ VC_SUB(t3, &tt1, &tt3);
+}
+
+/* Radix4 inverse butterfly */
+static inline void RADIX4_INV_BUTTERFLY(
+ VC *t0,
+ VC *t1,
+ VC *t2,
+ VC *t3,
+ VC *Tw1,
+ VC *Tw2,
+ VC *Tw3,
+ VC *T0,
+ VC *T1,
+ VC *T2,
+ VC *T3) {
+ VC tt1, tt2, tt3;
+
+ /* CMUL tt1, Tw1, T1 */
+ VC_CONJ_MUL(&tt1, Tw1, T1);
+
+ /* CMUL tt2, Tw2, T2 */
+ VC_CONJ_MUL(&tt2, Tw2, T2);
+
+ /* CMUL tt3, Tw3, T3 */
+ VC_CONJ_MUL(&tt3, Tw3, T3);
+
+ /* CADD t0, T0, tt2 */
+ VC_ADD(t0, T0, &tt2);
+
+ /* CSUB t1, T0, tt2 */
+ VC_SUB(t1, T0, &tt2);
+
+ /* CADD t2, tt1, tt3 */
+ VC_ADD(t2, &tt1, &tt3);
+
+ /* CSUB t3, tt1, tt3 */
+ VC_SUB(t3, &tt1, &tt3);
+}
+
+/* Radix4 butterfly in first stage for both forward and inverse */
+static inline void RADIX4_BUTTERFLY_FS(
+ VC *t0,
+ VC *t1,
+ VC *t2,
+ VC *t3,
+ VC *T0,
+ VC *T1,
+ VC *T2,
+ VC *T3) {
+ /* CADD t0, T0, T2 */
+ VC_ADD(t0, T0, T2);
+
+ /* CSUB t1, T0, T2 */
+ VC_SUB(t1, T0, T2);
+
+ /* CADD t2, T1, T3 */
+ VC_ADD(t2, T1, T3);
+
+ /* CSUB t3, T1, T3 */
+ VC_SUB(t3, T1, T3);
+}
+
+/**
+ * Load 16 float elements (4 sse registers) which is a 4 * 4 matrix.
+ * Then Do transpose on the matrix.
+ * 3, 2, 1, 0 12, 8, 4, 0
+ * 7, 6, 5, 4 =====> 13, 9, 5, 1
+ * 11, 10, 9, 8 14, 10, 6, 2
+ * 15, 14, 13, 12 15, 11, 7, 3
+ */
+static inline void VC_LOAD_MATRIX_TRANSPOSE(
+ VC *T0,
+ VC *T1,
+ VC *T2,
+ VC *T3,
+ const OMX_F32 *pT0,
+ const OMX_F32 *pT1,
+ const OMX_F32 *pT2,
+ const OMX_F32 *pT3,
+ OMX_INT n) {
+ __m128 xmm0;
+ __m128 xmm1;
+ __m128 xmm2;
+ __m128 xmm3;
+ __m128 xmm4;
+ __m128 xmm5;
+ __m128 xmm6;
+ __m128 xmm7;
+
+ xmm0 = _mm_load_ps(pT0);
+ xmm1 = _mm_load_ps(pT1);
+ xmm2 = _mm_load_ps(pT2);
+ xmm3 = _mm_load_ps(pT3);
+
+ /* Matrix transpose */
+ xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
+ xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
+ xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
+ xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
+ T0->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
+ T1->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
+ T2->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
+ T3->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
+
+ xmm0 = _mm_load_ps(pT0 + n);
+ xmm1 = _mm_load_ps(pT1 + n);
+ xmm2 = _mm_load_ps(pT2 + n);
+ xmm3 = _mm_load_ps(pT3 + n);
+
+ /* Matrix transpose */
+ xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
+ xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
+ xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
+ xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
+ T0->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
+ T1->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
+ T2->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
+ T3->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
+}