Optimize scaleFFTData for float FFTs BUG= Speed up scaleFFTData by about 30% by doing the scaling on 4 complex (8 float) elements at a time. Some timing measurements using perf measuring time_fft_time -T -F -f 1 -n 11 -g 2 -c 1000000 Before optimization: samples pcnt function DSO _______ _____ ______________________________________ _____________ 2364.00 25.9% evenOddButterflyLoopInv [vectors] 1957.00 21.4% radix4SetLoopINV [vectors] 1197.00 13.1% radix4SkipReadINV [vectors] 1009.00 11.0% scaleFFTData [vectors] After optimization: samples pcnt function DSO _______ _____ ______________________________________ _____________ 3806.00 25.9% evenOddButterflyLoopInv [vectors] 3523.00 23.9% radix4SetLoopINV [vectors] 2103.00 14.3% radix4SkipReadINV [vectors] 1471.00 10.0% radix4lsGrpLoopinv [vectors] 1134.00 7.7% scaleFFTData [vectors] The time spent has gone in scaleFFTData has gone down from 11% to 7.7%. R=aedla@chromium.org, andrew@webrtc.org, kma@webrtc.org Review URL: https://webrtc-codereview.appspot.com/1574005 git-svn-id: http://webrtc.googlecode.com/svn/deps/third_party/openmax@4148 4adac7df-926f-26a2-2b94-8c16560cd09d

commit: f2abf62750419d4ce70b53b1b4fef405046d29f9 [log] [tgz]
author: rtoy@google.com <rtoy@google.com@4adac7df-926f-26a2-2b94-8c16560cd09d> Fri May 31 17:08:30 2013
committer: rtoy@google.com <rtoy@google.com@4adac7df-926f-26a2-2b94-8c16560cd09d> Fri May 31 17:08:30 2013
tree: ce3dda73cb96f2aae2cdd142ba0a2fe632cec053
parent: 6a4eb762f075e42ea1b0a1828fe0a58478d4cffa [diff]
diff --git a/dl/sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_s.S b/dl/sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_s.S
index 2616506..5deaf89 100644
--- a/dl/sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_s.S
+++ b/dl/sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_s.S

@@ -134,6 +134,8 @@
 #define dScale  D2.F32
 #define one     S4.F32
 
+#define qX0     Q2.F32
+#define qX1     Q3.F32
 
     @// Allocate stack memory required by the function
         M_ALLOC4        complexFFTSize, 4
@@ -262,15 +264,25 @@
         VDIV    one, one, fN            @ one = dScale[0] = 1 / fftSize
 
 
-        @// N = subFFTSize  ; dataptr = pDst
+        @// subFFTSize = N = complexFFTSize, which is always even and
+        @// greater than 0.
+        CMP     subFFTSize, #4
+        BLT     scaleFFTData1
 scaleFFTData:
-        VLD1    {dX0},[pSrc]            @// pSrc contains pDst pointer
-        SUBS    subFFTSize,subFFTSize,#1
-        VMUL    dX0, dX0, dScale[0]
-        VST1    {dX0},[pSrc]!
+        @// Scale 4 complex (8 float) elements at a time 
+        VLD1    {qX0, qX1}, [pSrc :256]            @// pSrc contains pDst pointer
+        SUBS    subFFTSize, subFFTSize, #4
+        VMUL    qX0, qX0, dScale[0]
+        VMUL    qX1, qX1, dScale[0]
+        VST1    {qX0, qX1}, [pSrc :256]!
 
         BGT     scaleFFTData
-
+scaleFFTData1:
+        CMP     subFFTSize, #2
+        BLT     End
+        VLD1    {qX0}, [pSrc]
+        VMUL    qX0, qX0, dScale[0]
+        VST1    {qX0}, [pSrc]!  
 End:
         @// Set return value
         MOV     result, #OMX_Sts_NoErr
commit	f2abf62750419d4ce70b53b1b4fef405046d29f9	[log] [tgz]
author	rtoy@google.com <rtoy@google.com@4adac7df-926f-26a2-2b94-8c16560cd09d>	Fri May 31 17:08:30 2013
committer	rtoy@google.com <rtoy@google.com@4adac7df-926f-26a2-2b94-8c16560cd09d>	Fri May 31 17:08:30 2013
tree	ce3dda73cb96f2aae2cdd142ba0a2fe632cec053
parent	6a4eb762f075e42ea1b0a1828fe0a58478d4cffa [diff]