Minimized the table sizes for MIPS implementation.

Size of the Twiddle and Offset table is reduced.
Minor changes in algorithm due to the reduced table sizes.
Minor speed improvement observed some FFT sizes.

R=andrew@webrtc.org, rtoy@google.com

Review URL: https://webrtc-codereview.appspot.com/19959004

Patch from Zeljko Lukac <Zeljko.Lukac@imgtec.com>.

git-svn-id: http://webrtc.googlecode.com/svn/deps/third_party/openmax@6807 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/dl/sp/api/mipsSP.h b/dl/sp/api/mipsSP.h
index 6402e92..d4bcc8c 100644
--- a/dl/sp/api/mipsSP.h
+++ b/dl/sp/api/mipsSP.h
@@ -34,7 +34,7 @@
   OMX_U16* pBitRev;
   OMX_U16* pBitRevInv;
   const OMX_U16* pOffset;
-  OMX_F32* pTwiddle;
+  const OMX_F32* pTwiddle;
   OMX_F32* pBuf;
 } MIPSFFTSpec_R_FC32;
 
diff --git a/dl/sp/src/mips/mips_FFTFwd_RToCCS_F32_complex.c b/dl/sp/src/mips/mips_FFTFwd_RToCCS_F32_complex.c
index f5a8a19..a43946d 100644
--- a/dl/sp/src/mips/mips_FFTFwd_RToCCS_F32_complex.c
+++ b/dl/sp/src/mips/mips_FFTFwd_RToCCS_F32_complex.c
@@ -22,8 +22,8 @@
                                          OMX_F32* pDst,
                                          const MIPSFFTSpec_R_FC32* pFFTSpec) {
   OMX_U32 n1_4, num_transforms, step;
-  OMX_F32* w_re_ptr;
-  OMX_F32* w_im_ptr;
+  const OMX_F32* w_re_ptr;
+  const OMX_F32* w_im_ptr;
   OMX_U32 fft_size = 1 << pFFTSpec->order;
   OMX_FC32* p_dst = (OMX_FC32*)pDst;
   OMX_FC32* p_buf = (OMX_FC32*)pFFTSpec->pBuf;
@@ -127,7 +127,7 @@
     p_tmp[3].Im = p_tmp[3].Im - tmp5;
   }
 
-  step = 1 << (TWIDDLE_TABLE_ORDER - 4);
+  step = 1 << (pFFTSpec->order - 4);
   n1_4 = 4; /* Quarter of the sub-transform size. */
   /* Outer loop that loops over FFT stages. */
   for (uint32_t fft_stage = 4; fft_stage <= pFFTSpec->order - 1; ++fft_stage) {
@@ -159,7 +159,7 @@
       /* Twiddle table is initialized for the maximal FFT size. */
       w_re_ptr = pFFTSpec->pTwiddle + step;
       w_im_ptr =
-          pFFTSpec->pTwiddle + (OMX_U32)(1 << TWIDDLE_TABLE_ORDER - 2) - step;
+          pFFTSpec->pTwiddle + (OMX_U32)(1 << pFFTSpec->order - 2) - step;
 
       /*
        * Loop performing split-radix butterfly operations for
@@ -198,8 +198,7 @@
 
   /* Additional computation to get the output for full FFT size. */
   w_re_ptr = pFFTSpec->pTwiddle + step;
-  w_im_ptr =
-      pFFTSpec->pTwiddle + (OMX_U32)(1 << TWIDDLE_TABLE_ORDER - 2) - step;
+  w_im_ptr = pFFTSpec->pTwiddle + (OMX_U32)(1 << pFFTSpec->order - 2) - step;
 
   for (uint32_t i = 1; i < fft_size / 8; ++i) {
     tmp1 = p_buf[i].Re;
diff --git a/dl/sp/src/mips/mips_FFTFwd_RToCCS_F32_real.c b/dl/sp/src/mips/mips_FFTFwd_RToCCS_F32_real.c
index 9a17567..a33b90e 100644
--- a/dl/sp/src/mips/mips_FFTFwd_RToCCS_F32_real.c
+++ b/dl/sp/src/mips/mips_FFTFwd_RToCCS_F32_real.c
@@ -17,12 +17,12 @@
 OMXResult mips_FFTFwd_RToCCS_F32_real(const OMX_F32* pSrc,
                                       OMX_F32* pDst,
                                       const MIPSFFTSpec_R_FC32* pFFTSpec) {
-  OMX_U32 num_transforms, step;
+  OMX_U32 num_transforms;
   OMX_FC32* p_dst = (OMX_FC32*)pDst;
   OMX_FC32* p_buf = (OMX_FC32*)pFFTSpec->pBuf;
   OMX_F32 tmp1, tmp2, tmp3, tmp4;
-  OMX_F32* w_re_ptr;
-  OMX_F32* w_im_ptr;
+  const OMX_F32* w_re_ptr;
+  const OMX_F32* w_im_ptr;
 
   /* Transform for order = 2. */
   /* TODO: hard-code the offsets for p_src. */
@@ -142,7 +142,6 @@
     p_tmp[3].Im = p_tmp[3].Im - tmp1;
   }
 
-  step = 1 << (TWIDDLE_TABLE_ORDER - 4);
   /*
    * Last FFT stage,  performing sub-transforms of size 16. Place the output
    * into the destination buffer and avoid unnecessary computations.
@@ -159,9 +158,8 @@
   p_dst[4].Re = p_buf[4].Re + tmp4;
   p_dst[4].Im = p_buf[4].Im - tmp2;
 
-  w_re_ptr = pFFTSpec->pTwiddle + step;
-  w_im_ptr =
-      pFFTSpec->pTwiddle + (OMX_U32)(1 << TWIDDLE_TABLE_ORDER - 2) - step;
+  w_re_ptr = pFFTSpec->pTwiddle + 1;
+  w_im_ptr = pFFTSpec->pTwiddle + (OMX_U32)(1 << pFFTSpec->order - 2) - 1;
 
   /* Loop performing split-radix butterfly operations. */
   for (uint32_t n = 1; n < 4; ++n) {
@@ -184,8 +182,8 @@
     p_dst[4 + n].Re = p_buf[4 + n].Re + tmp2;
     p_dst[4 + n].Im = p_buf[4 + n].Im - tmp1;
 
-    w_re_ptr += step;
-    w_im_ptr -= step;
+    ++w_re_ptr;
+    --w_im_ptr;
   }
   return OMX_Sts_NoErr;
 }
diff --git a/dl/sp/src/mips/mips_FFTInv_CCSToR_F32_complex.c b/dl/sp/src/mips/mips_FFTInv_CCSToR_F32_complex.c
index 88f93cc..f47a937 100644
--- a/dl/sp/src/mips/mips_FFTInv_CCSToR_F32_complex.c
+++ b/dl/sp/src/mips/mips_FFTInv_CCSToR_F32_complex.c
@@ -20,8 +20,8 @@
   OMX_U32 num_transforms, step;
   /* Quarter, half and three-quarters of transform size. */
   OMX_U32 n1_4, n1_2, n3_4;
-  OMX_F32* w_re_ptr;
-  OMX_F32* w_im_ptr;
+  const OMX_F32* w_re_ptr;
+  const OMX_F32* w_im_ptr;
   OMX_U32 fft_size = 1 << pFFTSpec->order;
   OMX_FC32* p_buf = (OMX_FC32*)pFFTSpec->pBuf;
   const OMX_FC32* p_src = (const OMX_FC32*)pSrc;
@@ -30,10 +30,8 @@
   OMX_F32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, factor;
   OMX_F32 w_re, w_im;
 
-  step = 1 << (TWIDDLE_TABLE_ORDER - pFFTSpec->order);
-  w_re_ptr = pFFTSpec->pTwiddle + step;
-  w_im_ptr =
-      pFFTSpec->pTwiddle + (OMX_U32)(1 << (TWIDDLE_TABLE_ORDER - 2)) - step;
+  w_re_ptr = pFFTSpec->pTwiddle + 1;
+  w_im_ptr = pFFTSpec->pTwiddle + (OMX_U32)(1 << pFFTSpec->order - 2) - 1;
 
   /*
    * Preliminary loop performing input adaptation due to computing real FFT
@@ -79,8 +77,8 @@
     p_buf[p_bitrev[fft_size / 4 - n]].Im =
         0.5f * (-tmp8 - w_im * tmp6 - w_re * tmp7);
 
-    w_re_ptr += step;
-    w_im_ptr -= step;
+    ++w_re_ptr;
+    --w_im_ptr;
   }
   tmp1 = p_src[fft_size / 8].Re;
   tmp2 = p_src[fft_size / 8].Im;
@@ -188,7 +186,7 @@
     p_tmp[3].Im = p_tmp[3].Im + tmp3;
   }
 
-  step = 1 << (TWIDDLE_TABLE_ORDER - 4);
+  step = 1 << (pFFTSpec->order - 4);
   n1_4 = 4;
   /* Outer loop that loops over FFT stages. */
   for (uint32_t fft_stage = 4; fft_stage <= pFFTSpec->order - 2; ++fft_stage) {
@@ -215,7 +213,7 @@
 
       w_re_ptr = pFFTSpec->pTwiddle + step;
       w_im_ptr =
-          pFFTSpec->pTwiddle + (OMX_U32)(1 << TWIDDLE_TABLE_ORDER - 2) - step;
+          pFFTSpec->pTwiddle + (OMX_U32)(1 << pFFTSpec->order - 2) - step;
 
       /*
        * Loop performing split-radix butterfly operations for one sub-transform.
@@ -273,8 +271,7 @@
   p_dst[n1_4].Im = factor * (p_buf[n1_4].Im + tmp2);
 
   w_re_ptr = pFFTSpec->pTwiddle + step;
-  w_im_ptr =
-      pFFTSpec->pTwiddle + (OMX_U32)(1 << TWIDDLE_TABLE_ORDER - 2) - step;
+  w_im_ptr = pFFTSpec->pTwiddle + (OMX_U32)(1 << pFFTSpec->order - 2) - step;
 
   for (uint32_t i = 1; i < n1_4; ++i) {
     w_re = *w_re_ptr;
diff --git a/dl/sp/src/mips/mips_FFTInv_CCSToR_F32_real.c b/dl/sp/src/mips/mips_FFTInv_CCSToR_F32_real.c
index ff7896e..e2f36fe 100644
--- a/dl/sp/src/mips/mips_FFTInv_CCSToR_F32_real.c
+++ b/dl/sp/src/mips/mips_FFTInv_CCSToR_F32_real.c
@@ -22,8 +22,8 @@
   const OMX_FC32* p_src = (const OMX_FC32*)pSrc;
   OMX_U32 fft_size = 1 << pFFTSpec->order;
   OMX_F32 factor, tmp1, tmp2;
-  OMX_F32* w_re_ptr;
-  OMX_F32* w_im_ptr;
+  const OMX_F32* w_re_ptr;
+  const OMX_F32* w_im_ptr;
 
   /* Copy the input into the auxiliary buffer. */
   for (uint32_t n = 1; n < fft_size / 2; ++n) {
@@ -179,7 +179,6 @@
     p_tmp[3].Im = p_tmp[3].Im + tmp3;
   }
 
-  step = 1 << (TWIDDLE_TABLE_ORDER - 4);
   /*
    * Last FFT stage, doing sub-transform of size 16. Avoid unnecessary
    * calculations and place the output directly into the destination buffer.
@@ -192,9 +191,8 @@
   pDst[12] = factor * (p_buf[4].Re + tmp2);
   pDst[4] = factor * (p_buf[4].Re - tmp2);
 
-  w_re_ptr = pFFTSpec->pTwiddle + step;
-  w_im_ptr =
-      pFFTSpec->pTwiddle + (OMX_U32)(1 << TWIDDLE_TABLE_ORDER - 2) - step;
+  w_re_ptr = pFFTSpec->pTwiddle + 1;
+  w_im_ptr = pFFTSpec->pTwiddle + (OMX_U32)(1 << pFFTSpec->order - 2) - 1;
 
   /* Loop performing split-radix butterfly operations. */
   for (uint32_t n = 1; n < 4; ++n) {
@@ -215,8 +213,8 @@
     pDst[12 + n] = factor * (p_buf[4 + n].Re + tmp2);
     pDst[4 + n] = factor * (p_buf[4 + n].Re - tmp2);
 
-    w_re_ptr += step;
-    w_im_ptr -= step;
+    ++w_re_ptr;
+    --w_im_ptr;
   }
   return OMX_Sts_NoErr;
 }
diff --git a/dl/sp/src/mips/omxSP_FFTGetBufSize_R_F32.c b/dl/sp/src/mips/omxSP_FFTGetBufSize_R_F32.c
index db575d1..9862385 100644
--- a/dl/sp/src/mips/omxSP_FFTGetBufSize_R_F32.c
+++ b/dl/sp/src/mips/omxSP_FFTGetBufSize_R_F32.c
@@ -14,16 +14,19 @@
 #include "dl/sp/api/omxSP.h"
 
 OMXResult omxSP_FFTGetBufSize_R_F32(OMX_INT order, OMX_INT* pSize) {
-  OMX_INT fft_size;
+  OMX_INT fft_size, offset_lut_size;
 
   if (!pSize || (order < 1) || (order > TWIDDLE_TABLE_ORDER))
     return OMX_Sts_BadArgErr;
 
   /* For order larger than 4, compute Real FFT as Complex FFT of (order - 1). */
-  if (order > 4)
+  if (order > 4) {
     fft_size = 1 << (order - 1);
-  else
+    offset_lut_size = (SUBTRANSFORM_CONST >> (16 - order)) | 1;
+  } else {
     fft_size = 1 << order;
+    offset_lut_size = (SUBTRANSFORM_CONST >> (17 - order)) | 1;
+  }
 
   *pSize = sizeof(MIPSFFTSpec_R_FC32) +
            /* BitRev Table. */
@@ -31,15 +34,16 @@
            /* BitRevInv Table. */
            sizeof(OMX_U16) * fft_size +
            /* Offsets table. */
-           sizeof(OMX_U16) *
-               ((SUBTRANSFORM_CONST >> (16 - TWIDDLE_TABLE_ORDER)) | 1) +
+           sizeof(OMX_U16) * offset_lut_size +
+           /* Twiddle table */
+           sizeof(OMX_F32) * (1 << (order - 2)) +
            /* pBuf. */
            sizeof(OMX_F32) * (fft_size << 1) +
            /*
             * Extra bytes to get 32 byte alignment of
-            * pBitRev, pBitRevInv, pOffset and pBuf.
+            * pBitRev, pBitRevInv, pOffset, pTwiddle and pBuf.
             */
-           124;
+           155;
 
   return OMX_Sts_NoErr;
 }
diff --git a/dl/sp/src/mips/omxSP_FFTInit_R_F32.c b/dl/sp/src/mips/omxSP_FFTInit_R_F32.c
index 15749e0..9230b5b 100644
--- a/dl/sp/src/mips/omxSP_FFTInit_R_F32.c
+++ b/dl/sp/src/mips/omxSP_FFTInit_R_F32.c
@@ -63,8 +63,6 @@
   else
     fft_size = 1 << order;
 
-  p_twiddle = mipsSP_FFT_F32TwiddleTable;
-
   p_bit_rev = (OMX_U16*)((OMX_S8*)pFFTSpec + sizeof(MIPSFFTSpec_R_FC32));
   /* Align to 32 byte boundary. */
   tmp = ((uintptr_t)p_bit_rev) & 31;
@@ -83,9 +81,26 @@
   if (tmp)
     p_offset = (OMX_U16*)((OMX_S8*)p_offset + (32 - tmp));
 
-  p_buf = (OMX_F32*)((OMX_S8*)p_offset +
-                     ((SUBTRANSFORM_CONST >> (16 - TWIDDLE_TABLE_ORDER)) | 1) *
-                         sizeof(OMX_U16));
+  if (order < 5) {
+    p_twiddle = (OMX_F32*)((OMX_S8*)p_offset +
+                           ((SUBTRANSFORM_CONST >> (16 - order)) | 1) *
+                               sizeof(OMX_U16));
+  } else {
+    p_twiddle = (OMX_F32*)((OMX_S8*)p_offset +
+                           ((SUBTRANSFORM_CONST >> (17 - order)) | 1) *
+                               sizeof(OMX_U16));
+  }
+
+  /* Align to 32 byte boundary. */
+  tmp = ((uintptr_t)p_twiddle) & 31;
+  if (tmp)
+    p_twiddle = (OMX_F32*)((OMX_S8*)p_twiddle + (32 - tmp));
+
+  if (order > 3)
+    p_buf =
+        (OMX_F32*)((OMX_S8*)p_twiddle + (1 << (order - 2)) * sizeof(OMX_F32));
+  else
+    p_buf = (OMX_F32*)((OMX_S8*)p_twiddle + sizeof(OMX_F32));
 
   /* Align to 32 byte boundary. */
   tmp = ((uintptr_t)p_buf) & 31;
@@ -102,7 +117,18 @@
 
   /* Calculate Offsets. */
   n = 0;
-  InitFFTOffsetsLUT(p_offset, 0, 1 << TWIDDLE_TABLE_ORDER, &n);
+  if (order < 5)
+    InitFFTOffsetsLUT(p_offset, 0, 1 << order, &n);
+  else
+    InitFFTOffsetsLUT(p_offset, 0, 1 << (order - 1), &n);
+
+  /* Fill the twiddle table */
+  if (order > 3) {
+    tmp = 1 << (TWIDDLE_TABLE_ORDER - order);
+    for (n = 0; n < (1 << (order - 2)); ++n) {
+      p_twiddle[n] = mipsSP_FFT_F32TwiddleTable[n * tmp];
+    }
+  }
 
   /*
    * Double-check if the offset tables are initialized correctly.
@@ -111,49 +137,55 @@
    * probabaly redundant. However, keeping this just to make sure the offsets
    * will not exceed the buffer boundaries.
    */
-  if (order == 2) {
-    /* Only check the offsets for the p_bit_rev_inv table. */
-    for (uint32_t i = 0; i < fft_size; ++i) {
-      if (p_bit_rev_inv[i] >= fft_size)
-        return OMX_Sts_BadArgErr;
-    }
-  } else if (order < 5) {
-    /* Check for p_offset table. */
-    int shift = 2;
-    int over = 4;
-    int num_transforms = (SUBTRANSFORM_CONST >> (16 - order)) | 1;
-    for (uint32_t i = 2; i < order; ++i) {
-      for (uint32_t j = 0; j < num_transforms; ++j) {
-        if (((p_offset[j] << shift) + over - 1) >= fft_size)
+  if (order > 1) {
+    /*
+     * In the case of order = 1 there is no table accesses, so there is no need
+     * for any boundary checks.
+     */
+    if (order == 2) {
+      /* Only check the offsets for the p_bit_rev_inv table. */
+      for (uint32_t i = 0; i < fft_size; ++i) {
+        if (p_bit_rev_inv[i] >= fft_size)
           return OMX_Sts_BadArgErr;
       }
-      shift++;
-      over <<= 1;
-      num_transforms = (num_transforms >> 1) | 1;
-    }
-    /* Check for bit-reverse tables. */
-    for (uint32_t i = 0; i < fft_size; ++i) {
-      if ((p_bit_rev[i] >= fft_size) || (p_bit_rev_inv[i] >= fft_size))
-        return OMX_Sts_BadArgErr;
-    }
-  } else {
-    /* Check for p_offset table. */
-    int shift = 2;
-    int over = 4;
-    int num_transforms = (SUBTRANSFORM_CONST >> (17 - order)) | 1;
-    for (uint32_t i = 2; i < order; ++i) {
-      for (uint32_t j = 0; j < num_transforms; ++j) {
-        if (((p_offset[j] << shift) + over - 1) >= fft_size)
+    } else if (order < 5) {
+      /* Check for p_offset table. */
+      int shift = 2;
+      int over = 4;
+      int num_transforms = (SUBTRANSFORM_CONST >> (16 - order)) | 1;
+      for (uint32_t i = 2; i < order; ++i) {
+        for (uint32_t j = 0; j < num_transforms; ++j) {
+          if (((p_offset[j] << shift) + over - 1) >= fft_size)
+            return OMX_Sts_BadArgErr;
+        }
+        shift++;
+        over <<= 1;
+        num_transforms = (num_transforms >> 1) | 1;
+      }
+      /* Check for bit-reverse tables. */
+      for (uint32_t i = 0; i < fft_size; ++i) {
+        if ((p_bit_rev[i] >= fft_size) || (p_bit_rev_inv[i] >= fft_size))
           return OMX_Sts_BadArgErr;
       }
-      shift++;
-      over <<= 1;
-      num_transforms = (num_transforms >> 1) | 1;
-    }
-    /* Check for bit-reverse tables. */
-    for (uint32_t i = 0; i < fft_size; ++i) {
-      if ((p_bit_rev[i] >= fft_size) || (p_bit_rev_inv[i] >= fft_size))
-        return OMX_Sts_BadArgErr;
+    } else {
+      /* Check for p_offset table. */
+      int shift = 2;
+      int over = 4;
+      int num_transforms = (SUBTRANSFORM_CONST >> (17 - order)) | 1;
+      for (uint32_t i = 2; i < order; ++i) {
+        for (uint32_t j = 0; j < num_transforms; ++j) {
+          if (((p_offset[j] << shift) + over - 1) >= fft_size)
+            return OMX_Sts_BadArgErr;
+        }
+        shift++;
+        over <<= 1;
+        num_transforms = (num_transforms >> 1) | 1;
+      }
+      /* Check for bit-reverse tables. */
+      for (uint32_t i = 0; i < fft_size; ++i) {
+        if ((p_bit_rev[i] >= fft_size) || (p_bit_rev_inv[i] >= fft_size))
+          return OMX_Sts_BadArgErr;
+      }
     }
   }
 
@@ -161,7 +193,7 @@
   pFFTStruct->pBitRev = p_bit_rev;
   pFFTStruct->pBitRevInv = p_bit_rev_inv;
   pFFTStruct->pOffset = (const OMX_U16*)p_offset;
-  pFFTStruct->pTwiddle = p_twiddle;
+  pFFTStruct->pTwiddle = (const OMX_F32*)p_twiddle;
   pFFTStruct->pBuf = p_buf;
 
   return OMX_Sts_NoErr;