MIPS optimizations for NS audio processing module

R=andrew@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/4139006

Patch from Ljubomir Papuga <lpapuga@mips.com>.

git-svn-id: http://webrtc.googlecode.com/svn/trunk@5393 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/webrtc/modules/audio_processing/audio_processing.gypi b/webrtc/modules/audio_processing/audio_processing.gypi
index 6512f03..1f3f705 100644
--- a/webrtc/modules/audio_processing/audio_processing.gypi
+++ b/webrtc/modules/audio_processing/audio_processing.gypi
@@ -101,6 +101,17 @@
             'ns/nsx_core.h',
             'ns/nsx_defines.h',
           ],
+          'conditions': [
+            ['target_arch=="mipsel"', {
+              'sources': [
+                'ns/nsx_core_mips.c',
+              ],
+            }, {
+              'sources': [
+                'ns/nsx_core_c.c',
+              ],
+            }],
+          ],
         }, {
           'defines': ['WEBRTC_NS_FLOAT'],
           'sources': [
diff --git a/webrtc/modules/audio_processing/ns/nsx_core.c b/webrtc/modules/audio_processing/ns/nsx_core.c
index c722957..e627a2e 100644
--- a/webrtc/modules/audio_processing/ns/nsx_core.c
+++ b/webrtc/modules/audio_processing/ns/nsx_core.c
@@ -70,11 +70,6 @@
 // Skip first frequency bins during estimation. (0 <= value < 64)
 static const int kStartBand = 5;
 
-static const int16_t kIndicatorTable[17] = {
-  0, 2017, 3809, 5227, 6258, 6963, 7424, 7718,
-  7901, 8014, 8084, 8126, 8152, 8168, 8177, 8183, 8187
-};
-
 // hybrib Hanning & flat window
 static const int16_t kBlocks80w128x[128] = {
   0,    536,   1072,   1606,   2139,   2669,   3196,   3720,   4240,   4756,   5266,
@@ -481,7 +476,7 @@
 }
 
 // Denormalize the real-valued signal |in|, the output from inverse FFT.
-static __inline void Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
+static void DenormalizeC(NsxInst_t* inst, int16_t* in, int factor) {
   int i = 0;
   int32_t tmp32 = 0;
   for (i = 0; i < inst->anaLen; i += 1) {
@@ -546,9 +541,9 @@
 }
 
 // Normalize the real-valued signal |in|, the input to forward FFT.
-static __inline void NormalizeRealBuffer(NsxInst_t* inst,
-                                         const int16_t* in,
-                                         int16_t* out) {
+static void NormalizeRealBufferC(NsxInst_t* inst,
+                                 const int16_t* in,
+                                 int16_t* out) {
   int i = 0;
   for (i = 0; i < inst->anaLen; ++i) {
     out[i] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
@@ -560,6 +555,8 @@
 PrepareSpectrum WebRtcNsx_PrepareSpectrum;
 SynthesisUpdate WebRtcNsx_SynthesisUpdate;
 AnalysisUpdate WebRtcNsx_AnalysisUpdate;
+Denormalize WebRtcNsx_Denormalize;
+NormalizeRealBuffer WebRtcNsx_NormalizeRealBuffer;
 
 #if (defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON)
 // Initialize function pointers for ARM Neon platform.
@@ -571,6 +568,19 @@
 }
 #endif
 
+#if defined(MIPS32_LE)
+// Initialize function pointers for MIPS platform.
+static void WebRtcNsx_InitMips(void) {
+  WebRtcNsx_PrepareSpectrum = WebRtcNsx_PrepareSpectrum_mips;
+  WebRtcNsx_SynthesisUpdate = WebRtcNsx_SynthesisUpdate_mips;
+  WebRtcNsx_AnalysisUpdate = WebRtcNsx_AnalysisUpdate_mips;
+  WebRtcNsx_NormalizeRealBuffer = WebRtcNsx_NormalizeRealBuffer_mips;
+#if defined(MIPS_DSP_R1_LE)
+  WebRtcNsx_Denormalize = WebRtcNsx_Denormalize_mips;
+#endif
+}
+#endif
+
 void WebRtcNsx_CalcParametricNoiseEstimate(NsxInst_t* inst,
                                            int16_t pink_noise_exp_avg,
                                            int32_t pink_noise_num_avg,
@@ -758,6 +768,8 @@
   WebRtcNsx_PrepareSpectrum = PrepareSpectrumC;
   WebRtcNsx_SynthesisUpdate = SynthesisUpdateC;
   WebRtcNsx_AnalysisUpdate = AnalysisUpdateC;
+  WebRtcNsx_Denormalize = DenormalizeC;
+  WebRtcNsx_NormalizeRealBuffer = NormalizeRealBufferC;
 
 #ifdef WEBRTC_DETECT_ARM_NEON
   uint64_t features = WebRtc_GetCPUFeaturesARM();
@@ -768,6 +780,10 @@
   WebRtcNsx_InitNeon();
 #endif
 
+#if defined(MIPS32_LE)
+  WebRtcNsx_InitMips();
+#endif
+
   inst->initFlag = 1;
 
   return 0;
@@ -1169,239 +1185,6 @@
   }
 }
 
-// Compute speech/noise probability
-// speech/noise probability is returned in: probSpeechFinal
-//snrLocPrior is the prior SNR for each frequency (in Q11)
-//snrLocPost is the post SNR for each frequency (in Q11)
-void WebRtcNsx_SpeechNoiseProb(NsxInst_t* inst, uint16_t* nonSpeechProbFinal,
-                               uint32_t* priorLocSnr, uint32_t* postLocSnr) {
-  uint32_t zeros, num, den, tmpU32no1, tmpU32no2, tmpU32no3;
-
-  int32_t invLrtFX, indPriorFX, tmp32, tmp32no1, tmp32no2, besselTmpFX32;
-  int32_t frac32, logTmp;
-  int32_t logLrtTimeAvgKsumFX;
-
-  int16_t indPriorFX16;
-  int16_t tmp16, tmp16no1, tmp16no2, tmpIndFX, tableIndex, frac, intPart;
-
-  int i, normTmp, normTmp2, nShifts;
-
-  // compute feature based on average LR factor
-  // this is the average over all frequencies of the smooth log LRT
-  logLrtTimeAvgKsumFX = 0;
-  for (i = 0; i < inst->magnLen; i++) {
-    besselTmpFX32 = (int32_t)postLocSnr[i]; // Q11
-    normTmp = WebRtcSpl_NormU32(postLocSnr[i]);
-    num = WEBRTC_SPL_LSHIFT_U32(postLocSnr[i], normTmp); // Q(11+normTmp)
-    if (normTmp > 10) {
-      den = WEBRTC_SPL_LSHIFT_U32(priorLocSnr[i], normTmp - 11); // Q(normTmp)
-    } else {
-      den = WEBRTC_SPL_RSHIFT_U32(priorLocSnr[i], 11 - normTmp); // Q(normTmp)
-    }
-    if (den > 0) {
-      besselTmpFX32 -= WEBRTC_SPL_UDIV(num, den); // Q11
-    } else {
-      besselTmpFX32 -= num; // Q11
-    }
-
-    // inst->logLrtTimeAvg[i] += LRT_TAVG * (besselTmp - log(snrLocPrior) - inst->logLrtTimeAvg[i]);
-    // Here, LRT_TAVG = 0.5
-    zeros = WebRtcSpl_NormU32(priorLocSnr[i]);
-    frac32 = (int32_t)(((priorLocSnr[i] << zeros) & 0x7FFFFFFF) >> 19);
-    tmp32 = WEBRTC_SPL_MUL(frac32, frac32);
-    tmp32 = WEBRTC_SPL_RSHIFT_W32(WEBRTC_SPL_MUL(tmp32, -43), 19);
-    tmp32 += WEBRTC_SPL_MUL_16_16_RSFT((int16_t)frac32, 5412, 12);
-    frac32 = tmp32 + 37;
-    // tmp32 = log2(priorLocSnr[i])
-    tmp32 = (int32_t)(((31 - zeros) << 12) + frac32) - (11 << 12); // Q12
-    logTmp = WEBRTC_SPL_RSHIFT_W32(WEBRTC_SPL_MUL_32_16(tmp32, 178), 8); // log2(priorLocSnr[i])*log(2)
-    tmp32no1 = WEBRTC_SPL_RSHIFT_W32(logTmp + inst->logLrtTimeAvgW32[i], 1); // Q12
-    inst->logLrtTimeAvgW32[i] += (besselTmpFX32 - tmp32no1); // Q12
-
-    logLrtTimeAvgKsumFX += inst->logLrtTimeAvgW32[i]; // Q12
-  }
-  inst->featureLogLrt = WEBRTC_SPL_RSHIFT_W32(logLrtTimeAvgKsumFX * 5, inst->stages + 10); // 5 = BIN_SIZE_LRT / 2
-  // done with computation of LR factor
-
-  //
-  //compute the indicator functions
-  //
-
-  // average LRT feature
-  // FLOAT code
-  // indicator0 = 0.5 * (tanh(widthPrior * (logLrtTimeAvgKsum - threshPrior0)) + 1.0);
-  tmpIndFX = 16384; // Q14(1.0)
-  tmp32no1 = logLrtTimeAvgKsumFX - inst->thresholdLogLrt; // Q12
-  nShifts = 7 - inst->stages; // WIDTH_PR_MAP_SHIFT - inst->stages + 5;
-  //use larger width in tanh map for pause regions
-  if (tmp32no1 < 0) {
-    tmpIndFX = 0;
-    tmp32no1 = -tmp32no1;
-    //widthPrior = widthPrior * 2.0;
-    nShifts++;
-  }
-  tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, nShifts); // Q14
-  // compute indicator function: sigmoid map
-  tableIndex = (int16_t)WEBRTC_SPL_RSHIFT_W32(tmp32no1, 14);
-  if ((tableIndex < 16) && (tableIndex >= 0)) {
-    tmp16no2 = kIndicatorTable[tableIndex];
-    tmp16no1 = kIndicatorTable[tableIndex + 1] - kIndicatorTable[tableIndex];
-    frac = (int16_t)(tmp32no1 & 0x00003fff); // Q14
-    tmp16no2 += (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, frac, 14);
-    if (tmpIndFX == 0) {
-      tmpIndFX = 8192 - tmp16no2; // Q14
-    } else {
-      tmpIndFX = 8192 + tmp16no2; // Q14
-    }
-  }
-  indPriorFX = WEBRTC_SPL_MUL_16_16(inst->weightLogLrt, tmpIndFX); // 6*Q14
-
-  //spectral flatness feature
-  if (inst->weightSpecFlat) {
-    tmpU32no1 = WEBRTC_SPL_UMUL(inst->featureSpecFlat, 400); // Q10
-    tmpIndFX = 16384; // Q14(1.0)
-    //use larger width in tanh map for pause regions
-    tmpU32no2 = inst->thresholdSpecFlat - tmpU32no1; //Q10
-    nShifts = 4;
-    if (inst->thresholdSpecFlat < tmpU32no1) {
-      tmpIndFX = 0;
-      tmpU32no2 = tmpU32no1 - inst->thresholdSpecFlat;
-      //widthPrior = widthPrior * 2.0;
-      nShifts++;
-    }
-    tmp32no1 = (int32_t)WebRtcSpl_DivU32U16(WEBRTC_SPL_LSHIFT_U32(tmpU32no2,
-                                                                        nShifts), 25); //Q14
-    tmpU32no1 = WebRtcSpl_DivU32U16(WEBRTC_SPL_LSHIFT_U32(tmpU32no2, nShifts), 25); //Q14
-    // compute indicator function: sigmoid map
-    // FLOAT code
-    // indicator1 = 0.5 * (tanh(sgnMap * widthPrior * (threshPrior1 - tmpFloat1)) + 1.0);
-    tableIndex = (int16_t)WEBRTC_SPL_RSHIFT_U32(tmpU32no1, 14);
-    if (tableIndex < 16) {
-      tmp16no2 = kIndicatorTable[tableIndex];
-      tmp16no1 = kIndicatorTable[tableIndex + 1] - kIndicatorTable[tableIndex];
-      frac = (int16_t)(tmpU32no1 & 0x00003fff); // Q14
-      tmp16no2 += (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, frac, 14);
-      if (tmpIndFX) {
-        tmpIndFX = 8192 + tmp16no2; // Q14
-      } else {
-        tmpIndFX = 8192 - tmp16no2; // Q14
-      }
-    }
-    indPriorFX += WEBRTC_SPL_MUL_16_16(inst->weightSpecFlat, tmpIndFX); // 6*Q14
-  }
-
-  //for template spectral-difference
-  if (inst->weightSpecDiff) {
-    tmpU32no1 = 0;
-    if (inst->featureSpecDiff) {
-      normTmp = WEBRTC_SPL_MIN(20 - inst->stages,
-                               WebRtcSpl_NormU32(inst->featureSpecDiff));
-      tmpU32no1 = WEBRTC_SPL_LSHIFT_U32(inst->featureSpecDiff, normTmp); // Q(normTmp-2*stages)
-      tmpU32no2 = WEBRTC_SPL_RSHIFT_U32(inst->timeAvgMagnEnergy, 20 - inst->stages
-                                        - normTmp);
-      if (tmpU32no2 > 0) {
-        // Q(20 - inst->stages)
-        tmpU32no1 = WEBRTC_SPL_UDIV(tmpU32no1, tmpU32no2);
-      } else {
-        tmpU32no1 = (uint32_t)(0x7fffffff);
-      }
-    }
-    tmpU32no3 = WEBRTC_SPL_UDIV(WEBRTC_SPL_LSHIFT_U32(inst->thresholdSpecDiff, 17), 25);
-    tmpU32no2 = tmpU32no1 - tmpU32no3;
-    nShifts = 1;
-    tmpIndFX = 16384; // Q14(1.0)
-    //use larger width in tanh map for pause regions
-    if (tmpU32no2 & 0x80000000) {
-      tmpIndFX = 0;
-      tmpU32no2 = tmpU32no3 - tmpU32no1;
-      //widthPrior = widthPrior * 2.0;
-      nShifts--;
-    }
-    tmpU32no1 = WEBRTC_SPL_RSHIFT_U32(tmpU32no2, nShifts);
-    // compute indicator function: sigmoid map
-    /* FLOAT code
-     indicator2 = 0.5 * (tanh(widthPrior * (tmpFloat1 - threshPrior2)) + 1.0);
-     */
-    tableIndex = (int16_t)WEBRTC_SPL_RSHIFT_U32(tmpU32no1, 14);
-    if (tableIndex < 16) {
-      tmp16no2 = kIndicatorTable[tableIndex];
-      tmp16no1 = kIndicatorTable[tableIndex + 1] - kIndicatorTable[tableIndex];
-      frac = (int16_t)(tmpU32no1 & 0x00003fff); // Q14
-      tmp16no2 += (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
-                    tmp16no1, frac, 14);
-      if (tmpIndFX) {
-        tmpIndFX = 8192 + tmp16no2;
-      } else {
-        tmpIndFX = 8192 - tmp16no2;
-      }
-    }
-    indPriorFX += WEBRTC_SPL_MUL_16_16(inst->weightSpecDiff, tmpIndFX); // 6*Q14
-  }
-
-  //combine the indicator function with the feature weights
-  // FLOAT code
-  // indPrior = 1 - (weightIndPrior0 * indicator0 + weightIndPrior1 * indicator1 + weightIndPrior2 * indicator2);
-  indPriorFX16 = WebRtcSpl_DivW32W16ResW16(98307 - indPriorFX, 6); // Q14
-  // done with computing indicator function
-
-  //compute the prior probability
-  // FLOAT code
-  // inst->priorNonSpeechProb += PRIOR_UPDATE * (indPriorNonSpeech - inst->priorNonSpeechProb);
-  tmp16 = indPriorFX16 - inst->priorNonSpeechProb; // Q14
-  inst->priorNonSpeechProb += (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(
-                                PRIOR_UPDATE_Q14, tmp16, 14); // Q14
-
-  //final speech probability: combine prior model with LR factor:
-
-  memset(nonSpeechProbFinal, 0, sizeof(uint16_t) * inst->magnLen);
-
-  if (inst->priorNonSpeechProb > 0) {
-    for (i = 0; i < inst->magnLen; i++) {
-      // FLOAT code
-      // invLrt = exp(inst->logLrtTimeAvg[i]);
-      // invLrt = inst->priorSpeechProb * invLrt;
-      // nonSpeechProbFinal[i] = (1.0 - inst->priorSpeechProb) / (1.0 - inst->priorSpeechProb + invLrt);
-      // invLrt = (1.0 - inst->priorNonSpeechProb) * invLrt;
-      // nonSpeechProbFinal[i] = inst->priorNonSpeechProb / (inst->priorNonSpeechProb + invLrt);
-      if (inst->logLrtTimeAvgW32[i] < 65300) {
-        tmp32no1 = WEBRTC_SPL_RSHIFT_W32(WEBRTC_SPL_MUL(inst->logLrtTimeAvgW32[i], 23637),
-                                         14); // Q12
-        intPart = (int16_t)WEBRTC_SPL_RSHIFT_W32(tmp32no1, 12);
-        if (intPart < -8) {
-          intPart = -8;
-        }
-        frac = (int16_t)(tmp32no1 & 0x00000fff); // Q12
-
-        // Quadratic approximation of 2^frac
-        tmp32no2 = WEBRTC_SPL_RSHIFT_W32(frac * frac * 44, 19); // Q12
-        tmp32no2 += WEBRTC_SPL_MUL_16_16_RSFT(frac, 84, 7); // Q12
-        invLrtFX = WEBRTC_SPL_LSHIFT_W32(1, 8 + intPart)
-                   + WEBRTC_SPL_SHIFT_W32(tmp32no2, intPart - 4); // Q8
-
-        normTmp = WebRtcSpl_NormW32(invLrtFX);
-        normTmp2 = WebRtcSpl_NormW16((16384 - inst->priorNonSpeechProb));
-        if (normTmp + normTmp2 >= 7) {
-          if (normTmp + normTmp2 < 15) {
-            invLrtFX = WEBRTC_SPL_RSHIFT_W32(invLrtFX, 15 - normTmp2 - normTmp);
-            // Q(normTmp+normTmp2-7)
-            tmp32no1 = WEBRTC_SPL_MUL_32_16(invLrtFX, (16384 - inst->priorNonSpeechProb));
-            // Q(normTmp+normTmp2+7)
-            invLrtFX = WEBRTC_SPL_SHIFT_W32(tmp32no1, 7 - normTmp - normTmp2); // Q14
-          } else {
-            tmp32no1 = WEBRTC_SPL_MUL_32_16(invLrtFX, (16384 - inst->priorNonSpeechProb)); // Q22
-            invLrtFX = WEBRTC_SPL_RSHIFT_W32(tmp32no1, 8); // Q14
-          }
-
-          tmp32no1 = WEBRTC_SPL_LSHIFT_W32((int32_t)inst->priorNonSpeechProb, 8); // Q22
-
-          nonSpeechProbFinal[i] = (uint16_t)WEBRTC_SPL_DIV(tmp32no1,
-              (int32_t)inst->priorNonSpeechProb + invLrtFX); // Q8
-        }
-      }
-    }
-  }
-}
-
 // Transform input (speechFrame) to frequency domain magnitude (magnU16)
 void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, uint16_t* magnU16) {
 
@@ -1461,7 +1244,7 @@
   right_shifts_in_magnU16 = WEBRTC_SPL_MAX(right_shifts_in_magnU16, 0);
 
   // create realImag as winData interleaved with zeros (= imag. part), normalize it
-  NormalizeRealBuffer(inst, winData, realImag);
+  WebRtcNsx_NormalizeRealBuffer(inst, winData, realImag);
 
   // FFT output will be in winData[].
   WebRtcSpl_RealForwardFFT(inst->real_fft, realImag, winData);
@@ -1693,7 +1476,7 @@
   // Inverse FFT output will be in rfft_out[].
   outCIFFT = WebRtcSpl_RealInverseFFT(inst->real_fft, realImag, rfft_out);
 
-  Denormalize(inst, rfft_out, outCIFFT);
+  WebRtcNsx_Denormalize(inst, rfft_out, outCIFFT);
 
   //scale factor: only do it after END_STARTUP_LONG time
   gainFactor = 8192; // 8192 = Q13(1.0)
diff --git a/webrtc/modules/audio_processing/ns/nsx_core.h b/webrtc/modules/audio_processing/ns/nsx_core.h
index 1ad369f..5b3c5e7 100644
--- a/webrtc/modules/audio_processing/ns/nsx_core.h
+++ b/webrtc/modules/audio_processing/ns/nsx_core.h
@@ -201,6 +201,23 @@
                                int16_t* new_speech);
 extern AnalysisUpdate WebRtcNsx_AnalysisUpdate;
 
+// Denormalize the real-valued signal |in|, the output from inverse FFT.
+typedef void (*Denormalize) (NsxInst_t* inst, int16_t* in, int factor);
+extern Denormalize WebRtcNsx_Denormalize;
+
+// Normalize the real-valued signal |in|, the input to forward FFT.
+typedef void (*NormalizeRealBuffer) (NsxInst_t* inst,
+                                     const int16_t* in,
+                                     int16_t* out);
+extern NormalizeRealBuffer WebRtcNsx_NormalizeRealBuffer;
+
+// Compute speech/noise probability.
+// Intended to be private.
+void WebRtcNsx_SpeechNoiseProb(NsxInst_t* inst,
+                               uint16_t* nonSpeechProbFinal,
+                               uint32_t* priorLocSnr,
+                               uint32_t* postLocSnr);
+
 #if (defined WEBRTC_DETECT_ARM_NEON) || defined (WEBRTC_ARCH_ARM_NEON)
 // For the above function pointers, functions for generic platforms are declared
 // and defined as static in file nsx_core.c, while those for ARM Neon platforms
@@ -218,6 +235,26 @@
 void WebRtcNsx_PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buff);
 #endif
 
+#if defined(MIPS32_LE)
+// For the above function pointers, functions for generic platforms are declared
+// and defined as static in file nsx_core.c, while those for MIPS platforms
+// are declared below and defined in file nsx_core_mips.c.
+void WebRtcNsx_SynthesisUpdate_mips(NsxInst_t* inst,
+                                    int16_t* out_frame,
+                                    int16_t gain_factor);
+void WebRtcNsx_AnalysisUpdate_mips(NsxInst_t* inst,
+                                   int16_t* out,
+                                   int16_t* new_speech);
+void WebRtcNsx_PrepareSpectrum_mips(NsxInst_t* inst, int16_t* freq_buff);
+void WebRtcNsx_NormalizeRealBuffer_mips(NsxInst_t* inst,
+                                        const int16_t* in,
+                                        int16_t* out);
+#if defined(MIPS_DSP_R1_LE)
+void WebRtcNsx_Denormalize_mips(NsxInst_t* inst, int16_t* in, int factor);
+#endif
+
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/webrtc/modules/audio_processing/ns/nsx_core_c.c b/webrtc/modules/audio_processing/ns/nsx_core_c.c
new file mode 100644
index 0000000..452b96e
--- /dev/null
+++ b/webrtc/modules/audio_processing/ns/nsx_core_c.c
@@ -0,0 +1,273 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/ns/include/noise_suppression_x.h"
+#include "webrtc/modules/audio_processing/ns/nsx_core.h"
+
+static const int16_t kIndicatorTable[17] = {
+  0, 2017, 3809, 5227, 6258, 6963, 7424, 7718,
+  7901, 8014, 8084, 8126, 8152, 8168, 8177, 8183, 8187
+};
+
+// Compute speech/noise probability
+// speech/noise probability is returned in: probSpeechFinal
+//snrLocPrior is the prior SNR for each frequency (in Q11)
+//snrLocPost is the post SNR for each frequency (in Q11)
+void WebRtcNsx_SpeechNoiseProb(NsxInst_t* inst,
+                               uint16_t* nonSpeechProbFinal,
+                               uint32_t* priorLocSnr,
+                               uint32_t* postLocSnr) {
+
+  uint32_t zeros, num, den, tmpU32no1, tmpU32no2, tmpU32no3;
+  int32_t invLrtFX, indPriorFX, tmp32, tmp32no1, tmp32no2, besselTmpFX32;
+  int32_t frac32, logTmp;
+  int32_t logLrtTimeAvgKsumFX;
+  int16_t indPriorFX16;
+  int16_t tmp16, tmp16no1, tmp16no2, tmpIndFX, tableIndex, frac, intPart;
+  int i, normTmp, normTmp2, nShifts;
+
+  // compute feature based on average LR factor
+  // this is the average over all frequencies of the smooth log LRT
+  logLrtTimeAvgKsumFX = 0;
+  for (i = 0; i < inst->magnLen; i++) {
+    besselTmpFX32 = (int32_t)postLocSnr[i]; // Q11
+    normTmp = WebRtcSpl_NormU32(postLocSnr[i]);
+    num = WEBRTC_SPL_LSHIFT_U32(postLocSnr[i], normTmp); // Q(11+normTmp)
+    if (normTmp > 10) {
+      den = WEBRTC_SPL_LSHIFT_U32(priorLocSnr[i], normTmp - 11); // Q(normTmp)
+    } else {
+      den = WEBRTC_SPL_RSHIFT_U32(priorLocSnr[i], 11 - normTmp); // Q(normTmp)
+    }
+    if (den > 0) {
+      besselTmpFX32 -= WEBRTC_SPL_UDIV(num, den); // Q11
+    } else {
+      besselTmpFX32 -= num; // Q11
+    }
+
+    // inst->logLrtTimeAvg[i] += LRT_TAVG * (besselTmp - log(snrLocPrior)
+    //                                       - inst->logLrtTimeAvg[i]);
+    // Here, LRT_TAVG = 0.5
+    zeros = WebRtcSpl_NormU32(priorLocSnr[i]);
+    frac32 = (int32_t)(((priorLocSnr[i] << zeros) & 0x7FFFFFFF) >> 19);
+    tmp32 = WEBRTC_SPL_MUL(frac32, frac32);
+    tmp32 = WEBRTC_SPL_RSHIFT_W32(WEBRTC_SPL_MUL(tmp32, -43), 19);
+    tmp32 += WEBRTC_SPL_MUL_16_16_RSFT((int16_t)frac32, 5412, 12);
+    frac32 = tmp32 + 37;
+    // tmp32 = log2(priorLocSnr[i])
+    tmp32 = (int32_t)(((31 - zeros) << 12) + frac32) - (11 << 12); // Q12
+    logTmp = WEBRTC_SPL_RSHIFT_W32(WEBRTC_SPL_MUL_32_16(tmp32, 178), 8);
+                                                  // log2(priorLocSnr[i])*log(2)
+    tmp32no1 = WEBRTC_SPL_RSHIFT_W32(logTmp + inst->logLrtTimeAvgW32[i], 1);
+                                                  // Q12
+    inst->logLrtTimeAvgW32[i] += (besselTmpFX32 - tmp32no1); // Q12
+
+    logLrtTimeAvgKsumFX += inst->logLrtTimeAvgW32[i]; // Q12
+  }
+  inst->featureLogLrt = WEBRTC_SPL_RSHIFT_W32(logLrtTimeAvgKsumFX * 5,
+                                              inst->stages + 10);
+                                                  // 5 = BIN_SIZE_LRT / 2
+  // done with computation of LR factor
+
+  //
+  //compute the indicator functions
+  //
+
+  // average LRT feature
+  // FLOAT code
+  // indicator0 = 0.5 * (tanh(widthPrior *
+  //                      (logLrtTimeAvgKsum - threshPrior0)) + 1.0);
+  tmpIndFX = 16384; // Q14(1.0)
+  tmp32no1 = logLrtTimeAvgKsumFX - inst->thresholdLogLrt; // Q12
+  nShifts = 7 - inst->stages; // WIDTH_PR_MAP_SHIFT - inst->stages + 5;
+  //use larger width in tanh map for pause regions
+  if (tmp32no1 < 0) {
+    tmpIndFX = 0;
+    tmp32no1 = -tmp32no1;
+    //widthPrior = widthPrior * 2.0;
+    nShifts++;
+  }
+  tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, nShifts); // Q14
+  // compute indicator function: sigmoid map
+  tableIndex = (int16_t)WEBRTC_SPL_RSHIFT_W32(tmp32no1, 14);
+  if ((tableIndex < 16) && (tableIndex >= 0)) {
+    tmp16no2 = kIndicatorTable[tableIndex];
+    tmp16no1 = kIndicatorTable[tableIndex + 1] - kIndicatorTable[tableIndex];
+    frac = (int16_t)(tmp32no1 & 0x00003fff); // Q14
+    tmp16no2 += (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, frac, 14);
+    if (tmpIndFX == 0) {
+      tmpIndFX = 8192 - tmp16no2; // Q14
+    } else {
+      tmpIndFX = 8192 + tmp16no2; // Q14
+    }
+  }
+  indPriorFX = WEBRTC_SPL_MUL_16_16(inst->weightLogLrt, tmpIndFX); // 6*Q14
+
+  //spectral flatness feature
+  if (inst->weightSpecFlat) {
+    tmpU32no1 = WEBRTC_SPL_UMUL(inst->featureSpecFlat, 400); // Q10
+    tmpIndFX = 16384; // Q14(1.0)
+    //use larger width in tanh map for pause regions
+    tmpU32no2 = inst->thresholdSpecFlat - tmpU32no1; //Q10
+    nShifts = 4;
+    if (inst->thresholdSpecFlat < tmpU32no1) {
+      tmpIndFX = 0;
+      tmpU32no2 = tmpU32no1 - inst->thresholdSpecFlat;
+      //widthPrior = widthPrior * 2.0;
+      nShifts++;
+    }
+    tmp32no1 = (int32_t)WebRtcSpl_DivU32U16(WEBRTC_SPL_LSHIFT_U32(tmpU32no2,
+                                                                  nShifts), 25);
+                                                     //Q14
+    tmpU32no1 = WebRtcSpl_DivU32U16(WEBRTC_SPL_LSHIFT_U32(tmpU32no2, nShifts),
+                                    25); //Q14
+    // compute indicator function: sigmoid map
+    // FLOAT code
+    // indicator1 = 0.5 * (tanh(sgnMap * widthPrior *
+    //                          (threshPrior1 - tmpFloat1)) + 1.0);
+    tableIndex = (int16_t)WEBRTC_SPL_RSHIFT_U32(tmpU32no1, 14);
+    if (tableIndex < 16) {
+      tmp16no2 = kIndicatorTable[tableIndex];
+      tmp16no1 = kIndicatorTable[tableIndex + 1] - kIndicatorTable[tableIndex];
+      frac = (int16_t)(tmpU32no1 & 0x00003fff); // Q14
+      tmp16no2 += (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, frac, 14);
+      if (tmpIndFX) {
+        tmpIndFX = 8192 + tmp16no2; // Q14
+      } else {
+        tmpIndFX = 8192 - tmp16no2; // Q14
+      }
+    }
+    indPriorFX += WEBRTC_SPL_MUL_16_16(inst->weightSpecFlat, tmpIndFX); // 6*Q14
+  }
+
+  //for template spectral-difference
+  if (inst->weightSpecDiff) {
+    tmpU32no1 = 0;
+    if (inst->featureSpecDiff) {
+      normTmp = WEBRTC_SPL_MIN(20 - inst->stages,
+                               WebRtcSpl_NormU32(inst->featureSpecDiff));
+      tmpU32no1 = WEBRTC_SPL_LSHIFT_U32(inst->featureSpecDiff, normTmp);
+                                                         // Q(normTmp-2*stages)
+      tmpU32no2 = WEBRTC_SPL_RSHIFT_U32(inst->timeAvgMagnEnergy,
+                                        20 - inst->stages - normTmp);
+      if (tmpU32no2 > 0) {
+        // Q(20 - inst->stages)
+        tmpU32no1 = WEBRTC_SPL_UDIV(tmpU32no1, tmpU32no2);
+      } else {
+        tmpU32no1 = (uint32_t)(0x7fffffff);
+      }
+    }
+    tmpU32no3 = WEBRTC_SPL_UDIV(WEBRTC_SPL_LSHIFT_U32(inst->thresholdSpecDiff,
+                                                      17),
+                                25);
+    tmpU32no2 = tmpU32no1 - tmpU32no3;
+    nShifts = 1;
+    tmpIndFX = 16384; // Q14(1.0)
+    //use larger width in tanh map for pause regions
+    if (tmpU32no2 & 0x80000000) {
+      tmpIndFX = 0;
+      tmpU32no2 = tmpU32no3 - tmpU32no1;
+      //widthPrior = widthPrior * 2.0;
+      nShifts--;
+    }
+    tmpU32no1 = WEBRTC_SPL_RSHIFT_U32(tmpU32no2, nShifts);
+    // compute indicator function: sigmoid map
+    /* FLOAT code
+     indicator2 = 0.5 * (tanh(widthPrior * (tmpFloat1 - threshPrior2)) + 1.0);
+     */
+    tableIndex = (int16_t)WEBRTC_SPL_RSHIFT_U32(tmpU32no1, 14);
+    if (tableIndex < 16) {
+      tmp16no2 = kIndicatorTable[tableIndex];
+      tmp16no1 = kIndicatorTable[tableIndex + 1] - kIndicatorTable[tableIndex];
+      frac = (int16_t)(tmpU32no1 & 0x00003fff); // Q14
+      tmp16no2 += (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+                    tmp16no1, frac, 14);
+      if (tmpIndFX) {
+        tmpIndFX = 8192 + tmp16no2;
+      } else {
+        tmpIndFX = 8192 - tmp16no2;
+      }
+    }
+    indPriorFX += WEBRTC_SPL_MUL_16_16(inst->weightSpecDiff, tmpIndFX); // 6*Q14
+  }
+
+  //combine the indicator function with the feature weights
+  // FLOAT code
+  // indPrior = 1 - (weightIndPrior0 * indicator0 + weightIndPrior1 *
+  //                 indicator1 + weightIndPrior2 * indicator2);
+  indPriorFX16 = WebRtcSpl_DivW32W16ResW16(98307 - indPriorFX, 6); // Q14
+  // done with computing indicator function
+
+  //compute the prior probability
+  // FLOAT code
+  // inst->priorNonSpeechProb += PRIOR_UPDATE *
+  //                             (indPriorNonSpeech - inst->priorNonSpeechProb);
+  tmp16 = indPriorFX16 - inst->priorNonSpeechProb; // Q14
+  inst->priorNonSpeechProb += (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(
+                                PRIOR_UPDATE_Q14, tmp16, 14); // Q14
+
+  //final speech probability: combine prior model with LR factor:
+
+  memset(nonSpeechProbFinal, 0, sizeof(uint16_t) * inst->magnLen);
+
+  if (inst->priorNonSpeechProb > 0) {
+    for (i = 0; i < inst->magnLen; i++) {
+      // FLOAT code
+      // invLrt = exp(inst->logLrtTimeAvg[i]);
+      // invLrt = inst->priorSpeechProb * invLrt;
+      // nonSpeechProbFinal[i] = (1.0 - inst->priorSpeechProb) /
+      //                         (1.0 - inst->priorSpeechProb + invLrt);
+      // invLrt = (1.0 - inst->priorNonSpeechProb) * invLrt;
+      // nonSpeechProbFinal[i] = inst->priorNonSpeechProb /
+      //                         (inst->priorNonSpeechProb + invLrt);
+      if (inst->logLrtTimeAvgW32[i] < 65300) {
+        tmp32no1 = WEBRTC_SPL_RSHIFT_W32(WEBRTC_SPL_MUL(
+                                           inst->logLrtTimeAvgW32[i], 23637),
+                                         14); // Q12
+        intPart = (int16_t)WEBRTC_SPL_RSHIFT_W32(tmp32no1, 12);
+        if (intPart < -8) {
+          intPart = -8;
+        }
+        frac = (int16_t)(tmp32no1 & 0x00000fff); // Q12
+
+        // Quadratic approximation of 2^frac
+        tmp32no2 = WEBRTC_SPL_RSHIFT_W32(frac * frac * 44, 19); // Q12
+        tmp32no2 += WEBRTC_SPL_MUL_16_16_RSFT(frac, 84, 7); // Q12
+        invLrtFX = WEBRTC_SPL_LSHIFT_W32(1, 8 + intPart)
+                   + WEBRTC_SPL_SHIFT_W32(tmp32no2, intPart - 4); // Q8
+
+        normTmp = WebRtcSpl_NormW32(invLrtFX);
+        normTmp2 = WebRtcSpl_NormW16((16384 - inst->priorNonSpeechProb));
+        if (normTmp + normTmp2 >= 7) {
+          if (normTmp + normTmp2 < 15) {
+            invLrtFX = WEBRTC_SPL_RSHIFT_W32(invLrtFX, 15 - normTmp2 - normTmp);
+            // Q(normTmp+normTmp2-7)
+            tmp32no1 = WEBRTC_SPL_MUL_32_16(invLrtFX,
+                                            (16384 - inst->priorNonSpeechProb));
+            // Q(normTmp+normTmp2+7)
+            invLrtFX = WEBRTC_SPL_SHIFT_W32(tmp32no1, 7 - normTmp - normTmp2);
+                                                                  // Q14
+          } else {
+            tmp32no1 = WEBRTC_SPL_MUL_32_16(invLrtFX,
+                                            (16384 - inst->priorNonSpeechProb));
+                                                                  // Q22
+            invLrtFX = WEBRTC_SPL_RSHIFT_W32(tmp32no1, 8); // Q14
+          }
+
+          tmp32no1 = WEBRTC_SPL_LSHIFT_W32((int32_t)inst->priorNonSpeechProb,
+                                           8); // Q22
+
+          nonSpeechProbFinal[i] = (uint16_t)WEBRTC_SPL_DIV(tmp32no1,
+              (int32_t)inst->priorNonSpeechProb + invLrtFX); // Q8
+        }
+      }
+    }
+  }
+}
+
diff --git a/webrtc/modules/audio_processing/ns/nsx_core_mips.c b/webrtc/modules/audio_processing/ns/nsx_core_mips.c
new file mode 100644
index 0000000..ccb0c37
--- /dev/null
+++ b/webrtc/modules/audio_processing/ns/nsx_core_mips.c
@@ -0,0 +1,1008 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/ns/include/noise_suppression_x.h"
+#include "webrtc/modules/audio_processing/ns/nsx_core.h"
+
+static const int16_t kIndicatorTable[17] = {
+  0, 2017, 3809, 5227, 6258, 6963, 7424, 7718,
+  7901, 8014, 8084, 8126, 8152, 8168, 8177, 8183, 8187
+};
+
+// Compute speech/noise probability
+// speech/noise probability is returned in: probSpeechFinal
+//snrLocPrior is the prior SNR for each frequency (in Q11)
+//snrLocPost is the post SNR for each frequency (in Q11)
+void WebRtcNsx_SpeechNoiseProb(NsxInst_t* inst,
+                               uint16_t* nonSpeechProbFinal,
+                               uint32_t* priorLocSnr,
+                               uint32_t* postLocSnr) {
+
+  uint32_t tmpU32no1, tmpU32no2, tmpU32no3;
+  int32_t indPriorFX, tmp32no1;
+  int32_t logLrtTimeAvgKsumFX;
+  int16_t indPriorFX16;
+  int16_t tmp16, tmp16no1, tmp16no2, tmpIndFX, tableIndex, frac;
+  int i, normTmp, nShifts;
+
+  int32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
+  int32_t const_max = 0x7fffffff;
+  int32_t const_neg43 = -43;
+  int32_t const_5412 = 5412;
+  int32_t const_11rsh12 = (11 << 12);
+  int32_t const_178 = 178;
+
+
+  // compute feature based on average LR factor
+  // this is the average over all frequencies of the smooth log LRT
+  logLrtTimeAvgKsumFX = 0;
+  for (i = 0; i < inst->magnLen; i++) {
+    r0 = postLocSnr[i]; // Q11
+    r1 = priorLocSnr[i];
+    r2 = inst->logLrtTimeAvgW32[i];
+
+    __asm __volatile(
+      ".set       push                                    \n\t"
+      ".set       noreorder                               \n\t"
+      "clz        %[r3],    %[r0]                         \n\t"
+      "clz        %[r5],    %[r1]                         \n\t"
+      "slti       %[r4],    %[r3],    32                  \n\t"
+      "slti       %[r6],    %[r5],    32                  \n\t"
+      "movz       %[r3],    $0,       %[r4]               \n\t"
+      "movz       %[r5],    $0,       %[r6]               \n\t"
+      "slti       %[r4],    %[r3],    11                  \n\t"
+      "addiu      %[r6],    %[r3],    -11                 \n\t"
+      "neg        %[r7],    %[r6]                         \n\t"
+      "sllv       %[r6],    %[r1],    %[r6]               \n\t"
+      "srav       %[r7],    %[r1],    %[r7]               \n\t"
+      "movn       %[r6],    %[r7],    %[r4]               \n\t"
+      "sllv       %[r1],    %[r1],    %[r5]               \n\t"
+      "and        %[r1],    %[r1],    %[const_max]        \n\t"
+      "sra        %[r1],    %[r1],    19                  \n\t"
+      "mul        %[r7],    %[r1],    %[r1]               \n\t"
+      "sllv       %[r3],    %[r0],    %[r3]               \n\t"
+      "divu       %[r8],    %[r3],    %[r6]               \n\t"
+      "slti       %[r6],    %[r6],    1                   \n\t"
+      "mul        %[r7],    %[r7],    %[const_neg43]      \n\t"
+      "sra        %[r7],    %[r7],    19                  \n\t"
+      "movz       %[r3],    %[r8],    %[r6]               \n\t"
+      "subu       %[r0],    %[r0],    %[r3]               \n\t"
+      "mul        %[r1],    %[r1],    %[const_5412]       \n\t"
+      "sra        %[r1],    %[r1],    12                  \n\t"
+      "addu       %[r7],    %[r7],    %[r1]               \n\t"
+      "addiu      %[r1],    %[r7],    37                  \n\t"
+      "addiu      %[r5],    %[r5],    -31                 \n\t"
+      "neg        %[r5],    %[r5]                         \n\t"
+      "sll        %[r5],    %[r5],    12                  \n\t"
+      "addu       %[r5],    %[r5],    %[r1]               \n\t"
+      "subu       %[r7],    %[r5],    %[const_11rsh12]    \n\t"
+      "mul        %[r7],    %[r7],    %[const_178]        \n\t"
+      "sra        %[r7],    %[r7],    8                   \n\t"
+      "addu       %[r7],    %[r7],    %[r2]               \n\t"
+      "sra        %[r7],    %[r7],    1                   \n\t"
+      "subu       %[r2],    %[r2],    %[r7]               \n\t"
+      "addu       %[r2],    %[r2],    %[r0]               \n\t"
+      ".set       pop                                     \n\t"
+      : [r0] "+r" (r0), [r1] "+r" (r1), [r2] "+r" (r2),
+        [r3] "=&r" (r3), [r4] "=&r" (r4), [r5] "=&r" (r5),
+        [r6] "=&r" (r6), [r7] "=&r" (r7), [r8] "=&r" (r8)
+      : [const_max] "r" (const_max), [const_neg43] "r" (const_neg43),
+        [const_5412] "r" (const_5412), [const_11rsh12] "r" (const_11rsh12),
+        [const_178] "r" (const_178)
+      : "hi", "lo"
+    );
+    inst->logLrtTimeAvgW32[i] = r2;
+    logLrtTimeAvgKsumFX += r2;
+  }
+
+  inst->featureLogLrt = WEBRTC_SPL_RSHIFT_W32(logLrtTimeAvgKsumFX * 5,
+                                              inst->stages + 10);
+                                                  // 5 = BIN_SIZE_LRT / 2
+  // done with computation of LR factor
+
+  //
+  // compute the indicator functions
+  //
+
+  // average LRT feature
+  // FLOAT code
+  // indicator0 = 0.5 * (tanh(widthPrior *
+  //                      (logLrtTimeAvgKsum - threshPrior0)) + 1.0);
+  tmpIndFX = 16384; // Q14(1.0)
+  tmp32no1 = logLrtTimeAvgKsumFX - inst->thresholdLogLrt; // Q12
+  nShifts = 7 - inst->stages; // WIDTH_PR_MAP_SHIFT - inst->stages + 5;
+  //use larger width in tanh map for pause regions
+  if (tmp32no1 < 0) {
+    tmpIndFX = 0;
+    tmp32no1 = -tmp32no1;
+    //widthPrior = widthPrior * 2.0;
+    nShifts++;
+  }
+  tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, nShifts); // Q14
+  // compute indicator function: sigmoid map
+  tableIndex = (int16_t)WEBRTC_SPL_RSHIFT_W32(tmp32no1, 14);
+  if ((tableIndex < 16) && (tableIndex >= 0)) {
+    tmp16no2 = kIndicatorTable[tableIndex];
+    tmp16no1 = kIndicatorTable[tableIndex + 1] - kIndicatorTable[tableIndex];
+    frac = (int16_t)(tmp32no1 & 0x00003fff); // Q14
+    tmp16no2 += (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, frac, 14);
+    if (tmpIndFX == 0) {
+      tmpIndFX = 8192 - tmp16no2; // Q14
+    } else {
+      tmpIndFX = 8192 + tmp16no2; // Q14
+    }
+  }
+  indPriorFX = WEBRTC_SPL_MUL_16_16(inst->weightLogLrt, tmpIndFX); // 6*Q14
+
+  //spectral flatness feature
+  if (inst->weightSpecFlat) {
+    tmpU32no1 = WEBRTC_SPL_UMUL(inst->featureSpecFlat, 400); // Q10
+    tmpIndFX = 16384; // Q14(1.0)
+    //use larger width in tanh map for pause regions
+    tmpU32no2 = inst->thresholdSpecFlat - tmpU32no1; //Q10
+    nShifts = 4;
+    if (inst->thresholdSpecFlat < tmpU32no1) {
+      tmpIndFX = 0;
+      tmpU32no2 = tmpU32no1 - inst->thresholdSpecFlat;
+      //widthPrior = widthPrior * 2.0;
+      nShifts++;
+    }
+    tmp32no1 = (int32_t)WebRtcSpl_DivU32U16(WEBRTC_SPL_LSHIFT_U32(tmpU32no2,
+                                                                  nShifts), 25);
+                                                     //Q14
+    tmpU32no1 = WebRtcSpl_DivU32U16(WEBRTC_SPL_LSHIFT_U32(tmpU32no2, nShifts),
+                                    25); //Q14
+    // compute indicator function: sigmoid map
+    // FLOAT code
+    // indicator1 = 0.5 * (tanh(sgnMap * widthPrior *
+    //                          (threshPrior1 - tmpFloat1)) + 1.0);
+    tableIndex = (int16_t)WEBRTC_SPL_RSHIFT_U32(tmpU32no1, 14);
+    if (tableIndex < 16) {
+      tmp16no2 = kIndicatorTable[tableIndex];
+      tmp16no1 = kIndicatorTable[tableIndex + 1] - kIndicatorTable[tableIndex];
+      frac = (int16_t)(tmpU32no1 & 0x00003fff); // Q14
+      tmp16no2 += (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, frac, 14);
+      if (tmpIndFX) {
+        tmpIndFX = 8192 + tmp16no2; // Q14
+      } else {
+        tmpIndFX = 8192 - tmp16no2; // Q14
+      }
+    }
+    indPriorFX += WEBRTC_SPL_MUL_16_16(inst->weightSpecFlat, tmpIndFX); // 6*Q14
+  }
+
+  //for template spectral-difference
+  if (inst->weightSpecDiff) {
+    tmpU32no1 = 0;
+    if (inst->featureSpecDiff) {
+      normTmp = WEBRTC_SPL_MIN(20 - inst->stages,
+                               WebRtcSpl_NormU32(inst->featureSpecDiff));
+      tmpU32no1 = WEBRTC_SPL_LSHIFT_U32(inst->featureSpecDiff, normTmp);
+                                                         // Q(normTmp-2*stages)
+      tmpU32no2 = WEBRTC_SPL_RSHIFT_U32(inst->timeAvgMagnEnergy,
+                                        20 - inst->stages - normTmp);
+      if (tmpU32no2 > 0) {
+        // Q(20 - inst->stages)
+        tmpU32no1 = WEBRTC_SPL_UDIV(tmpU32no1, tmpU32no2);
+      } else {
+        tmpU32no1 = (uint32_t)(0x7fffffff);
+      }
+    }
+    tmpU32no3 = WEBRTC_SPL_UDIV(WEBRTC_SPL_LSHIFT_U32(inst->thresholdSpecDiff,
+                                                      17),
+                                25);
+    tmpU32no2 = tmpU32no1 - tmpU32no3;
+    nShifts = 1;
+    tmpIndFX = 16384; // Q14(1.0)
+    //use larger width in tanh map for pause regions
+    if (tmpU32no2 & 0x80000000) {
+      tmpIndFX = 0;
+      tmpU32no2 = tmpU32no3 - tmpU32no1;
+      //widthPrior = widthPrior * 2.0;
+      nShifts--;
+    }
+    tmpU32no1 = WEBRTC_SPL_RSHIFT_U32(tmpU32no2, nShifts);
+    // compute indicator function: sigmoid map
+    /* FLOAT code
+     indicator2 = 0.5 * (tanh(widthPrior * (tmpFloat1 - threshPrior2)) + 1.0);
+     */
+    tableIndex = (int16_t)WEBRTC_SPL_RSHIFT_U32(tmpU32no1, 14);
+    if (tableIndex < 16) {
+      tmp16no2 = kIndicatorTable[tableIndex];
+      tmp16no1 = kIndicatorTable[tableIndex + 1] - kIndicatorTable[tableIndex];
+      frac = (int16_t)(tmpU32no1 & 0x00003fff); // Q14
+      tmp16no2 += (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+                    tmp16no1, frac, 14);
+      if (tmpIndFX) {
+        tmpIndFX = 8192 + tmp16no2;
+      } else {
+        tmpIndFX = 8192 - tmp16no2;
+      }
+    }
+    indPriorFX += WEBRTC_SPL_MUL_16_16(inst->weightSpecDiff, tmpIndFX); // 6*Q14
+  }
+
+  //combine the indicator function with the feature weights
+  // FLOAT code
+  // indPrior = 1 - (weightIndPrior0 * indicator0 + weightIndPrior1 *
+  //                 indicator1 + weightIndPrior2 * indicator2);
+  indPriorFX16 = WebRtcSpl_DivW32W16ResW16(98307 - indPriorFX, 6); // Q14
+  // done with computing indicator function
+
+  //compute the prior probability
+  // FLOAT code
+  // inst->priorNonSpeechProb += PRIOR_UPDATE *
+  //                             (indPriorNonSpeech - inst->priorNonSpeechProb);
+  tmp16 = indPriorFX16 - inst->priorNonSpeechProb; // Q14
+  inst->priorNonSpeechProb += (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(
+                                PRIOR_UPDATE_Q14, tmp16, 14); // Q14
+
+  //final speech probability: combine prior model with LR factor:
+
+  memset(nonSpeechProbFinal, 0, sizeof(uint16_t) * inst->magnLen);
+
+  if (inst->priorNonSpeechProb > 0) {
+    r0 = inst->priorNonSpeechProb;
+    r1 = 16384 - r0;
+    int32_t const_23637 = 23637;
+    int32_t const_44 = 44;
+    int32_t const_84 = 84;
+    int32_t const_1 = 1;
+    int32_t const_neg8 = -8;
+    for (i = 0; i < inst->magnLen; i++) {
+      r2 = inst->logLrtTimeAvgW32[i];
+      if (r2 < 65300) {
+        __asm __volatile(
+          ".set         push                                      \n\t"
+          ".set         noreorder                                 \n\t"
+          "mul          %[r2],    %[r2],          %[const_23637]  \n\t"
+          "sll          %[r6],    %[r1],          16              \n\t"
+          "clz          %[r7],    %[r6]                           \n\t"
+          "clo          %[r8],    %[r6]                           \n\t"
+          "slt          %[r9],    %[r6],          $0              \n\t"
+          "movn         %[r7],    %[r8],          %[r9]           \n\t"
+          "sra          %[r2],    %[r2],          14              \n\t"
+          "andi         %[r3],    %[r2],          0xfff           \n\t"
+          "mul          %[r4],    %[r3],          %[r3]           \n\t"
+          "mul          %[r3],    %[r3],          %[const_84]     \n\t"
+          "sra          %[r2],    %[r2],          12              \n\t"
+          "slt          %[r5],    %[r2],          %[const_neg8]   \n\t"
+          "movn         %[r2],    %[const_neg8],  %[r5]           \n\t"
+          "mul          %[r4],    %[r4],          %[const_44]     \n\t"
+          "sra          %[r3],    %[r3],          7               \n\t"
+          "addiu        %[r7],    %[r7],          -1              \n\t"
+          "slti         %[r9],    %[r7],          31              \n\t"
+          "movz         %[r7],    $0,             %[r9]           \n\t"
+          "sra          %[r4],    %[r4],          19              \n\t"
+          "addu         %[r4],    %[r4],          %[r3]           \n\t"
+          "addiu        %[r3],    %[r2],          8               \n\t"
+          "addiu        %[r2],    %[r2],          -4              \n\t"
+          "neg          %[r5],    %[r2]                           \n\t"
+          "sllv         %[r6],    %[r4],          %[r2]           \n\t"
+          "srav         %[r5],    %[r4],          %[r5]           \n\t"
+          "slt          %[r2],    %[r2],          $0              \n\t"
+          "movn         %[r6],    %[r5],          %[r2]           \n\t"
+          "sllv         %[r3],    %[const_1],     %[r3]           \n\t"
+          "addu         %[r2],    %[r3],          %[r6]           \n\t"
+          "clz          %[r4],    %[r2]                           \n\t"
+          "clo          %[r5],    %[r2]                           \n\t"
+          "slt          %[r8],    %[r2],          $0              \n\t"
+          "movn         %[r4],    %[r5],          %[r8]           \n\t"
+          "addiu        %[r4],    %[r4],          -1              \n\t"
+          "slt          %[r5],    $0,             %[r2]           \n\t"
+          "or           %[r5],    %[r5],          %[r7]           \n\t"
+          "movz         %[r4],    $0,             %[r5]           \n\t"
+          "addiu        %[r6],    %[r7],          -7              \n\t"
+          "addu         %[r6],    %[r6],          %[r4]           \n\t"
+          "bltz         %[r6],    1f                              \n\t"
+          " nop                                                   \n\t"
+          "addiu        %[r4],    %[r6],          -8              \n\t"
+          "neg          %[r3],    %[r4]                           \n\t"
+          "srav         %[r5],    %[r2],          %[r3]           \n\t"
+          "mul          %[r5],    %[r5],          %[r1]           \n\t"
+          "mul          %[r2],    %[r2],          %[r1]           \n\t"
+          "slt          %[r4],    %[r4],          $0              \n\t"
+          "srav         %[r5],    %[r5],          %[r6]           \n\t"
+          "sra          %[r2],    %[r2],          8               \n\t"
+          "movn         %[r2],    %[r5],          %[r4]           \n\t"
+          "sll          %[r3],    %[r0],          8               \n\t"
+          "addu         %[r2],    %[r0],          %[r2]           \n\t"
+          "divu         %[r3],    %[r3],          %[r2]           \n\t"
+         "1:                                                      \n\t"
+          ".set         pop                                       \n\t"
+          : [r2] "+r" (r2), [r3] "=&r" (r3), [r4] "=&r" (r4),
+            [r5] "=&r" (r5), [r6] "=&r" (r6), [r7] "=&r" (r7),
+            [r8] "=&r" (r8), [r9] "=&r" (r9)
+          : [r0] "r" (r0), [r1] "r" (r1), [const_23637] "r" (const_23637),
+            [const_neg8] "r" (const_neg8), [const_84] "r" (const_84),
+            [const_1] "r" (const_1), [const_44] "r" (const_44)
+          : "hi", "lo"
+        );
+        nonSpeechProbFinal[i] = r3;
+      }
+    }
+  }
+}
+
+// Update analysis buffer for lower band, and window data before FFT.
+void WebRtcNsx_AnalysisUpdate_mips(NsxInst_t* inst,
+                                   int16_t* out,
+                                   int16_t* new_speech) {
+
+  int iters, after;
+  int anaLen = inst->anaLen;
+  int *window = (int*)inst->window;
+  int *anaBuf = (int*)inst->analysisBuffer;
+  int *outBuf = (int*)out;
+  int r0, r1, r2, r3, r4, r5, r6, r7;
+#if defined(MIPS_DSP_R1_LE)
+  int r8;
+#endif
+
+  // For lower band update analysis buffer.
+  WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
+                        inst->analysisBuffer + inst->blockLen10ms,
+                        inst->anaLen - inst->blockLen10ms);
+  WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
+      + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
+
+  // Window data before FFT.
+#if defined(MIPS_DSP_R1_LE)
+  __asm __volatile(
+    ".set              push                                \n\t"
+    ".set              noreorder                           \n\t"
+    "sra               %[iters],   %[anaLen],    3         \n\t"
+   "1:                                                     \n\t"
+    "blez              %[iters],   2f                      \n\t"
+    " nop                                                  \n\t"
+    "lw                %[r0],      0(%[window])            \n\t"
+    "lw                %[r1],      0(%[anaBuf])            \n\t"
+    "lw                %[r2],      4(%[window])            \n\t"
+    "lw                %[r3],      4(%[anaBuf])            \n\t"
+    "lw                %[r4],      8(%[window])            \n\t"
+    "lw                %[r5],      8(%[anaBuf])            \n\t"
+    "lw                %[r6],      12(%[window])           \n\t"
+    "lw                %[r7],      12(%[anaBuf])           \n\t"
+    "muleq_s.w.phl     %[r8],      %[r0],        %[r1]     \n\t"
+    "muleq_s.w.phr     %[r0],      %[r0],        %[r1]     \n\t"
+    "muleq_s.w.phl     %[r1],      %[r2],        %[r3]     \n\t"
+    "muleq_s.w.phr     %[r2],      %[r2],        %[r3]     \n\t"
+    "muleq_s.w.phl     %[r3],      %[r4],        %[r5]     \n\t"
+    "muleq_s.w.phr     %[r4],      %[r4],        %[r5]     \n\t"
+    "muleq_s.w.phl     %[r5],      %[r6],        %[r7]     \n\t"
+    "muleq_s.w.phr     %[r6],      %[r6],        %[r7]     \n\t"
+#if defined(MIPS_DSP_R2_LE)
+    "precr_sra_r.ph.w  %[r8],      %[r0],        15        \n\t"
+    "precr_sra_r.ph.w  %[r1],      %[r2],        15        \n\t"
+    "precr_sra_r.ph.w  %[r3],      %[r4],        15        \n\t"
+    "precr_sra_r.ph.w  %[r5],      %[r6],        15        \n\t"
+    "sw                %[r8],      0(%[outBuf])            \n\t"
+    "sw                %[r1],      4(%[outBuf])            \n\t"
+    "sw                %[r3],      8(%[outBuf])            \n\t"
+    "sw                %[r5],      12(%[outBuf])           \n\t"
+#else
+    "shra_r.w          %[r8],      %[r8],        15        \n\t"
+    "shra_r.w          %[r0],      %[r0],        15        \n\t"
+    "shra_r.w          %[r1],      %[r1],        15        \n\t"
+    "shra_r.w          %[r2],      %[r2],        15        \n\t"
+    "shra_r.w          %[r3],      %[r3],        15        \n\t"
+    "shra_r.w          %[r4],      %[r4],        15        \n\t"
+    "shra_r.w          %[r5],      %[r5],        15        \n\t"
+    "shra_r.w          %[r6],      %[r6],        15        \n\t"
+    "sll               %[r0],      %[r0],        16        \n\t"
+    "sll               %[r2],      %[r2],        16        \n\t"
+    "sll               %[r4],      %[r4],        16        \n\t"
+    "sll               %[r6],      %[r6],        16        \n\t"
+    "packrl.ph         %[r0],      %[r8],        %[r0]     \n\t"
+    "packrl.ph         %[r2],      %[r1],        %[r2]     \n\t"
+    "packrl.ph         %[r4],      %[r3],        %[r4]     \n\t"
+    "packrl.ph         %[r6],      %[r5],        %[r6]     \n\t"
+    "sw                %[r0],      0(%[outBuf])            \n\t"
+    "sw                %[r2],      4(%[outBuf])            \n\t"
+    "sw                %[r4],      8(%[outBuf])            \n\t"
+    "sw                %[r6],      12(%[outBuf])           \n\t"
+#endif
+    "addiu             %[window],  %[window],    16        \n\t"
+    "addiu             %[anaBuf],  %[anaBuf],    16        \n\t"
+    "addiu             %[outBuf],  %[outBuf],    16        \n\t"
+    "b                 1b                                  \n\t"
+    " addiu            %[iters],   %[iters],     -1        \n\t"
+   "2:                                                     \n\t"
+    "andi              %[after],   %[anaLen],    7         \n\t"
+   "3:                                                     \n\t"
+    "blez              %[after],   4f                      \n\t"
+    " nop                                                  \n\t"
+    "lh                %[r0],      0(%[window])            \n\t"
+    "lh                %[r1],      0(%[anaBuf])            \n\t"
+    "mul               %[r0],      %[r0],        %[r1]     \n\t"
+    "addiu             %[window],  %[window],    2         \n\t"
+    "addiu             %[anaBuf],  %[anaBuf],    2         \n\t"
+    "addiu             %[outBuf],  %[outBuf],    2         \n\t"
+    "shra_r.w          %[r0],      %[r0],        14        \n\t"
+    "sh                %[r0],      -2(%[outBuf])           \n\t"
+    "b                 3b                                  \n\t"
+    " addiu            %[after],   %[after],     -1        \n\t"
+   "4:                                                     \n\t"
+    ".set              pop                                 \n\t"
+    : [r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2),
+      [r3] "=&r" (r3), [r4] "=&r" (r4), [r5] "=&r" (r5),
+      [r6] "=&r" (r6), [r7] "=&r" (r7), [r8] "=&r" (r8),
+      [iters] "=&r" (iters), [after] "=&r" (after),
+      [window] "+r" (window),[anaBuf] "+r" (anaBuf),
+      [outBuf] "+r" (outBuf)
+    : [anaLen] "r" (anaLen)
+    : "memory", "hi", "lo"
+  );
+#else
+  __asm  __volatile(
+    ".set           push                                    \n\t"
+    ".set           noreorder                               \n\t"
+    "sra            %[iters],   %[anaLen],      2           \n\t"
+   "1:                                                      \n\t"
+    "blez           %[iters],   2f                          \n\t"
+    " nop                                                   \n\t"
+    "lh             %[r0],      0(%[window])                \n\t"
+    "lh             %[r1],      0(%[anaBuf])                \n\t"
+    "lh             %[r2],      2(%[window])                \n\t"
+    "lh             %[r3],      2(%[anaBuf])                \n\t"
+    "lh             %[r4],      4(%[window])                \n\t"
+    "lh             %[r5],      4(%[anaBuf])                \n\t"
+    "lh             %[r6],      6(%[window])                \n\t"
+    "lh             %[r7],      6(%[anaBuf])                \n\t"
+    "mul            %[r0],      %[r0],          %[r1]       \n\t"
+    "mul            %[r2],      %[r2],          %[r3]       \n\t"
+    "mul            %[r4],      %[r4],          %[r5]       \n\t"
+    "mul            %[r6],      %[r6],          %[r7]       \n\t"
+    "addiu          %[window],  %[window],      8           \n\t"
+    "addiu          %[anaBuf],  %[anaBuf],      8           \n\t"
+    "addiu          %[r0],      %[r0],          0x2000      \n\t"
+    "addiu          %[r2],      %[r2],          0x2000      \n\t"
+    "addiu          %[r4],      %[r4],          0x2000      \n\t"
+    "addiu          %[r6],      %[r6],          0x2000      \n\t"
+    "sra            %[r0],      %[r0],          14          \n\t"
+    "sra            %[r2],      %[r2],          14          \n\t"
+    "sra            %[r4],      %[r4],          14          \n\t"
+    "sra            %[r6],      %[r6],          14          \n\t"
+    "sh             %[r0],      0(%[outBuf])                \n\t"
+    "sh             %[r2],      2(%[outBuf])                \n\t"
+    "sh             %[r4],      4(%[outBuf])                \n\t"
+    "sh             %[r6],      6(%[outBuf])                \n\t"
+    "addiu          %[outBuf],  %[outBuf],      8           \n\t"
+    "b              1b                                      \n\t"
+    " addiu         %[iters],   %[iters],       -1          \n\t"
+   "2:                                                      \n\t"
+    "andi           %[after],   %[anaLen],      3           \n\t"
+   "3:                                                      \n\t"
+    "blez           %[after],   4f                          \n\t"
+    " nop                                                   \n\t"
+    "lh             %[r0],      0(%[window])                \n\t"
+    "lh             %[r1],      0(%[anaBuf])                \n\t"
+    "mul            %[r0],      %[r0],          %[r1]       \n\t"
+    "addiu          %[window],  %[window],      2           \n\t"
+    "addiu          %[anaBuf],  %[anaBuf],      2           \n\t"
+    "addiu          %[outBuf],  %[outBuf],      2           \n\t"
+    "addiu          %[r0],      %[r0],          0x2000      \n\t"
+    "sra            %[r0],      %[r0],          14          \n\t"
+    "sh             %[r0],      -2(%[outBuf])               \n\t"
+    "b              3b                                      \n\t"
+    " addiu         %[after],   %[after],       -1          \n\t"
+   "4:                                                      \n\t"
+    ".set           pop                                     \n\t"
+    : [r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2),
+      [r3] "=&r" (r3), [r4] "=&r" (r4), [r5] "=&r" (r5),
+      [r6] "=&r" (r6), [r7] "=&r" (r7), [iters] "=&r" (iters),
+      [after] "=&r" (after), [window] "+r" (window),
+      [anaBuf] "+r" (anaBuf), [outBuf] "+r" (outBuf)
+    : [anaLen] "r" (anaLen)
+    : "memory", "hi", "lo"
+  );
+#endif
+}
+
+// For the noise supression process, synthesis, read out fully processed
+// segment, and update synthesis buffer.
+void WebRtcNsx_SynthesisUpdate_mips(NsxInst_t* inst,
+                                    int16_t* out_frame,
+                                    int16_t gain_factor) {
+
+  int iters = inst->blockLen10ms >> 2;
+  int after = inst->blockLen10ms & 3;
+  int r0, r1, r2, r3, r4, r5, r6, r7;
+  int16_t *window = (int16_t*)inst->window;
+  int16_t *real = inst->real;
+  int16_t *synthBuf = inst->synthesisBuffer;
+  int16_t *out = out_frame;
+  int sat_pos = 0x7fff;
+  int sat_neg = 0xffff8000;
+  int block10 = (int)inst->blockLen10ms;
+  int anaLen = (int)inst->anaLen;
+
+  __asm __volatile(
+    ".set       push                                        \n\t"
+    ".set       noreorder                                   \n\t"
+   "1:                                                      \n\t"
+    "blez       %[iters],   2f                              \n\t"
+    " nop                                                   \n\t"
+    "lh         %[r0],      0(%[window])                    \n\t"
+    "lh         %[r1],      0(%[real])                      \n\t"
+    "lh         %[r2],      2(%[window])                    \n\t"
+    "lh         %[r3],      2(%[real])                      \n\t"
+    "lh         %[r4],      4(%[window])                    \n\t"
+    "lh         %[r5],      4(%[real])                      \n\t"
+    "lh         %[r6],      6(%[window])                    \n\t"
+    "lh         %[r7],      6(%[real])                      \n\t"
+    "mul        %[r0],      %[r0],          %[r1]           \n\t"
+    "mul        %[r2],      %[r2],          %[r3]           \n\t"
+    "mul        %[r4],      %[r4],          %[r5]           \n\t"
+    "mul        %[r6],      %[r6],          %[r7]           \n\t"
+    "addiu      %[r0],      %[r0],          0x2000          \n\t"
+    "addiu      %[r2],      %[r2],          0x2000          \n\t"
+    "addiu      %[r4],      %[r4],          0x2000          \n\t"
+    "addiu      %[r6],      %[r6],          0x2000          \n\t"
+    "sra        %[r0],      %[r0],          14              \n\t"
+    "sra        %[r2],      %[r2],          14              \n\t"
+    "sra        %[r4],      %[r4],          14              \n\t"
+    "sra        %[r6],      %[r6],          14              \n\t"
+    "mul        %[r0],      %[r0],          %[gain_factor]  \n\t"
+    "mul        %[r2],      %[r2],          %[gain_factor]  \n\t"
+    "mul        %[r4],      %[r4],          %[gain_factor]  \n\t"
+    "mul        %[r6],      %[r6],          %[gain_factor]  \n\t"
+    "addiu      %[r0],      %[r0],          0x1000          \n\t"
+    "addiu      %[r2],      %[r2],          0x1000          \n\t"
+    "addiu      %[r4],      %[r4],          0x1000          \n\t"
+    "addiu      %[r6],      %[r6],          0x1000          \n\t"
+    "sra        %[r0],      %[r0],          13              \n\t"
+    "sra        %[r2],      %[r2],          13              \n\t"
+    "sra        %[r4],      %[r4],          13              \n\t"
+    "sra        %[r6],      %[r6],          13              \n\t"
+    "slt        %[r1],      %[r0],          %[sat_pos]      \n\t"
+    "slt        %[r3],      %[r2],          %[sat_pos]      \n\t"
+    "slt        %[r5],      %[r4],          %[sat_pos]      \n\t"
+    "slt        %[r7],      %[r6],          %[sat_pos]      \n\t"
+    "movz       %[r0],      %[sat_pos],     %[r1]           \n\t"
+    "movz       %[r2],      %[sat_pos],     %[r3]           \n\t"
+    "movz       %[r4],      %[sat_pos],     %[r5]           \n\t"
+    "movz       %[r6],      %[sat_pos],     %[r7]           \n\t"
+    "lh         %[r1],      0(%[synthBuf])                  \n\t"
+    "lh         %[r3],      2(%[synthBuf])                  \n\t"
+    "lh         %[r5],      4(%[synthBuf])                  \n\t"
+    "lh         %[r7],      6(%[synthBuf])                  \n\t"
+    "addu       %[r0],      %[r0],          %[r1]           \n\t"
+    "addu       %[r2],      %[r2],          %[r3]           \n\t"
+    "addu       %[r4],      %[r4],          %[r5]           \n\t"
+    "addu       %[r6],      %[r6],          %[r7]           \n\t"
+    "slt        %[r1],      %[r0],          %[sat_pos]      \n\t"
+    "slt        %[r3],      %[r2],          %[sat_pos]      \n\t"
+    "slt        %[r5],      %[r4],          %[sat_pos]      \n\t"
+    "slt        %[r7],      %[r6],          %[sat_pos]      \n\t"
+    "movz       %[r0],      %[sat_pos],     %[r1]           \n\t"
+    "movz       %[r2],      %[sat_pos],     %[r3]           \n\t"
+    "movz       %[r4],      %[sat_pos],     %[r5]           \n\t"
+    "movz       %[r6],      %[sat_pos],     %[r7]           \n\t"
+    "slt        %[r1],      %[r0],          %[sat_neg]      \n\t"
+    "slt        %[r3],      %[r2],          %[sat_neg]      \n\t"
+    "slt        %[r5],      %[r4],          %[sat_neg]      \n\t"
+    "slt        %[r7],      %[r6],          %[sat_neg]      \n\t"
+    "movn       %[r0],      %[sat_neg],     %[r1]           \n\t"
+    "movn       %[r2],      %[sat_neg],     %[r3]           \n\t"
+    "movn       %[r4],      %[sat_neg],     %[r5]           \n\t"
+    "movn       %[r6],      %[sat_neg],     %[r7]           \n\t"
+    "sh         %[r0],      0(%[synthBuf])                  \n\t"
+    "sh         %[r2],      2(%[synthBuf])                  \n\t"
+    "sh         %[r4],      4(%[synthBuf])                  \n\t"
+    "sh         %[r6],      6(%[synthBuf])                  \n\t"
+    "sh         %[r0],      0(%[out])                       \n\t"
+    "sh         %[r2],      2(%[out])                       \n\t"
+    "sh         %[r4],      4(%[out])                       \n\t"
+    "sh         %[r6],      6(%[out])                       \n\t"
+    "addiu      %[window],  %[window],      8               \n\t"
+    "addiu      %[real],    %[real],        8               \n\t"
+    "addiu      %[synthBuf],%[synthBuf],    8               \n\t"
+    "addiu      %[out],     %[out],         8               \n\t"
+    "b          1b                                          \n\t"
+    " addiu     %[iters],   %[iters],       -1              \n\t"
+   "2:                                                      \n\t"
+    "blez       %[after],   3f                              \n\t"
+    " subu      %[block10], %[anaLen],      %[block10]      \n\t"
+    "lh         %[r0],      0(%[window])                    \n\t"
+    "lh         %[r1],      0(%[real])                      \n\t"
+    "mul        %[r0],      %[r0],          %[r1]           \n\t"
+    "addiu      %[window],  %[window],      2               \n\t"
+    "addiu      %[real],    %[real],        2               \n\t"
+    "addiu      %[r0],      %[r0],          0x2000          \n\t"
+    "sra        %[r0],      %[r0],          14              \n\t"
+    "mul        %[r0],      %[r0],          %[gain_factor]  \n\t"
+    "addiu      %[r0],      %[r0],          0x1000          \n\t"
+    "sra        %[r0],      %[r0],          13              \n\t"
+    "slt        %[r1],      %[r0],          %[sat_pos]      \n\t"
+    "movz       %[r0],      %[sat_pos],     %[r1]           \n\t"
+    "lh         %[r1],      0(%[synthBuf])                  \n\t"
+    "addu       %[r0],      %[r0],          %[r1]           \n\t"
+    "slt        %[r1],      %[r0],          %[sat_pos]      \n\t"
+    "movz       %[r0],      %[sat_pos],     %[r1]           \n\t"
+    "slt        %[r1],      %[r0],          %[sat_neg]      \n\t"
+    "movn       %[r0],      %[sat_neg],     %[r1]           \n\t"
+    "sh         %[r0],      0(%[synthBuf])                  \n\t"
+    "sh         %[r0],      0(%[out])                       \n\t"
+    "addiu      %[synthBuf],%[synthBuf],    2               \n\t"
+    "addiu      %[out],     %[out],         2               \n\t"
+    "b          2b                                          \n\t"
+    " addiu     %[after],   %[after],       -1              \n\t"
+   "3:                                                      \n\t"
+    "sra        %[iters],   %[block10],     2               \n\t"
+   "4:                                                      \n\t"
+    "blez       %[iters],   5f                              \n\t"
+    " andi      %[after],   %[block10],     3               \n\t"
+    "lh         %[r0],      0(%[window])                    \n\t"
+    "lh         %[r1],      0(%[real])                      \n\t"
+    "lh         %[r2],      2(%[window])                    \n\t"
+    "lh         %[r3],      2(%[real])                      \n\t"
+    "lh         %[r4],      4(%[window])                    \n\t"
+    "lh         %[r5],      4(%[real])                      \n\t"
+    "lh         %[r6],      6(%[window])                    \n\t"
+    "lh         %[r7],      6(%[real])                      \n\t"
+    "mul        %[r0],      %[r0],          %[r1]           \n\t"
+    "mul        %[r2],      %[r2],          %[r3]           \n\t"
+    "mul        %[r4],      %[r4],          %[r5]           \n\t"
+    "mul        %[r6],      %[r6],          %[r7]           \n\t"
+    "addiu      %[r0],      %[r0],          0x2000          \n\t"
+    "addiu      %[r2],      %[r2],          0x2000          \n\t"
+    "addiu      %[r4],      %[r4],          0x2000          \n\t"
+    "addiu      %[r6],      %[r6],          0x2000          \n\t"
+    "sra        %[r0],      %[r0],          14              \n\t"
+    "sra        %[r2],      %[r2],          14              \n\t"
+    "sra        %[r4],      %[r4],          14              \n\t"
+    "sra        %[r6],      %[r6],          14              \n\t"
+    "mul        %[r0],      %[r0],          %[gain_factor]  \n\t"
+    "mul        %[r2],      %[r2],          %[gain_factor]  \n\t"
+    "mul        %[r4],      %[r4],          %[gain_factor]  \n\t"
+    "mul        %[r6],      %[r6],          %[gain_factor]  \n\t"
+    "addiu      %[r0],      %[r0],          0x1000          \n\t"
+    "addiu      %[r2],      %[r2],          0x1000          \n\t"
+    "addiu      %[r4],      %[r4],          0x1000          \n\t"
+    "addiu      %[r6],      %[r6],          0x1000          \n\t"
+    "sra        %[r0],      %[r0],          13              \n\t"
+    "sra        %[r2],      %[r2],          13              \n\t"
+    "sra        %[r4],      %[r4],          13              \n\t"
+    "sra        %[r6],      %[r6],          13              \n\t"
+    "slt        %[r1],      %[r0],          %[sat_pos]      \n\t"
+    "slt        %[r3],      %[r2],          %[sat_pos]      \n\t"
+    "slt        %[r5],      %[r4],          %[sat_pos]      \n\t"
+    "slt        %[r7],      %[r6],          %[sat_pos]      \n\t"
+    "movz       %[r0],      %[sat_pos],     %[r1]           \n\t"
+    "movz       %[r2],      %[sat_pos],     %[r3]           \n\t"
+    "movz       %[r4],      %[sat_pos],     %[r5]           \n\t"
+    "movz       %[r6],      %[sat_pos],     %[r7]           \n\t"
+    "lh         %[r1],      0(%[synthBuf])                  \n\t"
+    "lh         %[r3],      2(%[synthBuf])                  \n\t"
+    "lh         %[r5],      4(%[synthBuf])                  \n\t"
+    "lh         %[r7],      6(%[synthBuf])                  \n\t"
+    "addu       %[r0],      %[r0],          %[r1]           \n\t"
+    "addu       %[r2],      %[r2],          %[r3]           \n\t"
+    "addu       %[r4],      %[r4],          %[r5]           \n\t"
+    "addu       %[r6],      %[r6],          %[r7]           \n\t"
+    "slt        %[r1],      %[r0],          %[sat_pos]      \n\t"
+    "slt        %[r3],      %[r2],          %[sat_pos]      \n\t"
+    "slt        %[r5],      %[r4],          %[sat_pos]      \n\t"
+    "slt        %[r7],      %[r6],          %[sat_pos]      \n\t"
+    "movz       %[r0],      %[sat_pos],     %[r1]           \n\t"
+    "movz       %[r2],      %[sat_pos],     %[r3]           \n\t"
+    "movz       %[r4],      %[sat_pos],     %[r5]           \n\t"
+    "movz       %[r6],      %[sat_pos],     %[r7]           \n\t"
+    "slt        %[r1],      %[r0],          %[sat_neg]      \n\t"
+    "slt        %[r3],      %[r2],          %[sat_neg]      \n\t"
+    "slt        %[r5],      %[r4],          %[sat_neg]      \n\t"
+    "slt        %[r7],      %[r6],          %[sat_neg]      \n\t"
+    "movn       %[r0],      %[sat_neg],     %[r1]           \n\t"
+    "movn       %[r2],      %[sat_neg],     %[r3]           \n\t"
+    "movn       %[r4],      %[sat_neg],     %[r5]           \n\t"
+    "movn       %[r6],      %[sat_neg],     %[r7]           \n\t"
+    "sh         %[r0],      0(%[synthBuf])                  \n\t"
+    "sh         %[r2],      2(%[synthBuf])                  \n\t"
+    "sh         %[r4],      4(%[synthBuf])                  \n\t"
+    "sh         %[r6],      6(%[synthBuf])                  \n\t"
+    "addiu      %[window],  %[window],      8               \n\t"
+    "addiu      %[real],    %[real],        8               \n\t"
+    "addiu      %[synthBuf],%[synthBuf],    8               \n\t"
+    "b          4b                                          \n\t"
+    " addiu     %[iters],   %[iters],       -1              \n\t"
+   "5:                                                      \n\t"
+    "blez       %[after],   6f                              \n\t"
+    " nop                                                   \n\t"
+    "lh         %[r0],      0(%[window])                    \n\t"
+    "lh         %[r1],      0(%[real])                      \n\t"
+    "mul        %[r0],      %[r0],          %[r1]           \n\t"
+    "addiu      %[window],  %[window],      2               \n\t"
+    "addiu      %[real],    %[real],        2               \n\t"
+    "addiu      %[r0],      %[r0],          0x2000          \n\t"
+    "sra        %[r0],      %[r0],          14              \n\t"
+    "mul        %[r0],      %[r0],          %[gain_factor]  \n\t"
+    "addiu      %[r0],      %[r0],          0x1000          \n\t"
+    "sra        %[r0],      %[r0],          13              \n\t"
+    "slt        %[r1],      %[r0],          %[sat_pos]      \n\t"
+    "movz       %[r0],      %[sat_pos],     %[r1]           \n\t"
+    "lh         %[r1],      0(%[synthBuf])                  \n\t"
+    "addu       %[r0],      %[r0],          %[r1]           \n\t"
+    "slt        %[r1],      %[r0],          %[sat_pos]      \n\t"
+    "movz       %[r0],      %[sat_pos],     %[r1]           \n\t"
+    "slt        %[r1],      %[r0],          %[sat_neg]      \n\t"
+    "movn       %[r0],      %[sat_neg],     %[r1]           \n\t"
+    "sh         %[r0],      0(%[synthBuf])                  \n\t"
+    "addiu      %[synthBuf],%[synthBuf],    2               \n\t"
+    "b          2b                                          \n\t"
+    " addiu     %[after],   %[after],       -1              \n\t"
+   "6:                                                      \n\t"
+    ".set       pop                                         \n\t"
+    : [r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2),
+      [r3] "=&r" (r3), [r4] "=&r" (r4), [r5] "=&r" (r5),
+      [r6] "=&r" (r6), [r7] "=&r" (r7), [iters] "+r" (iters),
+      [after] "+r" (after), [block10] "+r" (block10),
+      [window] "+r" (window), [real] "+r" (real),
+      [synthBuf] "+r" (synthBuf), [out] "+r" (out)
+    : [gain_factor] "r" (gain_factor), [sat_pos] "r" (sat_pos),
+      [sat_neg] "r" (sat_neg), [anaLen] "r" (anaLen)
+    : "memory", "hi", "lo"
+  );
+
+  // update synthesis buffer
+  WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer,
+                        inst->synthesisBuffer + inst->blockLen10ms,
+                        inst->anaLen - inst->blockLen10ms);
+  WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
+      + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
+}
+
+// Filter the data in the frequency domain, and create spectrum.
+void WebRtcNsx_PrepareSpectrum_mips(NsxInst_t* inst, int16_t* freq_buf) {
+
+  uint16_t *noiseSupFilter = inst->noiseSupFilter;
+  int16_t *real = inst->real;
+  int16_t *imag = inst->imag;
+  int32_t loop_count = 2;
+  int16_t tmp_1, tmp_2, tmp_3, tmp_4, tmp_5, tmp_6;
+  int16_t tmp16 = (inst->anaLen << 1) - 4;
+  int16_t* freq_buf_f = freq_buf;
+  int16_t* freq_buf_s = &freq_buf[tmp16];
+
+  __asm __volatile (
+    ".set       push                                                 \n\t"
+    ".set       noreorder                                            \n\t"
+    //first sample
+    "lh         %[tmp_1],           0(%[noiseSupFilter])             \n\t"
+    "lh         %[tmp_2],           0(%[real])                       \n\t"
+    "lh         %[tmp_3],           0(%[imag])                       \n\t"
+    "mul        %[tmp_2],           %[tmp_2],             %[tmp_1]   \n\t"
+    "mul        %[tmp_3],           %[tmp_3],             %[tmp_1]   \n\t"
+    "sra        %[tmp_2],           %[tmp_2],             14         \n\t"
+    "sra        %[tmp_3],           %[tmp_3],             14         \n\t"
+    "sh         %[tmp_2],           0(%[real])                       \n\t"
+    "sh         %[tmp_3],           0(%[imag])                       \n\t"
+    "negu       %[tmp_3],           %[tmp_3]                         \n\t"
+    "sh         %[tmp_2],           0(%[freq_buf_f])                 \n\t"
+    "sh         %[tmp_3],           2(%[freq_buf_f])                 \n\t"
+    "addiu      %[real],            %[real],              2          \n\t"
+    "addiu      %[imag],            %[imag],              2          \n\t"
+    "addiu      %[noiseSupFilter],  %[noiseSupFilter],    2          \n\t"
+    "addiu      %[freq_buf_f],      %[freq_buf_f],        4          \n\t"
+   "1:                                                               \n\t"
+    "lh         %[tmp_1],           0(%[noiseSupFilter])             \n\t"
+    "lh         %[tmp_2],           0(%[real])                       \n\t"
+    "lh         %[tmp_3],           0(%[imag])                       \n\t"
+    "lh         %[tmp_4],           2(%[noiseSupFilter])             \n\t"
+    "lh         %[tmp_5],           2(%[real])                       \n\t"
+    "lh         %[tmp_6],           2(%[imag])                       \n\t"
+    "mul        %[tmp_2],           %[tmp_2],             %[tmp_1]   \n\t"
+    "mul        %[tmp_3],           %[tmp_3],             %[tmp_1]   \n\t"
+    "mul        %[tmp_5],           %[tmp_5],             %[tmp_4]   \n\t"
+    "mul        %[tmp_6],           %[tmp_6],             %[tmp_4]   \n\t"
+    "addiu      %[loop_count],      %[loop_count],        2          \n\t"
+    "sra        %[tmp_2],           %[tmp_2],             14         \n\t"
+    "sra        %[tmp_3],           %[tmp_3],             14         \n\t"
+    "sra        %[tmp_5],           %[tmp_5],             14         \n\t"
+    "sra        %[tmp_6],           %[tmp_6],             14         \n\t"
+    "addiu      %[noiseSupFilter],  %[noiseSupFilter],    4          \n\t"
+    "sh         %[tmp_2],           0(%[real])                       \n\t"
+    "sh         %[tmp_2],           4(%[freq_buf_s])                 \n\t"
+    "sh         %[tmp_3],           0(%[imag])                       \n\t"
+    "sh         %[tmp_3],           6(%[freq_buf_s])                 \n\t"
+    "negu       %[tmp_3],           %[tmp_3]                         \n\t"
+    "sh         %[tmp_5],           2(%[real])                       \n\t"
+    "sh         %[tmp_5],           0(%[freq_buf_s])                 \n\t"
+    "sh         %[tmp_6],           2(%[imag])                       \n\t"
+    "sh         %[tmp_6],           2(%[freq_buf_s])                 \n\t"
+    "negu       %[tmp_6],           %[tmp_6]                         \n\t"
+    "addiu      %[freq_buf_s],      %[freq_buf_s],        -8         \n\t"
+    "addiu      %[real],            %[real],              4          \n\t"
+    "addiu      %[imag],            %[imag],              4          \n\t"
+    "sh         %[tmp_2],           0(%[freq_buf_f])                 \n\t"
+    "sh         %[tmp_3],           2(%[freq_buf_f])                 \n\t"
+    "sh         %[tmp_5],           4(%[freq_buf_f])                 \n\t"
+    "sh         %[tmp_6],           6(%[freq_buf_f])                 \n\t"
+    "blt        %[loop_count],      %[loop_size],         1b         \n\t"
+    " addiu     %[freq_buf_f],      %[freq_buf_f],        8          \n\t"
+    //last two samples:
+    "lh         %[tmp_1],           0(%[noiseSupFilter])             \n\t"
+    "lh         %[tmp_2],           0(%[real])                       \n\t"
+    "lh         %[tmp_3],           0(%[imag])                       \n\t"
+    "lh         %[tmp_4],           2(%[noiseSupFilter])             \n\t"
+    "lh         %[tmp_5],           2(%[real])                       \n\t"
+    "lh         %[tmp_6],           2(%[imag])                       \n\t"
+    "mul        %[tmp_2],           %[tmp_2],             %[tmp_1]   \n\t"
+    "mul        %[tmp_3],           %[tmp_3],             %[tmp_1]   \n\t"
+    "mul        %[tmp_5],           %[tmp_5],             %[tmp_4]   \n\t"
+    "mul        %[tmp_6],           %[tmp_6],             %[tmp_4]   \n\t"
+    "sra        %[tmp_2],           %[tmp_2],             14         \n\t"
+    "sra        %[tmp_3],           %[tmp_3],             14         \n\t"
+    "sra        %[tmp_5],           %[tmp_5],             14         \n\t"
+    "sra        %[tmp_6],           %[tmp_6],             14         \n\t"
+    "sh         %[tmp_2],           0(%[real])                       \n\t"
+    "sh         %[tmp_2],           4(%[freq_buf_s])                 \n\t"
+    "sh         %[tmp_3],           0(%[imag])                       \n\t"
+    "sh         %[tmp_3],           6(%[freq_buf_s])                 \n\t"
+    "negu       %[tmp_3],           %[tmp_3]                         \n\t"
+    "sh         %[tmp_2],           0(%[freq_buf_f])                 \n\t"
+    "sh         %[tmp_3],           2(%[freq_buf_f])                 \n\t"
+    "sh         %[tmp_5],           4(%[freq_buf_f])                 \n\t"
+    "sh         %[tmp_6],           6(%[freq_buf_f])                 \n\t"
+    "sh         %[tmp_5],           2(%[real])                       \n\t"
+    "sh         %[tmp_6],           2(%[imag])                       \n\t"
+    ".set       pop                                                  \n\t"
+    : [real] "+r" (real), [imag] "+r" (imag),
+      [freq_buf_f] "+r" (freq_buf_f), [freq_buf_s] "+r" (freq_buf_s),
+      [loop_count] "+r" (loop_count), [noiseSupFilter] "+r" (noiseSupFilter),
+      [tmp_1] "=&r" (tmp_1), [tmp_2] "=&r" (tmp_2), [tmp_3] "=&r" (tmp_3),
+      [tmp_4] "=&r" (tmp_4), [tmp_5] "=&r" (tmp_5), [tmp_6] "=&r" (tmp_6)
+    : [loop_size] "r" (inst->anaLen2)
+    : "memory", "hi", "lo"
+  );
+}
+
+#if defined(MIPS_DSP_R1_LE)
+// Denormalize the real-valued signal |in|, the output from inverse FFT.
+void WebRtcNsx_Denormalize_mips(NsxInst_t* inst, int16_t* in, int factor) {
+  int32_t r0, r1, r2, r3, t0;
+  int len = inst->anaLen;
+  int16_t *out = &inst->real[0];
+  int shift = factor - inst->normData;
+
+  __asm __volatile (
+    ".set          push                                \n\t"
+    ".set          noreorder                           \n\t"
+    "beqz          %[len],     8f                      \n\t"
+    " nop                                              \n\t"
+    "bltz          %[shift],   4f                      \n\t"
+    " sra          %[t0],      %[len],      2          \n\t"
+    "beqz          %[t0],      2f                      \n\t"
+    " andi         %[len],     %[len],      3          \n\t"
+   "1:                                                 \n\t"
+    "lh            %[r0],      0(%[in])                \n\t"
+    "lh            %[r1],      2(%[in])                \n\t"
+    "lh            %[r2],      4(%[in])                \n\t"
+    "lh            %[r3],      6(%[in])                \n\t"
+    "shllv_s.ph    %[r0],      %[r0],       %[shift]   \n\t"
+    "shllv_s.ph    %[r1],      %[r1],       %[shift]   \n\t"
+    "shllv_s.ph    %[r2],      %[r2],       %[shift]   \n\t"
+    "shllv_s.ph    %[r3],      %[r3],       %[shift]   \n\t"
+    "addiu         %[in],      %[in],       8          \n\t"
+    "addiu         %[t0],      %[t0],       -1         \n\t"
+    "sh            %[r0],      0(%[out])               \n\t"
+    "sh            %[r1],      2(%[out])               \n\t"
+    "sh            %[r2],      4(%[out])               \n\t"
+    "sh            %[r3],      6(%[out])               \n\t"
+    "bgtz          %[t0],      1b                      \n\t"
+    " addiu        %[out],     %[out],      8          \n\t"
+   "2:                                                 \n\t"
+    "beqz          %[len],     8f                      \n\t"
+    " nop                                              \n\t"
+   "3:                                                 \n\t"
+    "lh            %[r0],      0(%[in])                \n\t"
+    "addiu         %[in],      %[in],       2          \n\t"
+    "addiu         %[len],     %[len],      -1         \n\t"
+    "shllv_s.ph    %[r0],      %[r0],       %[shift]   \n\t"
+    "addiu         %[out],     %[out],      2          \n\t"
+    "bgtz          %[len],     3b                      \n\t"
+    " sh           %[r0],      -2(%[out])              \n\t"
+    "b             8f                                  \n\t"
+   "4:                                                 \n\t"
+    "negu          %[shift],   %[shift]                \n\t"
+    "beqz          %[t0],      6f                      \n\t"
+    " andi         %[len],     %[len],      3          \n\t"
+   "5:                                                 \n\t"
+    "lh            %[r0],      0(%[in])                \n\t"
+    "lh            %[r1],      2(%[in])                \n\t"
+    "lh            %[r2],      4(%[in])                \n\t"
+    "lh            %[r3],      6(%[in])                \n\t"
+    "srav          %[r0],      %[r0],       %[shift]   \n\t"
+    "srav          %[r1],      %[r1],       %[shift]   \n\t"
+    "srav          %[r2],      %[r2],       %[shift]   \n\t"
+    "srav          %[r3],      %[r3],       %[shift]   \n\t"
+    "addiu         %[in],      %[in],       8          \n\t"
+    "addiu         %[t0],      %[t0],       -1         \n\t"
+    "sh            %[r0],      0(%[out])               \n\t"
+    "sh            %[r1],      2(%[out])               \n\t"
+    "sh            %[r2],      4(%[out])               \n\t"
+    "sh            %[r3],      6(%[out])               \n\t"
+    "bgtz          %[t0],      5b                      \n\t"
+    " addiu        %[out],     %[out],      8          \n\t"
+   "6:                                                 \n\t"
+    "beqz          %[len],     8f                      \n\t"
+    " nop                                              \n\t"
+   "7:                                                 \n\t"
+    "lh            %[r0],      0(%[in])                \n\t"
+    "addiu         %[in],      %[in],       2          \n\t"
+    "addiu         %[len],     %[len],      -1         \n\t"
+    "srav          %[r0],      %[r0],       %[shift]   \n\t"
+    "addiu         %[out],     %[out],      2          \n\t"
+    "bgtz          %[len],     7b                      \n\t"
+    " sh           %[r0],      -2(%[out])              \n\t"
+   "8:                                                 \n\t"
+    ".set          pop                                 \n\t"
+    : [t0] "=&r" (t0), [r0] "=&r" (r0), [r1] "=&r" (r1),
+      [r2] "=&r" (r2), [r3] "=&r" (r3)
+    : [len] "r" (len), [shift] "r" (shift), [in] "r" (in),
+      [out] "r" (out)
+    : "memory"
+  );
+}
+#endif
+
+// Normalize the real-valued signal |in|, the input to forward FFT.
+void WebRtcNsx_NormalizeRealBuffer_mips(NsxInst_t* inst,
+                                        const int16_t* in,
+                                        int16_t* out) {
+  int32_t r0, r1, r2, r3, t0;
+  int len = inst->anaLen;
+  int shift = inst->normData;
+
+  __asm __volatile (
+    ".set          push                                \n\t"
+    ".set          noreorder                           \n\t"
+    "beqz          %[len],     4f                      \n\t"
+    " sra          %[t0],      %[len],      2          \n\t"
+    "beqz          %[t0],      2f                      \n\t"
+    " andi         %[len],     %[len],      3          \n\t"
+   "1:                                                 \n\t"
+    "lh            %[r0],      0(%[in])                \n\t"
+    "lh            %[r1],      2(%[in])                \n\t"
+    "lh            %[r2],      4(%[in])                \n\t"
+    "lh            %[r3],      6(%[in])                \n\t"
+    "sllv          %[r0],      %[r0],       %[shift]   \n\t"
+    "sllv          %[r1],      %[r1],       %[shift]   \n\t"
+    "sllv          %[r2],      %[r2],       %[shift]   \n\t"
+    "sllv          %[r3],      %[r3],       %[shift]   \n\t"
+    "addiu         %[in],      %[in],       8          \n\t"
+    "addiu         %[t0],      %[t0],       -1         \n\t"
+    "sh            %[r0],      0(%[out])               \n\t"
+    "sh            %[r1],      2(%[out])               \n\t"
+    "sh            %[r2],      4(%[out])               \n\t"
+    "sh            %[r3],      6(%[out])               \n\t"
+    "bgtz          %[t0],      1b                      \n\t"
+    " addiu        %[out],     %[out],      8          \n\t"
+   "2:                                                 \n\t"
+    "beqz          %[len],     4f                      \n\t"
+    " nop                                              \n\t"
+   "3:                                                 \n\t"
+    "lh            %[r0],      0(%[in])                \n\t"
+    "addiu         %[in],      %[in],       2          \n\t"
+    "addiu         %[len],     %[len],      -1         \n\t"
+    "sllv          %[r0],      %[r0],       %[shift]   \n\t"
+    "addiu         %[out],     %[out],      2          \n\t"
+    "bgtz          %[len],     3b                      \n\t"
+    " sh           %[r0],      -2(%[out])              \n\t"
+   "4:                                                 \n\t"
+    ".set          pop                                 \n\t"
+    : [t0] "=&r" (t0), [r0] "=&r" (r0), [r1] "=&r" (r1),
+      [r2] "=&r" (r2), [r3] "=&r" (r3)
+    : [len] "r" (len), [shift] "r" (shift), [in] "r" (in),
+      [out] "r" (out)
+    : "memory"
+  );
+}
+