Optimized WebRtcIsacfix_NormLatticeFilterMa() function for iSAC fix for ARM Neon
architecture with intrinsics and assembly code. The total iSAC codec speech improved
about 3~5%.

Notes
(1) The Neon version after this optimization is not bit-exact with the generic
C version. The out quality, however, is not worse as verified by test vectors ouput,
and undertandably in theory (32bit x 32bit in Neon is more accurate than the approximation
C code in the generic version).
(2) In Android, a isac neon library will be built. Along with some new function structures,
it is partly for preparation of introducing a run time detection of Neon architecture soon.
Review URL: http://webrtc-codereview.appspot.com/268016

git-svn-id: http://webrtc.googlecode.com/svn/trunk@1192 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/Android.mk b/Android.mk
index e744a89..2380e0b 100644
--- a/Android.mk
+++ b/Android.mk
@@ -103,6 +103,7 @@
 LOCAL_PATH := $(call my-dir)
 
 include $(CLEAR_VARS)
+include $(LOCAL_PATH)/../../external/webrtc/android-webrtc.mk
 
 LOCAL_ARM_MODE := arm
 LOCAL_MODULE := libwebrtc
@@ -137,6 +138,15 @@
     libwebrtc_jpeg \
     libwebrtc_vpx
 
+# Add Neon libraries.
+ifneq (,$(filter '-DWEBRTC_DETECT_ARM_NEON',$(MY_WEBRTC_COMMON_DEFS)))
+LOCAL_WHOLE_STATIC_LIBRARIES += \
+    libwebrtc_isacfix_neon
+else ifeq ($(ARCH_ARM_HAVE_NEON),true)
+LOCAL_WHOLE_STATIC_LIBRARIES += \
+    libwebrtc_isacfix_neon
+endif
+
 LOCAL_SHARED_LIBRARIES := \
     libcutils \
     libdl \
diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/Android.mk b/src/modules/audio_coding/codecs/iSAC/fix/source/Android.mk
index 786695b..3bedfe9 100644
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/Android.mk
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/Android.mk
@@ -6,6 +6,9 @@
 # in the file PATENTS.  All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 
+#############################
+# Build the non-neon library.
+
 LOCAL_PATH := $(call my-dir)
 
 include $(CLEAR_VARS)
@@ -68,8 +71,41 @@
 endif
 include $(BUILD_STATIC_LIBRARY)
 
+#########################
+# Build the neon library.
 
+include $(CLEAR_VARS)
+
+LOCAL_ARM_MODE := arm
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+LOCAL_MODULE := libwebrtc_isacfix_neon
+LOCAL_MODULE_TAGS := optional
+LOCAL_SRC_FILES := \
+    filters_neon.c \
+    lattice_neon.S #.S extention is for including a header file in assembly.
+# TODO(kma): Check with C compiler team and on line community for any status
+# in the file name (.s vs .S), for a better solution.
+
+# Flags passed to both C and C++ files.
+LOCAL_CFLAGS := \
+    $(MY_WEBRTC_COMMON_DEFS) \
+    -mfpu=neon \
+    -flax-vector-conversions
+
+LOCAL_C_INCLUDES := \
+    $(LOCAL_PATH)/../interface \
+    $(LOCAL_PATH)/../../../../../.. \
+    $(LOCAL_PATH)/../../../../../../common_audio/signal_processing/include 
+
+
+ifndef NDK_ROOT
+include external/stlport/libstlport.mk
+endif
+include $(BUILD_STATIC_LIBRARY)
+
+###########################
 # isac test app
+
 include $(CLEAR_VARS)
 
 LOCAL_MODULE_TAGS := tests
diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/codec.h b/src/modules/audio_coding/codecs/iSAC/fix/source/codec.h
index 279c2c5..4a8d281 100644
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/codec.h
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/codec.h
@@ -122,7 +122,6 @@
                                        WebRtc_Word16 lo_hi,
                                        WebRtc_Word16 *lat_outQ9);
 
-
 void WebRtcIsacfix_NormLatticeFilterAr(WebRtc_Word16 orderCoef,
                                        WebRtc_Word16 *stateGQ0,
                                        WebRtc_Word32 *lat_inQ25,
@@ -131,10 +130,54 @@
                                        WebRtc_Word16 lo_hi,
                                        WebRtc_Word16 *lat_outQ0);
 
-int WebRtcIsacfix_AutocorrFix(WebRtc_Word32* __restrict r,
-                              const WebRtc_Word16*  __restrict x,
-                              WebRtc_Word16 N,
-                              WebRtc_Word16 order,
-                              WebRtc_Word16* __restrict scale);
+int WebRtcIsacfix_AutocorrC(WebRtc_Word32* __restrict r,
+                            const WebRtc_Word16* __restrict x,
+                            WebRtc_Word16 N,
+                            WebRtc_Word16 order,
+                            WebRtc_Word16* __restrict scale);
+
+void WebRtcIsacfix_FilterMaLoopC(int16_t input0,
+                                 int16_t input1,
+                                 int32_t input2,
+                                 int32_t* ptr0,
+                                 int32_t* ptr1,
+                                 int32_t* ptr2);
+
+// Functions for ARM-Neon platforms, in place of the above two generic C ones.
+#if (defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
+int WebRtcIsacfix_AutocorrNeon(WebRtc_Word32* __restrict r,
+                               const WebRtc_Word16* __restrict x,
+                               WebRtc_Word16 N,
+                               WebRtc_Word16 order,
+                               WebRtc_Word16* __restrict scale);
+
+void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,
+                                    int16_t input1,
+                                    int32_t input2,
+                                    int32_t* ptr0,
+                                    int32_t* ptr1,
+                                    int32_t* ptr2);
+#endif
+
+/**** Function pointers associated with 
+ **** WebRtcIsacfix_AutocorrC() / WebRtcIsacfix_AutocorrNeon()
+ **** and WebRtcIsacfix_FilterMaLoopC() / WebRtcIsacfix_FilterMaLoopNeon().
+ ****/
+
+typedef int (*AutocorrFix)(WebRtc_Word32* __restrict__ r,
+                           const WebRtc_Word16* __restrict__ x,
+                           WebRtc_Word16 N,
+                           WebRtc_Word16 order,
+                           WebRtc_Word16* __restrict__ scale);
+extern AutocorrFix WebRtcIsacfix_AutocorrFix;
+
+typedef void (*FilterMaLoopFix)(int16_t input0,
+                                int16_t input1,
+                                int32_t input2,
+                                int32_t* ptr0,
+                                int32_t* ptr1,
+                                int32_t* ptr2);
+extern FilterMaLoopFix WebRtcIsacfix_FilterMaLoopFix;
+
 
 #endif /* WEBRTC_MODULES_AUDIO_CODING_CODECS_ISAC_FIX_SOURCE_CODEC_H_ */
diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c b/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c
index 940bb56..6ee0477 100644
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c
@@ -11,7 +11,7 @@
 /*
  * filters.c
  *
- * This file contains function WebRtcIsacfix_AutocorrFix,
+ * This file contains function WebRtcIsacfix_AutocorrC,
  * AllpassFilterForDec32, and WebRtcIsacfix_DecimateAllpass32
  *
  */
@@ -22,16 +22,13 @@
 #include "lpc_masking_model.h"
 #include "codec.h"
 
-#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
 // Autocorrelation function in fixed point.
 // NOTE! Different from SPLIB-version in how it scales the signal.
-int WebRtcIsacfix_AutocorrFix(
-    WebRtc_Word32* __restrict r,
-    const WebRtc_Word16* __restrict x,
-    WebRtc_Word16 N,
-    WebRtc_Word16 order,
-    WebRtc_Word16* __restrict scale) {
-
+int WebRtcIsacfix_AutocorrC(WebRtc_Word32* __restrict r,
+                            const WebRtc_Word16* __restrict x,
+                            WebRtc_Word16 N,
+                            WebRtc_Word16 order,
+                            WebRtc_Word16* __restrict scale) {
   int i = 0;
   int j = 0;
   int16_t scaling = 0;
@@ -67,7 +64,6 @@
 
   return(order + 1);
 }
-#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
 
 static const WebRtc_Word32 kApUpperQ15[ALLPASSSECTIONS] = { 1137, 12537 };
 static const WebRtc_Word32 kApLowerQ15[ALLPASSSECTIONS] = { 5059, 24379 };
diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/filters_neon.c b/src/modules/audio_coding/codecs/iSAC/fix/source/filters_neon.c
index 0b44886..8270359 100644
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/filters_neon.c
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/filters_neon.c
@@ -11,7 +11,7 @@
 /*
  * filters_neon.c
  *
- * This file contains function WebRtcIsacfix_AutocorrFix, optimized for
+ * This file contains function WebRtcIsacfix_AutocorrNeon, optimized for
  * ARM Neon platform.
  *
  */
@@ -23,7 +23,7 @@
 
 // Autocorrelation function in fixed point.
 // NOTE! Different from SPLIB-version in how it scales the signal.
-int WebRtcIsacfix_AutocorrFix(
+int WebRtcIsacfix_AutocorrNeon(
     WebRtc_Word32* __restrict r,
     const WebRtc_Word16* __restrict x,
     WebRtc_Word16 N,
diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/isacfix.c b/src/modules/audio_coding/codecs/iSAC/fix/source/isacfix.c
index a8c55e1..3a37785 100644
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/isacfix.c
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/isacfix.c
@@ -246,11 +246,18 @@
   WebRtcIsacfix_InitPostFilterbank(&ISAC_inst->ISACenc_obj.interpolatorstr_obj);
 #endif
 
+  // Initiaze function pointers.
+  WebRtcIsacfix_AutocorrFix = WebRtcIsacfix_AutocorrC;
+  WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopC;
+
+#ifdef WEBRTC_ARCH_ARM_NEON
+  WebRtcIsacfix_AutocorrFix = WebRtcIsacfix_AutocorrNeon;
+  WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopNeon;
+#endif
 
   return statusInit;
 }
 
-
 /****************************************************************************
  * WebRtcIsacfix_Encode(...)
  *
diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/lattice.c b/src/modules/audio_coding/codecs/iSAC/fix/source/lattice.c
index e2db729..0f80d58 100644
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/lattice.c
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/lattice.c
@@ -18,6 +18,64 @@
 #include "codec.h"
 #include "settings.h"
 
+#define LATTICE_MUL_32_32_RSFT16(a32a, a32b, b32)                  \
+  ((WebRtc_Word32)(WEBRTC_SPL_MUL(a32a, b32) + (WEBRTC_SPL_MUL_16_32_RSFT16(a32b, b32))))
+/* This macro is FORBIDDEN to use elsewhere than in a function in this file and
+   its corresponding neon version. It might give unpredictable results, since a
+   general WebRtc_Word32*WebRtc_Word32 multiplication results in a 64 bit value.
+   The result is then shifted just 16 steps to the right, giving need for 48
+   bits, i.e. in the generel case, it will NOT fit in a WebRtc_Word32. In the
+   cases used in here, the WebRtc_Word32 will be enough, since (for a good
+   reason) the involved multiplicands aren't big enough to overflow a
+   WebRtc_Word32 after shifting right 16 bits. I have compared the result of a
+   multiplication between t32 and tmp32, done in two ways:
+   1) Using (WebRtc_Word32) (((float)(tmp32))*((float)(tmp32b))/65536.0);
+   2) Using LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
+   By running 25 files, I haven't found any bigger diff than 64 - this was in the
+   case when  method 1) gave 650235648 and 2) gave 650235712.
+*/
+
+/* Inner loop used for function WebRtcIsacfix_NormLatticeFilterMa().
+   It does:
+   for 0 <= n < HALF_SUBFRAMELEN - 1:
+     *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
+     *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
+*/
+void WebRtcIsacfix_FilterMaLoopC(int16_t input0,  // Filter coefficient
+                                 int16_t input1,  // Filter coefficient
+                                 int32_t input2,  // Inverse coeff. (1/input1)
+                                 int32_t* ptr0,   // Sample buffer
+                                 int32_t* ptr1,   // Sample buffer
+                                 int32_t* ptr2) { // Sample buffer
+  int n = 0;
+
+  // Separate the 32-bit variable input2 into two 16-bit integers (high 16 and
+  // low 16 bits), for using LATTICE_MUL_32_32_RSFT16 in the loop.
+  int16_t t16a = (int16_t)(input2 >> 16);
+  int16_t t16b = (int16_t)input2;
+  if (t16b < 0) t16a++;
+
+  // The loop filtering the samples *ptr0, *ptr1, *ptr2 with filter coefficients
+  // input0, input1, and input2.
+  for(n = 0; n < HALF_SUBFRAMELEN - 1; n++, ptr0++, ptr1++, ptr2++) {
+    int32_t tmp32a = 0;
+    int32_t tmp32b = 0;
+
+    // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0));
+    tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0); // Q15 * Q15 >> 15 = Q15
+    tmp32b = *ptr2 + tmp32a; // Q15 + Q15 = Q15
+    *ptr2 = LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
+
+    // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
+    tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0); // Q15*Q15>>15 = Q15
+    tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2); // Q15*Q15>>15 = Q15
+    *ptr1 = tmp32a + tmp32b; // Q15 + Q15 = Q15
+  }
+}
+
+// Declare a function pointer.
+FilterMaLoopFix WebRtcIsacfix_FilterMaLoopFix;
+
 /* filter the signal using normalized lattice filter */
 /* MA filter */
 void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,
@@ -47,30 +105,6 @@
   WebRtc_Word16 t16a;
   WebRtc_Word16 t16b;
 
-#define LATTICE_MUL_32_32_RSFT16(a32a, a32b, b32)                  \
-  ((WebRtc_Word32)(WEBRTC_SPL_MUL(a32a, b32) + (WEBRTC_SPL_MUL_16_32_RSFT16(a32b, b32))))
-  /* This macro is FORBIDDEN to use elsewhere than in two places in this file
-     since it might give unpredictable results, since a general WebRtc_Word32*WebRtc_Word32
-     multiplication results in a 64 bit value. The result is then shifted just
-     16 steps to the right, giving need for 48 bits, i.e. in the generel case,
-     it will NOT fit in a WebRtc_Word32. In the cases used in here, the WebRtc_Word32 will be
-     enough, since (FOR SOME REASON!!!) the involved multiplicands aren't big
-     enough to overflow a WebRtc_Word32 after shifting right 16 bits. I have compared
-     the result of a multiplication between t32 and tmp32, done in two ways:
-
-     1) Using (WebRtc_Word32) (((float)(tmp32))*((float)(tmp32b))/65536.0);
-
-     2) Using LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
-
-     By running 25 files, I haven't found any bigger diff than 64 - this was in the
-     case when  method 1) gave 650235648 and 2) gave 650235712.
-
-     It might be good to investigate this further, in order to PROVE why it seems to
-     work without any problems. This might be done, by using the properties of
-     all reflection coefficients etc.
-
-  */
-
   for (u=0;u<SUBFRAMES;u++)
   {
     /* set the Direct Form coefficients */
@@ -133,24 +167,11 @@
     /* save the states */
     for(k=0;k<orderCoef;k++)
     {
-      for(n=0;n<HALF_SUBFRAMELEN-1;n++)
-      {
-        // Calculate f[k+1][n+1] = inv_cth[k]*(f[k][n+1] + sth[k]*g[k][n]);
-        tmp32 = WEBRTC_SPL_MUL_16_32_RSFT15(sthQ15[k], gQ15[k][n]);//Q15*Q15>>15 = Q15
-        tmp32b= fQ15vec[n+1] + tmp32; //Q15+Q15=Q15
-        tmp32 = inv_cthQ16[k]; //Q16
-        t16a = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32, 16);
-        t16b = (WebRtc_Word16) (tmp32-WEBRTC_SPL_LSHIFT_W32(((WebRtc_Word32)t16a), 16));
-        if (t16b<0) t16a++;
-        tmp32 = LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
-        fQ15vec[n+1] = tmp32; // Q15
-
-        // Calculate g[k+1][n+1] = cth[k]*g[k][n] + sth[k]* f[k+1][n+1];
-        tmp32  = WEBRTC_SPL_MUL_16_32_RSFT15(cthQ15[k], gQ15[k][n]); //Q15*Q15>>15 = Q15
-        tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(sthQ15[k], fQ15vec[n+1]); //Q15*Q15>>15 = Q15
-        tmp32  = tmp32 + tmp32b;//Q15+Q15 = Q15
-        gQ15[k+1][n+1] = tmp32; // Q15
-      }
+      // for 0 <= n < HALF_SUBFRAMELEN - 1:
+      //   f[k+1][n+1] = inv_cth[k]*(f[k][n+1] + sth[k]*g[k][n]);
+      //   g[k+1][n+1] = cth[k]*g[k][n] + sth[k]* f[k+1][n+1];
+      WebRtcIsacfix_FilterMaLoopFix(sthQ15[k], cthQ15[k], inv_cthQ16[k],
+                                    &gQ15[k][0], &gQ15[k+1][1], &fQ15vec[1]);
     }
 
     fQ15vec[0] = fQtmp;
diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/lattice_neon.S b/src/modules/audio_coding/codecs/iSAC/fix/source/lattice_neon.S
new file mode 100644
index 0000000..a59b6e3
--- /dev/null
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/lattice_neon.S
@@ -0,0 +1,155 @@
+@
+@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS.  All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+
+@ lattice_neon.s
+@
+@ Contains a function for the core loop in the normalized lattice MA 
+@ filter routine for iSAC codec, optimized for ARM Neon platform.
+@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,
+@                                     int16_t input1,
+@                                     int32_t input2,
+@                                     int32_t* ptr0,
+@                                     int32_t* ptr1,
+@                                     int32_t* __restrict ptr2);
+@ It calculates
+@   *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
+@   *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
+@ in Q15 domain.
+@
+@ Reference code in lattice.c.
+@ Output is not bit-exact with the reference C code, due to the replacement
+@ of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
+@ instructions, smulwb, and smull. Speech quality was not degraded by
+@ testing speech and tone vectors.
+
+.arch armv7-a
+.fpu neon
+
+#include "settings.h"
+
+.global WebRtcIsacfix_FilterMaLoopNeon
+
+.align  2
+
+WebRtcIsacfix_FilterMaLoopNeon:
+.fnstart
+
+.save {r4-r8}
+  push        {r4-r8}
+
+  vdup.32     d28, r0             @ Initialize Neon register with input0
+  vdup.32     d29, r1             @ Initialize Neon register with input1
+  vdup.32     d30, r2             @ Initialize Neon register with input2
+  ldr         r4, [sp, #20]       @ ptr1
+  ldr         r12, [sp, #24]      @ ptr2
+
+  @ Number of loop iterations after unrolling: r5 = (HALF_SUBFRAMELEN - 1) >> 2
+  @ Leftover samples after the loop, in r6:
+  @    r6 = (HALF_SUBFRAMELEN - 1) - (HALF_SUBFRAMELEN - 1) >> 2 << 2
+  mov         r6, #HALF_SUBFRAMELEN
+  sub         r6, #1
+  lsr         r5, r6, #2
+  sub         r6, r5, lsl #2
+
+  @ First r5 iterations in a loop.
+
+LOOP:
+  vld1.32     {d0, d1}, [r3]!     @ *ptr0
+
+  vmull.s32   q10, d0, d28        @ tmp32a = input0 * (*ptr0)
+  vmull.s32   q11, d1, d28        @ tmp32a = input0 * (*ptr0)
+  vmull.s32   q12, d0, d29        @ input1 * (*ptr0)
+  vmull.s32   q13, d1, d29        @ input1 * (*ptr0)
+                                  
+  vrshrn.i64  d4, q10, #15        
+  vrshrn.i64  d5, q11, #15        
+                                  
+  vld1.32     {d2, d3}, [r12]     @ *ptr2
+  vadd.i32    q3, q2, q1          @ tmp32b = *ptr2 + tmp32a
+                                  
+  vrshrn.i64  d0, q12, #15        
+                                  
+  vmull.s32   q10, d6, d30        @ input2 * (*ptr2 + tmp32b)
+  vmull.s32   q11, d7, d30        @ input2 * (*ptr2 + tmp32b)
+
+  vrshrn.i64  d16, q10, #16
+  vrshrn.i64  d17, q11, #16
+
+  vmull.s32   q10, d16, d28       @ input0 * (*ptr2)
+  vmull.s32   q11, d17, d28       @ input0 * (*ptr2)
+
+  vrshrn.i64  d1, q13, #15
+  vrshrn.i64  d18, q10, #15
+  vrshrn.i64  d19, q11, #15
+
+  vst1.32     {d16, d17}, [r12]!  @ *ptr2
+
+  vadd.i32    q9, q0, q9
+  subs        r5, #1
+  vst1.32     {d18, d19}, [r4]!   @ *ptr1
+
+  bgt         LOOP
+
+  @ Check how many samples still need to be processed.
+  subs        r6, #2
+  blt         LAST_SAMPLE
+
+  @ Process two more samples:
+  vld1.32     d0, [r3]!           @ *ptr0
+
+  vmull.s32   q11, d0, d28        @ tmp32a = input0 * (*ptr0)
+  vmull.s32   q13, d0, d29        @ input1 * (*ptr0)
+
+  vld1.32     d18, [r12]          @ *ptr2
+  vrshrn.i64  d4, q11, #15
+
+  vadd.i32    d7, d4, d18         @ tmp32b = *ptr2 + tmp32a
+  vmull.s32   q11, d7, d30        @ input2 * (*ptr2 + tmp32b)
+  vrshrn.i64  d16, q11, #16
+
+  vmull.s32   q11, d16, d28       @ input0 * (*ptr2)
+  vst1.32     d16, [r12]!         @ *ptr2
+
+  vrshrn.i64  d0, q13, #15
+  vrshrn.i64  d19, q11, #15
+  vadd.i32    d19, d0, d19
+
+  vst1.32     d19, [r4]!          @ *ptr1
+
+  @ If there's still one more sample, process it here.
+LAST_SAMPLE:
+  cmp         r6, #1
+  bne         END
+
+  @ *ptr2 = input2 * (*ptr2 + input0 * (*ptr0));
+  
+  ldr         r7, [r3]            @ *ptr0
+  ldr         r8, [r12]           @ *ptr2
+
+  smulwb      r5, r7, r0          @ tmp32a = *ptr0 * input0 >> 16
+  add         r8, r8, r5, lsl #1  @ tmp32b = *ptr2 + (tmp32a << 1)
+  smull       r5, r6, r8, r2      @ tmp32b * input2, in 64 bits
+  lsl         r6, #16
+  add         r6, r5, lsr #16     @ Only take the middle 32 bits
+  str         r6, [r12]           @ Output (*ptr2, as 32 bits)
+
+  @ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
+
+  smulwb      r5, r7, r1          @ tmp32a = *ptr0 * input1 >> 16
+  smulwb      r6, r6, r0          @ tmp32b = *ptr2 * input0 >> 16
+  lsl         r5, r5, #1
+  add         r5, r6, lsl #1
+  str         r5, [r4]            @ Output (*ptr1)
+
+END:
+  pop         {r4-r8}
+  bx          lr
+
+.fnend