Optimized WebRtcIsacfix_NormLatticeFilterMa() function for iSAC fix for ARM Neon
architecture with intrinsics and assembly code. The total iSAC codec speech improved
about 3~5%.
Notes
(1) The Neon version after this optimization is not bit-exact with the generic
C version. The out quality, however, is not worse as verified by test vectors ouput,
and undertandably in theory (32bit x 32bit in Neon is more accurate than the approximation
C code in the generic version).
(2) In Android, a isac neon library will be built. Along with some new function structures,
it is partly for preparation of introducing a run time detection of Neon architecture soon.
Review URL: http://webrtc-codereview.appspot.com/268016
git-svn-id: http://webrtc.googlecode.com/svn/trunk@1192 4adac7df-926f-26a2-2b94-8c16560cd09d
diff --git a/Android.mk b/Android.mk
index e744a89..2380e0b 100644
--- a/Android.mk
+++ b/Android.mk
@@ -103,6 +103,7 @@
LOCAL_PATH := $(call my-dir)
include $(CLEAR_VARS)
+include $(LOCAL_PATH)/../../external/webrtc/android-webrtc.mk
LOCAL_ARM_MODE := arm
LOCAL_MODULE := libwebrtc
@@ -137,6 +138,15 @@
libwebrtc_jpeg \
libwebrtc_vpx
+# Add Neon libraries.
+ifneq (,$(filter '-DWEBRTC_DETECT_ARM_NEON',$(MY_WEBRTC_COMMON_DEFS)))
+LOCAL_WHOLE_STATIC_LIBRARIES += \
+ libwebrtc_isacfix_neon
+else ifeq ($(ARCH_ARM_HAVE_NEON),true)
+LOCAL_WHOLE_STATIC_LIBRARIES += \
+ libwebrtc_isacfix_neon
+endif
+
LOCAL_SHARED_LIBRARIES := \
libcutils \
libdl \
diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/Android.mk b/src/modules/audio_coding/codecs/iSAC/fix/source/Android.mk
index 786695b..3bedfe9 100644
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/Android.mk
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/Android.mk
@@ -6,6 +6,9 @@
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
+#############################
+# Build the non-neon library.
+
LOCAL_PATH := $(call my-dir)
include $(CLEAR_VARS)
@@ -68,8 +71,41 @@
endif
include $(BUILD_STATIC_LIBRARY)
+#########################
+# Build the neon library.
+include $(CLEAR_VARS)
+
+LOCAL_ARM_MODE := arm
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+LOCAL_MODULE := libwebrtc_isacfix_neon
+LOCAL_MODULE_TAGS := optional
+LOCAL_SRC_FILES := \
+ filters_neon.c \
+ lattice_neon.S #.S extention is for including a header file in assembly.
+# TODO(kma): Check with C compiler team and on line community for any status
+# in the file name (.s vs .S), for a better solution.
+
+# Flags passed to both C and C++ files.
+LOCAL_CFLAGS := \
+ $(MY_WEBRTC_COMMON_DEFS) \
+ -mfpu=neon \
+ -flax-vector-conversions
+
+LOCAL_C_INCLUDES := \
+ $(LOCAL_PATH)/../interface \
+ $(LOCAL_PATH)/../../../../../.. \
+ $(LOCAL_PATH)/../../../../../../common_audio/signal_processing/include
+
+
+ifndef NDK_ROOT
+include external/stlport/libstlport.mk
+endif
+include $(BUILD_STATIC_LIBRARY)
+
+###########################
# isac test app
+
include $(CLEAR_VARS)
LOCAL_MODULE_TAGS := tests
diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/codec.h b/src/modules/audio_coding/codecs/iSAC/fix/source/codec.h
index 279c2c5..4a8d281 100644
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/codec.h
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/codec.h
@@ -122,7 +122,6 @@
WebRtc_Word16 lo_hi,
WebRtc_Word16 *lat_outQ9);
-
void WebRtcIsacfix_NormLatticeFilterAr(WebRtc_Word16 orderCoef,
WebRtc_Word16 *stateGQ0,
WebRtc_Word32 *lat_inQ25,
@@ -131,10 +130,54 @@
WebRtc_Word16 lo_hi,
WebRtc_Word16 *lat_outQ0);
-int WebRtcIsacfix_AutocorrFix(WebRtc_Word32* __restrict r,
- const WebRtc_Word16* __restrict x,
- WebRtc_Word16 N,
- WebRtc_Word16 order,
- WebRtc_Word16* __restrict scale);
+int WebRtcIsacfix_AutocorrC(WebRtc_Word32* __restrict r,
+ const WebRtc_Word16* __restrict x,
+ WebRtc_Word16 N,
+ WebRtc_Word16 order,
+ WebRtc_Word16* __restrict scale);
+
+void WebRtcIsacfix_FilterMaLoopC(int16_t input0,
+ int16_t input1,
+ int32_t input2,
+ int32_t* ptr0,
+ int32_t* ptr1,
+ int32_t* ptr2);
+
+// Functions for ARM-Neon platforms, in place of the above two generic C ones.
+#if (defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
+int WebRtcIsacfix_AutocorrNeon(WebRtc_Word32* __restrict r,
+ const WebRtc_Word16* __restrict x,
+ WebRtc_Word16 N,
+ WebRtc_Word16 order,
+ WebRtc_Word16* __restrict scale);
+
+void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,
+ int16_t input1,
+ int32_t input2,
+ int32_t* ptr0,
+ int32_t* ptr1,
+ int32_t* ptr2);
+#endif
+
+/**** Function pointers associated with
+ **** WebRtcIsacfix_AutocorrC() / WebRtcIsacfix_AutocorrNeon()
+ **** and WebRtcIsacfix_FilterMaLoopC() / WebRtcIsacfix_FilterMaLoopNeon().
+ ****/
+
+typedef int (*AutocorrFix)(WebRtc_Word32* __restrict__ r,
+ const WebRtc_Word16* __restrict__ x,
+ WebRtc_Word16 N,
+ WebRtc_Word16 order,
+ WebRtc_Word16* __restrict__ scale);
+extern AutocorrFix WebRtcIsacfix_AutocorrFix;
+
+typedef void (*FilterMaLoopFix)(int16_t input0,
+ int16_t input1,
+ int32_t input2,
+ int32_t* ptr0,
+ int32_t* ptr1,
+ int32_t* ptr2);
+extern FilterMaLoopFix WebRtcIsacfix_FilterMaLoopFix;
+
#endif /* WEBRTC_MODULES_AUDIO_CODING_CODECS_ISAC_FIX_SOURCE_CODEC_H_ */
diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c b/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c
index 940bb56..6ee0477 100644
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c
@@ -11,7 +11,7 @@
/*
* filters.c
*
- * This file contains function WebRtcIsacfix_AutocorrFix,
+ * This file contains function WebRtcIsacfix_AutocorrC,
* AllpassFilterForDec32, and WebRtcIsacfix_DecimateAllpass32
*
*/
@@ -22,16 +22,13 @@
#include "lpc_masking_model.h"
#include "codec.h"
-#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
// Autocorrelation function in fixed point.
// NOTE! Different from SPLIB-version in how it scales the signal.
-int WebRtcIsacfix_AutocorrFix(
- WebRtc_Word32* __restrict r,
- const WebRtc_Word16* __restrict x,
- WebRtc_Word16 N,
- WebRtc_Word16 order,
- WebRtc_Word16* __restrict scale) {
-
+int WebRtcIsacfix_AutocorrC(WebRtc_Word32* __restrict r,
+ const WebRtc_Word16* __restrict x,
+ WebRtc_Word16 N,
+ WebRtc_Word16 order,
+ WebRtc_Word16* __restrict scale) {
int i = 0;
int j = 0;
int16_t scaling = 0;
@@ -67,7 +64,6 @@
return(order + 1);
}
-#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
static const WebRtc_Word32 kApUpperQ15[ALLPASSSECTIONS] = { 1137, 12537 };
static const WebRtc_Word32 kApLowerQ15[ALLPASSSECTIONS] = { 5059, 24379 };
diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/filters_neon.c b/src/modules/audio_coding/codecs/iSAC/fix/source/filters_neon.c
index 0b44886..8270359 100644
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/filters_neon.c
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/filters_neon.c
@@ -11,7 +11,7 @@
/*
* filters_neon.c
*
- * This file contains function WebRtcIsacfix_AutocorrFix, optimized for
+ * This file contains function WebRtcIsacfix_AutocorrNeon, optimized for
* ARM Neon platform.
*
*/
@@ -23,7 +23,7 @@
// Autocorrelation function in fixed point.
// NOTE! Different from SPLIB-version in how it scales the signal.
-int WebRtcIsacfix_AutocorrFix(
+int WebRtcIsacfix_AutocorrNeon(
WebRtc_Word32* __restrict r,
const WebRtc_Word16* __restrict x,
WebRtc_Word16 N,
diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/isacfix.c b/src/modules/audio_coding/codecs/iSAC/fix/source/isacfix.c
index a8c55e1..3a37785 100644
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/isacfix.c
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/isacfix.c
@@ -246,11 +246,18 @@
WebRtcIsacfix_InitPostFilterbank(&ISAC_inst->ISACenc_obj.interpolatorstr_obj);
#endif
+ // Initiaze function pointers.
+ WebRtcIsacfix_AutocorrFix = WebRtcIsacfix_AutocorrC;
+ WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopC;
+
+#ifdef WEBRTC_ARCH_ARM_NEON
+ WebRtcIsacfix_AutocorrFix = WebRtcIsacfix_AutocorrNeon;
+ WebRtcIsacfix_FilterMaLoopFix = WebRtcIsacfix_FilterMaLoopNeon;
+#endif
return statusInit;
}
-
/****************************************************************************
* WebRtcIsacfix_Encode(...)
*
diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/lattice.c b/src/modules/audio_coding/codecs/iSAC/fix/source/lattice.c
index e2db729..0f80d58 100644
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/lattice.c
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/lattice.c
@@ -18,6 +18,64 @@
#include "codec.h"
#include "settings.h"
+#define LATTICE_MUL_32_32_RSFT16(a32a, a32b, b32) \
+ ((WebRtc_Word32)(WEBRTC_SPL_MUL(a32a, b32) + (WEBRTC_SPL_MUL_16_32_RSFT16(a32b, b32))))
+/* This macro is FORBIDDEN to use elsewhere than in a function in this file and
+ its corresponding neon version. It might give unpredictable results, since a
+ general WebRtc_Word32*WebRtc_Word32 multiplication results in a 64 bit value.
+ The result is then shifted just 16 steps to the right, giving need for 48
+ bits, i.e. in the generel case, it will NOT fit in a WebRtc_Word32. In the
+ cases used in here, the WebRtc_Word32 will be enough, since (for a good
+ reason) the involved multiplicands aren't big enough to overflow a
+ WebRtc_Word32 after shifting right 16 bits. I have compared the result of a
+ multiplication between t32 and tmp32, done in two ways:
+ 1) Using (WebRtc_Word32) (((float)(tmp32))*((float)(tmp32b))/65536.0);
+ 2) Using LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
+ By running 25 files, I haven't found any bigger diff than 64 - this was in the
+ case when method 1) gave 650235648 and 2) gave 650235712.
+*/
+
+/* Inner loop used for function WebRtcIsacfix_NormLatticeFilterMa().
+ It does:
+ for 0 <= n < HALF_SUBFRAMELEN - 1:
+ *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
+ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
+*/
+void WebRtcIsacfix_FilterMaLoopC(int16_t input0, // Filter coefficient
+ int16_t input1, // Filter coefficient
+ int32_t input2, // Inverse coeff. (1/input1)
+ int32_t* ptr0, // Sample buffer
+ int32_t* ptr1, // Sample buffer
+ int32_t* ptr2) { // Sample buffer
+ int n = 0;
+
+ // Separate the 32-bit variable input2 into two 16-bit integers (high 16 and
+ // low 16 bits), for using LATTICE_MUL_32_32_RSFT16 in the loop.
+ int16_t t16a = (int16_t)(input2 >> 16);
+ int16_t t16b = (int16_t)input2;
+ if (t16b < 0) t16a++;
+
+ // The loop filtering the samples *ptr0, *ptr1, *ptr2 with filter coefficients
+ // input0, input1, and input2.
+ for(n = 0; n < HALF_SUBFRAMELEN - 1; n++, ptr0++, ptr1++, ptr2++) {
+ int32_t tmp32a = 0;
+ int32_t tmp32b = 0;
+
+ // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0));
+ tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0); // Q15 * Q15 >> 15 = Q15
+ tmp32b = *ptr2 + tmp32a; // Q15 + Q15 = Q15
+ *ptr2 = LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
+
+ // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
+ tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0); // Q15*Q15>>15 = Q15
+ tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2); // Q15*Q15>>15 = Q15
+ *ptr1 = tmp32a + tmp32b; // Q15 + Q15 = Q15
+ }
+}
+
+// Declare a function pointer.
+FilterMaLoopFix WebRtcIsacfix_FilterMaLoopFix;
+
/* filter the signal using normalized lattice filter */
/* MA filter */
void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,
@@ -47,30 +105,6 @@
WebRtc_Word16 t16a;
WebRtc_Word16 t16b;
-#define LATTICE_MUL_32_32_RSFT16(a32a, a32b, b32) \
- ((WebRtc_Word32)(WEBRTC_SPL_MUL(a32a, b32) + (WEBRTC_SPL_MUL_16_32_RSFT16(a32b, b32))))
- /* This macro is FORBIDDEN to use elsewhere than in two places in this file
- since it might give unpredictable results, since a general WebRtc_Word32*WebRtc_Word32
- multiplication results in a 64 bit value. The result is then shifted just
- 16 steps to the right, giving need for 48 bits, i.e. in the generel case,
- it will NOT fit in a WebRtc_Word32. In the cases used in here, the WebRtc_Word32 will be
- enough, since (FOR SOME REASON!!!) the involved multiplicands aren't big
- enough to overflow a WebRtc_Word32 after shifting right 16 bits. I have compared
- the result of a multiplication between t32 and tmp32, done in two ways:
-
- 1) Using (WebRtc_Word32) (((float)(tmp32))*((float)(tmp32b))/65536.0);
-
- 2) Using LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
-
- By running 25 files, I haven't found any bigger diff than 64 - this was in the
- case when method 1) gave 650235648 and 2) gave 650235712.
-
- It might be good to investigate this further, in order to PROVE why it seems to
- work without any problems. This might be done, by using the properties of
- all reflection coefficients etc.
-
- */
-
for (u=0;u<SUBFRAMES;u++)
{
/* set the Direct Form coefficients */
@@ -133,24 +167,11 @@
/* save the states */
for(k=0;k<orderCoef;k++)
{
- for(n=0;n<HALF_SUBFRAMELEN-1;n++)
- {
- // Calculate f[k+1][n+1] = inv_cth[k]*(f[k][n+1] + sth[k]*g[k][n]);
- tmp32 = WEBRTC_SPL_MUL_16_32_RSFT15(sthQ15[k], gQ15[k][n]);//Q15*Q15>>15 = Q15
- tmp32b= fQ15vec[n+1] + tmp32; //Q15+Q15=Q15
- tmp32 = inv_cthQ16[k]; //Q16
- t16a = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32, 16);
- t16b = (WebRtc_Word16) (tmp32-WEBRTC_SPL_LSHIFT_W32(((WebRtc_Word32)t16a), 16));
- if (t16b<0) t16a++;
- tmp32 = LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
- fQ15vec[n+1] = tmp32; // Q15
-
- // Calculate g[k+1][n+1] = cth[k]*g[k][n] + sth[k]* f[k+1][n+1];
- tmp32 = WEBRTC_SPL_MUL_16_32_RSFT15(cthQ15[k], gQ15[k][n]); //Q15*Q15>>15 = Q15
- tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(sthQ15[k], fQ15vec[n+1]); //Q15*Q15>>15 = Q15
- tmp32 = tmp32 + tmp32b;//Q15+Q15 = Q15
- gQ15[k+1][n+1] = tmp32; // Q15
- }
+ // for 0 <= n < HALF_SUBFRAMELEN - 1:
+ // f[k+1][n+1] = inv_cth[k]*(f[k][n+1] + sth[k]*g[k][n]);
+ // g[k+1][n+1] = cth[k]*g[k][n] + sth[k]* f[k+1][n+1];
+ WebRtcIsacfix_FilterMaLoopFix(sthQ15[k], cthQ15[k], inv_cthQ16[k],
+ &gQ15[k][0], &gQ15[k+1][1], &fQ15vec[1]);
}
fQ15vec[0] = fQtmp;
diff --git a/src/modules/audio_coding/codecs/iSAC/fix/source/lattice_neon.S b/src/modules/audio_coding/codecs/iSAC/fix/source/lattice_neon.S
new file mode 100644
index 0000000..a59b6e3
--- /dev/null
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/lattice_neon.S
@@ -0,0 +1,155 @@
+@
+@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS. All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+
+@ lattice_neon.s
+@
+@ Contains a function for the core loop in the normalized lattice MA
+@ filter routine for iSAC codec, optimized for ARM Neon platform.
+@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,
+@ int16_t input1,
+@ int32_t input2,
+@ int32_t* ptr0,
+@ int32_t* ptr1,
+@ int32_t* __restrict ptr2);
+@ It calculates
+@ *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
+@ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
+@ in Q15 domain.
+@
+@ Reference code in lattice.c.
+@ Output is not bit-exact with the reference C code, due to the replacement
+@ of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
+@ instructions, smulwb, and smull. Speech quality was not degraded by
+@ testing speech and tone vectors.
+
+.arch armv7-a
+.fpu neon
+
+#include "settings.h"
+
+.global WebRtcIsacfix_FilterMaLoopNeon
+
+.align 2
+
+WebRtcIsacfix_FilterMaLoopNeon:
+.fnstart
+
+.save {r4-r8}
+ push {r4-r8}
+
+ vdup.32 d28, r0 @ Initialize Neon register with input0
+ vdup.32 d29, r1 @ Initialize Neon register with input1
+ vdup.32 d30, r2 @ Initialize Neon register with input2
+ ldr r4, [sp, #20] @ ptr1
+ ldr r12, [sp, #24] @ ptr2
+
+ @ Number of loop iterations after unrolling: r5 = (HALF_SUBFRAMELEN - 1) >> 2
+ @ Leftover samples after the loop, in r6:
+ @ r6 = (HALF_SUBFRAMELEN - 1) - (HALF_SUBFRAMELEN - 1) >> 2 << 2
+ mov r6, #HALF_SUBFRAMELEN
+ sub r6, #1
+ lsr r5, r6, #2
+ sub r6, r5, lsl #2
+
+ @ First r5 iterations in a loop.
+
+LOOP:
+ vld1.32 {d0, d1}, [r3]! @ *ptr0
+
+ vmull.s32 q10, d0, d28 @ tmp32a = input0 * (*ptr0)
+ vmull.s32 q11, d1, d28 @ tmp32a = input0 * (*ptr0)
+ vmull.s32 q12, d0, d29 @ input1 * (*ptr0)
+ vmull.s32 q13, d1, d29 @ input1 * (*ptr0)
+
+ vrshrn.i64 d4, q10, #15
+ vrshrn.i64 d5, q11, #15
+
+ vld1.32 {d2, d3}, [r12] @ *ptr2
+ vadd.i32 q3, q2, q1 @ tmp32b = *ptr2 + tmp32a
+
+ vrshrn.i64 d0, q12, #15
+
+ vmull.s32 q10, d6, d30 @ input2 * (*ptr2 + tmp32b)
+ vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b)
+
+ vrshrn.i64 d16, q10, #16
+ vrshrn.i64 d17, q11, #16
+
+ vmull.s32 q10, d16, d28 @ input0 * (*ptr2)
+ vmull.s32 q11, d17, d28 @ input0 * (*ptr2)
+
+ vrshrn.i64 d1, q13, #15
+ vrshrn.i64 d18, q10, #15
+ vrshrn.i64 d19, q11, #15
+
+ vst1.32 {d16, d17}, [r12]! @ *ptr2
+
+ vadd.i32 q9, q0, q9
+ subs r5, #1
+ vst1.32 {d18, d19}, [r4]! @ *ptr1
+
+ bgt LOOP
+
+ @ Check how many samples still need to be processed.
+ subs r6, #2
+ blt LAST_SAMPLE
+
+ @ Process two more samples:
+ vld1.32 d0, [r3]! @ *ptr0
+
+ vmull.s32 q11, d0, d28 @ tmp32a = input0 * (*ptr0)
+ vmull.s32 q13, d0, d29 @ input1 * (*ptr0)
+
+ vld1.32 d18, [r12] @ *ptr2
+ vrshrn.i64 d4, q11, #15
+
+ vadd.i32 d7, d4, d18 @ tmp32b = *ptr2 + tmp32a
+ vmull.s32 q11, d7, d30 @ input2 * (*ptr2 + tmp32b)
+ vrshrn.i64 d16, q11, #16
+
+ vmull.s32 q11, d16, d28 @ input0 * (*ptr2)
+ vst1.32 d16, [r12]! @ *ptr2
+
+ vrshrn.i64 d0, q13, #15
+ vrshrn.i64 d19, q11, #15
+ vadd.i32 d19, d0, d19
+
+ vst1.32 d19, [r4]! @ *ptr1
+
+ @ If there's still one more sample, process it here.
+LAST_SAMPLE:
+ cmp r6, #1
+ bne END
+
+ @ *ptr2 = input2 * (*ptr2 + input0 * (*ptr0));
+
+ ldr r7, [r3] @ *ptr0
+ ldr r8, [r12] @ *ptr2
+
+ smulwb r5, r7, r0 @ tmp32a = *ptr0 * input0 >> 16
+ add r8, r8, r5, lsl #1 @ tmp32b = *ptr2 + (tmp32a << 1)
+ smull r5, r6, r8, r2 @ tmp32b * input2, in 64 bits
+ lsl r6, #16
+ add r6, r5, lsr #16 @ Only take the middle 32 bits
+ str r6, [r12] @ Output (*ptr2, as 32 bits)
+
+ @ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
+
+ smulwb r5, r7, r1 @ tmp32a = *ptr0 * input1 >> 16
+ smulwb r6, r6, r0 @ tmp32b = *ptr2 * input0 >> 16
+ lsl r5, r5, #1
+ add r5, r6, lsl #1
+ str r5, [r4] @ Output (*ptr1)
+
+END:
+ pop {r4-r8}
+ bx lr
+
+.fnend