| @ |
| @ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. |
| @ |
| @ Use of this source code is governed by a BSD-style license |
| @ that can be found in the LICENSE file in the root of the source |
| @ tree. An additional intellectual property rights grant can be found |
| @ in the file PATENTS. All contributing project authors may |
| @ be found in the AUTHORS file in the root of the source tree. |
| @ |
| |
| @ This file contains the function WebRtcSpl_FilterARFastQ12(), optimized for |
| @ ARMv7 platform. The description header can be found in |
| @ signal_processing_library.h |
| @ |
| @ Output is bit-exact with the generic C code as in filter_ar_fast_q12.c, and |
| @ the reference C code at end of this file. |
| |
| @ Assumptions: |
| @ (1) data_length > 0 |
| @ (2) coefficients_length > 1 |
| |
| @ Register usage: |
| @ |
| @ r0: &data_in[i] |
| @ r1: &data_out[i], for result ouput |
| @ r2: &coefficients[0] |
| @ r3: coefficients_length |
| @ r4: Iteration counter for the outer loop. |
| @ r5: data_out[j] as multiplication inputs |
| @ r6: Calculated value for output data_out[]; interation counter for inner loop |
| @ r7: Partial sum of a filtering multiplication results |
| @ r8: Partial sum of a filtering multiplication results |
| @ r9: &data_out[], for filtering input; data_in[i] |
| @ r10: coefficients[j] |
| @ r11: Scratch |
| @ r12: &coefficients[j] |
| |
| #include "rtc_base/system/asm_defines.h" |
| |
| GLOBAL_FUNCTION WebRtcSpl_FilterARFastQ12 |
| .align 2 |
| DEFINE_FUNCTION WebRtcSpl_FilterARFastQ12 |
| push {r4-r11} |
| |
| ldrsh r12, [sp, #32] @ data_length |
| subs r4, r12, #1 |
| beq ODD_LENGTH @ jump if data_length == 1 |
| |
| LOOP_LENGTH: |
| add r12, r2, r3, lsl #1 |
| sub r12, #4 @ &coefficients[coefficients_length - 2] |
| sub r9, r1, r3, lsl #1 |
| add r9, #2 @ &data_out[i - coefficients_length + 1] |
| ldr r5, [r9], #4 @ data_out[i - coefficients_length + {1,2}] |
| |
| mov r7, #0 @ sum1 |
| mov r8, #0 @ sum2 |
| subs r6, r3, #3 @ Iteration counter for inner loop. |
| beq ODD_A_LENGTH @ branch if coefficients_length == 3 |
| blt POST_LOOP_A_LENGTH @ branch if coefficients_length == 2 |
| |
| LOOP_A_LENGTH: |
| ldr r10, [r12], #-4 @ coefficients[j - 1], coefficients[j] |
| subs r6, #2 |
| smlatt r8, r10, r5, r8 @ sum2 += coefficients[j] * data_out[i - j + 1]; |
| smlatb r7, r10, r5, r7 @ sum1 += coefficients[j] * data_out[i - j]; |
| smlabt r7, r10, r5, r7 @ coefficients[j - 1] * data_out[i - j + 1]; |
| ldr r5, [r9], #4 @ data_out[i - j + 2], data_out[i - j + 3] |
| smlabb r8, r10, r5, r8 @ coefficients[j - 1] * data_out[i - j + 2]; |
| bgt LOOP_A_LENGTH |
| blt POST_LOOP_A_LENGTH |
| |
| ODD_A_LENGTH: |
| ldrsh r10, [r12, #2] @ Filter coefficients coefficients[2] |
| sub r12, #2 @ &coefficients[0] |
| smlabb r7, r10, r5, r7 @ sum1 += coefficients[2] * data_out[i - 2]; |
| smlabt r8, r10, r5, r8 @ sum2 += coefficients[2] * data_out[i - 1]; |
| ldr r5, [r9, #-2] @ data_out[i - 1], data_out[i] |
| |
| POST_LOOP_A_LENGTH: |
| ldr r10, [r12] @ coefficients[0], coefficients[1] |
| smlatb r7, r10, r5, r7 @ sum1 += coefficients[1] * data_out[i - 1]; |
| |
| ldr r9, [r0], #4 @ data_in[i], data_in[i + 1] |
| smulbb r6, r10, r9 @ output1 = coefficients[0] * data_in[i]; |
| sub r6, r7 @ output1 -= sum1; |
| |
| sbfx r11, r6, #12, #16 |
| ssat r7, #16, r6, asr #12 |
| cmp r7, r11 |
| addeq r6, r6, #2048 |
| ssat r6, #16, r6, asr #12 |
| strh r6, [r1], #2 @ Store data_out[i] |
| |
| smlatb r8, r10, r6, r8 @ sum2 += coefficients[1] * data_out[i]; |
| smulbt r6, r10, r9 @ output2 = coefficients[0] * data_in[i + 1]; |
| sub r6, r8 @ output1 -= sum1; |
| |
| sbfx r11, r6, #12, #16 |
| ssat r7, #16, r6, asr #12 |
| cmp r7, r11 |
| addeq r6, r6, #2048 |
| ssat r6, #16, r6, asr #12 |
| strh r6, [r1], #2 @ Store data_out[i + 1] |
| |
| subs r4, #2 |
| bgt LOOP_LENGTH |
| blt END @ For even data_length, it's done. Jump to END. |
| |
| @ Process i = data_length -1, for the case of an odd length. |
| ODD_LENGTH: |
| add r12, r2, r3, lsl #1 |
| sub r12, #4 @ &coefficients[coefficients_length - 2] |
| sub r9, r1, r3, lsl #1 |
| add r9, #2 @ &data_out[i - coefficients_length + 1] |
| mov r7, #0 @ sum1 |
| mov r8, #0 @ sum1 |
| subs r6, r3, #2 @ inner loop counter |
| beq EVEN_A_LENGTH @ branch if coefficients_length == 2 |
| |
| LOOP2_A_LENGTH: |
| ldr r10, [r12], #-4 @ coefficients[j - 1], coefficients[j] |
| ldr r5, [r9], #4 @ data_out[i - j], data_out[i - j + 1] |
| subs r6, #2 |
| smlatb r7, r10, r5, r7 @ sum1 += coefficients[j] * data_out[i - j]; |
| smlabt r8, r10, r5, r8 @ coefficients[j - 1] * data_out[i - j + 1]; |
| bgt LOOP2_A_LENGTH |
| addlt r12, #2 |
| blt POST_LOOP2_A_LENGTH |
| |
| EVEN_A_LENGTH: |
| ldrsh r10, [r12, #2] @ Filter coefficients coefficients[1] |
| ldrsh r5, [r9] @ data_out[i - 1] |
| smlabb r7, r10, r5, r7 @ sum1 += coefficients[1] * data_out[i - 1]; |
| |
| POST_LOOP2_A_LENGTH: |
| ldrsh r10, [r12] @ Filter coefficients coefficients[0] |
| ldrsh r9, [r0] @ data_in[i] |
| smulbb r6, r10, r9 @ output1 = coefficients[0] * data_in[i]; |
| sub r6, r7 @ output1 -= sum1; |
| sub r6, r8 @ output1 -= sum1; |
| sbfx r8, r6, #12, #16 |
| ssat r7, #16, r6, asr #12 |
| cmp r7, r8 |
| addeq r6, r6, #2048 |
| ssat r6, #16, r6, asr #12 |
| strh r6, [r1] @ Store the data_out[i] |
| |
| END: |
| pop {r4-r11} |
| bx lr |
| |
| @Reference C code: |
| @ |
| @void WebRtcSpl_FilterARFastQ12(int16_t* data_in, |
| @ int16_t* data_out, |
| @ int16_t* __restrict coefficients, |
| @ size_t coefficients_length, |
| @ size_t data_length) { |
| @ size_t i = 0; |
| @ size_t j = 0; |
| @ |
| @ assert(data_length > 0); |
| @ assert(coefficients_length > 1); |
| @ |
| @ for (i = 0; i < data_length - 1; i += 2) { |
| @ int32_t output1 = 0; |
| @ int32_t sum1 = 0; |
| @ int32_t output2 = 0; |
| @ int32_t sum2 = 0; |
| @ |
| @ for (j = coefficients_length - 1; j > 2; j -= 2) { |
| @ sum1 += coefficients[j] * data_out[i - j]; |
| @ sum1 += coefficients[j - 1] * data_out[i - j + 1]; |
| @ sum2 += coefficients[j] * data_out[i - j + 1]; |
| @ sum2 += coefficients[j - 1] * data_out[i - j + 2]; |
| @ } |
| @ |
| @ if (j == 2) { |
| @ sum1 += coefficients[2] * data_out[i - 2]; |
| @ sum2 += coefficients[2] * data_out[i - 1]; |
| @ } |
| @ |
| @ sum1 += coefficients[1] * data_out[i - 1]; |
| @ output1 = coefficients[0] * data_in[i]; |
| @ output1 -= sum1; |
| @ // Saturate and store the output. |
| @ output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728); |
| @ data_out[i] = (int16_t)((output1 + 2048) >> 12); |
| @ |
| @ sum2 += coefficients[1] * data_out[i]; |
| @ output2 = coefficients[0] * data_in[i + 1]; |
| @ output2 -= sum2; |
| @ // Saturate and store the output. |
| @ output2 = WEBRTC_SPL_SAT(134215679, output2, -134217728); |
| @ data_out[i + 1] = (int16_t)((output2 + 2048) >> 12); |
| @ } |
| @ |
| @ if (i == data_length - 1) { |
| @ int32_t output1 = 0; |
| @ int32_t sum1 = 0; |
| @ |
| @ for (j = coefficients_length - 1; j > 1; j -= 2) { |
| @ sum1 += coefficients[j] * data_out[i - j]; |
| @ sum1 += coefficients[j - 1] * data_out[i - j + 1]; |
| @ } |
| @ |
| @ if (j == 1) { |
| @ sum1 += coefficients[1] * data_out[i - 1]; |
| @ } |
| @ |
| @ output1 = coefficients[0] * data_in[i]; |
| @ output1 -= sum1; |
| @ // Saturate and store the output. |
| @ output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728); |
| @ data_out[i] = (int16_t)((output1 + 2048) >> 12); |
| @ } |
| @} |