webrtc / src / webrtc / 1f11d1a0fdec5d469ab0df6946a1c5f63c5ed4cb / . / common_audio / signal_processing / filter_ar_fast_q12_armv7.S

@ | |

@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | |

@ | |

@ Use of this source code is governed by a BSD-style license | |

@ that can be found in the LICENSE file in the root of the source | |

@ tree. An additional intellectual property rights grant can be found | |

@ in the file PATENTS. All contributing project authors may | |

@ be found in the AUTHORS file in the root of the source tree. | |

@ | |

@ This file contains the function WebRtcSpl_FilterARFastQ12(), optimized for | |

@ ARMv7 platform. The description header can be found in | |

@ signal_processing_library.h | |

@ | |

@ Output is bit-exact with the generic C code as in filter_ar_fast_q12.c, and | |

@ the reference C code at end of this file. | |

@ Assumptions: | |

@ (1) data_length > 0 | |

@ (2) coefficients_length > 1 | |

@ Register usage: | |

@ | |

@ r0: &data_in[i] | |

@ r1: &data_out[i], for result ouput | |

@ r2: &coefficients[0] | |

@ r3: coefficients_length | |

@ r4: Iteration counter for the outer loop. | |

@ r5: data_out[j] as multiplication inputs | |

@ r6: Calculated value for output data_out[]; interation counter for inner loop | |

@ r7: Partial sum of a filtering multiplication results | |

@ r8: Partial sum of a filtering multiplication results | |

@ r9: &data_out[], for filtering input; data_in[i] | |

@ r10: coefficients[j] | |

@ r11: Scratch | |

@ r12: &coefficients[j] | |

#include "webrtc/system_wrappers/include/asm_defines.h" | |

GLOBAL_FUNCTION WebRtcSpl_FilterARFastQ12 | |

.align 2 | |

DEFINE_FUNCTION WebRtcSpl_FilterARFastQ12 | |

push {r4-r11} | |

ldrsh r12, [sp, #32] @ data_length | |

subs r4, r12, #1 | |

beq ODD_LENGTH @ jump if data_length == 1 | |

LOOP_LENGTH: | |

add r12, r2, r3, lsl #1 | |

sub r12, #4 @ &coefficients[coefficients_length - 2] | |

sub r9, r1, r3, lsl #1 | |

add r9, #2 @ &data_out[i - coefficients_length + 1] | |

ldr r5, [r9], #4 @ data_out[i - coefficients_length + {1,2}] | |

mov r7, #0 @ sum1 | |

mov r8, #0 @ sum2 | |

subs r6, r3, #3 @ Iteration counter for inner loop. | |

beq ODD_A_LENGTH @ branch if coefficients_length == 3 | |

blt POST_LOOP_A_LENGTH @ branch if coefficients_length == 2 | |

LOOP_A_LENGTH: | |

ldr r10, [r12], #-4 @ coefficients[j - 1], coefficients[j] | |

subs r6, #2 | |

smlatt r8, r10, r5, r8 @ sum2 += coefficients[j] * data_out[i - j + 1]; | |

smlatb r7, r10, r5, r7 @ sum1 += coefficients[j] * data_out[i - j]; | |

smlabt r7, r10, r5, r7 @ coefficients[j - 1] * data_out[i - j + 1]; | |

ldr r5, [r9], #4 @ data_out[i - j + 2], data_out[i - j + 3] | |

smlabb r8, r10, r5, r8 @ coefficients[j - 1] * data_out[i - j + 2]; | |

bgt LOOP_A_LENGTH | |

blt POST_LOOP_A_LENGTH | |

ODD_A_LENGTH: | |

ldrsh r10, [r12, #2] @ Filter coefficients coefficients[2] | |

sub r12, #2 @ &coefficients[0] | |

smlabb r7, r10, r5, r7 @ sum1 += coefficients[2] * data_out[i - 2]; | |

smlabt r8, r10, r5, r8 @ sum2 += coefficients[2] * data_out[i - 1]; | |

ldr r5, [r9, #-2] @ data_out[i - 1], data_out[i] | |

POST_LOOP_A_LENGTH: | |

ldr r10, [r12] @ coefficients[0], coefficients[1] | |

smlatb r7, r10, r5, r7 @ sum1 += coefficients[1] * data_out[i - 1]; | |

ldr r9, [r0], #4 @ data_in[i], data_in[i + 1] | |

smulbb r6, r10, r9 @ output1 = coefficients[0] * data_in[i]; | |

sub r6, r7 @ output1 -= sum1; | |

sbfx r11, r6, #12, #16 | |

ssat r7, #16, r6, asr #12 | |

cmp r7, r11 | |

addeq r6, r6, #2048 | |

ssat r6, #16, r6, asr #12 | |

strh r6, [r1], #2 @ Store data_out[i] | |

smlatb r8, r10, r6, r8 @ sum2 += coefficients[1] * data_out[i]; | |

smulbt r6, r10, r9 @ output2 = coefficients[0] * data_in[i + 1]; | |

sub r6, r8 @ output1 -= sum1; | |

sbfx r11, r6, #12, #16 | |

ssat r7, #16, r6, asr #12 | |

cmp r7, r11 | |

addeq r6, r6, #2048 | |

ssat r6, #16, r6, asr #12 | |

strh r6, [r1], #2 @ Store data_out[i + 1] | |

subs r4, #2 | |

bgt LOOP_LENGTH | |

blt END @ For even data_length, it's done. Jump to END. | |

@ Process i = data_length -1, for the case of an odd length. | |

ODD_LENGTH: | |

add r12, r2, r3, lsl #1 | |

sub r12, #4 @ &coefficients[coefficients_length - 2] | |

sub r9, r1, r3, lsl #1 | |

add r9, #2 @ &data_out[i - coefficients_length + 1] | |

mov r7, #0 @ sum1 | |

mov r8, #0 @ sum1 | |

subs r6, r3, #2 @ inner loop counter | |

beq EVEN_A_LENGTH @ branch if coefficients_length == 2 | |

LOOP2_A_LENGTH: | |

ldr r10, [r12], #-4 @ coefficients[j - 1], coefficients[j] | |

ldr r5, [r9], #4 @ data_out[i - j], data_out[i - j + 1] | |

subs r6, #2 | |

smlatb r7, r10, r5, r7 @ sum1 += coefficients[j] * data_out[i - j]; | |

smlabt r8, r10, r5, r8 @ coefficients[j - 1] * data_out[i - j + 1]; | |

bgt LOOP2_A_LENGTH | |

addlt r12, #2 | |

blt POST_LOOP2_A_LENGTH | |

EVEN_A_LENGTH: | |

ldrsh r10, [r12, #2] @ Filter coefficients coefficients[1] | |

ldrsh r5, [r9] @ data_out[i - 1] | |

smlabb r7, r10, r5, r7 @ sum1 += coefficients[1] * data_out[i - 1]; | |

POST_LOOP2_A_LENGTH: | |

ldrsh r10, [r12] @ Filter coefficients coefficients[0] | |

ldrsh r9, [r0] @ data_in[i] | |

smulbb r6, r10, r9 @ output1 = coefficients[0] * data_in[i]; | |

sub r6, r7 @ output1 -= sum1; | |

sub r6, r8 @ output1 -= sum1; | |

sbfx r8, r6, #12, #16 | |

ssat r7, #16, r6, asr #12 | |

cmp r7, r8 | |

addeq r6, r6, #2048 | |

ssat r6, #16, r6, asr #12 | |

strh r6, [r1] @ Store the data_out[i] | |

END: | |

pop {r4-r11} | |

bx lr | |

@Reference C code: | |

@ | |

@void WebRtcSpl_FilterARFastQ12(int16_t* data_in, | |

@ int16_t* data_out, | |

@ int16_t* __restrict coefficients, | |

@ size_t coefficients_length, | |

@ size_t data_length) { | |

@ size_t i = 0; | |

@ size_t j = 0; | |

@ | |

@ assert(data_length > 0); | |

@ assert(coefficients_length > 1); | |

@ | |

@ for (i = 0; i < data_length - 1; i += 2) { | |

@ int32_t output1 = 0; | |

@ int32_t sum1 = 0; | |

@ int32_t output2 = 0; | |

@ int32_t sum2 = 0; | |

@ | |

@ for (j = coefficients_length - 1; j > 2; j -= 2) { | |

@ sum1 += coefficients[j] * data_out[i - j]; | |

@ sum1 += coefficients[j - 1] * data_out[i - j + 1]; | |

@ sum2 += coefficients[j] * data_out[i - j + 1]; | |

@ sum2 += coefficients[j - 1] * data_out[i - j + 2]; | |

@ } | |

@ | |

@ if (j == 2) { | |

@ sum1 += coefficients[2] * data_out[i - 2]; | |

@ sum2 += coefficients[2] * data_out[i - 1]; | |

@ } | |

@ | |

@ sum1 += coefficients[1] * data_out[i - 1]; | |

@ output1 = coefficients[0] * data_in[i]; | |

@ output1 -= sum1; | |

@ // Saturate and store the output. | |

@ output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728); | |

@ data_out[i] = (int16_t)((output1 + 2048) >> 12); | |

@ | |

@ sum2 += coefficients[1] * data_out[i]; | |

@ output2 = coefficients[0] * data_in[i + 1]; | |

@ output2 -= sum2; | |

@ // Saturate and store the output. | |

@ output2 = WEBRTC_SPL_SAT(134215679, output2, -134217728); | |

@ data_out[i + 1] = (int16_t)((output2 + 2048) >> 12); | |

@ } | |

@ | |

@ if (i == data_length - 1) { | |

@ int32_t output1 = 0; | |

@ int32_t sum1 = 0; | |

@ | |

@ for (j = coefficients_length - 1; j > 1; j -= 2) { | |

@ sum1 += coefficients[j] * data_out[i - j]; | |

@ sum1 += coefficients[j - 1] * data_out[i - j + 1]; | |

@ } | |

@ | |

@ if (j == 1) { | |

@ sum1 += coefficients[1] * data_out[i - 1]; | |

@ } | |

@ | |

@ output1 = coefficients[0] * data_in[i]; | |

@ output1 -= sum1; | |

@ // Saturate and store the output. | |

@ output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728); | |

@ data_out[i] = (int16_t)((output1 + 2048) >> 12); | |

@ } | |

@} |