|  | @ | 
|  | @ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. | 
|  | @ | 
|  | @ Use of this source code is governed by a BSD-style license | 
|  | @ that can be found in the LICENSE file in the root of the source | 
|  | @ tree. An additional intellectual property rights grant can be found | 
|  | @ in the file PATENTS.  All contributing project authors may | 
|  | @ be found in the AUTHORS file in the root of the source tree. | 
|  | @ | 
|  |  | 
|  | @ This file contains the function WebRtcSpl_DownsampleFastNeon(), optimized for | 
|  | @ ARM Neon platform. The description header can be found in | 
|  | @ signal_processing_library.h | 
|  | @ | 
|  | @ The reference C code is in file downsample_fast.c. Bit-exact. | 
|  |  | 
|  | #include "webrtc/system_wrappers/interface/asm_defines.h" | 
|  |  | 
|  | GLOBAL_FUNCTION WebRtcSpl_DownsampleFastNeon | 
|  | .align  2 | 
|  | DEFINE_FUNCTION WebRtcSpl_DownsampleFastNeon | 
|  | push {r4-r11} | 
|  |  | 
|  | cmp r3, #0                                @ data_out_length <= 0? | 
|  | movle r0, #-1 | 
|  | ble END | 
|  |  | 
|  | ldrsh r12, [sp, #44] | 
|  | ldr r5, [sp, #40]                         @ r5: factor | 
|  | add r4, r12, #1                           @ r4: delay + 1 | 
|  | sub r3, r3, #1                            @ r3: data_out_length - 1 | 
|  | smulbb r3, r5, r3 | 
|  | ldr r8, [sp, #32]                         @ &coefficients[0] | 
|  | mov r9, r12                               @ Iteration counter for outer loops. | 
|  | add r3, r4                                @ delay + factor * (out_length-1) +1 | 
|  |  | 
|  | cmp r3, r1                                @ data_in_length < endpos? | 
|  | movgt r0, #-1 | 
|  | bgt END | 
|  |  | 
|  | @ Initializations. | 
|  | sub r3, r5, asl #3 | 
|  | add r11, r0, r12, asl #1                  @ &data_in[delay] | 
|  | ldr r0, [sp, #36]                         @ coefficients_length | 
|  | add r3, r5                                @ endpos - factor * 7 | 
|  |  | 
|  | cmp r0, #0                                @ coefficients_length <= 0 ? | 
|  | movle r0, #-1 | 
|  | ble END | 
|  |  | 
|  | add r8, r0, asl #1                        @ &coeffieient[coefficients_length] | 
|  | cmp r9, r3 | 
|  | bge POST_LOOP_ENDPOS                      @ branch when Iteration < 8 times. | 
|  |  | 
|  | @ | 
|  | @ First part, unroll the loop 8 times, with 3 subcases (factor == 2, 4, others) | 
|  | @ | 
|  | mov r4, #-2 | 
|  |  | 
|  | @ Direct program flow to the right channel. | 
|  |  | 
|  | @ r10 is an offset to &data_in[] in the loop. After an iteration, we need to | 
|  | @ move the pointer back to original after advancing 16 bytes by a vld1, and | 
|  | @ then move 2 bytes forward to increment one more sample. | 
|  | cmp r5, #2 | 
|  | moveq r10, #-14 | 
|  | beq LOOP_ENDPOS_FACTOR2                   @ Branch when factor == 2 | 
|  |  | 
|  | @ Similar here, for r10, we need to move the pointer back to original after | 
|  | @ advancing 32 bytes, then move 2 bytes forward to increment one sample. | 
|  | cmp r5, #4 | 
|  | moveq r10, #-30 | 
|  | beq LOOP_ENDPOS_FACTOR4                   @ Branch when factor == 4 | 
|  |  | 
|  | @ For r10, we need to move the pointer back to original after advancing | 
|  | @ (factor * 7 * 2) bytes, then move 2 bytes forward to increment one sample. | 
|  | mov r10, r5, asl #4 | 
|  | rsb r10, #2 | 
|  | add r10, r5, asl #1 | 
|  | lsl r5, #1                                @ r5 = factor * sizeof(data_in) | 
|  |  | 
|  | @ The general case (factor != 2 && factor != 4) | 
|  | LOOP_ENDPOS_GENERAL: | 
|  | @ Initializations. | 
|  | vmov.i32 q2, #2048 | 
|  | vmov.i32 q3, #2048 | 
|  | sub r7, r8, #2 | 
|  | sub r12, r0, #1                           @ coefficients_length - 1 | 
|  | sub r1, r11, r12, asl #1                  @ &data_in[i - j] | 
|  |  | 
|  | LOOP_COEFF_LENGTH_GENERAL: | 
|  | vld1.16 {d2[], d3[]}, [r7], r4            @ coefficients[j] | 
|  | vld1.16 d0[0], [r1], r5                   @ data_in[i - j] | 
|  | vld1.16 d0[1], [r1], r5                   @ data_in[i + factor - j] | 
|  | vld1.16 d0[2], [r1], r5                   @ data_in[i + factor * 2 - j] | 
|  | vld1.16 d0[3], [r1], r5                   @ data_in[i + factor * 3 - j] | 
|  | vld1.16 d1[0], [r1], r5                   @ data_in[i + factor * 4 - j] | 
|  | vld1.16 d1[1], [r1], r5                   @ data_in[i + factor * 5 - j] | 
|  | vld1.16 d1[2], [r1], r5                   @ data_in[i + factor * 6 - j] | 
|  | vld1.16 d1[3], [r1], r10                  @ data_in[i + factor * 7 - j] | 
|  | subs r12, #1 | 
|  | vmlal.s16 q2, d0, d2 | 
|  | vmlal.s16 q3, d1, d3 | 
|  | bge LOOP_COEFF_LENGTH_GENERAL | 
|  |  | 
|  | @ Shift, saturate, and store the result. | 
|  | vqshrn.s32 d0, q2, #12 | 
|  | vqshrn.s32 d1, q3, #12 | 
|  | vst1.16 {d0, d1}, [r2]! | 
|  |  | 
|  | add r11, r5, asl #3                       @ r11 -> &data_in[i + factor * 8] | 
|  | add r9, r5, asl #2                        @ Counter i = delay + factor * 8. | 
|  | cmp r9, r3                                @ i < endpos - factor * 7 ? | 
|  | blt LOOP_ENDPOS_GENERAL | 
|  | asr r5, #1                                @ Restore r5 to the value of factor. | 
|  | b POST_LOOP_ENDPOS | 
|  |  | 
|  | @ The case for factor == 2. | 
|  | LOOP_ENDPOS_FACTOR2: | 
|  | @ Initializations. | 
|  | vmov.i32 q2, #2048 | 
|  | vmov.i32 q3, #2048 | 
|  | sub r7, r8, #2 | 
|  | sub r12, r0, #1                           @ coefficients_length - 1 | 
|  | sub r1, r11, r12, asl #1                  @ &data_in[i - j] | 
|  |  | 
|  | LOOP_COEFF_LENGTH_FACTOR2: | 
|  | vld1.16 {d16[], d17[]}, [r7], r4          @ coefficients[j] | 
|  | vld2.16 {d0, d1}, [r1]!                   @ data_in[] | 
|  | vld2.16 {d2, d3}, [r1], r10               @ data_in[] | 
|  | subs r12, #1 | 
|  | vmlal.s16 q2, d0, d16 | 
|  | vmlal.s16 q3, d2, d17 | 
|  | bge LOOP_COEFF_LENGTH_FACTOR2 | 
|  |  | 
|  | @ Shift, saturate, and store the result. | 
|  | vqshrn.s32 d0, q2, #12 | 
|  | vqshrn.s32 d1, q3, #12 | 
|  | vst1.16 {d0, d1}, [r2]! | 
|  |  | 
|  | add r11, r5, asl #4                       @ r11 -> &data_in[i + factor * 8] | 
|  | add r9, r5, asl #3                        @ Counter i = delay + factor * 8. | 
|  | cmp r9, r3                                @ i < endpos - factor * 7 ? | 
|  | blt LOOP_ENDPOS_FACTOR2 | 
|  | b POST_LOOP_ENDPOS | 
|  |  | 
|  | @ The case for factor == 4. | 
|  | LOOP_ENDPOS_FACTOR4: | 
|  | @ Initializations. | 
|  | vmov.i32 q2, #2048 | 
|  | vmov.i32 q3, #2048 | 
|  | sub r7, r8, #2 | 
|  | sub r12, r0, #1                           @ coefficients_length - 1 | 
|  | sub r1, r11, r12, asl #1                  @ &data_in[i - j] | 
|  |  | 
|  | LOOP_COEFF_LENGTH_FACTOR4: | 
|  | vld1.16 {d16[], d17[]}, [r7], r4          @ coefficients[j] | 
|  | vld4.16 {d0, d1, d2, d3}, [r1]!           @ data_in[] | 
|  | vld4.16 {d18, d19, d20, d21}, [r1], r10   @ data_in[] | 
|  | subs r12, #1 | 
|  | vmlal.s16 q2, d0, d16 | 
|  | vmlal.s16 q3, d18, d17 | 
|  | bge LOOP_COEFF_LENGTH_FACTOR4 | 
|  |  | 
|  | add r11, r5, asl #4                       @ r11 -> &data_in[i + factor * 8] | 
|  | add r9, r5, asl #3                        @ Counter i = delay + factor * 8. | 
|  |  | 
|  | @ Shift, saturate, and store the result. | 
|  | vqshrn.s32 d0, q2, #12 | 
|  | vqshrn.s32 d1, q3, #12 | 
|  | cmp r9, r3                                @ i < endpos - factor * 7 ? | 
|  | vst1.16 {d0, d1}, [r2]! | 
|  |  | 
|  | blt LOOP_ENDPOS_FACTOR4 | 
|  |  | 
|  | @ | 
|  | @ Second part, do the rest iterations (if any). | 
|  | @ | 
|  |  | 
|  | POST_LOOP_ENDPOS: | 
|  | add r3, r5, asl #3 | 
|  | sub r3, r5                                @ Restore r3 to endpos. | 
|  | cmp r9, r3 | 
|  | movge r0, #0 | 
|  | bge END | 
|  |  | 
|  | LOOP2_ENDPOS: | 
|  | @ Initializations. | 
|  | mov r7, r8 | 
|  | sub r12, r0, #1                           @ coefficients_length - 1 | 
|  | sub r6, r11, r12, asl #1                  @ &data_in[i - j] | 
|  |  | 
|  | mov r1, #2048 | 
|  |  | 
|  | LOOP2_COEFF_LENGTH: | 
|  | ldrsh r4, [r7, #-2]!                      @ coefficients[j] | 
|  | ldrsh r10, [r6], #2                       @ data_in[i - j] | 
|  | smlabb r1, r4, r10, r1 | 
|  | subs r12, #1 | 
|  | bge LOOP2_COEFF_LENGTH | 
|  |  | 
|  | @ Shift, saturate, and store the result. | 
|  | ssat r1, #16, r1, asr #12 | 
|  | strh r1, [r2], #2 | 
|  |  | 
|  | add r11, r5, asl #1                       @ r11 -> &data_in[i + factor] | 
|  | add r9, r5                                @ Counter i = delay + factor. | 
|  | cmp r9, r3                                @ i < endpos? | 
|  | blt LOOP2_ENDPOS | 
|  |  | 
|  | mov r0, #0 | 
|  |  | 
|  | END: | 
|  | pop {r4-r11} | 
|  | bx  lr |