blob: 056dc275d3912ec2402fa6d92e79f0d2630e7498 [file] [log] [blame]
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
// MIPS optimized implementation of the Autocorrelation function in fixed point.
// NOTE! Different from SPLIB-version in how it scales the signal.
int WebRtcIsacfix_AutocorrMIPS(int32_t* __restrict r,
const int16_t* __restrict x,
int16_t N,
int16_t order,
int16_t* __restrict scale) {
int i = 0;
int16_t scaling = 0;
int16_t* in = (int16_t*)x;
int loop_size = (int)(N >> 3);
int count = (int)(N & 7);
// Declare temporary variables used as registry values.
int32_t r0, r1, r2, r3;
#if !defined(MIPS_DSP_R2_LE)
// For non-DSPR2 optimizations 4 more registers are used.
int32_t r4, r5, r6, r7;
#endif
// Calculate r[0] and scaling needed.
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"mult $0, $0 \n\t"
// Loop is unrolled 8 times, set accumulator to zero in branch delay slot.
"beqz %[loop_size], 2f \n\t"
" mult $0, $0 \n\t"
"1: \n\t"
// Load 8 samples per loop iteration.
#if defined(MIPS_DSP_R2_LE)
"ulw %[r0], 0(%[in]) \n\t"
"ulw %[r1], 4(%[in]) \n\t"
"ulw %[r2], 8(%[in]) \n\t"
"ulw %[r3], 12(%[in]) \n\t"
#else
"lh %[r0], 0(%[in]) \n\t"
"lh %[r1], 2(%[in]) \n\t"
"lh %[r2], 4(%[in]) \n\t"
"lh %[r3], 6(%[in]) \n\t"
"lh %[r4], 8(%[in]) \n\t"
"lh %[r5], 10(%[in]) \n\t"
"lh %[r6], 12(%[in]) \n\t"
"lh %[r7], 14(%[in]) \n\t"
#endif
"addiu %[loop_size], %[loop_size], -1 \n\t"
// Multiply and accumulate.
#if defined(MIPS_DSP_R2_LE)
"dpa.w.ph $ac0, %[r0], %[r0] \n\t"
"dpa.w.ph $ac0, %[r1], %[r1] \n\t"
"dpa.w.ph $ac0, %[r2], %[r2] \n\t"
"dpa.w.ph $ac0, %[r3], %[r3] \n\t"
#else
"madd %[r0], %[r0] \n\t"
"madd %[r1], %[r1] \n\t"
"madd %[r2], %[r2] \n\t"
"madd %[r3], %[r3] \n\t"
"madd %[r4], %[r4] \n\t"
"madd %[r5], %[r5] \n\t"
"madd %[r6], %[r6] \n\t"
"madd %[r7], %[r7] \n\t"
#endif
"bnez %[loop_size], 1b \n\t"
" addiu %[in], %[in], 16 \n\t"
"2: \n\t"
"beqz %[count], 4f \n\t"
#if defined(MIPS_DSP_R1_LE)
" extr.w %[r0], $ac0, 31 \n\t"
#else
" mfhi %[r2] \n\t"
#endif
// Process remaining samples (if any).
"3: \n\t"
"lh %[r0], 0(%[in]) \n\t"
"addiu %[count], %[count], -1 \n\t"
"madd %[r0], %[r0] \n\t"
"bnez %[count], 3b \n\t"
" addiu %[in], %[in], 2 \n\t"
#if defined(MIPS_DSP_R1_LE)
"extr.w %[r0], $ac0, 31 \n\t"
#else
"mfhi %[r2] \n\t"
#endif
"4: \n\t"
#if !defined(MIPS_DSP_R1_LE)
"mflo %[r3] \n\t"
"sll %[r0], %[r2], 1 \n\t"
"srl %[r1], %[r3], 31 \n\t"
"addu %[r0], %[r0], %[r1] \n\t"
#endif
// Calculate scaling (the value of shifting).
"clz %[r1], %[r0] \n\t"
"addiu %[r1], %[r1], -32 \n\t"
"subu %[scaling], $0, %[r1] \n\t"
"slti %[r1], %[r0], 0x1 \n\t"
"movn %[scaling], $0, %[r1] \n\t"
#if defined(MIPS_DSP_R1_LE)
"extrv.w %[r0], $ac0, %[scaling] \n\t"
"mfhi %[r2], $ac0 \n\t"
#else
"addiu %[r1], %[scaling], -32 \n\t"
"subu %[r1], $0, %[r1] \n\t"
"sllv %[r1], %[r2], %[r1] \n\t"
"srlv %[r0], %[r3], %[scaling] \n\t"
"addu %[r0], %[r0], %[r1] \n\t"
#endif
"slti %[r1], %[scaling], 32 \n\t"
"movz %[r0], %[r2], %[r1] \n\t"
".set pop \n\t"
: [loop_size] "+r" (loop_size), [in] "+r" (in), [r0] "=&r" (r0),
[r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3),
#if !defined(MIPS_DSP_R2_LE)
[r4] "=&r" (r4), [r5] "=&r" (r5), [r6] "=&r" (r6), [r7] "=&r" (r7),
#endif
[count] "+r" (count), [scaling] "=r" (scaling)
: [N] "r" (N)
: "memory", "hi", "lo"
);
r[0] = r0;
// Correlation calculation is divided in 3 cases depending on the scaling
// value (different accumulator manipulation needed). Three slightly different
// loops are written in order to avoid branches inside the loop.
if (scaling == 0) {
// In this case, the result will be in low part of the accumulator.
for (i = 1; i < order + 1; i++) {
in = (int16_t*)x;
int16_t* in1 = (int16_t*)x + i;
count = N - i;
loop_size = (count) >> 2;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"mult $0, $0 \n\t"
"beqz %[loop_size], 2f \n\t"
" andi %[count], %[count], 0x3 \n\t"
// Loop processing 4 pairs of samples per iteration.
"1: \n\t"
#if defined(MIPS_DSP_R2_LE)
"ulw %[r0], 0(%[in]) \n\t"
"ulw %[r1], 0(%[in1]) \n\t"
"ulw %[r2], 4(%[in]) \n\t"
"ulw %[r3], 4(%[in1]) \n\t"
#else
"lh %[r0], 0(%[in]) \n\t"
"lh %[r1], 0(%[in1]) \n\t"
"lh %[r2], 2(%[in]) \n\t"
"lh %[r3], 2(%[in1]) \n\t"
"lh %[r4], 4(%[in]) \n\t"
"lh %[r5], 4(%[in1]) \n\t"
"lh %[r6], 6(%[in]) \n\t"
"lh %[r7], 6(%[in1]) \n\t"
#endif
"addiu %[loop_size], %[loop_size], -1 \n\t"
#if defined(MIPS_DSP_R2_LE)
"dpa.w.ph $ac0, %[r0], %[r1] \n\t"
"dpa.w.ph $ac0, %[r2], %[r3] \n\t"
#else
"madd %[r0], %[r1] \n\t"
"madd %[r2], %[r3] \n\t"
"madd %[r4], %[r5] \n\t"
"madd %[r6], %[r7] \n\t"
#endif
"addiu %[in], %[in], 8 \n\t"
"bnez %[loop_size], 1b \n\t"
" addiu %[in1], %[in1], 8 \n\t"
"2: \n\t"
"beqz %[count], 4f \n\t"
" mflo %[r0] \n\t"
// Process remaining samples (if any).
"3: \n\t"
"lh %[r0], 0(%[in]) \n\t"
"lh %[r1], 0(%[in1]) \n\t"
"addiu %[count], %[count], -1 \n\t"
"addiu %[in], %[in], 2 \n\t"
"madd %[r0], %[r1] \n\t"
"bnez %[count], 3b \n\t"
" addiu %[in1], %[in1], 2 \n\t"
"mflo %[r0] \n\t"
"4: \n\t"
".set pop \n\t"
: [loop_size] "+r" (loop_size), [in] "+r" (in), [in1] "+r" (in1),
#if !defined(MIPS_DSP_R2_LE)
[r4] "=&r" (r4), [r5] "=&r" (r5), [r6] "=&r" (r6), [r7] "=&r" (r7),
#endif
[r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3),
[count] "+r" (count)
:
: "memory", "hi", "lo"
);
r[i] = r0;
}
} else if (scaling == 32) {
// In this case, the result will be high part of the accumulator.
for (i = 1; i < order + 1; i++) {
in = (int16_t*)x;
int16_t* in1 = (int16_t*)x + i;
count = N - i;
loop_size = (count) >> 2;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"mult $0, $0 \n\t"
"beqz %[loop_size], 2f \n\t"
" andi %[count], %[count], 0x3 \n\t"
// Loop processing 4 pairs of samples per iteration.
"1: \n\t"
#if defined(MIPS_DSP_R2_LE)
"ulw %[r0], 0(%[in]) \n\t"
"ulw %[r1], 0(%[in1]) \n\t"
"ulw %[r2], 4(%[in]) \n\t"
"ulw %[r3], 4(%[in1]) \n\t"
#else
"lh %[r0], 0(%[in]) \n\t"
"lh %[r1], 0(%[in1]) \n\t"
"lh %[r2], 2(%[in]) \n\t"
"lh %[r3], 2(%[in1]) \n\t"
"lh %[r4], 4(%[in]) \n\t"
"lh %[r5], 4(%[in1]) \n\t"
"lh %[r6], 6(%[in]) \n\t"
"lh %[r7], 6(%[in1]) \n\t"
#endif
"addiu %[loop_size], %[loop_size], -1 \n\t"
#if defined(MIPS_DSP_R2_LE)
"dpa.w.ph $ac0, %[r0], %[r1] \n\t"
"dpa.w.ph $ac0, %[r2], %[r3] \n\t"
#else
"madd %[r0], %[r1] \n\t"
"madd %[r2], %[r3] \n\t"
"madd %[r4], %[r5] \n\t"
"madd %[r6], %[r7] \n\t"
#endif
"addiu %[in], %[in], 8 \n\t"
"bnez %[loop_size], 1b \n\t"
" addiu %[in1], %[in1], 8 \n\t"
"2: \n\t"
"beqz %[count], 4f \n\t"
" mfhi %[r0] \n\t"
// Process remaining samples (if any).
"3: \n\t"
"lh %[r0], 0(%[in]) \n\t"
"lh %[r1], 0(%[in1]) \n\t"
"addiu %[count], %[count], -1 \n\t"
"addiu %[in], %[in], 2 \n\t"
"madd %[r0], %[r1] \n\t"
"bnez %[count], 3b \n\t"
" addiu %[in1], %[in1], 2 \n\t"
"mfhi %[r0] \n\t"
"4: \n\t"
".set pop \n\t"
: [loop_size] "+r" (loop_size), [in] "+r" (in), [in1] "+r" (in1),
#if !defined(MIPS_DSP_R2_LE)
[r4] "=&r" (r4), [r5] "=&r" (r5), [r6] "=&r" (r6), [r7] "=&r" (r7),
#endif
[r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3),
[count] "+r" (count)
:
: "memory", "hi", "lo"
);
r[i] = r0;
}
} else {
// In this case, the result is obtained by combining low and high parts
// of the accumulator.
#if !defined(MIPS_DSP_R1_LE)
int32_t tmp_shift = 32 - scaling;
#endif
for (i = 1; i < order + 1; i++) {
in = (int16_t*)x;
int16_t* in1 = (int16_t*)x + i;
count = N - i;
loop_size = (count) >> 2;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"mult $0, $0 \n\t"
"beqz %[loop_size], 2f \n\t"
" andi %[count], %[count], 0x3 \n\t"
"1: \n\t"
#if defined(MIPS_DSP_R2_LE)
"ulw %[r0], 0(%[in]) \n\t"
"ulw %[r1], 0(%[in1]) \n\t"
"ulw %[r2], 4(%[in]) \n\t"
"ulw %[r3], 4(%[in1]) \n\t"
#else
"lh %[r0], 0(%[in]) \n\t"
"lh %[r1], 0(%[in1]) \n\t"
"lh %[r2], 2(%[in]) \n\t"
"lh %[r3], 2(%[in1]) \n\t"
"lh %[r4], 4(%[in]) \n\t"
"lh %[r5], 4(%[in1]) \n\t"
"lh %[r6], 6(%[in]) \n\t"
"lh %[r7], 6(%[in1]) \n\t"
#endif
"addiu %[loop_size], %[loop_size], -1 \n\t"
#if defined(MIPS_DSP_R2_LE)
"dpa.w.ph $ac0, %[r0], %[r1] \n\t"
"dpa.w.ph $ac0, %[r2], %[r3] \n\t"
#else
"madd %[r0], %[r1] \n\t"
"madd %[r2], %[r3] \n\t"
"madd %[r4], %[r5] \n\t"
"madd %[r6], %[r7] \n\t"
#endif
"addiu %[in], %[in], 8 \n\t"
"bnez %[loop_size], 1b \n\t"
" addiu %[in1], %[in1], 8 \n\t"
"2: \n\t"
"beqz %[count], 4f \n\t"
#if defined(MIPS_DSP_R1_LE)
" extrv.w %[r0], $ac0, %[scaling] \n\t"
#else
" mfhi %[r0] \n\t"
#endif
"3: \n\t"
"lh %[r0], 0(%[in]) \n\t"
"lh %[r1], 0(%[in1]) \n\t"
"addiu %[count], %[count], -1 \n\t"
"addiu %[in], %[in], 2 \n\t"
"madd %[r0], %[r1] \n\t"
"bnez %[count], 3b \n\t"
" addiu %[in1], %[in1], 2 \n\t"
#if defined(MIPS_DSP_R1_LE)
"extrv.w %[r0], $ac0, %[scaling] \n\t"
#else
"mfhi %[r0] \n\t"
#endif
"4: \n\t"
#if !defined(MIPS_DSP_R1_LE)
"mflo %[r1] \n\t"
"sllv %[r0], %[r0], %[tmp_shift] \n\t"
"srlv %[r1], %[r1], %[scaling] \n\t"
"addu %[r0], %[r0], %[r1] \n\t"
#endif
".set pop \n\t"
: [loop_size] "+r" (loop_size), [in] "+r" (in), [in1] "+r" (in1),
#if !defined(MIPS_DSP_R2_LE)
[r4] "=&r" (r4), [r5] "=&r" (r5), [r6] "=&r" (r6), [r7] "=&r" (r7),
#endif
[r0] "=&r" (r0), [r1] "=&r" (r1), [r2] "=&r" (r2), [r3] "=&r" (r3),
[count] "+r" (count)
: [scaling] "r" (scaling)
#if !defined(MIPS_DSP_R1_LE)
, [tmp_shift] "r" (tmp_shift)
#endif
: "memory", "hi", "lo"
);
r[i] = r0;
}
}
*scale = scaling;
return (order + 1);
}