godot/thirdparty/libvpx/vp8/common/arm/neon/dequant_idct_neon.c

/*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include <arm_neon.h>

static const int16_t cospi8sqrt2minus1 = 20091;
static const int16_t sinpi8sqrt2       = 35468;

void vp8_dequant_idct_add_neon(
        int16_t *input,
        int16_t *dq,
        unsigned char *dst,
        int stride) {
    unsigned char *dst0;
    int32x2_t d14, d15;
    int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
    int16x8_t q1, q2, q3, q4, q5, q6;
    int16x8_t qEmpty = vdupq_n_s16(0);
    int32x2x2_t d2tmp0, d2tmp1;
    int16x4x2_t d2tmp2, d2tmp3;

    d14 = d15 = vdup_n_s32(0);

    // load input
    q3 = vld1q_s16(input);
    vst1q_s16(input, qEmpty);
    input += 8;
    q4 = vld1q_s16(input);
    vst1q_s16(input, qEmpty);

    // load dq
    q5 = vld1q_s16(dq);
    dq += 8;
    q6 = vld1q_s16(dq);

    // load src from dst
    dst0 = dst;
    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
    dst0 += stride;
    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
    dst0 += stride;
    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
    dst0 += stride;
    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);

    q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
                                         vreinterpretq_u16_s16(q5)));
    q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
                                         vreinterpretq_u16_s16(q6)));

    d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
    d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));

    q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));

    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);

    q3 = vshrq_n_s16(q3, 1);
    q4 = vshrq_n_s16(q4, 1);

    q3 = vqaddq_s16(q3, q2);
    q4 = vqaddq_s16(q4, q2);

    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));

    d2 = vqadd_s16(d12, d11);
    d3 = vqadd_s16(d13, d10);
    d4 = vqsub_s16(d13, d10);
    d5 = vqsub_s16(d12, d11);

    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
                      vreinterpret_s16_s32(d2tmp1.val[0]));
    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
                      vreinterpret_s16_s32(d2tmp1.val[1]));

    // loop 2
    q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);

    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);

    d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
    d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);

    q3 = vshrq_n_s16(q3, 1);
    q4 = vshrq_n_s16(q4, 1);

    q3 = vqaddq_s16(q3, q2);
    q4 = vqaddq_s16(q4, q2);

    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));

    d2 = vqadd_s16(d12, d11);
    d3 = vqadd_s16(d13, d10);
    d4 = vqsub_s16(d13, d10);
    d5 = vqsub_s16(d12, d11);

    d2 = vrshr_n_s16(d2, 3);
    d3 = vrshr_n_s16(d3, 3);
    d4 = vrshr_n_s16(d4, 3);
    d5 = vrshr_n_s16(d5, 3);

    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
                      vreinterpret_s16_s32(d2tmp1.val[0]));
    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
                      vreinterpret_s16_s32(d2tmp1.val[1]));

    q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
    q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);

    q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
                                        vreinterpret_u8_s32(d14)));
    q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
                                        vreinterpret_u8_s32(d15)));

    d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
    d15 = vreinterpret_s32_u8(vqmovun_s16(q2));

    dst0 = dst;
    vst1_lane_s32((int32_t *)dst0, d14, 0);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst0, d14, 1);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst0, d15, 0);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst0, d15, 1);
    return;
}
Add libvpx thirdparty library Only necessary files 2016-09-14 20:10:03 +00:00			`/*`
			`* Copyright (c) 2014 The WebM project authors. All Rights Reserved.`
			`*`
			`* Use of this source code is governed by a BSD-style license`
			`* that can be found in the LICENSE file in the root of the source`
			`* tree. An additional intellectual property rights grant can be found`
			`* in the file PATENTS. All contributing project authors may`
			`* be found in the AUTHORS file in the root of the source tree.`
			`*/`

			`#include <arm_neon.h>`

			`static const int16_t cospi8sqrt2minus1 = 20091;`
			`static const int16_t sinpi8sqrt2 = 35468;`

			`void vp8_dequant_idct_add_neon(`
			`int16_t *input,`
			`int16_t *dq,`
			`unsigned char *dst,`
			`int stride) {`
			`unsigned char *dst0;`
			`int32x2_t d14, d15;`
			`int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;`
			`int16x8_t q1, q2, q3, q4, q5, q6;`
			`int16x8_t qEmpty = vdupq_n_s16(0);`
			`int32x2x2_t d2tmp0, d2tmp1;`
			`int16x4x2_t d2tmp2, d2tmp3;`

			`d14 = d15 = vdup_n_s32(0);`

			`// load input`
			`q3 = vld1q_s16(input);`
			`vst1q_s16(input, qEmpty);`
			`input += 8;`
			`q4 = vld1q_s16(input);`
			`vst1q_s16(input, qEmpty);`

			`// load dq`
			`q5 = vld1q_s16(dq);`
			`dq += 8;`
			`q6 = vld1q_s16(dq);`

			`// load src from dst`
			`dst0 = dst;`
			`d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);`
			`dst0 += stride;`
			`d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);`
			`dst0 += stride;`
			`d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);`
			`dst0 += stride;`
			`d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);`

			`q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),`
			`vreinterpretq_u16_s16(q5)));`
			`q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),`
			`vreinterpretq_u16_s16(q6)));`

			`d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));`
			`d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));`

			`q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));`

			`q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);`
			`q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);`

			`q3 = vshrq_n_s16(q3, 1);`
			`q4 = vshrq_n_s16(q4, 1);`

			`q3 = vqaddq_s16(q3, q2);`
			`q4 = vqaddq_s16(q4, q2);`

			`d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));`
			`d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));`

			`d2 = vqadd_s16(d12, d11);`
			`d3 = vqadd_s16(d13, d10);`
			`d4 = vqsub_s16(d13, d10);`
			`d5 = vqsub_s16(d12, d11);`

			`d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));`
			`d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));`
			`d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),`
			`vreinterpret_s16_s32(d2tmp1.val[0]));`
			`d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),`
			`vreinterpret_s16_s32(d2tmp1.val[1]));`

			`// loop 2`
			`q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);`

			`q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);`
			`q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);`

			`d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);`
			`d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);`

			`q3 = vshrq_n_s16(q3, 1);`
			`q4 = vshrq_n_s16(q4, 1);`

			`q3 = vqaddq_s16(q3, q2);`
			`q4 = vqaddq_s16(q4, q2);`

			`d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));`
			`d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));`

			`d2 = vqadd_s16(d12, d11);`
			`d3 = vqadd_s16(d13, d10);`
			`d4 = vqsub_s16(d13, d10);`
			`d5 = vqsub_s16(d12, d11);`

			`d2 = vrshr_n_s16(d2, 3);`
			`d3 = vrshr_n_s16(d3, 3);`
			`d4 = vrshr_n_s16(d4, 3);`
			`d5 = vrshr_n_s16(d5, 3);`

			`d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));`
			`d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));`
			`d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),`
			`vreinterpret_s16_s32(d2tmp1.val[0]));`
			`d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),`
			`vreinterpret_s16_s32(d2tmp1.val[1]));`

			`q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);`
			`q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);`

			`q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),`
			`vreinterpret_u8_s32(d14)));`
			`q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),`
			`vreinterpret_u8_s32(d15)));`

			`d14 = vreinterpret_s32_u8(vqmovun_s16(q1));`
			`d15 = vreinterpret_s32_u8(vqmovun_s16(q2));`

			`dst0 = dst;`
			`vst1_lane_s32((int32_t *)dst0, d14, 0);`
			`dst0 += stride;`
			`vst1_lane_s32((int32_t *)dst0, d14, 1);`
			`dst0 += stride;`
			`vst1_lane_s32((int32_t *)dst0, d15, 0);`
			`dst0 += stride;`
			`vst1_lane_s32((int32_t *)dst0, d15, 1);`
			`return;`
			`}`