summaryrefslogtreecommitdiff
path: root/media/libwebp/dsp/dec_neon.c
diff options
context:
space:
mode:
Diffstat (limited to 'media/libwebp/dsp/dec_neon.c')
-rw-r--r--media/libwebp/dsp/dec_neon.c73
1 files changed, 42 insertions, 31 deletions
diff --git a/media/libwebp/dsp/dec_neon.c b/media/libwebp/dsp/dec_neon.c
index e8341327e4..ea690646d6 100644
--- a/media/libwebp/dsp/dec_neon.c
+++ b/media/libwebp/dsp/dec_neon.c
@@ -1283,12 +1283,12 @@ static void DC4_NEON(uint8_t* dst) { // DC
const uint8x8_t A = vld1_u8(dst - BPS); // top row
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
const uint16x4_t p1 = vpadd_u16(p0, p0);
- const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
- const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
- const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
- const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
- const uint16x8_t s0 = vaddq_u16(L0, L1);
- const uint16x8_t s1 = vaddq_u16(L2, L3);
+ const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
+ const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
+ const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
+ const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
+ const uint16x8_t s0 = vaddl_u8(L0, L1);
+ const uint16x8_t s1 = vaddl_u8(L2, L3);
const uint16x8_t s01 = vaddq_u16(s0, s1);
const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3
@@ -1361,7 +1361,8 @@ static void RD4_NEON(uint8_t* dst) { // Down-right
const uint32_t J = dst[-1 + 1 * BPS];
const uint32_t K = dst[-1 + 2 * BPS];
const uint32_t L = dst[-1 + 3 * BPS];
- const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
+ const uint64x1_t LKJI____ =
+ vcreate_u64((uint64_t)L | (K << 8) | (J << 16) | (I << 24));
const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
@@ -1427,25 +1428,30 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
if (do_top) {
const uint8x8_t A = vld1_u8(dst - BPS); // top row
+#if defined(__aarch64__)
+ const uint16_t p2 = vaddlv_u8(A);
+ sum_top = vdupq_n_u16(p2);
+#else
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
const uint16x4_t p1 = vpadd_u16(p0, p0);
const uint16x4_t p2 = vpadd_u16(p1, p1);
sum_top = vcombine_u16(p2, p2);
+#endif
}
if (do_left) {
- const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
- const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
- const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
- const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
- const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
- const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
- const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
- const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
- const uint16x8_t s0 = vaddq_u16(L0, L1);
- const uint16x8_t s1 = vaddq_u16(L2, L3);
- const uint16x8_t s2 = vaddq_u16(L4, L5);
- const uint16x8_t s3 = vaddq_u16(L6, L7);
+ const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
+ const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
+ const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
+ const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
+ const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
+ const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
+ const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
+ const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
+ const uint16x8_t s0 = vaddl_u8(L0, L1);
+ const uint16x8_t s1 = vaddl_u8(L2, L3);
+ const uint16x8_t s2 = vaddl_u8(L4, L5);
+ const uint16x8_t s3 = vaddl_u8(L6, L7);
const uint16x8_t s01 = vaddq_u16(s0, s1);
const uint16x8_t s23 = vaddq_u16(s2, s3);
sum_left = vaddq_u16(s01, s23);
@@ -1505,29 +1511,34 @@ static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
if (do_top) {
const uint8x16_t A = vld1q_u8(dst - BPS); // top row
+#if defined(__aarch64__)
+ const uint16_t p3 = vaddlvq_u8(A);
+ sum_top = vdupq_n_u16(p3);
+#else
const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
const uint16x4_t p2 = vpadd_u16(p1, p1);
const uint16x4_t p3 = vpadd_u16(p2, p2);
sum_top = vcombine_u16(p3, p3);
+#endif
}
if (do_left) {
int i;
sum_left = vdupq_n_u16(0);
for (i = 0; i < 16; i += 8) {
- const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
- const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
- const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
- const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
- const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
- const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
- const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
- const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
- const uint16x8_t s0 = vaddq_u16(L0, L1);
- const uint16x8_t s1 = vaddq_u16(L2, L3);
- const uint16x8_t s2 = vaddq_u16(L4, L5);
- const uint16x8_t s3 = vaddq_u16(L6, L7);
+ const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
+ const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
+ const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
+ const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
+ const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
+ const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
+ const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
+ const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
+ const uint16x8_t s0 = vaddl_u8(L0, L1);
+ const uint16x8_t s1 = vaddl_u8(L2, L3);
+ const uint16x8_t s2 = vaddl_u8(L4, L5);
+ const uint16x8_t s3 = vaddl_u8(L6, L7);
const uint16x8_t s01 = vaddq_u16(s0, s1);
const uint16x8_t s23 = vaddq_u16(s2, s3);
const uint16x8_t sum = vaddq_u16(s01, s23);