diff options
Diffstat (limited to 'media/libaom/src/aom_dsp/arm/sad_neon.c')
-rw-r--r-- | media/libaom/src/aom_dsp/arm/sad_neon.c | 356 |
1 files changed, 348 insertions, 8 deletions
diff --git a/media/libaom/src/aom_dsp/arm/sad_neon.c b/media/libaom/src/aom_dsp/arm/sad_neon.c index a39de91d60..4f0a1990ca 100644 --- a/media/libaom/src/aom_dsp/arm/sad_neon.c +++ b/media/libaom/src/aom_dsp/arm/sad_neon.c @@ -10,13 +10,12 @@ */ #include <arm_neon.h> - #include "config/aom_config.h" - +#include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" -unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { +unsigned int aom_sad8x16_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { uint8x8_t d0, d8; uint16x8_t q12; uint32x4_t q1; @@ -46,8 +45,8 @@ unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride, return vget_lane_u32(d5, 0); } -unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { +unsigned int aom_sad4x4_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { uint8x8_t d0, d8; uint16x8_t q12; uint32x2_t d1; @@ -74,8 +73,8 @@ unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride, return vget_lane_u32(vreinterpret_u32_u64(d3), 0); } -unsigned int aom_sad16x8_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { +unsigned int aom_sad16x8_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { uint8x16_t q0, q4; uint16x8_t q12, q13; uint32x4_t q1; @@ -164,6 +163,77 @@ unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride, return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); } +unsigned int aom_sad128x128_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint16x8_t vec_accum_lo, vec_accum_hi; + uint32x4_t vec_accum_32lo = vdupq_n_u32(0); + uint32x4_t vec_accum_32hi = vdupq_n_u32(0); + uint16x8_t tmp; + for (int i = 0; i < 128; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_src_32 = vld1q_u8(src + 32); + const uint8x16_t vec_src_48 = vld1q_u8(src + 48); + const uint8x16_t vec_src_64 = vld1q_u8(src + 64); + const uint8x16_t vec_src_80 = vld1q_u8(src + 80); + const uint8x16_t vec_src_96 = vld1q_u8(src + 96); + const uint8x16_t vec_src_112 = vld1q_u8(src + 112); + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); + const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); + const uint8x16_t vec_ref_64 = vld1q_u8(ref + 64); + const uint8x16_t vec_ref_80 = vld1q_u8(ref + 80); + const uint8x16_t vec_ref_96 = vld1q_u8(ref + 96); + const uint8x16_t vec_ref_112 = vld1q_u8(ref + 112); + src += src_stride; + ref += ref_stride; + vec_accum_lo = vdupq_n_u16(0); + vec_accum_hi = vdupq_n_u16(0); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32), + vget_low_u8(vec_ref_32)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), + vget_high_u8(vec_ref_32)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), + vget_low_u8(vec_ref_48)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), + vget_high_u8(vec_ref_48)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_64), + vget_low_u8(vec_ref_64)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_64), + vget_high_u8(vec_ref_64)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_80), + vget_low_u8(vec_ref_80)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_80), + vget_high_u8(vec_ref_80)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_96), + vget_low_u8(vec_ref_96)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_96), + vget_high_u8(vec_ref_96)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_112), + vget_low_u8(vec_ref_112)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_112), + vget_high_u8(vec_ref_112)); + + tmp = vaddq_u16(vec_accum_lo, vec_accum_hi); + vec_accum_32lo = vaddw_u16(vec_accum_32lo, vget_low_u16(tmp)); + vec_accum_32hi = vaddw_u16(vec_accum_32hi, vget_high_u16(tmp)); + } + const uint32x4_t a = vaddq_u32(vec_accum_32lo, vec_accum_32hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride) { int i; @@ -222,3 +292,273 @@ unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride, } return horizontal_add_16x8(vec_accum); } + +static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int sum = 0; + for (int i = 0; i < h; i++) { + uint16x8_t q3 = vdupq_n_u16(0); + + uint8x16_t q0 = vld1q_u8(src_ptr); + uint8x16_t q1 = vld1q_u8(ref_ptr); + uint8x16_t q2 = vabdq_u8(q0, q1); + q3 = vpadalq_u8(q3, q2); + + q0 = vld1q_u8(src_ptr + 16); + q1 = vld1q_u8(ref_ptr + 16); + q2 = vabdq_u8(q0, q1); + q3 = vpadalq_u8(q3, q2); + + q0 = vld1q_u8(src_ptr + 32); + q1 = vld1q_u8(ref_ptr + 32); + q2 = vabdq_u8(q0, q1); + q3 = vpadalq_u8(q3, q2); + + q0 = vld1q_u8(src_ptr + 48); + q1 = vld1q_u8(ref_ptr + 48); + q2 = vabdq_u8(q0, q1); + q3 = vpadalq_u8(q3, q2); + + q0 = vld1q_u8(src_ptr + 64); + q1 = vld1q_u8(ref_ptr + 64); + q2 = vabdq_u8(q0, q1); + q3 = vpadalq_u8(q3, q2); + + q0 = vld1q_u8(src_ptr + 80); + q1 = vld1q_u8(ref_ptr + 80); + q2 = vabdq_u8(q0, q1); + q3 = vpadalq_u8(q3, q2); + + q0 = vld1q_u8(src_ptr + 96); + q1 = vld1q_u8(ref_ptr + 96); + q2 = vabdq_u8(q0, q1); + q3 = vpadalq_u8(q3, q2); + + q0 = vld1q_u8(src_ptr + 112); + q1 = vld1q_u8(ref_ptr + 112); + q2 = vabdq_u8(q0, q1); + q3 = vpadalq_u8(q3, q2); + + src_ptr += src_stride; + ref_ptr += ref_stride; + + sum += horizontal_add_16x8(q3); + } + + return sum; +} + +static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int sum = 0; + for (int i = 0; i < h; i++) { + uint16x8_t q3 = vdupq_n_u16(0); + + uint8x16_t q0 = vld1q_u8(src_ptr); + uint8x16_t q1 = vld1q_u8(ref_ptr); + uint8x16_t q2 = vabdq_u8(q0, q1); + q3 = vpadalq_u8(q3, q2); + + q0 = vld1q_u8(src_ptr + 16); + q1 = vld1q_u8(ref_ptr + 16); + q2 = vabdq_u8(q0, q1); + q3 = vpadalq_u8(q3, q2); + + q0 = vld1q_u8(src_ptr + 32); + q1 = vld1q_u8(ref_ptr + 32); + q2 = vabdq_u8(q0, q1); + q3 = vpadalq_u8(q3, q2); + + q0 = vld1q_u8(src_ptr + 48); + q1 = vld1q_u8(ref_ptr + 48); + q2 = vabdq_u8(q0, q1); + q3 = vpadalq_u8(q3, q2); + + src_ptr += src_stride; + ref_ptr += ref_stride; + + sum += horizontal_add_16x8(q3); + } + + return sum; +} + +static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int sum = 0; + for (int i = 0; i < h; i++) { + uint16x8_t q3 = vdupq_n_u16(0); + + uint8x16_t q0 = vld1q_u8(src_ptr); + uint8x16_t q1 = vld1q_u8(ref_ptr); + uint8x16_t q2 = vabdq_u8(q0, q1); + q3 = vpadalq_u8(q3, q2); + + q0 = vld1q_u8(src_ptr + 16); + q1 = vld1q_u8(ref_ptr + 16); + q2 = vabdq_u8(q0, q1); + q3 = vpadalq_u8(q3, q2); + + sum += horizontal_add_16x8(q3); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } + + return sum; +} + +static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int sum = 0; + for (int i = 0; i < h; i++) { + uint8x8_t q0 = vld1_u8(src_ptr); + uint8x8_t q1 = vld1_u8(ref_ptr); + sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 0); + sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 1); + sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 2); + sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 3); + q0 = vld1_u8(src_ptr + 8); + q1 = vld1_u8(ref_ptr + 8); + sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 0); + sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 1); + sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 2); + sum += vget_lane_u16(vpaddl_u8(vabd_u8(q0, q1)), 3); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } + + return sum; +} + +static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t q3 = vdupq_n_u16(0); + for (int y = 0; y < h; y++) { + uint8x8_t q0 = vld1_u8(src_ptr); + uint8x8_t q1 = vld1_u8(ref_ptr); + src_ptr += src_stride; + ref_ptr += ref_stride; + q3 = vabal_u8(q3, q0, q1); + } + return horizontal_add_16x8(q3); +} + +static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t q3 = vdupq_n_u16(0); + uint32x2_t q0 = vdup_n_u32(0); + uint32x2_t q1 = vdup_n_u32(0); + uint32_t src4, ref4; + for (int y = 0; y < h / 2; y++) { + memcpy(&src4, src_ptr, 4); + memcpy(&ref4, ref_ptr, 4); + src_ptr += src_stride; + ref_ptr += ref_stride; + q0 = vset_lane_u32(src4, q0, 0); + q1 = vset_lane_u32(ref4, q1, 0); + + memcpy(&src4, src_ptr, 4); + memcpy(&ref4, ref_ptr, 4); + src_ptr += src_stride; + ref_ptr += ref_stride; + q0 = vset_lane_u32(src4, q0, 1); + q1 = vset_lane_u32(ref4, q1, 1); + + q3 = vabal_u8(q3, vreinterpret_u8_u32(q0), vreinterpret_u8_u32(q1)); + } + return horizontal_add_16x8(q3); +} + +#define FSADS128_H(h) \ + unsigned int aom_sad_skip_128x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + const uint32_t sum = sad128xh_neon(src_ptr, 2 * src_stride, ref_ptr, \ + 2 * ref_stride, h / 2); \ + return 2 * sum; \ + } + +FSADS128_H(128) +FSADS128_H(64) + +#undef FSADS128_H + +#define FSADS64_H(h) \ + unsigned int aom_sad_skip_64x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad64xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ + } + +FSADS64_H(128) +FSADS64_H(64) +FSADS64_H(32) +FSADS64_H(16) + +#undef FSADS64_H + +#define FSADS32_H(h) \ + unsigned int aom_sad_skip_32x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad32xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ + } + +FSADS32_H(64) +FSADS32_H(32) +FSADS32_H(16) +FSADS32_H(8) + +#undef FSADS32_H + +#define FSADS16_H(h) \ + unsigned int aom_sad_skip_16x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad16xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ + } + +FSADS16_H(64) +FSADS16_H(32) +FSADS16_H(16) +FSADS16_H(8) + +#undef FSADS16_H + +#define FSADS8_H(h) \ + unsigned int aom_sad_skip_8x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad8xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ + } + +FSADS8_H(32) +FSADS8_H(16) +FSADS8_H(8) + +#undef FSADS8_H + +#define FSADS4_H(h) \ + unsigned int aom_sad_skip_4x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad4xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ + } + +FSADS4_H(16) +FSADS4_H(8) + +#undef FSADS4_H |