summaryrefslogtreecommitdiff
path: root/media/libaom/src/av1/common/arm/selfguided_neon.c
diff options
context:
space:
mode:
Diffstat (limited to 'media/libaom/src/av1/common/arm/selfguided_neon.c')
-rw-r--r--media/libaom/src/av1/common/arm/selfguided_neon.c140
1 files changed, 111 insertions, 29 deletions
diff --git a/media/libaom/src/av1/common/arm/selfguided_neon.c b/media/libaom/src/av1/common/arm/selfguided_neon.c
index b3a37c4cb8..fc404a64ab 100644
--- a/media/libaom/src/av1/common/arm/selfguided_neon.c
+++ b/media/libaom/src/av1/common/arm/selfguided_neon.c
@@ -19,8 +19,8 @@
#include "aom_dsp/txfm_common.h"
#include "aom_mem/aom_mem.h"
#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
#include "av1/common/common.h"
-#include "av1/common/onyxc_int.h"
#include "av1/common/resize.h"
#include "av1/common/restoration.h"
#include "av1/common/arm/mem_neon.h"
@@ -86,7 +86,7 @@ static INLINE void calc_ab_fast_internal_common(
for (int x = 0; x < 4; x++) {
for (int y = 0; y < 4; y++) {
- dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+ dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]];
}
}
load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3);
@@ -214,7 +214,7 @@ static INLINE void calc_ab_internal_common(
for (int x = 0; x < 4; x++) {
for (int y = 0; y < 8; y++) {
- dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+ dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]];
}
}
load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7);
@@ -376,6 +376,21 @@ static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
w -= 8;
count++;
} while (w > 0);
+
+ // memset needed for row pixels as 2nd stage of boxsum filter uses
+ // first 2 rows of dst16, dst2 buffer which is not filled in first stage.
+ for (int x = 0; x < 2; x++) {
+ memset(dst16 + x * dst_stride, 0, (width + 4) * sizeof(*dst16));
+ memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
+ }
+
+ // memset needed for extra columns as 2nd stage of boxsum filter uses
+ // last 2 columns of dst16, dst2 buffer which is not filled in first stage.
+ for (int x = 2; x < height + 2; x++) {
+ int dst_offset = x * dst_stride + width + 2;
+ memset(dst16 + dst_offset, 0, 3 * sizeof(*dst16));
+ memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
+ }
}
{
@@ -467,7 +482,7 @@ static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
const uint32_t n = (2 * r + 1) * (2 * r + 1);
const uint32x4_t const_n_val = vdupq_n_u32(n);
const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
- const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+ const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]);
const uint32x4_t const_val = vdupq_n_u32(255);
uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7;
@@ -509,6 +524,7 @@ static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
} while (h > 0);
}
+#if CONFIG_AV1_HIGHBITDEPTH
static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
uint16_t *B16, int32_t *B,
const int buf_stride, const int width,
@@ -522,7 +538,7 @@ static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
const uint32x4_t const_n_val = vdupq_n_u32(n);
const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
- const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+ const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]);
const uint32x4_t const_val = vdupq_n_u32(255);
int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
@@ -573,6 +589,7 @@ static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
h -= (ht_inc * 4);
} while (h > 0);
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
int32_t *B, const int buf_stride,
@@ -584,7 +601,7 @@ static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
const uint32_t n = (2 * r + 1) * (2 * r + 1);
const uint32x4_t const_n_val = vdupq_n_u32(n);
const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
- const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+ const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]);
const uint32x4_t const_val = vdupq_n_u32(255);
int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
@@ -626,6 +643,7 @@ static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
} while (h > 0);
}
+#if CONFIG_AV1_HIGHBITDEPTH
static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
int32_t *B, const int buf_stride,
const int width, const int height,
@@ -638,7 +656,7 @@ static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
const uint32x4_t const_n_val = vdupq_n_u32(n);
const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
- const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+ const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]);
const uint32x4_t const_val = vdupq_n_u32(255);
int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
@@ -679,6 +697,7 @@ static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
h -= (ht_inc * 4);
} while (h > 0);
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
int32_t *dst2, const int dst_stride, const int width,
@@ -788,6 +807,21 @@ static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
w -= 8;
count++;
} while (w > 0);
+
+ // memset needed for row pixels as 2nd stage of boxsum filter uses
+ // first 2 rows of dst1, dst2 buffer which is not filled in first stage.
+ for (int x = 0; x < 2; x++) {
+ memset(dst1 + x * dst_stride, 0, (width + 4) * sizeof(*dst1));
+ memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
+ }
+
+ // memset needed for extra columns as 2nd stage of boxsum filter uses
+ // last 2 columns of dst1, dst2 buffer which is not filled in first stage.
+ for (int x = 2; x < height + 2; x++) {
+ int dst_offset = x * dst_stride + width + 2;
+ memset(dst1 + dst_offset, 0, 3 * sizeof(*dst1));
+ memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
+ }
}
{
@@ -1145,7 +1179,7 @@ static INLINE void restoration_fast_internal(uint16_t *dgd16, int width,
int32_t *dst, int dst_stride,
int bit_depth, int sgr_params_idx,
int radius_idx) {
- const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
const int r = params->r[radius_idx];
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -1181,17 +1215,25 @@ static INLINE void restoration_fast_internal(uint16_t *dgd16, int width,
// Calculation of a, b. a output is in 16bit tmp_buf which is in range of
// [1, 256] for all bit depths. b output is kept in 32bit buffer.
- if (8 == bit_depth) {
- calc_ab_fast_internal_lbd(
- (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
- (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r,
- params->s[radius_idx], 2);
- } else {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (bit_depth > 8) {
calc_ab_fast_internal_hbd(
(square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
(sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2,
bit_depth, r, params->s[radius_idx], 2);
+ } else {
+ calc_ab_fast_internal_lbd(
+ (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+ (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r,
+ params->s[radius_idx], 2);
}
+#else
+ (void)bit_depth;
+ calc_ab_fast_internal_lbd((square_sum_buf - buf_stride - 1),
+ (tmp16_buf - buf_stride - 1),
+ (sum_buf - buf_stride - 1), buf_stride * 2,
+ width + 2, height + 2, r, params->s[radius_idx], 2);
+#endif
final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16,
dgd_stride, dst, dst_stride, width, height);
}
@@ -1200,7 +1242,7 @@ static INLINE void restoration_internal(uint16_t *dgd16, int width, int height,
int dgd_stride, int32_t *dst,
int dst_stride, int bit_depth,
int sgr_params_idx, int radius_idx) {
- const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
const int r = params->r[radius_idx];
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
@@ -1235,19 +1277,27 @@ static INLINE void restoration_internal(uint16_t *dgd16, int width, int height,
A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+#if CONFIG_AV1_HIGHBITDEPTH
// Calculation of a, b. a output is in 16bit tmp_buf which is in range of
// [1, 256] for all bit depths. b output is kept in 32bit buffer.
- if (8 == bit_depth) {
- calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
+ if (bit_depth > 8) {
+ calc_ab_internal_hbd((square_sum_buf - buf_stride - 1),
(A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
(B - buf_stride - 1), buf_stride, width + 2,
- height + 2, r, params->s[radius_idx], 1);
+ height + 2, bit_depth, r, params->s[radius_idx], 1);
} else {
- calc_ab_internal_hbd((square_sum_buf - buf_stride - 1),
+ calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
(A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
(B - buf_stride - 1), buf_stride, width + 2,
- height + 2, bit_depth, r, params->s[radius_idx], 1);
+ height + 2, r, params->s[radius_idx], 1);
}
+#else
+ (void)bit_depth;
+ calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
+ (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+ (B - buf_stride - 1), buf_stride, width + 2, height + 2,
+ r, params->s[radius_idx], 1);
+#endif
final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst,
dst_stride, width, height);
}
@@ -1299,8 +1349,14 @@ static INLINE void src_convert_u8_to_u16(const uint8_t *src,
dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride];
}
}
+
+ // memset uninitialized rows of src buffer as they are needed for the
+ // boxsum filter calculation.
+ for (int x = height; x < height + 5; x++)
+ memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
}
+#if CONFIG_AV1_HIGHBITDEPTH
static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
uint16_t *dst, const int dst_stride,
int width, int height) {
@@ -1339,13 +1395,18 @@ static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride),
sizeof(uint16_t) * width);
}
+ // memset uninitialized rows of src buffer as they are needed for the
+ // boxsum filter calculation.
+ for (int x = height; x < height + 5; x++)
+ memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
}
+#endif // CONFIG_AV1_HIGHBITDEPTH
int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
int stride, int32_t *flt0, int32_t *flt1,
int flt_stride, int sgr_params_idx,
int bit_depth, int highbd) {
- const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
assert(!(params->r[0] == 0 && params->r[1] == 0));
uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
@@ -1356,6 +1417,7 @@ int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
const int dgd_stride = stride;
+#if CONFIG_AV1_HIGHBITDEPTH
if (highbd) {
const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
src_convert_hbd_copy(
@@ -1370,6 +1432,13 @@ int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
dgd16_stride, width_ext, height_ext);
}
+#else
+ (void)highbd;
+ src_convert_u8_to_u16(
+ dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+#endif
if (params->r[0] > 0)
restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0,
@@ -1380,11 +1449,11 @@ int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
return 0;
}
-void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
- int height, int stride, int eps,
- const int *xqd, uint8_t *dst8,
- int dst_stride, int32_t *tmpbuf,
- int bit_depth, int highbd) {
+void av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
int32_t *flt0 = tmpbuf;
int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
assert(width * height <= RESTORATION_UNITPELS_MAX);
@@ -1395,11 +1464,12 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
const int dgd_stride = stride;
- const sgr_params_type *const params = &sgr_params[eps];
+ const sgr_params_type *const params = &av1_sgr_params[eps];
int xq[2];
assert(!(params->r[0] == 0 && params->r[1] == 0));
+#if CONFIG_AV1_HIGHBITDEPTH
if (highbd) {
const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
src_convert_hbd_copy(
@@ -1414,7 +1484,13 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
dgd16_stride, width_ext, height_ext);
}
-
+#else
+ (void)highbd;
+ src_convert_u8_to_u16(
+ dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+#endif
if (params->r[0] > 0)
restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width,
bit_depth, eps, 0);
@@ -1422,7 +1498,7 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
restoration_internal(dgd16, width, height, dgd16_stride, flt1, width,
bit_depth, eps, 1);
- decode_xq(xqd, xq, params);
+ av1_decode_xq(xqd, xq, params);
{
int16_t *src_ptr;
@@ -1485,6 +1561,7 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero));
+#if CONFIG_AV1_HIGHBITDEPTH
if (highbd) {
r4 = vminq_u16(r4, max);
vst1q_u16(dst16_ptr, r4);
@@ -1492,6 +1569,11 @@ void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
t0 = vqmovn_u16(r4);
vst1_u8(dst_ptr, t0);
}
+#else
+ (void)max;
+ t0 = vqmovn_u16(r4);
+ vst1_u8(dst_ptr, t0);
+#endif
w -= 8;
count += 8;
dst_ptr += 8;