diff options
Diffstat (limited to 'third_party/aom/av1/common/warped_motion.c')
-rw-r--r-- | third_party/aom/av1/common/warped_motion.c | 603 |
1 files changed, 459 insertions, 144 deletions
diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c index fc832681a7..75ae08723e 100644 --- a/third_party/aom/av1/common/warped_motion.c +++ b/third_party/aom/av1/common/warped_motion.c @@ -17,6 +17,9 @@ #include "./av1_rtcd.h" #include "av1/common/warped_motion.h" +#include "av1/common/scale.h" + +#define WARP_ERROR_BLOCK 32 /* clang-format off */ static const int error_measure_lut[512] = { @@ -90,6 +93,8 @@ static const int error_measure_lut[512] = { static ProjectPointsFunc get_project_points_type(TransformationType type) { switch (type) { + case VERTRAPEZOID: return project_points_vertrapezoid; + case HORTRAPEZOID: return project_points_hortrapezoid; case HOMOGRAPHY: return project_points_homography; case AFFINE: return project_points_affine; case ROTZOOM: return project_points_rotzoom; @@ -279,29 +284,6 @@ void project_points_homography(const int32_t *mat, int *points, int *proj, } } -// 'points' are at original scale, output 'proj's are scaled up by -// 1 << WARPEDPIXEL_PREC_BITS -void project_points(const WarpedMotionParams *wm_params, int *points, int *proj, - const int n, const int stride_points, const int stride_proj, - const int subsampling_x, const int subsampling_y) { - switch (wm_params->wmtype) { - case AFFINE: - project_points_affine(wm_params->wmmat, points, proj, n, stride_points, - stride_proj, subsampling_x, subsampling_y); - break; - case ROTZOOM: - project_points_rotzoom(wm_params->wmmat, points, proj, n, stride_points, - stride_proj, subsampling_x, subsampling_y); - break; - case HOMOGRAPHY: - project_points_homography(wm_params->wmmat, points, proj, n, - stride_points, stride_proj, subsampling_x, - subsampling_y); - break; - default: assert(0 && "Invalid warped motion type!"); return; - } -} - static const int16_t filter_ntap[WARPEDPIXEL_PREC_SHIFTS][WARPEDPIXEL_FILTER_TAPS] = { #if WARPEDPIXEL_PREC_BITS == 6 @@ -911,11 +893,14 @@ static INLINE int highbd_error_measure(int err, int bd) { error_measure_lut[256 + e1] * e2; } -static void highbd_warp_plane_old( - const WarpedMotionParams *const wm, const uint8_t *const ref8, int width, - int height, int stride, const uint8_t *const pred8, int p_col, int p_row, - int p_width, int p_height, int p_stride, int subsampling_x, - int subsampling_y, int x_scale, int y_scale, int bd, int comp_avg) { +static void highbd_warp_plane_old(const WarpedMotionParams *const wm, + const uint8_t *const ref8, int width, + int height, int stride, + const uint8_t *const pred8, int p_col, + int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, + int subsampling_y, int x_scale, int y_scale, + int bd, ConvolveParams *conv_params) { int i, j; ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype); uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); @@ -929,7 +914,7 @@ static void highbd_warp_plane_old( projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y); out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, 4); out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, 4); - if (comp_avg) + if (conv_params->do_average) pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO( pred[(j - p_col) + (i - p_row) * p_stride] + highbd_warp_interpolate(ref, out[0], out[1], width, height, @@ -949,10 +934,10 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, - int subsampling_y, int bd, int comp_avg, - int16_t alpha, int16_t beta, int16_t gamma, - int16_t delta) { - uint32_t tmp[15 * 8]; + int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + int32_t tmp[15 * 8]; int i, j, k, l, m; for (i = p_row; i < p_row + p_height; i += 8) { @@ -1037,7 +1022,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, assert(0 <= sum && sum < (1 << (bd + 2))); uint16_t px = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd); - if (comp_avg) + if (conv_params->do_average) *p = ROUND_POWER_OF_TWO(*p + px, 1); else *p = px; @@ -1048,18 +1033,125 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, } } +#if CONFIG_CONVOLVE_ROUND +void av1_highbd_warp_affine_post_round_c( + const int32_t *mat, const uint16_t *ref, int width, int height, int stride, + uint16_t *pred, int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, + int16_t delta) { + (void)pred; + (void)p_stride; + int32_t tmp[15 * 8]; + int i, j, k, l, m; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + const int offset_bits_vert = bd + 2 * FILTER_BITS - conv_params->round_0; + assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS); + + for (i = p_row; i < p_row + p_height; i += 8) { + for (j = p_col; j < p_col + p_width; j += 8) { + int32_t x4, y4, ix4, sx4, iy4, sy4; + if (subsampling_x) + x4 = (mat[2] * 4 * (j + 4) + mat[3] * 4 * (i + 4) + mat[0] * 2 + + (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) / + 4; + else + x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0]; + + if (subsampling_y) + y4 = (mat[4] * 4 * (j + 4) + mat[5] * 4 * (i + 4) + mat[1] * 2 + + (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) / + 4; + else + y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1]; + + ix4 = x4 >> WARPEDMODEL_PREC_BITS; + sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + iy4 = y4 >> WARPEDMODEL_PREC_BITS; + sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + sx4 += alpha * (-4) + beta * (-4); + sy4 += gamma * (-4) + delta * (-4); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Horizontal filter + for (k = -7; k < 8; ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + int sx = sx4 + beta * (k + 4); + for (l = -4; l < 4; ++l) { + int ix = ix4 + l - 3; + const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); + const int16_t *coeffs = warped_filter[offs]; + + int32_t sum = 1 << offset_bits_horiz; + for (m = 0; m < 8; ++m) { + int sample_x = ix + m; + if (sample_x < 0) + sample_x = 0; + else if (sample_x > width - 1) + sample_x = width - 1; + sum += ref[iy * stride + sample_x] * coeffs[m]; + } + sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0); + assert(0 <= sum && + sum < (1 << (bd + FILTER_BITS + 1 - conv_params->round_0))); + tmp[(k + 7) * 8 + (l + 4)] = sum; + sx += alpha; + } + } + + // Vertical filter + for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + for (l = -4; l < 4; ++l) { + const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); + const int16_t *coeffs = warped_filter[offs]; + + int32_t sum = 1 << offset_bits_vert; + for (m = 0; m < 8; ++m) { + sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m]; + } + + sum = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - + (1 << (offset_bits_horiz + FILTER_BITS - conv_params->round_0 - + conv_params->round_1)) - + (1 << (offset_bits_vert - conv_params->round_1)); + CONV_BUF_TYPE *p = + &conv_params->dst[(i - p_row + k + 4) * conv_params->dst_stride + + (j - p_col + l + 4)]; + *p += sum; + sy += gamma; + } + } + } + } +} +#endif + static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8, int width, int height, int stride, const uint8_t *const pred8, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int x_scale, - int y_scale, int bd, int comp_avg) { + int y_scale, int bd, + ConvolveParams *conv_params) { if (wm->wmtype == ROTZOOM) { wm->wmmat[5] = wm->wmmat[2]; wm->wmmat[4] = -wm->wmmat[3]; } - if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) && x_scale == 16 && - y_scale == 16) { + if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) && + x_scale == SCALE_SUBPEL_SHIFTS && y_scale == SCALE_SUBPEL_SHIFTS) { const int32_t *const mat = wm->wmmat; const int16_t alpha = wm->alpha; const int16_t beta = wm->beta; @@ -1068,26 +1160,40 @@ static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8, const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8); uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); +#if CONFIG_CONVOLVE_ROUND + if (conv_params->round == CONVOLVE_OPT_NO_ROUND) { + conv_params->do_post_rounding = 1; + av1_highbd_warp_affine_post_round( + mat, ref, width, height, stride, pred, p_col, p_row, p_width, + p_height, p_stride, subsampling_x, subsampling_y, bd, conv_params, + alpha, beta, gamma, delta); + } else { + av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, + p_row, p_width, p_height, p_stride, subsampling_x, + subsampling_y, bd, conv_params, alpha, beta, gamma, + delta); + } +#else av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, - subsampling_y, bd, comp_avg, alpha, beta, gamma, + subsampling_y, bd, conv_params, alpha, beta, gamma, delta); +#endif } else { highbd_warp_plane_old(wm, ref8, width, height, stride, pred8, p_col, p_row, p_width, p_height, p_stride, subsampling_x, - subsampling_y, x_scale, y_scale, bd, comp_avg); + subsampling_y, x_scale, y_scale, bd, conv_params); } } static int64_t highbd_frame_error(const uint16_t *const ref, int stride, - const uint16_t *const dst, int p_col, - int p_row, int p_width, int p_height, - int p_stride, int bd) { + const uint16_t *const dst, int p_width, + int p_height, int p_stride, int bd) { int64_t sum_error = 0; for (int i = 0; i < p_height; ++i) { for (int j = 0; j < p_width; ++j) { - sum_error += highbd_error_measure( - dst[j + i * p_stride] - ref[(j + p_col) + (i + p_row) * stride], bd); + sum_error += + highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd); } } return sum_error; @@ -1097,19 +1203,31 @@ static int64_t highbd_warp_error( WarpedMotionParams *wm, const uint8_t *const ref8, int width, int height, int stride, const uint8_t *const dst8, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, - int x_scale, int y_scale, int bd) { + int x_scale, int y_scale, int bd, int64_t best_error) { int64_t gm_sumerr = 0; - uint16_t *tmp = aom_malloc(p_width * p_height * sizeof(*tmp)); - if (!tmp) return INT64_MAX; - - highbd_warp_plane(wm, ref8, width, height, stride, CONVERT_TO_BYTEPTR(tmp), - p_col, p_row, p_width, p_height, p_width, subsampling_x, - subsampling_y, x_scale, y_scale, bd, 0); - - gm_sumerr = highbd_frame_error(tmp, p_width, CONVERT_TO_SHORTPTR(dst8), p_col, - p_row, p_width, p_height, p_stride, bd); - - aom_free(tmp); + int warp_w, warp_h; + int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); + int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); + uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]; + + ConvolveParams conv_params = get_conv_params(0, 0, 0); + for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { + for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) { + // avoid warping extra 8x8 blocks in the padded region of the frame + // when p_width and p_height are not multiples of WARP_ERROR_BLOCK + warp_w = AOMMIN(error_bsize_w, p_col + p_width - j); + warp_h = AOMMIN(error_bsize_h, p_row + p_height - i); + highbd_warp_plane(wm, ref8, width, height, stride, + CONVERT_TO_BYTEPTR(tmp), j, i, warp_w, warp_h, + WARP_ERROR_BLOCK, subsampling_x, subsampling_y, x_scale, + y_scale, bd, &conv_params); + + gm_sumerr += highbd_frame_error( + tmp, WARP_ERROR_BLOCK, CONVERT_TO_SHORTPTR(dst8) + j + i * p_stride, + warp_w, warp_h, p_stride, bd); + if (gm_sumerr > best_error) return gm_sumerr; + } + } return gm_sumerr; } #endif // CONFIG_HIGHBITDEPTH @@ -1123,7 +1241,7 @@ static void warp_plane_old(const WarpedMotionParams *const wm, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int x_scale, - int y_scale, int comp_avg) { + int y_scale, ConvolveParams *conv_params) { int i, j; ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype); if (projectpoints == NULL) return; @@ -1135,7 +1253,7 @@ static void warp_plane_old(const WarpedMotionParams *const wm, projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y); out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, 4); out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, 4); - if (comp_avg) + if (conv_params->do_average) pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO( pred[(j - p_col) + (i - p_row) * p_stride] + warp_interpolate(ref, out[0], out[1], width, height, stride), @@ -1235,10 +1353,10 @@ static void warp_plane_old(const WarpedMotionParams *const wm, void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, - int subsampling_x, int subsampling_y, int comp_avg, - int16_t alpha, int16_t beta, int16_t gamma, - int16_t delta) { - uint16_t tmp[15 * 8]; + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, int16_t beta, + int16_t gamma, int16_t delta) { + int32_t tmp[15 * 8]; int i, j, k, l, m; const int bd = 8; @@ -1329,7 +1447,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS); assert(0 <= sum && sum < (1 << (bd + 2))); uint8_t px = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd)); - if (comp_avg) + if (conv_params->do_average) *p = ROUND_POWER_OF_TWO(*p + px, 1); else *p = px; @@ -1340,41 +1458,170 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, } } +#if CONFIG_CONVOLVE_ROUND +void av1_warp_affine_post_round_c(const int32_t *mat, const uint8_t *ref, + int width, int height, int stride, + uint8_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + (void)pred; + (void)p_stride; + int32_t tmp[15 * 8]; + int i, j, k, l, m; + const int bd = 8; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + const int offset_bits_vert = bd + 2 * FILTER_BITS - conv_params->round_0; + assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS); + + for (i = p_row; i < p_row + p_height; i += 8) { + for (j = p_col; j < p_col + p_width; j += 8) { + int32_t x4, y4, ix4, sx4, iy4, sy4; + if (subsampling_x) + x4 = (mat[2] * 4 * (j + 4) + mat[3] * 4 * (i + 4) + mat[0] * 2 + + (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) / + 4; + else + x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0]; + + if (subsampling_y) + y4 = (mat[4] * 4 * (j + 4) + mat[5] * 4 * (i + 4) + mat[1] * 2 + + (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) / + 4; + else + y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1]; + + ix4 = x4 >> WARPEDMODEL_PREC_BITS; + sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + iy4 = y4 >> WARPEDMODEL_PREC_BITS; + sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + sx4 += alpha * (-4) + beta * (-4); + sy4 += gamma * (-4) + delta * (-4); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Horizontal filter + for (k = -7; k < 8; ++k) { + // Clamp to top/bottom edge of the frame + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + int sx = sx4 + beta * (k + 4); + + for (l = -4; l < 4; ++l) { + int ix = ix4 + l - 3; + // At this point, sx = sx4 + alpha * l + beta * k + const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); + const int16_t *coeffs = warped_filter[offs]; + + int32_t sum = 1 << offset_bits_horiz; + for (m = 0; m < 8; ++m) { + // Clamp to left/right edge of the frame + int sample_x = ix + m; + if (sample_x < 0) + sample_x = 0; + else if (sample_x > width - 1) + sample_x = width - 1; + + sum += ref[iy * stride + sample_x] * coeffs[m]; + } + sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0); + assert(0 <= sum && + sum < (1 << (bd + FILTER_BITS + 1 - conv_params->round_0))); + tmp[(k + 7) * 8 + (l + 4)] = sum; + sx += alpha; + } + } + + // Vertical filter + for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) { + // At this point, sy = sy4 + gamma * l + delta * k + const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); + const int16_t *coeffs = warped_filter[offs]; + + int32_t sum = 1 << offset_bits_vert; + + for (m = 0; m < 8; ++m) { + sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m]; + } + + sum = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - + (1 << (offset_bits_horiz + FILTER_BITS - conv_params->round_0 - + conv_params->round_1)) - + (1 << (offset_bits_vert - conv_params->round_1)); + CONV_BUF_TYPE *p = + &conv_params->dst[(i - p_row + k + 4) * conv_params->dst_stride + + (j - p_col + l + 4)]; + *p += sum; + sy += gamma; + } + } + } + } +} +#endif // CONFIG_CONVOLVE_ROUND + static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, - int x_scale, int y_scale, int comp_avg) { + int x_scale, int y_scale, ConvolveParams *conv_params) { if (wm->wmtype == ROTZOOM) { wm->wmmat[5] = wm->wmmat[2]; wm->wmmat[4] = -wm->wmmat[3]; } - if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) && x_scale == 16 && - y_scale == 16) { + if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) && + x_scale == SCALE_SUBPEL_SHIFTS && y_scale == SCALE_SUBPEL_SHIFTS) { const int32_t *const mat = wm->wmmat; const int16_t alpha = wm->alpha; const int16_t beta = wm->beta; const int16_t gamma = wm->gamma; const int16_t delta = wm->delta; +#if CONFIG_CONVOLVE_ROUND + if (conv_params->round == CONVOLVE_OPT_NO_ROUND) { + conv_params->do_post_rounding = 1; + av1_warp_affine_post_round(mat, ref, width, height, stride, pred, p_col, + p_row, p_width, p_height, p_stride, + subsampling_x, subsampling_y, conv_params, + alpha, beta, gamma, delta); + } else { + av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, + p_width, p_height, p_stride, subsampling_x, subsampling_y, + conv_params, alpha, beta, gamma, delta); + } +#else av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, - comp_avg, alpha, beta, gamma, delta); + conv_params, alpha, beta, gamma, delta); +#endif } else { warp_plane_old(wm, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, x_scale, - y_scale, comp_avg); + y_scale, conv_params); } } static int64_t frame_error(const uint8_t *const ref, int stride, - const uint8_t *const dst, int p_col, int p_row, - int p_width, int p_height, int p_stride) { + const uint8_t *const dst, int p_width, int p_height, + int p_stride) { int64_t sum_error = 0; for (int i = 0; i < p_height; ++i) { for (int j = 0; j < p_width; ++j) { - sum_error += (int64_t)error_measure( - dst[j + i * p_stride] - ref[(j + p_col) + (i + p_row) * stride]); + sum_error += + (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]); } } return sum_error; @@ -1385,19 +1632,29 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref, const uint8_t *const dst, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int x_scale, - int y_scale) { + int y_scale, int64_t best_error) { int64_t gm_sumerr = 0; - uint8_t *tmp = aom_malloc(p_width * p_height); - if (!tmp) return INT64_MAX; - - warp_plane(wm, ref, width, height, stride, tmp, p_col, p_row, p_width, - p_height, p_width, subsampling_x, subsampling_y, x_scale, y_scale, - 0); - - gm_sumerr = - frame_error(tmp, p_width, dst, p_col, p_row, p_width, p_height, p_stride); - - aom_free(tmp); + int warp_w, warp_h; + int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); + int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); + uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]; + ConvolveParams conv_params = get_conv_params(0, 0, 0); + + for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { + for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) { + // avoid warping extra 8x8 blocks in the padded region of the frame + // when p_width and p_height are not multiples of WARP_ERROR_BLOCK + warp_w = AOMMIN(error_bsize_w, p_col + p_width - j); + warp_h = AOMMIN(error_bsize_h, p_row + p_height - i); + warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h, + WARP_ERROR_BLOCK, subsampling_x, subsampling_y, x_scale, + y_scale, &conv_params); + + gm_sumerr += frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride, + warp_w, warp_h, p_stride); + if (gm_sumerr > best_error) return gm_sumerr; + } + } return gm_sumerr; } @@ -1405,17 +1662,16 @@ int64_t av1_frame_error( #if CONFIG_HIGHBITDEPTH int use_hbd, int bd, #endif // CONFIG_HIGHBITDEPTH - const uint8_t *ref, int stride, uint8_t *dst, int p_col, int p_row, - int p_width, int p_height, int p_stride) { + const uint8_t *ref, int stride, uint8_t *dst, int p_width, int p_height, + int p_stride) { #if CONFIG_HIGHBITDEPTH if (use_hbd) { return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride, - CONVERT_TO_SHORTPTR(dst), p_col, p_row, p_width, - p_height, p_stride, bd); + CONVERT_TO_SHORTPTR(dst), p_width, p_height, + p_stride, bd); } #endif // CONFIG_HIGHBITDEPTH - return frame_error(ref, stride, dst, p_col, p_row, p_width, p_height, - p_stride); + return frame_error(ref, stride, dst, p_width, p_height, p_stride); } int64_t av1_warp_error(WarpedMotionParams *wm, @@ -1425,18 +1681,19 @@ int64_t av1_warp_error(WarpedMotionParams *wm, const uint8_t *ref, int width, int height, int stride, uint8_t *dst, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, - int subsampling_y, int x_scale, int y_scale) { + int subsampling_y, int x_scale, int y_scale, + int64_t best_error) { if (wm->wmtype <= AFFINE) if (!get_shear_params(wm)) return 1; #if CONFIG_HIGHBITDEPTH if (use_hbd) return highbd_warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width, p_height, p_stride, subsampling_x, - subsampling_y, x_scale, y_scale, bd); + subsampling_y, x_scale, y_scale, bd, best_error); #endif // CONFIG_HIGHBITDEPTH return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, x_scale, - y_scale); + y_scale, best_error); } void av1_warp_plane(WarpedMotionParams *wm, @@ -1446,17 +1703,18 @@ void av1_warp_plane(WarpedMotionParams *wm, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, - int subsampling_y, int x_scale, int y_scale, int comp_avg) { + int subsampling_y, int x_scale, int y_scale, + ConvolveParams *conv_params) { #if CONFIG_HIGHBITDEPTH if (use_hbd) highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, - x_scale, y_scale, bd, comp_avg); + x_scale, y_scale, bd, conv_params); else #endif // CONFIG_HIGHBITDEPTH warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, x_scale, - y_scale, comp_avg); + y_scale, conv_params); } #if CONFIG_WARPED_MOTION @@ -1492,6 +1750,83 @@ void av1_warp_plane(WarpedMotionParams *wm, #define LS_PRODUCT2(a, b) \ (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2) +#define USE_LIMITED_PREC_MULT 0 + +#if USE_LIMITED_PREC_MULT + +#define MUL_PREC_BITS 16 +static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) { + int msb = 0; + uint16_t mult = 0; + *shift = 0; + if (D != 0) { + msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32 + : get_msb((unsigned int)D)); + if (msb >= MUL_PREC_BITS) { + mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS); + *shift = msb + 1 - MUL_PREC_BITS; + } else { + mult = (uint16_t)D; + *shift = 0; + } + } + return mult; +} + +static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) { + int32_t ret; + int16_t mshift; + uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift); + int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1); + shift -= mshift; + if (shift > 0) { + return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift), + -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } else { + return (int32_t)clamp(v * (1 << (-shift)), + -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } + return ret; +} + +static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) { + int16_t mshift; + uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift); + int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1); + shift -= mshift; + if (shift > 0) { + return (int32_t)clamp( + ROUND_POWER_OF_TWO_SIGNED(v, shift), + (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } else { + return (int32_t)clamp( + v * (1 << (-shift)), + (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } +} + +#else + +static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) { + int64_t v = Px * (int64_t)iDet; + return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift), + -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); +} + +static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) { + int64_t v = Px * (int64_t)iDet; + return (int32_t)clamp64( + ROUND_POWER_OF_TWO_SIGNED_64(v, shift), + (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); +} +#endif // USE_LIMITED_PREC_MULT + static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy, int mvx, WarpedMotionParams *wm, int mi_row, int mi_col) { @@ -1502,8 +1837,10 @@ static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; - const int suy = (mi_row * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1) * 8; - const int sux = (mi_col * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1) * 8; + const int isuy = (mi_row * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1); + const int isux = (mi_col * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1); + const int suy = isuy * 8; + const int sux = isux * 8; const int duy = suy + mvy; const int dux = sux + mvx; @@ -1590,61 +1927,39 @@ static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, shift = 0; } - int64_t v; - v = Px[0] * (int64_t)iDet; - wm->wmmat[2] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift)); - v = Px[1] * (int64_t)iDet; - wm->wmmat[3] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift)); - v = ((int64_t)dux * (1 << WARPEDMODEL_PREC_BITS)) - - (int64_t)sux * wm->wmmat[2] - (int64_t)suy * wm->wmmat[3]; - wm->wmmat[0] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED(v, 3)); - - v = Py[0] * (int64_t)iDet; - wm->wmmat[4] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift)); - v = Py[1] * (int64_t)iDet; - wm->wmmat[5] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift)); - v = ((int64_t)duy * (1 << WARPEDMODEL_PREC_BITS)) - - (int64_t)sux * wm->wmmat[4] - (int64_t)suy * wm->wmmat[5]; - wm->wmmat[1] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED(v, 3)); + wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift); + wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift); + wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift); + wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift); + + // Note: In the vx, vy expressions below, the max value of each of the + // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room + // for the first term so that the overall sum in the worst case fits + // within 32 bits overall. + int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) - + (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) + + isuy * wm->wmmat[3]); + int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) - + (isux * wm->wmmat[4] + + isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS))); + wm->wmmat[0] = + clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1); + wm->wmmat[1] = + clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1); wm->wmmat[6] = wm->wmmat[7] = 0; - - // Clamp values - wm->wmmat[0] = clamp(wm->wmmat[0], -WARPEDMODEL_TRANS_CLAMP, - WARPEDMODEL_TRANS_CLAMP - 1); - wm->wmmat[1] = clamp(wm->wmmat[1], -WARPEDMODEL_TRANS_CLAMP, - WARPEDMODEL_TRANS_CLAMP - 1); - wm->wmmat[2] = clamp(wm->wmmat[2], -WARPEDMODEL_DIAGAFFINE_CLAMP, - WARPEDMODEL_DIAGAFFINE_CLAMP - 1); - wm->wmmat[5] = clamp(wm->wmmat[5], -WARPEDMODEL_DIAGAFFINE_CLAMP, - WARPEDMODEL_DIAGAFFINE_CLAMP - 1); - wm->wmmat[3] = clamp(wm->wmmat[3], -WARPEDMODEL_NONDIAGAFFINE_CLAMP, - WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); - wm->wmmat[4] = clamp(wm->wmmat[4], -WARPEDMODEL_NONDIAGAFFINE_CLAMP, - WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); return 0; } int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy, int mvx, WarpedMotionParams *wm_params, int mi_row, int mi_col) { - int result = 1; - switch (wm_params->wmtype) { - case AFFINE: - result = find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, - mi_row, mi_col); - break; - default: assert(0 && "Invalid warped motion type!"); return 1; - } + assert(wm_params->wmtype == AFFINE); + const int result = find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, + mi_row, mi_col); if (result == 0) { - if (wm_params->wmtype == ROTZOOM) { - wm_params->wmmat[5] = wm_params->wmmat[2]; - wm_params->wmmat[4] = -wm_params->wmmat[3]; - } - if (wm_params->wmtype == AFFINE || wm_params->wmtype == ROTZOOM) { - // check compatibility with the fast warp filter - if (!get_shear_params(wm_params)) return 1; - } + // check compatibility with the fast warp filter + if (!get_shear_params(wm_params)) return 1; } return result; |