1 files changed, 459 insertions, 144 deletions
diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c
index fc832681a7..75ae08723e 100644
--- a/third_party/aom/av1/common/warped_motion.c
+++ b/third_party/aom/av1/common/warped_motion.c
@@ -17,6 +17,9 @@
 
 #include "./av1_rtcd.h"
 #include "av1/common/warped_motion.h"
+#include "av1/common/scale.h"
+
+#define WARP_ERROR_BLOCK 32
 
 /* clang-format off */
 static const int error_measure_lut[512] = {
@@ -90,6 +93,8 @@ static const int error_measure_lut[512] = {
 
 static ProjectPointsFunc get_project_points_type(TransformationType type) {
   switch (type) {
+    case VERTRAPEZOID: return project_points_vertrapezoid;
+    case HORTRAPEZOID: return project_points_hortrapezoid;
     case HOMOGRAPHY: return project_points_homography;
     case AFFINE: return project_points_affine;
     case ROTZOOM: return project_points_rotzoom;
@@ -279,29 +284,6 @@ void project_points_homography(const int32_t *mat, int *points, int *proj,
   }
 }
 
-// 'points' are at original scale, output 'proj's are scaled up by
-// 1 << WARPEDPIXEL_PREC_BITS
-void project_points(const WarpedMotionParams *wm_params, int *points, int *proj,
-                    const int n, const int stride_points, const int stride_proj,
-                    const int subsampling_x, const int subsampling_y) {
-  switch (wm_params->wmtype) {
-    case AFFINE:
-      project_points_affine(wm_params->wmmat, points, proj, n, stride_points,
-                            stride_proj, subsampling_x, subsampling_y);
-      break;
-    case ROTZOOM:
-      project_points_rotzoom(wm_params->wmmat, points, proj, n, stride_points,
-                             stride_proj, subsampling_x, subsampling_y);
-      break;
-    case HOMOGRAPHY:
-      project_points_homography(wm_params->wmmat, points, proj, n,
-                                stride_points, stride_proj, subsampling_x,
-                                subsampling_y);
-      break;
-    default: assert(0 && "Invalid warped motion type!"); return;
-  }
-}
-
 static const int16_t
     filter_ntap[WARPEDPIXEL_PREC_SHIFTS][WARPEDPIXEL_FILTER_TAPS] = {
 #if WARPEDPIXEL_PREC_BITS == 6
@@ -911,11 +893,14 @@ static INLINE int highbd_error_measure(int err, int bd) {
          error_measure_lut[256 + e1] * e2;
 }
 
-static void highbd_warp_plane_old(
-    const WarpedMotionParams *const wm, const uint8_t *const ref8, int width,
-    int height, int stride, const uint8_t *const pred8, int p_col, int p_row,
-    int p_width, int p_height, int p_stride, int subsampling_x,
-    int subsampling_y, int x_scale, int y_scale, int bd, int comp_avg) {
+static void highbd_warp_plane_old(const WarpedMotionParams *const wm,
+                                  const uint8_t *const ref8, int width,
+                                  int height, int stride,
+                                  const uint8_t *const pred8, int p_col,
+                                  int p_row, int p_width, int p_height,
+                                  int p_stride, int subsampling_x,
+                                  int subsampling_y, int x_scale, int y_scale,
+                                  int bd, ConvolveParams *conv_params) {
   int i, j;
   ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
@@ -929,7 +914,7 @@ static void highbd_warp_plane_old(
       projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
       out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, 4);
       out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, 4);
-      if (comp_avg)
+      if (conv_params->do_average)
         pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
             pred[(j - p_col) + (i - p_row) * p_stride] +
                 highbd_warp_interpolate(ref, out[0], out[1], width, height,
@@ -949,10 +934,10 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
                               int width, int height, int stride, uint16_t *pred,
                               int p_col, int p_row, int p_width, int p_height,
                               int p_stride, int subsampling_x,
-                              int subsampling_y, int bd, int comp_avg,
-                              int16_t alpha, int16_t beta, int16_t gamma,
-                              int16_t delta) {
-  uint32_t tmp[15 * 8];
+                              int subsampling_y, int bd,
+                              ConvolveParams *conv_params, int16_t alpha,
+                              int16_t beta, int16_t gamma, int16_t delta) {
+  int32_t tmp[15 * 8];
   int i, j, k, l, m;
 
   for (i = p_row; i < p_row + p_height; i += 8) {
@@ -1037,7 +1022,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
           assert(0 <= sum && sum < (1 << (bd + 2)));
           uint16_t px =
               clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
-          if (comp_avg)
+          if (conv_params->do_average)
             *p = ROUND_POWER_OF_TWO(*p + px, 1);
           else
             *p = px;
@@ -1048,18 +1033,125 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
   }
 }
 
+#if CONFIG_CONVOLVE_ROUND
+void av1_highbd_warp_affine_post_round_c(
+    const int32_t *mat, const uint16_t *ref, int width, int height, int stride,
+    uint16_t *pred, int p_col, int p_row, int p_width, int p_height,
+    int p_stride, int subsampling_x, int subsampling_y, int bd,
+    ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma,
+    int16_t delta) {
+  (void)pred;
+  (void)p_stride;
+  int32_t tmp[15 * 8];
+  int i, j, k, l, m;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - conv_params->round_0;
+  assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
+
+  for (i = p_row; i < p_row + p_height; i += 8) {
+    for (j = p_col; j < p_col + p_width; j += 8) {
+      int32_t x4, y4, ix4, sx4, iy4, sy4;
+      if (subsampling_x)
+        x4 = (mat[2] * 4 * (j + 4) + mat[3] * 4 * (i + 4) + mat[0] * 2 +
+              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
+      else
+        x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
+
+      if (subsampling_y)
+        y4 = (mat[4] * 4 * (j + 4) + mat[5] * 4 * (i + 4) + mat[1] * 2 +
+              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
+      else
+        y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
+
+      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      sx4 += alpha * (-4) + beta * (-4);
+      sy4 += gamma * (-4) + delta * (-4);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+      // Horizontal filter
+      for (k = -7; k < 8; ++k) {
+        int iy = iy4 + k;
+        if (iy < 0)
+          iy = 0;
+        else if (iy > height - 1)
+          iy = height - 1;
+
+        int sx = sx4 + beta * (k + 4);
+        for (l = -4; l < 4; ++l) {
+          int ix = ix4 + l - 3;
+          const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+                           WARPEDPIXEL_PREC_SHIFTS;
+          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+          const int16_t *coeffs = warped_filter[offs];
+
+          int32_t sum = 1 << offset_bits_horiz;
+          for (m = 0; m < 8; ++m) {
+            int sample_x = ix + m;
+            if (sample_x < 0)
+              sample_x = 0;
+            else if (sample_x > width - 1)
+              sample_x = width - 1;
+            sum += ref[iy * stride + sample_x] * coeffs[m];
+          }
+          sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+          assert(0 <= sum &&
+                 sum < (1 << (bd + FILTER_BITS + 1 - conv_params->round_0)));
+          tmp[(k + 7) * 8 + (l + 4)] = sum;
+          sx += alpha;
+        }
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+        for (l = -4; l < 4; ++l) {
+          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+                           WARPEDPIXEL_PREC_SHIFTS;
+          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+          const int16_t *coeffs = warped_filter[offs];
+
+          int32_t sum = 1 << offset_bits_vert;
+          for (m = 0; m < 8; ++m) {
+            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+          }
+
+          sum = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+                (1 << (offset_bits_horiz + FILTER_BITS - conv_params->round_0 -
+                       conv_params->round_1)) -
+                (1 << (offset_bits_vert - conv_params->round_1));
+          CONV_BUF_TYPE *p =
+              &conv_params->dst[(i - p_row + k + 4) * conv_params->dst_stride +
+                                (j - p_col + l + 4)];
+          *p += sum;
+          sy += gamma;
+        }
+      }
+    }
+  }
+}
+#endif
+
 static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
                               int width, int height, int stride,
                               const uint8_t *const pred8, int p_col, int p_row,
                               int p_width, int p_height, int p_stride,
                               int subsampling_x, int subsampling_y, int x_scale,
-                              int y_scale, int bd, int comp_avg) {
+                              int y_scale, int bd,
+                              ConvolveParams *conv_params) {
   if (wm->wmtype == ROTZOOM) {
     wm->wmmat[5] = wm->wmmat[2];
     wm->wmmat[4] = -wm->wmmat[3];
   }
-  if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) && x_scale == 16 &&
-      y_scale == 16) {
+  if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) &&
+      x_scale == SCALE_SUBPEL_SHIFTS && y_scale == SCALE_SUBPEL_SHIFTS) {
     const int32_t *const mat = wm->wmmat;
     const int16_t alpha = wm->alpha;
     const int16_t beta = wm->beta;
@@ -1068,26 +1160,40 @@ static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
 
     const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
     uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+#if CONFIG_CONVOLVE_ROUND
+    if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
+      conv_params->do_post_rounding = 1;
+      av1_highbd_warp_affine_post_round(
+          mat, ref, width, height, stride, pred, p_col, p_row, p_width,
+          p_height, p_stride, subsampling_x, subsampling_y, bd, conv_params,
+          alpha, beta, gamma, delta);
+    } else {
+      av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col,
+                             p_row, p_width, p_height, p_stride, subsampling_x,
+                             subsampling_y, bd, conv_params, alpha, beta, gamma,
+                             delta);
+    }
+#else
     av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
                            p_width, p_height, p_stride, subsampling_x,
-                           subsampling_y, bd, comp_avg, alpha, beta, gamma,
+                           subsampling_y, bd, conv_params, alpha, beta, gamma,
                            delta);
+#endif
   } else {
     highbd_warp_plane_old(wm, ref8, width, height, stride, pred8, p_col, p_row,
                           p_width, p_height, p_stride, subsampling_x,
-                          subsampling_y, x_scale, y_scale, bd, comp_avg);
+                          subsampling_y, x_scale, y_scale, bd, conv_params);
   }
 }
 
 static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
-                                  const uint16_t *const dst, int p_col,
-                                  int p_row, int p_width, int p_height,
-                                  int p_stride, int bd) {
+                                  const uint16_t *const dst, int p_width,
+                                  int p_height, int p_stride, int bd) {
   int64_t sum_error = 0;
   for (int i = 0; i < p_height; ++i) {
     for (int j = 0; j < p_width; ++j) {
-      sum_error += highbd_error_measure(
-          dst[j + i * p_stride] - ref[(j + p_col) + (i + p_row) * stride], bd);
+      sum_error +=
+          highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd);
     }
   }
   return sum_error;
@@ -1097,19 +1203,31 @@ static int64_t highbd_warp_error(
     WarpedMotionParams *wm, const uint8_t *const ref8, int width, int height,
     int stride, const uint8_t *const dst8, int p_col, int p_row, int p_width,
     int p_height, int p_stride, int subsampling_x, int subsampling_y,
-    int x_scale, int y_scale, int bd) {
+    int x_scale, int y_scale, int bd, int64_t best_error) {
   int64_t gm_sumerr = 0;
-  uint16_t *tmp = aom_malloc(p_width * p_height * sizeof(*tmp));
-  if (!tmp) return INT64_MAX;
-
-  highbd_warp_plane(wm, ref8, width, height, stride, CONVERT_TO_BYTEPTR(tmp),
-                    p_col, p_row, p_width, p_height, p_width, subsampling_x,
-                    subsampling_y, x_scale, y_scale, bd, 0);
-
-  gm_sumerr = highbd_frame_error(tmp, p_width, CONVERT_TO_SHORTPTR(dst8), p_col,
-                                 p_row, p_width, p_height, p_stride, bd);
-
-  aom_free(tmp);
+  int warp_w, warp_h;
+  int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
+
+  ConvolveParams conv_params = get_conv_params(0, 0, 0);
+  for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
+      // avoid warping extra 8x8 blocks in the padded region of the frame
+      // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
+      warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
+      warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
+      highbd_warp_plane(wm, ref8, width, height, stride,
+                        CONVERT_TO_BYTEPTR(tmp), j, i, warp_w, warp_h,
+                        WARP_ERROR_BLOCK, subsampling_x, subsampling_y, x_scale,
+                        y_scale, bd, &conv_params);
+
+      gm_sumerr += highbd_frame_error(
+          tmp, WARP_ERROR_BLOCK, CONVERT_TO_SHORTPTR(dst8) + j + i * p_stride,
+          warp_w, warp_h, p_stride, bd);
+      if (gm_sumerr > best_error) return gm_sumerr;
+    }
+  }
   return gm_sumerr;
 }
 #endif  // CONFIG_HIGHBITDEPTH
@@ -1123,7 +1241,7 @@ static void warp_plane_old(const WarpedMotionParams *const wm,
                            int stride, uint8_t *pred, int p_col, int p_row,
                            int p_width, int p_height, int p_stride,
                            int subsampling_x, int subsampling_y, int x_scale,
-                           int y_scale, int comp_avg) {
+                           int y_scale, ConvolveParams *conv_params) {
   int i, j;
   ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
   if (projectpoints == NULL) return;
@@ -1135,7 +1253,7 @@ static void warp_plane_old(const WarpedMotionParams *const wm,
       projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
       out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, 4);
       out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, 4);
-      if (comp_avg)
+      if (conv_params->do_average)
         pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
             pred[(j - p_col) + (i - p_row) * p_stride] +
                 warp_interpolate(ref, out[0], out[1], width, height, stride),
@@ -1235,10 +1353,10 @@ static void warp_plane_old(const WarpedMotionParams *const wm,
 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
                        int height, int stride, uint8_t *pred, int p_col,
                        int p_row, int p_width, int p_height, int p_stride,
-                       int subsampling_x, int subsampling_y, int comp_avg,
-                       int16_t alpha, int16_t beta, int16_t gamma,
-                       int16_t delta) {
-  uint16_t tmp[15 * 8];
+                       int subsampling_x, int subsampling_y,
+                       ConvolveParams *conv_params, int16_t alpha, int16_t beta,
+                       int16_t gamma, int16_t delta) {
+  int32_t tmp[15 * 8];
   int i, j, k, l, m;
   const int bd = 8;
 
@@ -1329,7 +1447,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
           sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
           assert(0 <= sum && sum < (1 << (bd + 2)));
           uint8_t px = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
-          if (comp_avg)
+          if (conv_params->do_average)
             *p = ROUND_POWER_OF_TWO(*p + px, 1);
           else
             *p = px;
@@ -1340,41 +1458,170 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
   }
 }
 
+#if CONFIG_CONVOLVE_ROUND
+void av1_warp_affine_post_round_c(const int32_t *mat, const uint8_t *ref,
+                                  int width, int height, int stride,
+                                  uint8_t *pred, int p_col, int p_row,
+                                  int p_width, int p_height, int p_stride,
+                                  int subsampling_x, int subsampling_y,
+                                  ConvolveParams *conv_params, int16_t alpha,
+                                  int16_t beta, int16_t gamma, int16_t delta) {
+  (void)pred;
+  (void)p_stride;
+  int32_t tmp[15 * 8];
+  int i, j, k, l, m;
+  const int bd = 8;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - conv_params->round_0;
+  assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
+
+  for (i = p_row; i < p_row + p_height; i += 8) {
+    for (j = p_col; j < p_col + p_width; j += 8) {
+      int32_t x4, y4, ix4, sx4, iy4, sy4;
+      if (subsampling_x)
+        x4 = (mat[2] * 4 * (j + 4) + mat[3] * 4 * (i + 4) + mat[0] * 2 +
+              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
+      else
+        x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
+
+      if (subsampling_y)
+        y4 = (mat[4] * 4 * (j + 4) + mat[5] * 4 * (i + 4) + mat[1] * 2 +
+              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
+      else
+        y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
+
+      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      sx4 += alpha * (-4) + beta * (-4);
+      sy4 += gamma * (-4) + delta * (-4);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+      // Horizontal filter
+      for (k = -7; k < 8; ++k) {
+        // Clamp to top/bottom edge of the frame
+        int iy = iy4 + k;
+        if (iy < 0)
+          iy = 0;
+        else if (iy > height - 1)
+          iy = height - 1;
+
+        int sx = sx4 + beta * (k + 4);
+
+        for (l = -4; l < 4; ++l) {
+          int ix = ix4 + l - 3;
+          // At this point, sx = sx4 + alpha * l + beta * k
+          const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+                           WARPEDPIXEL_PREC_SHIFTS;
+          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+          const int16_t *coeffs = warped_filter[offs];
+
+          int32_t sum = 1 << offset_bits_horiz;
+          for (m = 0; m < 8; ++m) {
+            // Clamp to left/right edge of the frame
+            int sample_x = ix + m;
+            if (sample_x < 0)
+              sample_x = 0;
+            else if (sample_x > width - 1)
+              sample_x = width - 1;
+
+            sum += ref[iy * stride + sample_x] * coeffs[m];
+          }
+          sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+          assert(0 <= sum &&
+                 sum < (1 << (bd + FILTER_BITS + 1 - conv_params->round_0)));
+          tmp[(k + 7) * 8 + (l + 4)] = sum;
+          sx += alpha;
+        }
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+        for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
+          // At this point, sy = sy4 + gamma * l + delta * k
+          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+                           WARPEDPIXEL_PREC_SHIFTS;
+          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+          const int16_t *coeffs = warped_filter[offs];
+
+          int32_t sum = 1 << offset_bits_vert;
+
+          for (m = 0; m < 8; ++m) {
+            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+          }
+
+          sum = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+                (1 << (offset_bits_horiz + FILTER_BITS - conv_params->round_0 -
+                       conv_params->round_1)) -
+                (1 << (offset_bits_vert - conv_params->round_1));
+          CONV_BUF_TYPE *p =
+              &conv_params->dst[(i - p_row + k + 4) * conv_params->dst_stride +
+                                (j - p_col + l + 4)];
+          *p += sum;
+          sy += gamma;
+        }
+      }
+    }
+  }
+}
+#endif  // CONFIG_CONVOLVE_ROUND
+
 static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
                        int width, int height, int stride, uint8_t *pred,
                        int p_col, int p_row, int p_width, int p_height,
                        int p_stride, int subsampling_x, int subsampling_y,
-                       int x_scale, int y_scale, int comp_avg) {
+                       int x_scale, int y_scale, ConvolveParams *conv_params) {
   if (wm->wmtype == ROTZOOM) {
     wm->wmmat[5] = wm->wmmat[2];
     wm->wmmat[4] = -wm->wmmat[3];
   }
-  if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) && x_scale == 16 &&
-      y_scale == 16) {
+  if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) &&
+      x_scale == SCALE_SUBPEL_SHIFTS && y_scale == SCALE_SUBPEL_SHIFTS) {
     const int32_t *const mat = wm->wmmat;
     const int16_t alpha = wm->alpha;
     const int16_t beta = wm->beta;
     const int16_t gamma = wm->gamma;
     const int16_t delta = wm->delta;
 
+#if CONFIG_CONVOLVE_ROUND
+    if (conv_params->round == CONVOLVE_OPT_NO_ROUND) {
+      conv_params->do_post_rounding = 1;
+      av1_warp_affine_post_round(mat, ref, width, height, stride, pred, p_col,
+                                 p_row, p_width, p_height, p_stride,
+                                 subsampling_x, subsampling_y, conv_params,
+                                 alpha, beta, gamma, delta);
+    } else {
+      av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
+                      p_width, p_height, p_stride, subsampling_x, subsampling_y,
+                      conv_params, alpha, beta, gamma, delta);
+    }
+#else
     av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
                     p_width, p_height, p_stride, subsampling_x, subsampling_y,
-                    comp_avg, alpha, beta, gamma, delta);
+                    conv_params, alpha, beta, gamma, delta);
+#endif
   } else {
     warp_plane_old(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
                    p_height, p_stride, subsampling_x, subsampling_y, x_scale,
-                   y_scale, comp_avg);
+                   y_scale, conv_params);
   }
 }
 
 static int64_t frame_error(const uint8_t *const ref, int stride,
-                           const uint8_t *const dst, int p_col, int p_row,
-                           int p_width, int p_height, int p_stride) {
+                           const uint8_t *const dst, int p_width, int p_height,
+                           int p_stride) {
   int64_t sum_error = 0;
   for (int i = 0; i < p_height; ++i) {
     for (int j = 0; j < p_width; ++j) {
-      sum_error += (int64_t)error_measure(
-          dst[j + i * p_stride] - ref[(j + p_col) + (i + p_row) * stride]);
+      sum_error +=
+          (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]);
     }
   }
   return sum_error;
@@ -1385,19 +1632,29 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
                           const uint8_t *const dst, int p_col, int p_row,
                           int p_width, int p_height, int p_stride,
                           int subsampling_x, int subsampling_y, int x_scale,
-                          int y_scale) {
+                          int y_scale, int64_t best_error) {
   int64_t gm_sumerr = 0;
-  uint8_t *tmp = aom_malloc(p_width * p_height);
-  if (!tmp) return INT64_MAX;
-
-  warp_plane(wm, ref, width, height, stride, tmp, p_col, p_row, p_width,
-             p_height, p_width, subsampling_x, subsampling_y, x_scale, y_scale,
-             0);
-
-  gm_sumerr =
-      frame_error(tmp, p_width, dst, p_col, p_row, p_width, p_height, p_stride);
-
-  aom_free(tmp);
+  int warp_w, warp_h;
+  int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
+  ConvolveParams conv_params = get_conv_params(0, 0, 0);
+
+  for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
+      // avoid warping extra 8x8 blocks in the padded region of the frame
+      // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
+      warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
+      warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
+      warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
+                 WARP_ERROR_BLOCK, subsampling_x, subsampling_y, x_scale,
+                 y_scale, &conv_params);
+
+      gm_sumerr += frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
+                               warp_w, warp_h, p_stride);
+      if (gm_sumerr > best_error) return gm_sumerr;
+    }
+  }
   return gm_sumerr;
 }
 
@@ -1405,17 +1662,16 @@ int64_t av1_frame_error(
 #if CONFIG_HIGHBITDEPTH
     int use_hbd, int bd,
 #endif  // CONFIG_HIGHBITDEPTH
-    const uint8_t *ref, int stride, uint8_t *dst, int p_col, int p_row,
-    int p_width, int p_height, int p_stride) {
+    const uint8_t *ref, int stride, uint8_t *dst, int p_width, int p_height,
+    int p_stride) {
 #if CONFIG_HIGHBITDEPTH
   if (use_hbd) {
     return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
-                              CONVERT_TO_SHORTPTR(dst), p_col, p_row, p_width,
-                              p_height, p_stride, bd);
+                              CONVERT_TO_SHORTPTR(dst), p_width, p_height,
+                              p_stride, bd);
   }
 #endif  // CONFIG_HIGHBITDEPTH
-  return frame_error(ref, stride, dst, p_col, p_row, p_width, p_height,
-                     p_stride);
+  return frame_error(ref, stride, dst, p_width, p_height, p_stride);
 }
 
 int64_t av1_warp_error(WarpedMotionParams *wm,
@@ -1425,18 +1681,19 @@ int64_t av1_warp_error(WarpedMotionParams *wm,
                        const uint8_t *ref, int width, int height, int stride,
                        uint8_t *dst, int p_col, int p_row, int p_width,
                        int p_height, int p_stride, int subsampling_x,
-                       int subsampling_y, int x_scale, int y_scale) {
+                       int subsampling_y, int x_scale, int y_scale,
+                       int64_t best_error) {
   if (wm->wmtype <= AFFINE)
     if (!get_shear_params(wm)) return 1;
 #if CONFIG_HIGHBITDEPTH
   if (use_hbd)
     return highbd_warp_error(wm, ref, width, height, stride, dst, p_col, p_row,
                              p_width, p_height, p_stride, subsampling_x,
-                             subsampling_y, x_scale, y_scale, bd);
+                             subsampling_y, x_scale, y_scale, bd, best_error);
 #endif  // CONFIG_HIGHBITDEPTH
   return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
                     p_height, p_stride, subsampling_x, subsampling_y, x_scale,
-                    y_scale);
+                    y_scale, best_error);
 }
 
 void av1_warp_plane(WarpedMotionParams *wm,
@@ -1446,17 +1703,18 @@ void av1_warp_plane(WarpedMotionParams *wm,
                     const uint8_t *ref, int width, int height, int stride,
                     uint8_t *pred, int p_col, int p_row, int p_width,
                     int p_height, int p_stride, int subsampling_x,
-                    int subsampling_y, int x_scale, int y_scale, int comp_avg) {
+                    int subsampling_y, int x_scale, int y_scale,
+                    ConvolveParams *conv_params) {
 #if CONFIG_HIGHBITDEPTH
   if (use_hbd)
     highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row,
                       p_width, p_height, p_stride, subsampling_x, subsampling_y,
-                      x_scale, y_scale, bd, comp_avg);
+                      x_scale, y_scale, bd, conv_params);
   else
 #endif  // CONFIG_HIGHBITDEPTH
     warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
                p_height, p_stride, subsampling_x, subsampling_y, x_scale,
-               y_scale, comp_avg);
+               y_scale, conv_params);
 }
 
 #if CONFIG_WARPED_MOTION
@@ -1492,6 +1750,83 @@ void av1_warp_plane(WarpedMotionParams *wm,
 #define LS_PRODUCT2(a, b) \
   (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2)
 
+#define USE_LIMITED_PREC_MULT 0
+
+#if USE_LIMITED_PREC_MULT
+
+#define MUL_PREC_BITS 16
+static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
+  int msb = 0;
+  uint16_t mult = 0;
+  *shift = 0;
+  if (D != 0) {
+    msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
+                              : get_msb((unsigned int)D));
+    if (msb >= MUL_PREC_BITS) {
+      mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
+      *shift = msb + 1 - MUL_PREC_BITS;
+    } else {
+      mult = (uint16_t)D;
+      *shift = 0;
+    }
+  }
+  return mult;
+}
+
+static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
+  int32_t ret;
+  int16_t mshift;
+  uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
+  int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
+  shift -= mshift;
+  if (shift > 0) {
+    return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
+                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+  } else {
+    return (int32_t)clamp(v * (1 << (-shift)),
+                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+  }
+  return ret;
+}
+
+static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
+  int16_t mshift;
+  uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
+  int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
+  shift -= mshift;
+  if (shift > 0) {
+    return (int32_t)clamp(
+        ROUND_POWER_OF_TWO_SIGNED(v, shift),
+        (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+        (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+  } else {
+    return (int32_t)clamp(
+        v * (1 << (-shift)),
+        (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+        (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+  }
+}
+
+#else
+
+static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
+  int64_t v = Px * (int64_t)iDet;
+  return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
+                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+}
+
+static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
+  int64_t v = Px * (int64_t)iDet;
+  return (int32_t)clamp64(
+      ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
+      (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+      (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+}
+#endif  // USE_LIMITED_PREC_MULT
+
 static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
                            int mvy, int mvx, WarpedMotionParams *wm, int mi_row,
                            int mi_col) {
@@ -1502,8 +1837,10 @@ static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
 
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
-  const int suy = (mi_row * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1) * 8;
-  const int sux = (mi_col * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1) * 8;
+  const int isuy = (mi_row * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1);
+  const int isux = (mi_col * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1);
+  const int suy = isuy * 8;
+  const int sux = isux * 8;
   const int duy = suy + mvy;
   const int dux = sux + mvx;
 
@@ -1590,61 +1927,39 @@ static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
     shift = 0;
   }
 
-  int64_t v;
-  v = Px[0] * (int64_t)iDet;
-  wm->wmmat[2] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift));
-  v = Px[1] * (int64_t)iDet;
-  wm->wmmat[3] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift));
-  v = ((int64_t)dux * (1 << WARPEDMODEL_PREC_BITS)) -
-      (int64_t)sux * wm->wmmat[2] - (int64_t)suy * wm->wmmat[3];
-  wm->wmmat[0] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED(v, 3));
-
-  v = Py[0] * (int64_t)iDet;
-  wm->wmmat[4] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift));
-  v = Py[1] * (int64_t)iDet;
-  wm->wmmat[5] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED_64(v, shift));
-  v = ((int64_t)duy * (1 << WARPEDMODEL_PREC_BITS)) -
-      (int64_t)sux * wm->wmmat[4] - (int64_t)suy * wm->wmmat[5];
-  wm->wmmat[1] = (int32_t)(ROUND_POWER_OF_TWO_SIGNED(v, 3));
+  wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
+  wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
+  wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
+  wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
+
+  // Note: In the vx, vy expressions below, the max value of each of the
+  // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
+  // for the first term so that the overall sum in the worst case fits
+  // within 32 bits overall.
+  int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+               (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
+                isuy * wm->wmmat[3]);
+  int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+               (isux * wm->wmmat[4] +
+                isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
+  wm->wmmat[0] =
+      clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
+  wm->wmmat[1] =
+      clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
 
   wm->wmmat[6] = wm->wmmat[7] = 0;
-
-  // Clamp values
-  wm->wmmat[0] = clamp(wm->wmmat[0], -WARPEDMODEL_TRANS_CLAMP,
-                       WARPEDMODEL_TRANS_CLAMP - 1);
-  wm->wmmat[1] = clamp(wm->wmmat[1], -WARPEDMODEL_TRANS_CLAMP,
-                       WARPEDMODEL_TRANS_CLAMP - 1);
-  wm->wmmat[2] = clamp(wm->wmmat[2], -WARPEDMODEL_DIAGAFFINE_CLAMP,
-                       WARPEDMODEL_DIAGAFFINE_CLAMP - 1);
-  wm->wmmat[5] = clamp(wm->wmmat[5], -WARPEDMODEL_DIAGAFFINE_CLAMP,
-                       WARPEDMODEL_DIAGAFFINE_CLAMP - 1);
-  wm->wmmat[3] = clamp(wm->wmmat[3], -WARPEDMODEL_NONDIAGAFFINE_CLAMP,
-                       WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
-  wm->wmmat[4] = clamp(wm->wmmat[4], -WARPEDMODEL_NONDIAGAFFINE_CLAMP,
-                       WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
   return 0;
 }
 
 int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
                     int mvx, WarpedMotionParams *wm_params, int mi_row,
                     int mi_col) {
-  int result = 1;
-  switch (wm_params->wmtype) {
-    case AFFINE:
-      result = find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params,
-                               mi_row, mi_col);
-      break;
-    default: assert(0 && "Invalid warped motion type!"); return 1;
-  }
+  assert(wm_params->wmtype == AFFINE);
+  const int result = find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params,
+                                     mi_row, mi_col);
   if (result == 0) {
-    if (wm_params->wmtype == ROTZOOM) {
-      wm_params->wmmat[5] = wm_params->wmmat[2];
-      wm_params->wmmat[4] = -wm_params->wmmat[3];
-    }
-    if (wm_params->wmtype == AFFINE || wm_params->wmtype == ROTZOOM) {
-      // check compatibility with the fast warp filter
-      if (!get_shear_params(wm_params)) return 1;
-    }
+    // check compatibility with the fast warp filter
+    if (!get_shear_params(wm_params)) return 1;
   }
 
   return result;